fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

at tngl 40 kB View raw
1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package gitindex provides functions for indexing Git repositories. 16package gitindex 17 18import ( 19 "bytes" 20 "cmp" 21 "context" 22 "errors" 23 "fmt" 24 "io" 25 "log" 26 "math" 27 "net/url" 28 "os" 29 "path/filepath" 30 "regexp" 31 "sort" 32 "strconv" 33 "strings" 34 35 "github.com/go-git/go-billy/v5/osfs" 36 "github.com/go-git/go-git/v5/config" 37 "github.com/go-git/go-git/v5/plumbing" 38 "github.com/go-git/go-git/v5/plumbing/cache" 39 "github.com/go-git/go-git/v5/plumbing/object" 40 "github.com/go-git/go-git/v5/storage/filesystem" 41 42 "github.com/sourcegraph/zoekt" 43 "github.com/sourcegraph/zoekt/ignore" 44 "github.com/sourcegraph/zoekt/index" 45 46 git "github.com/go-git/go-git/v5" 47) 48 49// FindGitRepos finds directories holding git repositories below the 50// given directory. It will find both bare and the ".git" dirs in 51// non-bare repositories. It returns the full path including the dir 52// passed in. 53func FindGitRepos(dir string) ([]string, error) { 54 arg, err := filepath.Abs(dir) 55 if err != nil { 56 return nil, err 57 } 58 var dirs []string 59 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error { 60 // Best-effort, ignore filepath.Walk failing 61 if err != nil { 62 return nil 63 } 64 65 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() { 66 dirs = append(dirs, filepath.Join(name, ".git")) 67 return filepath.SkipDir 68 } 69 70 if !strings.HasSuffix(name, ".git") || !fi.IsDir() { 71 return nil 72 } 73 74 fi, err = os.Lstat(filepath.Join(name, "objects")) 75 if err != nil || !fi.IsDir() { 76 return nil 77 } 78 79 dirs = append(dirs, name) 80 return filepath.SkipDir 81 }); err != nil { 82 return nil, err 83 } 84 85 return dirs, nil 86} 87 88// setTemplates fills in URL templates for known git hosting 89// sites. 90func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error { 91 if u.Scheme == "ssh+git" { 92 u.Scheme = "https" 93 u.User = nil 94 } 95 96 // helper to generate u.JoinPath as a template 97 varVersion := ".Version" 98 varPath := ".Path" 99 urlJoinPath := func(elem ...string) string { 100 elem = append([]string{u.String()}, elem...) 101 var parts []string 102 for _, e := range elem { 103 if e == varVersion || e == varPath { 104 parts = append(parts, e) 105 } else { 106 parts = append(parts, strconv.Quote(e)) 107 } 108 } 109 return fmt.Sprintf("{{URLJoinPath %s}}", strings.Join(parts, " ")) 110 } 111 112 repo.URL = u.String() 113 switch typ { 114 case "gitiles": 115 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20 116 repo.CommitURLTemplate = urlJoinPath("+", varVersion) 117 repo.FileURLTemplate = urlJoinPath("+", varVersion, varPath) 118 repo.LineFragmentTemplate = "#{{.LineNumber}}" 119 case "github": 120 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10 121 repo.CommitURLTemplate = urlJoinPath("commit", varVersion) 122 repo.FileURLTemplate = urlJoinPath("blob", varVersion, varPath) 123 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 124 case "cgit": 125 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100 126 repo.CommitURLTemplate = urlJoinPath("commit") + "/?id={{.Version}}" 127 repo.FileURLTemplate = urlJoinPath("tree", varPath) + "/?id={{.Version}}" 128 repo.LineFragmentTemplate = "#n{{.LineNumber}}" 129 case "gitweb": 130 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10 131 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}" 132 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}" 133 repo.LineFragmentTemplate = "#l{{.LineNumber}}" 134 case "source.bazel.build": 135 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9 136 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10 137 repo.CommitURLTemplate = u.String() + "/%2B/{{.Version}}" 138 repo.FileURLTemplate = u.String() + "/%2B/{{.Version}}:{{.Path}}" 139 repo.LineFragmentTemplate = ";l={{.LineNumber}}" 140 case "bitbucket-server": 141 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc 142 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc 143 repo.CommitURLTemplate = urlJoinPath("commits", varVersion) 144 repo.FileURLTemplate = urlJoinPath(varPath) + "?at={{.Version}}" 145 repo.LineFragmentTemplate = "#{{.LineNumber}}" 146 case "bitbucket-cloud": 147 // https://bitbucket.org/<workspace>/<repo_slug>/commits/<version> 148 // https://bitbucket.org/<workspace>/<repo_slug>/src/<version>/<path> 149 repo.CommitURLTemplate = urlJoinPath("commits", varVersion) 150 repo.FileURLTemplate = urlJoinPath("src", varVersion, varPath) 151 repo.LineFragmentTemplate = "#{{.LineNumber}}" 152 case "azuredevops": 153 // https://dev.azure.com/<organization>/<project>/_git/<repo>/commit/<version> 154 // https://dev.azure.com/<organization>/<project>/_git/<repo>?path=/<path>&version=GC<version> 155 repo.CommitURLTemplate = urlJoinPath("commit", varVersion) 156 repo.FileURLTemplate = urlJoinPath() + "?path=/{{.Path}}&version=GC{{.Version}}&_a=contents" 157 repo.LineFragmentTemplate = "&line={{.LineNumber}}&lineEnd={{.LineNumber}}&lineStartColumn=1&lineEndColumn=200" 158 case "gitlab": 159 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed 160 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template 161 repo.CommitURLTemplate = urlJoinPath("-/commit", varVersion) 162 repo.FileURLTemplate = urlJoinPath("-/blob", varVersion, varPath) 163 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 164 case "gitea": 165 repo.CommitURLTemplate = urlJoinPath("commit", varVersion) 166 // NOTE The `display=source` query parameter is required to disable file rendering. 167 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to 168 // a line without `display=source`. This is supported since gitea 1.17.0. 169 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}}, 170 // but the query parameters are obmitted. 171 repo.FileURLTemplate = urlJoinPath("src/commit", varVersion, varPath) + "?display=source" 172 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 173 default: 174 return fmt.Errorf("URL scheme type %q unknown", typ) 175 } 176 return nil 177} 178 179// getCommit returns a tree object for the given reference. 180func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) { 181 sha1, err := repo.ResolveRevision(plumbing.Revision(ref)) 182 // ref might be a branch name (e.g. "master") add branch prefix and try again. 183 if err != nil { 184 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref))) 185 } 186 if err != nil { 187 return nil, err 188 } 189 190 commitObj, err := repo.CommitObject(*sha1) 191 if err != nil { 192 return nil, err 193 } 194 return commitObj, nil 195} 196 197func plainOpenRepo(repoDir string) (*git.Repository, error) { 198 // Try repoDir as the repository root first so bare repositories open 199 // correctly. If repoDir itself is not a repository, fall back to searching 200 // for a .git entry to preserve compatibility with worktree paths. 201 repo, err := git.PlainOpenWithOptions(repoDir, &git.PlainOpenOptions{ 202 EnableDotGitCommonDir: true, 203 }) 204 if err == nil || !errors.Is(err, git.ErrRepositoryNotExists) { 205 return repo, err 206 } 207 208 return git.PlainOpenWithOptions(repoDir, &git.PlainOpenOptions{ 209 DetectDotGit: true, 210 EnableDotGitCommonDir: true, 211 }) 212} 213 214func configLookupRemoteURL(cfg *config.Config, key string) string { 215 rc := cfg.Remotes[key] 216 if rc == nil || len(rc.URLs) == 0 { 217 return "" 218 } 219 return rc.URLs[0] 220} 221 222var sshRelativeURLRegexp = regexp.MustCompile(`^([^@]+)@([^:]+):(.*)$`) 223 224func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error { 225 repo, err := plainOpenRepo(repoDir) 226 if err != nil { 227 return err 228 } 229 230 cfg, err := repo.Config() 231 if err != nil { 232 return err 233 } 234 235 return setTemplatesFromRepoConfig(desc, cfg) 236} 237 238func setTemplatesFromRepo(desc *zoekt.Repository, repo *git.Repository, repoDir string) error { 239 cfg, err := repo.Config() 240 if err == nil { 241 return setTemplatesFromRepoConfig(desc, cfg) 242 } 243 244 return setTemplatesFromConfig(desc, repoDir) 245} 246 247func setTemplatesFromRepoConfig(desc *zoekt.Repository, cfg *config.Config) error { 248 sec := cfg.Raw.Section("zoekt") 249 250 webURLStr := sec.Options.Get("web-url") 251 webURLType := sec.Options.Get("web-url-type") 252 253 if webURLType != "" && webURLStr != "" { 254 webURL, err := url.Parse(webURLStr) 255 if err != nil { 256 return err 257 } 258 if err := setTemplates(desc, webURL, webURLType); err != nil { 259 return err 260 } 261 } else if webURLStr != "" { 262 desc.URL = webURLStr 263 } 264 265 name := sec.Options.Get("name") 266 if name != "" { 267 desc.Name = name 268 } else { 269 remoteURL := configLookupRemoteURL(cfg, "origin") 270 if remoteURL == "" { 271 return nil 272 } 273 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil { 274 user := sm[1] 275 host := sm[2] 276 path := sm[3] 277 278 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path) 279 } 280 281 u, err := url.Parse(remoteURL) 282 if err != nil { 283 return err 284 } 285 if err := SetTemplatesFromOrigin(desc, u); err != nil { 286 return err 287 } 288 } 289 290 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32) 291 desc.ID = uint32(id) 292 293 desc.TenantID, _ = strconv.Atoi(sec.Options.Get("tenantID")) 294 295 if desc.RawConfig == nil { 296 desc.RawConfig = map[string]string{} 297 } 298 for _, o := range sec.Options { 299 desc.RawConfig[o.Key] = o.Value 300 } 301 302 // Ranking info. 303 304 // Github: 305 traction := 0 306 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} { 307 f, err := strconv.Atoi(sec.Options.Get(s)) 308 if err == nil { 309 traction += f 310 } 311 } 312 313 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 { 314 // Pretend everything on googlesource.com has 1000 315 // github stars. 316 traction = 1000 317 } 318 319 if traction > 0 { 320 l := math.Log(float64(traction)) 321 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000) 322 } 323 324 return nil 325} 326 327// This attempts to get a repo URL similar to the main repository template processing as in setTemplatesFromConfig() 328func normalizeSubmoduleRemoteURL(cfg *config.Config) (string, error) { 329 sec := cfg.Raw.Section("zoekt") 330 remoteURL := sec.Options.Get("web-url") 331 if remoteURL == "" { 332 // fall back to "origin" remote 333 remoteURL = configLookupRemoteURL(cfg, "origin") 334 if remoteURL == "" { 335 return "", nil 336 } 337 } 338 339 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil { 340 user := sm[1] 341 host := sm[2] 342 path := sm[3] 343 344 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path) 345 } 346 347 u, err := url.Parse(remoteURL) 348 if err != nil { 349 return "", fmt.Errorf("unable to parse remote URL %q: %w", remoteURL, err) 350 } 351 352 if u.Scheme == "ssh+git" { 353 u.Scheme = "https" 354 u.User = nil 355 } 356 357 // Assume we cannot build templates for this URL, leave it empty 358 if u.Scheme == "" { 359 return "", nil 360 } 361 362 return u.String(), nil 363} 364 365// SetTemplatesFromOrigin fills in templates based on the origin URL. 366func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error { 367 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git")) 368 369 if strings.HasSuffix(u.Host, ".googlesource.com") { 370 return setTemplates(desc, u, "gitiles") 371 } else if u.Host == "github.com" { 372 u.Path = strings.TrimSuffix(u.Path, ".git") 373 return setTemplates(desc, u, "github") 374 } else { 375 return fmt.Errorf("unknown git hosting site %q", u) 376 } 377} 378 379// The Options structs controls details of the indexing process. 380type Options struct { 381 // The repository to be indexed. 382 RepoDir string 383 384 // If set, follow submodule links. This requires RepoCacheDir to be set. 385 Submodules bool 386 387 // If set, skip indexing if the existing index shard is newer 388 // than the refs in the repository. 389 Incremental bool 390 391 // Don't error out if some branch is missing 392 AllowMissingBranch bool 393 394 // Specifies the root of a Repository cache. Needed for submodule indexing. 395 RepoCacheDir string 396 397 // Indexing options. 398 BuildOptions index.Options 399 400 // Prefix of the branch to index, e.g. `remotes/origin`. 401 BranchPrefix string 402 403 // List of branch names to index, e.g. []string{"HEAD", "stable"} 404 Branches []string 405 406 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards 407 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold, 408 // then a normal build will be performed instead. 409 // 410 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled: 411 // a delta build will always be performed regardless of the number of preexisting shards. 412 DeltaShardNumberFallbackThreshold uint64 413} 414 415func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) { 416 var result []string 417 for _, b := range bs { 418 // Sourcegraph: We disable resolving refs. We want to return the exact ref 419 // requested so we can match it up. 420 if b == "HEAD" && false { 421 ref, err := repo.Head() 422 if err != nil { 423 return nil, err 424 } 425 426 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix)) 427 continue 428 } 429 430 if strings.Contains(b, "*") { 431 iter, err := repo.Branches() 432 if err != nil { 433 return nil, err 434 } 435 436 defer iter.Close() 437 for { 438 ref, err := iter.Next() 439 if err == io.EOF { 440 break 441 } 442 if err != nil { 443 return nil, err 444 } 445 446 name := ref.Name().Short() 447 if matched, err := filepath.Match(b, name); err != nil { 448 return nil, err 449 } else if !matched { 450 continue 451 } 452 453 result = append(result, strings.TrimPrefix(name, prefix)) 454 } 455 continue 456 } 457 458 result = append(result, b) 459 } 460 461 return result, nil 462} 463 464// IndexGitRepo indexes the git repository as specified by the options. 465// The returned bool indicates whether the index was updated as a result. This 466// can be informative if doing incremental indexing. 467func IndexGitRepo(opts Options) (bool, error) { 468 return indexGitRepo(opts, gitIndexConfig{}) 469} 470 471// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig. 472// The returned bool indicates whether the index was updated as a result. This 473// can be informative if doing incremental indexing. 474func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { 475 prepareDeltaBuild := prepareDeltaBuild 476 if config.prepareDeltaBuild != nil { 477 prepareDeltaBuild = config.prepareDeltaBuild 478 } 479 480 prepareNormalBuild := prepareNormalBuild 481 if config.prepareNormalBuild != nil { 482 prepareNormalBuild = config.prepareNormalBuild 483 } 484 485 // Set max thresholds, since we use them in this function. 486 opts.BuildOptions.SetDefaults() 487 if opts.RepoDir == "" { 488 return false, fmt.Errorf("gitindex: must set RepoDir") 489 } 490 491 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir 492 493 var repo *git.Repository 494 legacyRepoOpen := cmp.Or(os.Getenv("ZOEKT_DISABLE_GOGIT_OPTIMIZATION"), "false") 495 if b, err := strconv.ParseBool(legacyRepoOpen); b || err != nil { 496 repo, err = plainOpenRepo(opts.RepoDir) 497 if err != nil { 498 return false, fmt.Errorf("plainOpenRepo: %w", err) 499 } 500 } else { 501 var repoCloser io.Closer 502 repo, repoCloser, err = openRepo(opts.RepoDir) 503 if err != nil { 504 return false, fmt.Errorf("openRepo: %w", err) 505 } 506 defer repoCloser.Close() 507 } 508 509 if err := setTemplatesFromRepo(&opts.BuildOptions.RepositoryDescription, repo, opts.RepoDir); err != nil { 510 log.Printf("setTemplatesFromRepo(%s): %s", opts.RepoDir, err) 511 } 512 513 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix) 514 if err != nil { 515 return false, fmt.Errorf("expandBranches: %w", err) 516 } 517 for _, b := range branches { 518 commit, err := getCommit(repo, opts.BranchPrefix, b) 519 if err != nil { 520 if opts.AllowMissingBranch && err.Error() == "reference not found" { 521 continue 522 } 523 524 return false, fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err) 525 } 526 527 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{ 528 Name: b, 529 Version: commit.Hash.String(), 530 }) 531 532 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) { 533 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when 534 } 535 } 536 537 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() { 538 return false, nil 539 } 540 541 // branch => (path, sha1) => repo. 542 var repos map[fileKey]BlobLocation 543 544 // Branch => Repo => SHA1 545 var branchVersions map[string]map[string]plumbing.Hash 546 547 // set of file paths that have been changed or deleted since 548 // the last indexed commit 549 // 550 // These only have an effect on delta builds 551 var changedOrRemovedFiles []string 552 553 if opts.BuildOptions.IsDelta { 554 repos, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo) 555 if err != nil { 556 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err) 557 opts.BuildOptions.IsDelta = false 558 } 559 } 560 561 if !opts.BuildOptions.IsDelta { 562 repos, branchVersions, err = prepareNormalBuild(opts, repo) 563 if err != nil { 564 return false, fmt.Errorf("preparing normal build: %w", err) 565 } 566 } 567 568 reposByPath := map[string]BlobLocation{} 569 for key, info := range repos { 570 reposByPath[key.SubRepoPath] = info 571 } 572 573 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{} 574 for path, info := range reposByPath { 575 tpl := opts.BuildOptions.RepositoryDescription 576 if path != "" { 577 tpl = zoekt.Repository{URL: info.URL.String()} 578 if info.URL.String() != "" { 579 if err := SetTemplatesFromOrigin(&tpl, info.URL); err != nil { 580 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.URL, err) 581 } 582 } 583 if tpl.Name == "" { 584 tpl.Name = path 585 } 586 } 587 opts.BuildOptions.SubRepositories[path] = &tpl 588 } 589 590 for _, br := range opts.BuildOptions.RepositoryDescription.Branches { 591 for path, repo := range opts.BuildOptions.SubRepositories { 592 id := branchVersions[br.Name][path] 593 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{ 594 Name: br.Name, 595 Version: id.String(), 596 }) 597 } 598 } 599 600 builder, err := index.NewBuilder(opts.BuildOptions) 601 if err != nil { 602 return false, fmt.Errorf("build.NewBuilder: %w", err) 603 } 604 605 // Preparing the build can consume substantial memory, so check usage before starting to index. 606 builder.CheckMemoryUsage() 607 608 // we don't need to check error, since we either already have an error, or 609 // we returning the first call to builder.Finish. 610 defer builder.Finish() // nolint:errcheck 611 612 for _, f := range changedOrRemovedFiles { 613 builder.MarkFileAsChangedOrRemoved(f) 614 } 615 616 var names []string 617 fileKeys := map[string][]fileKey{} 618 totalFiles := 0 619 620 for key := range repos { 621 n := key.FullPath() 622 fileKeys[n] = append(fileKeys[n], key) 623 names = append(names, n) 624 totalFiles++ 625 } 626 627 sort.Strings(names) 628 names = uniq(names) 629 630 // Separate main-repo keys from submodule keys, collecting blob SHAs 631 // for the main repo so we can stream them via git cat-file --batch. 632 // ZOEKT_DISABLE_CATFILE_BATCH=true falls back to the go-git path for 633 // all files, useful as a kill switch if the cat-file path causes issues. 634 // 635 // 2026-04-02(keegan) we are regularly seeing git growing to over 9GB in 636 // memory usage in our production cluster. Disabling by default until the 637 // issue is resolved. 638 catfileBatchDisabled := cmp.Or(os.Getenv("ZOEKT_DISABLE_CATFILE_BATCH"), "true") 639 useCatfileBatch := true 640 if disabled, _ := strconv.ParseBool(catfileBatchDisabled); disabled { 641 useCatfileBatch = false 642 log.Printf("cat-file batch disabled via ZOEKT_DISABLE_CATFILE_BATCH, using go-git") 643 } 644 645 mainRepoKeys := make([]fileKey, 0, totalFiles) 646 mainRepoIDs := make([]plumbing.Hash, 0, totalFiles) 647 var submoduleKeys []fileKey 648 649 for _, name := range names { 650 for _, key := range fileKeys[name] { 651 if useCatfileBatch && key.SubRepoPath == "" { 652 mainRepoKeys = append(mainRepoKeys, key) 653 mainRepoIDs = append(mainRepoIDs, key.ID) 654 } else { 655 submoduleKeys = append(submoduleKeys, key) 656 } 657 } 658 } 659 660 log.Printf("attempting to index %d total files (%d via cat-file, %d via go-git)", totalFiles, len(mainRepoIDs), len(submoduleKeys)) 661 662 // Stream main-repo blobs via pipelined cat-file --batch --buffer. 663 // Large blobs are skipped without reading content into memory. 664 if len(mainRepoIDs) > 0 { 665 crOpts := catfileReaderOptions{ 666 filterSpec: catfileFilterSpec(opts), 667 } 668 cr, err := newCatfileReader(opts.RepoDir, mainRepoIDs, crOpts) 669 if err != nil { 670 return false, fmt.Errorf("newCatfileReader: %w", err) 671 } 672 673 if err := indexCatfileBlobs(cr, mainRepoKeys, repos, opts, builder); err != nil { 674 return false, err 675 } 676 } 677 678 // Index submodule blobs via go-git. 679 for idx, key := range submoduleKeys { 680 doc, err := createDocument(key, repos, opts.BuildOptions) 681 if err != nil { 682 return false, err 683 } 684 685 if err := builder.Add(doc); err != nil { 686 return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err) 687 } 688 689 if idx%10_000 == 0 { 690 builder.CheckMemoryUsage() 691 } 692 } 693 694 return true, builder.Finish() 695} 696 697// indexCatfileBlobs streams main-repo blobs from the catfileReader into the 698// builder. Large blobs are skipped without reading content into memory. 699// keys must correspond 1:1 (in order) with the ids passed to newCatfileReader. 700// The reader is always closed when this function returns. 701func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]BlobLocation, opts Options, builder *index.Builder) error { 702 defer cr.Close() 703 704 slab := newContentSlab(16 << 20) // 16 MB per slab 705 706 for idx, key := range keys { 707 size, missing, excluded, err := cr.Next() 708 if err != nil { 709 return fmt.Errorf("cat-file next for %s: %w", key.FullPath(), err) 710 } 711 712 branches := repos[key].Branches 713 var doc index.Document 714 715 if missing { 716 // Unexpected for local repos — may indicate corruption, shallow 717 // clone, or a race with git gc. Log a warning and skip. 718 log.Printf("warning: blob %s missing for %s", key.ID, key.FullPath()) 719 doc = skippedDoc(key, branches, index.SkipReasonMissing) 720 } else if excluded { 721 doc = skippedDoc(key, branches, index.SkipReasonTooLarge) 722 } else { 723 keyFullPath := key.FullPath() 724 if size > opts.BuildOptions.SizeMax && !opts.BuildOptions.IgnoreSizeMax(keyFullPath) { 725 // Skip without reading content into memory. 726 doc = skippedDoc(key, branches, index.SkipReasonTooLarge) 727 } else { 728 content := slab.alloc(size) 729 if _, err := io.ReadFull(cr, content); err != nil { 730 return fmt.Errorf("read blob %s: %w", keyFullPath, err) 731 } 732 doc = index.Document{ 733 SubRepositoryPath: key.SubRepoPath, 734 Name: keyFullPath, 735 Content: content, 736 Branches: branches, 737 } 738 } 739 } 740 741 if err := builder.Add(doc); err != nil { 742 return fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err) 743 } 744 745 if idx%10_000 == 0 { 746 builder.CheckMemoryUsage() 747 } 748 } 749 750 return nil 751} 752 753// openRepo opens a git repository in a way that's optimized for indexing. 754// 755// It copies the relevant logic from git.PlainOpen, and tweaks certain filesystem options. 756func openRepo(repoDir string) (*git.Repository, io.Closer, error) { 757 fs := osfs.New(repoDir) 758 759 // Check if the root directory exists. 760 if _, err := fs.Stat(""); err != nil { 761 if os.IsNotExist(err) { 762 return nil, nil, git.ErrRepositoryNotExists 763 } 764 return nil, nil, err 765 } 766 767 fi, err := fs.Stat(git.GitDirName) 768 if err == nil && !fi.IsDir() { 769 return openCompatibleRepo(repoDir) 770 } 771 772 return openOptimizedRepo(repoDir) 773} 774 775func openCompatibleRepo(repoDir string) (*git.Repository, io.Closer, error) { 776 repo, err := plainOpenRepo(repoDir) 777 if err != nil { 778 return nil, nil, err 779 } 780 781 return repo, noopCloser{}, nil 782} 783 784func openOptimizedRepo(repoDir string) (*git.Repository, io.Closer, error) { 785 fs := osfs.New(repoDir) 786 wt := fs 787 788 // If there's a .git directory, use that as the new root. 789 if fi, err := fs.Stat(git.GitDirName); err == nil && fi.IsDir() { 790 if fs, err = fs.Chroot(git.GitDirName); err != nil { 791 return nil, nil, fmt.Errorf("fs.Chroot: %w", err) 792 } 793 } 794 795 s := filesystem.NewStorageWithOptions(fs, cache.NewObjectLRUDefault(), filesystem.Options{ 796 // Cache the packfile handles, preventing the packfile from being opened then closed on every object access 797 KeepDescriptors: true, 798 }) 799 800 // Because we're keeping descriptors open, we need to close the storage object when we're done. 801 repo, err := git.Open(s, wt) 802 return repo, s, err 803} 804 805type noopCloser struct{} 806 807func (noopCloser) Close() error { return nil } 808 809func catfileFilterSpec(opts Options) string { 810 // Can't filter by size if we have large file exceptions 811 if len(opts.BuildOptions.LargeFiles) > 0 { 812 return "" 813 } 814 815 if opts.BuildOptions.SizeMax <= 0 { 816 return "" 817 } 818 819 // Git's blob:limit filter excludes blobs whose size is >= the given limit, 820 // while zoekt indexes files up to and including SizeMax bytes. 821 return fmt.Sprintf("blob:limit=%d", int64(opts.BuildOptions.SizeMax)+1) 822} 823 824func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) { 825 ignoreFile, err := tree.File(ignore.IgnoreFile) 826 if err == object.ErrFileNotFound { 827 return &ignore.Matcher{}, nil 828 } 829 if err != nil { 830 return nil, err 831 } 832 content, err := ignoreFile.Contents() 833 if err != nil { 834 return nil, err 835 } 836 return ignore.ParseIgnoreFile(strings.NewReader(content)) 837} 838 839// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing 840// a build.Builder instance for generating a delta build. 841type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) 842 843// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing 844// a build.Builder instance for generating a normal build. 845type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) 846 847type gitIndexConfig struct { 848 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to 849 // prepare the build.Builder instance for generating a delta build. 850 // 851 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead. 852 prepareDeltaBuild prepareDeltaBuildFunc 853 854 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to 855 // prepare the build.Builder instance for generating a normal build. 856 // 857 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead. 858 prepareNormalBuild prepareNormalBuildFunc 859} 860 861func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { 862 if options.Submodules { 863 return nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing") 864 } 865 866 // discover what commits we indexed during our last build 867 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata() 868 if err != nil { 869 return nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err) 870 } 871 872 if !ok { 873 return nil, nil, nil, fmt.Errorf("no existing shards found for repository") 874 } 875 876 if options.DeltaShardNumberFallbackThreshold > 0 { 877 // HACK: For our interim compaction strategy, we force a full normal index once 878 // the number of shards on disk for this repository exceeds the provided threshold. 879 // 880 // This strategy obviously isn't optimal (as an example: we currently can't differentiate 881 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per 882 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads 883 // while we create a better compaction strategy). 884 885 oldShards := options.BuildOptions.FindAllShards() 886 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold { 887 return nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold) 888 } 889 } 890 891 // Check to see if the set of branch names is consistent with what we last indexed. 892 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a 893 // normal one). 894 895 if !index.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) { 896 var existingBranchNames []string 897 for _, b := range existingRepository.Branches { 898 existingBranchNames = append(existingBranchNames, b.Name) 899 } 900 901 var optionsBranchNames []string 902 for _, b := range options.BuildOptions.RepositoryDescription.Branches { 903 optionsBranchNames = append(optionsBranchNames, b.Name) 904 } 905 906 existingBranchList := strings.Join(existingBranchNames, ", ") 907 optionsBranchList := strings.Join(optionsBranchNames, ", ") 908 909 return nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList) 910 } 911 912 // Check if the build options hash does not match the repository metadata's hash 913 // If it does not index then one or more index options has changed and will require a normal build instead of a delta build 914 if options.BuildOptions.GetHash() != existingRepository.IndexOptions { 915 return nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions()) 916 } 917 918 // branch => (path, sha1) => repo. 919 repos = map[fileKey]BlobLocation{} 920 921 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix) 922 if err != nil { 923 return nil, nil, nil, fmt.Errorf("expandBranches: %w", err) 924 } 925 926 // branch name -> git worktree at most current commit 927 branchToCurrentTree := make(map[string]*object.Tree, len(branches)) 928 929 for _, b := range branches { 930 commit, err := getCommit(repository, options.BranchPrefix, b) 931 if err != nil { 932 return nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err) 933 } 934 935 tree, err := commit.Tree() 936 if err != nil { 937 return nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err) 938 } 939 940 branchToCurrentTree[b] = tree 941 } 942 943 rawURL := options.BuildOptions.RepositoryDescription.URL 944 u, err := url.Parse(rawURL) 945 if err != nil { 946 return nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err) 947 } 948 949 // TODO: Support repository submodules for delta builds 950 951 // loop over all branches, calculate the diff between our 952 // last indexed commit and the current commit, and add files mentioned in the diff 953 for _, branch := range existingRepository.Branches { 954 lastIndexedCommit, err := getCommit(repository, "", branch.Version) 955 if err != nil { 956 return nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err) 957 } 958 959 lastIndexedTree, err := lastIndexedCommit.Tree() 960 if err != nil { 961 return nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err) 962 } 963 964 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false}) 965 if err != nil { 966 return nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err) 967 } 968 969 for i, c := range changes { 970 oldFile, newFile, err := c.Files() 971 if err != nil { 972 return nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err) 973 } 974 975 if newFile != nil { 976 // note: newFile.Name could be a path that isn't relative to the repository root - using the 977 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root 978 newFileRelativeRootPath := c.To.Name 979 980 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds 981 if newFileRelativeRootPath == ignore.IgnoreFile { 982 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile) 983 } 984 985 // either file is added or renamed, so we need to add the new version to the build 986 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash} 987 if existing, ok := repos[file]; ok { 988 existing.Branches = append(existing.Branches, branch.Name) 989 repos[file] = existing 990 } else { 991 repos[file] = BlobLocation{ 992 GitRepo: repository, 993 URL: u, 994 Branches: []string{branch.Name}, 995 } 996 } 997 } 998 999 if oldFile == nil { 1000 // file added - nothing more to do 1001 continue 1002 } 1003 1004 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the 1005 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root 1006 oldFileRelativeRootPath := c.From.Name 1007 1008 if oldFileRelativeRootPath == ignore.IgnoreFile { 1009 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile) 1010 } 1011 1012 // The file is either modified or deleted. So, we need to add ALL versions 1013 // of the old file (across all branches) to the build. 1014 for b, currentTree := range branchToCurrentTree { 1015 f, err := currentTree.File(oldFileRelativeRootPath) 1016 if err != nil { 1017 // the file doesn't exist in this branch 1018 if errors.Is(err, object.ErrFileNotFound) { 1019 continue 1020 } 1021 1022 return nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err) 1023 } 1024 1025 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()} 1026 if existing, ok := repos[file]; ok { 1027 existing.Branches = append(existing.Branches, b) 1028 repos[file] = existing 1029 } else { 1030 repos[file] = BlobLocation{ 1031 GitRepo: repository, 1032 URL: u, 1033 Branches: []string{b}, 1034 } 1035 } 1036 } 1037 1038 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath) 1039 } 1040 } 1041 1042 // we need to de-duplicate the branch map before returning it - it's possible for the same 1043 // branch to have been added multiple times if a file has been modified across multiple commits 1044 for _, info := range repos { 1045 sort.Strings(info.Branches) 1046 info.Branches = uniq(info.Branches) 1047 } 1048 1049 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates 1050 // for the same reasoning as above 1051 sort.Strings(changedOrDeletedPaths) 1052 changedOrDeletedPaths = uniq(changedOrDeletedPaths) 1053 1054 return repos, nil, changedOrDeletedPaths, nil 1055} 1056 1057func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) { 1058 var repoCache *RepoCache 1059 if options.Submodules && options.RepoCacheDir != "" { 1060 repoCache = NewRepoCache(options.RepoCacheDir) 1061 } 1062 return prepareNormalBuildRecurse(options, repository, repoCache, false) 1063} 1064 1065func prepareNormalBuildRecurse(options Options, repository *git.Repository, repoCache *RepoCache, isSubrepo bool) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) { 1066 // Branch => Repo => SHA1 1067 branchVersions = map[string]map[string]plumbing.Hash{} 1068 1069 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix) 1070 if err != nil { 1071 return nil, nil, fmt.Errorf("expandBranches: %w", err) 1072 } 1073 1074 repoURL := options.BuildOptions.RepositoryDescription.URL 1075 1076 if isSubrepo { 1077 cfg, err := repository.Config() 1078 if err != nil { 1079 return nil, nil, fmt.Errorf("unable to get repository config: %w", err) 1080 } 1081 1082 u, err := normalizeSubmoduleRemoteURL(cfg) 1083 if err != nil { 1084 return nil, nil, fmt.Errorf("failed to identify subrepository URL: %w", err) 1085 } 1086 repoURL = u 1087 } 1088 1089 rw := NewRepoWalker(repository, repoURL, repoCache) 1090 for _, b := range branches { 1091 commit, err := getCommit(repository, options.BranchPrefix, b) 1092 if err != nil { 1093 if options.AllowMissingBranch && err.Error() == "reference not found" { 1094 continue 1095 } 1096 1097 return nil, nil, fmt.Errorf("getCommit: %w", err) 1098 } 1099 1100 tree, err := commit.Tree() 1101 if err != nil { 1102 return nil, nil, fmt.Errorf("commit.Tree: %w", err) 1103 } 1104 1105 ig, err := newIgnoreMatcher(tree) 1106 if err != nil { 1107 return nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err) 1108 } 1109 1110 subVersions, err := rw.CollectFiles(tree, b, ig) 1111 if err != nil { 1112 return nil, nil, fmt.Errorf("CollectFiles: %w", err) 1113 } 1114 1115 branchVersions[b] = subVersions 1116 } 1117 1118 // Index submodules using go-git if we didn't do so using the repo cache 1119 if options.Submodules && options.RepoCacheDir == "" { 1120 worktree, err := repository.Worktree() 1121 if err != nil { 1122 return nil, nil, fmt.Errorf("failed to get repository worktree: %w", err) 1123 } 1124 1125 submodules, err := worktree.Submodules() 1126 if err != nil { 1127 return nil, nil, fmt.Errorf("failed to get submodules: %w", err) 1128 } 1129 1130 for _, submodule := range submodules { 1131 subRepository, err := submodule.Repository() 1132 if err != nil { 1133 log.Printf("failed to open submodule repository: %s, %s", submodule.Config().Name, err) 1134 continue 1135 } 1136 1137 sw, subVersions, err := prepareNormalBuildRecurse(options, subRepository, repoCache, true) 1138 if err != nil { 1139 log.Printf("failed to index submodule repository: %s, %s", submodule.Config().Name, err) 1140 continue 1141 } 1142 1143 log.Printf("adding subrepository files from: %s", submodule.Config().Name) 1144 1145 for k, repo := range sw { 1146 rw.Files[fileKey{ 1147 SubRepoPath: filepath.Join(submodule.Config().Path, k.SubRepoPath), 1148 Path: k.Path, 1149 ID: k.ID, 1150 }] = repo 1151 } 1152 1153 for k, v := range subVersions { 1154 branchVersions[filepath.Join(submodule.Config().Path, k)] = v 1155 } 1156 } 1157 } 1158 1159 return rw.Files, branchVersions, nil 1160} 1161 1162func createDocument(key fileKey, 1163 repos map[fileKey]BlobLocation, 1164 opts index.Options, 1165) (index.Document, error) { 1166 repo := repos[key] 1167 blob, err := repo.GitRepo.BlobObject(key.ID) 1168 branches := repos[key].Branches 1169 1170 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found. 1171 if errors.Is(err, plumbing.ErrObjectNotFound) { 1172 return skippedDoc(key, branches, index.SkipReasonTooLarge), nil 1173 } 1174 1175 if err != nil { 1176 return index.Document{}, err 1177 } 1178 1179 keyFullPath := key.FullPath() 1180 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) { 1181 return skippedDoc(key, branches, index.SkipReasonTooLarge), nil 1182 } 1183 1184 contents, err := blobContents(blob) 1185 if err != nil { 1186 return index.Document{}, err 1187 } 1188 1189 return index.Document{ 1190 SubRepositoryPath: key.SubRepoPath, 1191 Name: keyFullPath, 1192 Content: contents, 1193 Branches: branches, 1194 }, nil 1195} 1196 1197// skippedDoc creates a Document placeholder for a blob that was not indexed. 1198func skippedDoc(key fileKey, branches []string, reason index.SkipReason) index.Document { 1199 return index.Document{ 1200 SkipReason: reason, 1201 Name: key.FullPath(), 1202 Branches: branches, 1203 SubRepositoryPath: key.SubRepoPath, 1204 } 1205} 1206 1207func blobContents(blob *object.Blob) ([]byte, error) { 1208 r, err := blob.Reader() 1209 if err != nil { 1210 return nil, err 1211 } 1212 defer r.Close() 1213 1214 var buf bytes.Buffer 1215 buf.Grow(int(blob.Size)) 1216 _, err = buf.ReadFrom(r) 1217 if err != nil { 1218 return nil, err 1219 } 1220 return buf.Bytes(), nil 1221} 1222 1223func uniq(ss []string) []string { 1224 result := ss[:0] 1225 var last string 1226 for i, s := range ss { 1227 if i == 0 || s != last { 1228 result = append(result, s) 1229 } 1230 last = s 1231 } 1232 return result 1233}