fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package gitindex provides functions for indexing Git repositories.
16package gitindex
17
18import (
19 "bytes"
20 "cmp"
21 "context"
22 "errors"
23 "fmt"
24 "io"
25 "log"
26 "math"
27 "net/url"
28 "os"
29 "path/filepath"
30 "regexp"
31 "sort"
32 "strconv"
33 "strings"
34
35 "github.com/go-git/go-billy/v5/osfs"
36 "github.com/go-git/go-git/v5/config"
37 "github.com/go-git/go-git/v5/plumbing"
38 "github.com/go-git/go-git/v5/plumbing/cache"
39 "github.com/go-git/go-git/v5/plumbing/object"
40 "github.com/go-git/go-git/v5/storage/filesystem"
41
42 "github.com/sourcegraph/zoekt"
43 "github.com/sourcegraph/zoekt/ignore"
44 "github.com/sourcegraph/zoekt/index"
45
46 git "github.com/go-git/go-git/v5"
47)
48
49// FindGitRepos finds directories holding git repositories below the
50// given directory. It will find both bare and the ".git" dirs in
51// non-bare repositories. It returns the full path including the dir
52// passed in.
53func FindGitRepos(dir string) ([]string, error) {
54 arg, err := filepath.Abs(dir)
55 if err != nil {
56 return nil, err
57 }
58 var dirs []string
59 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error {
60 // Best-effort, ignore filepath.Walk failing
61 if err != nil {
62 return nil
63 }
64
65 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() {
66 dirs = append(dirs, filepath.Join(name, ".git"))
67 return filepath.SkipDir
68 }
69
70 if !strings.HasSuffix(name, ".git") || !fi.IsDir() {
71 return nil
72 }
73
74 fi, err = os.Lstat(filepath.Join(name, "objects"))
75 if err != nil || !fi.IsDir() {
76 return nil
77 }
78
79 dirs = append(dirs, name)
80 return filepath.SkipDir
81 }); err != nil {
82 return nil, err
83 }
84
85 return dirs, nil
86}
87
88// setTemplates fills in URL templates for known git hosting
89// sites.
90func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error {
91 if u.Scheme == "ssh+git" {
92 u.Scheme = "https"
93 u.User = nil
94 }
95
96 // helper to generate u.JoinPath as a template
97 varVersion := ".Version"
98 varPath := ".Path"
99 urlJoinPath := func(elem ...string) string {
100 elem = append([]string{u.String()}, elem...)
101 var parts []string
102 for _, e := range elem {
103 if e == varVersion || e == varPath {
104 parts = append(parts, e)
105 } else {
106 parts = append(parts, strconv.Quote(e))
107 }
108 }
109 return fmt.Sprintf("{{URLJoinPath %s}}", strings.Join(parts, " "))
110 }
111
112 repo.URL = u.String()
113 switch typ {
114 case "gitiles":
115 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20
116 repo.CommitURLTemplate = urlJoinPath("+", varVersion)
117 repo.FileURLTemplate = urlJoinPath("+", varVersion, varPath)
118 repo.LineFragmentTemplate = "#{{.LineNumber}}"
119 case "github":
120 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10
121 repo.CommitURLTemplate = urlJoinPath("commit", varVersion)
122 repo.FileURLTemplate = urlJoinPath("blob", varVersion, varPath)
123 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
124 case "cgit":
125 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100
126 repo.CommitURLTemplate = urlJoinPath("commit") + "/?id={{.Version}}"
127 repo.FileURLTemplate = urlJoinPath("tree", varPath) + "/?id={{.Version}}"
128 repo.LineFragmentTemplate = "#n{{.LineNumber}}"
129 case "gitweb":
130 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10
131 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}"
132 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}"
133 repo.LineFragmentTemplate = "#l{{.LineNumber}}"
134 case "source.bazel.build":
135 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9
136 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10
137 repo.CommitURLTemplate = u.String() + "/%2B/{{.Version}}"
138 repo.FileURLTemplate = u.String() + "/%2B/{{.Version}}:{{.Path}}"
139 repo.LineFragmentTemplate = ";l={{.LineNumber}}"
140 case "bitbucket-server":
141 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc
142 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc
143 repo.CommitURLTemplate = urlJoinPath("commits", varVersion)
144 repo.FileURLTemplate = urlJoinPath(varPath) + "?at={{.Version}}"
145 repo.LineFragmentTemplate = "#{{.LineNumber}}"
146 case "bitbucket-cloud":
147 // https://bitbucket.org/<workspace>/<repo_slug>/commits/<version>
148 // https://bitbucket.org/<workspace>/<repo_slug>/src/<version>/<path>
149 repo.CommitURLTemplate = urlJoinPath("commits", varVersion)
150 repo.FileURLTemplate = urlJoinPath("src", varVersion, varPath)
151 repo.LineFragmentTemplate = "#{{.LineNumber}}"
152 case "azuredevops":
153 // https://dev.azure.com/<organization>/<project>/_git/<repo>/commit/<version>
154 // https://dev.azure.com/<organization>/<project>/_git/<repo>?path=/<path>&version=GC<version>
155 repo.CommitURLTemplate = urlJoinPath("commit", varVersion)
156 repo.FileURLTemplate = urlJoinPath() + "?path=/{{.Path}}&version=GC{{.Version}}&_a=contents"
157 repo.LineFragmentTemplate = "&line={{.LineNumber}}&lineEnd={{.LineNumber}}&lineStartColumn=1&lineEndColumn=200"
158 case "gitlab":
159 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed
160 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template
161 repo.CommitURLTemplate = urlJoinPath("-/commit", varVersion)
162 repo.FileURLTemplate = urlJoinPath("-/blob", varVersion, varPath)
163 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
164 case "gitea":
165 repo.CommitURLTemplate = urlJoinPath("commit", varVersion)
166 // NOTE The `display=source` query parameter is required to disable file rendering.
167 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to
168 // a line without `display=source`. This is supported since gitea 1.17.0.
169 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}},
170 // but the query parameters are obmitted.
171 repo.FileURLTemplate = urlJoinPath("src/commit", varVersion, varPath) + "?display=source"
172 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
173 default:
174 return fmt.Errorf("URL scheme type %q unknown", typ)
175 }
176 return nil
177}
178
179// getCommit returns a tree object for the given reference.
180func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) {
181 sha1, err := repo.ResolveRevision(plumbing.Revision(ref))
182 // ref might be a branch name (e.g. "master") add branch prefix and try again.
183 if err != nil {
184 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref)))
185 }
186 if err != nil {
187 return nil, err
188 }
189
190 commitObj, err := repo.CommitObject(*sha1)
191 if err != nil {
192 return nil, err
193 }
194 return commitObj, nil
195}
196
197func plainOpenRepo(repoDir string) (*git.Repository, error) {
198 // Try repoDir as the repository root first so bare repositories open
199 // correctly. If repoDir itself is not a repository, fall back to searching
200 // for a .git entry to preserve compatibility with worktree paths.
201 repo, err := git.PlainOpenWithOptions(repoDir, &git.PlainOpenOptions{
202 EnableDotGitCommonDir: true,
203 })
204 if err == nil || !errors.Is(err, git.ErrRepositoryNotExists) {
205 return repo, err
206 }
207
208 return git.PlainOpenWithOptions(repoDir, &git.PlainOpenOptions{
209 DetectDotGit: true,
210 EnableDotGitCommonDir: true,
211 })
212}
213
214func configLookupRemoteURL(cfg *config.Config, key string) string {
215 rc := cfg.Remotes[key]
216 if rc == nil || len(rc.URLs) == 0 {
217 return ""
218 }
219 return rc.URLs[0]
220}
221
222var sshRelativeURLRegexp = regexp.MustCompile(`^([^@]+)@([^:]+):(.*)$`)
223
224func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error {
225 repo, err := plainOpenRepo(repoDir)
226 if err != nil {
227 return err
228 }
229
230 cfg, err := repo.Config()
231 if err != nil {
232 return err
233 }
234
235 return setTemplatesFromRepoConfig(desc, cfg)
236}
237
238func setTemplatesFromRepo(desc *zoekt.Repository, repo *git.Repository, repoDir string) error {
239 cfg, err := repo.Config()
240 if err == nil {
241 return setTemplatesFromRepoConfig(desc, cfg)
242 }
243
244 return setTemplatesFromConfig(desc, repoDir)
245}
246
247func setTemplatesFromRepoConfig(desc *zoekt.Repository, cfg *config.Config) error {
248 sec := cfg.Raw.Section("zoekt")
249
250 webURLStr := sec.Options.Get("web-url")
251 webURLType := sec.Options.Get("web-url-type")
252
253 if webURLType != "" && webURLStr != "" {
254 webURL, err := url.Parse(webURLStr)
255 if err != nil {
256 return err
257 }
258 if err := setTemplates(desc, webURL, webURLType); err != nil {
259 return err
260 }
261 } else if webURLStr != "" {
262 desc.URL = webURLStr
263 }
264
265 name := sec.Options.Get("name")
266 if name != "" {
267 desc.Name = name
268 } else {
269 remoteURL := configLookupRemoteURL(cfg, "origin")
270 if remoteURL == "" {
271 return nil
272 }
273 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil {
274 user := sm[1]
275 host := sm[2]
276 path := sm[3]
277
278 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path)
279 }
280
281 u, err := url.Parse(remoteURL)
282 if err != nil {
283 return err
284 }
285 if err := SetTemplatesFromOrigin(desc, u); err != nil {
286 return err
287 }
288 }
289
290 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32)
291 desc.ID = uint32(id)
292
293 desc.TenantID, _ = strconv.Atoi(sec.Options.Get("tenantID"))
294
295 if desc.RawConfig == nil {
296 desc.RawConfig = map[string]string{}
297 }
298 for _, o := range sec.Options {
299 desc.RawConfig[o.Key] = o.Value
300 }
301
302 // Ranking info.
303
304 // Github:
305 traction := 0
306 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} {
307 f, err := strconv.Atoi(sec.Options.Get(s))
308 if err == nil {
309 traction += f
310 }
311 }
312
313 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 {
314 // Pretend everything on googlesource.com has 1000
315 // github stars.
316 traction = 1000
317 }
318
319 if traction > 0 {
320 l := math.Log(float64(traction))
321 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000)
322 }
323
324 return nil
325}
326
327// This attempts to get a repo URL similar to the main repository template processing as in setTemplatesFromConfig()
328func normalizeSubmoduleRemoteURL(cfg *config.Config) (string, error) {
329 sec := cfg.Raw.Section("zoekt")
330 remoteURL := sec.Options.Get("web-url")
331 if remoteURL == "" {
332 // fall back to "origin" remote
333 remoteURL = configLookupRemoteURL(cfg, "origin")
334 if remoteURL == "" {
335 return "", nil
336 }
337 }
338
339 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil {
340 user := sm[1]
341 host := sm[2]
342 path := sm[3]
343
344 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path)
345 }
346
347 u, err := url.Parse(remoteURL)
348 if err != nil {
349 return "", fmt.Errorf("unable to parse remote URL %q: %w", remoteURL, err)
350 }
351
352 if u.Scheme == "ssh+git" {
353 u.Scheme = "https"
354 u.User = nil
355 }
356
357 // Assume we cannot build templates for this URL, leave it empty
358 if u.Scheme == "" {
359 return "", nil
360 }
361
362 return u.String(), nil
363}
364
365// SetTemplatesFromOrigin fills in templates based on the origin URL.
366func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error {
367 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git"))
368
369 if strings.HasSuffix(u.Host, ".googlesource.com") {
370 return setTemplates(desc, u, "gitiles")
371 } else if u.Host == "github.com" {
372 u.Path = strings.TrimSuffix(u.Path, ".git")
373 return setTemplates(desc, u, "github")
374 } else {
375 return fmt.Errorf("unknown git hosting site %q", u)
376 }
377}
378
379// The Options structs controls details of the indexing process.
380type Options struct {
381 // The repository to be indexed.
382 RepoDir string
383
384 // If set, follow submodule links. This requires RepoCacheDir to be set.
385 Submodules bool
386
387 // If set, skip indexing if the existing index shard is newer
388 // than the refs in the repository.
389 Incremental bool
390
391 // Don't error out if some branch is missing
392 AllowMissingBranch bool
393
394 // Specifies the root of a Repository cache. Needed for submodule indexing.
395 RepoCacheDir string
396
397 // Indexing options.
398 BuildOptions index.Options
399
400 // Prefix of the branch to index, e.g. `remotes/origin`.
401 BranchPrefix string
402
403 // List of branch names to index, e.g. []string{"HEAD", "stable"}
404 Branches []string
405
406 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards
407 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold,
408 // then a normal build will be performed instead.
409 //
410 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled:
411 // a delta build will always be performed regardless of the number of preexisting shards.
412 DeltaShardNumberFallbackThreshold uint64
413}
414
415func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) {
416 var result []string
417 for _, b := range bs {
418 // Sourcegraph: We disable resolving refs. We want to return the exact ref
419 // requested so we can match it up.
420 if b == "HEAD" && false {
421 ref, err := repo.Head()
422 if err != nil {
423 return nil, err
424 }
425
426 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix))
427 continue
428 }
429
430 if strings.Contains(b, "*") {
431 iter, err := repo.Branches()
432 if err != nil {
433 return nil, err
434 }
435
436 defer iter.Close()
437 for {
438 ref, err := iter.Next()
439 if err == io.EOF {
440 break
441 }
442 if err != nil {
443 return nil, err
444 }
445
446 name := ref.Name().Short()
447 if matched, err := filepath.Match(b, name); err != nil {
448 return nil, err
449 } else if !matched {
450 continue
451 }
452
453 result = append(result, strings.TrimPrefix(name, prefix))
454 }
455 continue
456 }
457
458 result = append(result, b)
459 }
460
461 return result, nil
462}
463
464// IndexGitRepo indexes the git repository as specified by the options.
465// The returned bool indicates whether the index was updated as a result. This
466// can be informative if doing incremental indexing.
467func IndexGitRepo(opts Options) (bool, error) {
468 return indexGitRepo(opts, gitIndexConfig{})
469}
470
471// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig.
472// The returned bool indicates whether the index was updated as a result. This
473// can be informative if doing incremental indexing.
474func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
475 prepareDeltaBuild := prepareDeltaBuild
476 if config.prepareDeltaBuild != nil {
477 prepareDeltaBuild = config.prepareDeltaBuild
478 }
479
480 prepareNormalBuild := prepareNormalBuild
481 if config.prepareNormalBuild != nil {
482 prepareNormalBuild = config.prepareNormalBuild
483 }
484
485 // Set max thresholds, since we use them in this function.
486 opts.BuildOptions.SetDefaults()
487 if opts.RepoDir == "" {
488 return false, fmt.Errorf("gitindex: must set RepoDir")
489 }
490
491 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir
492
493 var repo *git.Repository
494 legacyRepoOpen := cmp.Or(os.Getenv("ZOEKT_DISABLE_GOGIT_OPTIMIZATION"), "false")
495 if b, err := strconv.ParseBool(legacyRepoOpen); b || err != nil {
496 repo, err = plainOpenRepo(opts.RepoDir)
497 if err != nil {
498 return false, fmt.Errorf("plainOpenRepo: %w", err)
499 }
500 } else {
501 var repoCloser io.Closer
502 repo, repoCloser, err = openRepo(opts.RepoDir)
503 if err != nil {
504 return false, fmt.Errorf("openRepo: %w", err)
505 }
506 defer repoCloser.Close()
507 }
508
509 if err := setTemplatesFromRepo(&opts.BuildOptions.RepositoryDescription, repo, opts.RepoDir); err != nil {
510 log.Printf("setTemplatesFromRepo(%s): %s", opts.RepoDir, err)
511 }
512
513 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix)
514 if err != nil {
515 return false, fmt.Errorf("expandBranches: %w", err)
516 }
517 for _, b := range branches {
518 commit, err := getCommit(repo, opts.BranchPrefix, b)
519 if err != nil {
520 if opts.AllowMissingBranch && err.Error() == "reference not found" {
521 continue
522 }
523
524 return false, fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err)
525 }
526
527 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{
528 Name: b,
529 Version: commit.Hash.String(),
530 })
531
532 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) {
533 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when
534 }
535 }
536
537 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() {
538 return false, nil
539 }
540
541 // branch => (path, sha1) => repo.
542 var repos map[fileKey]BlobLocation
543
544 // Branch => Repo => SHA1
545 var branchVersions map[string]map[string]plumbing.Hash
546
547 // set of file paths that have been changed or deleted since
548 // the last indexed commit
549 //
550 // These only have an effect on delta builds
551 var changedOrRemovedFiles []string
552
553 if opts.BuildOptions.IsDelta {
554 repos, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo)
555 if err != nil {
556 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err)
557 opts.BuildOptions.IsDelta = false
558 }
559 }
560
561 if !opts.BuildOptions.IsDelta {
562 repos, branchVersions, err = prepareNormalBuild(opts, repo)
563 if err != nil {
564 return false, fmt.Errorf("preparing normal build: %w", err)
565 }
566 }
567
568 reposByPath := map[string]BlobLocation{}
569 for key, info := range repos {
570 reposByPath[key.SubRepoPath] = info
571 }
572
573 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{}
574 for path, info := range reposByPath {
575 tpl := opts.BuildOptions.RepositoryDescription
576 if path != "" {
577 tpl = zoekt.Repository{URL: info.URL.String()}
578 if info.URL.String() != "" {
579 if err := SetTemplatesFromOrigin(&tpl, info.URL); err != nil {
580 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.URL, err)
581 }
582 }
583 if tpl.Name == "" {
584 tpl.Name = path
585 }
586 }
587 opts.BuildOptions.SubRepositories[path] = &tpl
588 }
589
590 for _, br := range opts.BuildOptions.RepositoryDescription.Branches {
591 for path, repo := range opts.BuildOptions.SubRepositories {
592 id := branchVersions[br.Name][path]
593 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{
594 Name: br.Name,
595 Version: id.String(),
596 })
597 }
598 }
599
600 builder, err := index.NewBuilder(opts.BuildOptions)
601 if err != nil {
602 return false, fmt.Errorf("build.NewBuilder: %w", err)
603 }
604
605 // Preparing the build can consume substantial memory, so check usage before starting to index.
606 builder.CheckMemoryUsage()
607
608 // we don't need to check error, since we either already have an error, or
609 // we returning the first call to builder.Finish.
610 defer builder.Finish() // nolint:errcheck
611
612 for _, f := range changedOrRemovedFiles {
613 builder.MarkFileAsChangedOrRemoved(f)
614 }
615
616 var names []string
617 fileKeys := map[string][]fileKey{}
618 totalFiles := 0
619
620 for key := range repos {
621 n := key.FullPath()
622 fileKeys[n] = append(fileKeys[n], key)
623 names = append(names, n)
624 totalFiles++
625 }
626
627 sort.Strings(names)
628 names = uniq(names)
629
630 // Separate main-repo keys from submodule keys, collecting blob SHAs
631 // for the main repo so we can stream them via git cat-file --batch.
632 // ZOEKT_DISABLE_CATFILE_BATCH=true falls back to the go-git path for
633 // all files, useful as a kill switch if the cat-file path causes issues.
634 //
635 // 2026-04-02(keegan) we are regularly seeing git growing to over 9GB in
636 // memory usage in our production cluster. Disabling by default until the
637 // issue is resolved.
638 catfileBatchDisabled := cmp.Or(os.Getenv("ZOEKT_DISABLE_CATFILE_BATCH"), "true")
639 useCatfileBatch := true
640 if disabled, _ := strconv.ParseBool(catfileBatchDisabled); disabled {
641 useCatfileBatch = false
642 log.Printf("cat-file batch disabled via ZOEKT_DISABLE_CATFILE_BATCH, using go-git")
643 }
644
645 mainRepoKeys := make([]fileKey, 0, totalFiles)
646 mainRepoIDs := make([]plumbing.Hash, 0, totalFiles)
647 var submoduleKeys []fileKey
648
649 for _, name := range names {
650 for _, key := range fileKeys[name] {
651 if useCatfileBatch && key.SubRepoPath == "" {
652 mainRepoKeys = append(mainRepoKeys, key)
653 mainRepoIDs = append(mainRepoIDs, key.ID)
654 } else {
655 submoduleKeys = append(submoduleKeys, key)
656 }
657 }
658 }
659
660 log.Printf("attempting to index %d total files (%d via cat-file, %d via go-git)", totalFiles, len(mainRepoIDs), len(submoduleKeys))
661
662 // Stream main-repo blobs via pipelined cat-file --batch --buffer.
663 // Large blobs are skipped without reading content into memory.
664 if len(mainRepoIDs) > 0 {
665 crOpts := catfileReaderOptions{
666 filterSpec: catfileFilterSpec(opts),
667 }
668 cr, err := newCatfileReader(opts.RepoDir, mainRepoIDs, crOpts)
669 if err != nil {
670 return false, fmt.Errorf("newCatfileReader: %w", err)
671 }
672
673 if err := indexCatfileBlobs(cr, mainRepoKeys, repos, opts, builder); err != nil {
674 return false, err
675 }
676 }
677
678 // Index submodule blobs via go-git.
679 for idx, key := range submoduleKeys {
680 doc, err := createDocument(key, repos, opts.BuildOptions)
681 if err != nil {
682 return false, err
683 }
684
685 if err := builder.Add(doc); err != nil {
686 return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err)
687 }
688
689 if idx%10_000 == 0 {
690 builder.CheckMemoryUsage()
691 }
692 }
693
694 return true, builder.Finish()
695}
696
697// indexCatfileBlobs streams main-repo blobs from the catfileReader into the
698// builder. Large blobs are skipped without reading content into memory.
699// keys must correspond 1:1 (in order) with the ids passed to newCatfileReader.
700// The reader is always closed when this function returns.
701func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]BlobLocation, opts Options, builder *index.Builder) error {
702 defer cr.Close()
703
704 slab := newContentSlab(16 << 20) // 16 MB per slab
705
706 for idx, key := range keys {
707 size, missing, excluded, err := cr.Next()
708 if err != nil {
709 return fmt.Errorf("cat-file next for %s: %w", key.FullPath(), err)
710 }
711
712 branches := repos[key].Branches
713 var doc index.Document
714
715 if missing {
716 // Unexpected for local repos — may indicate corruption, shallow
717 // clone, or a race with git gc. Log a warning and skip.
718 log.Printf("warning: blob %s missing for %s", key.ID, key.FullPath())
719 doc = skippedDoc(key, branches, index.SkipReasonMissing)
720 } else if excluded {
721 doc = skippedDoc(key, branches, index.SkipReasonTooLarge)
722 } else {
723 keyFullPath := key.FullPath()
724 if size > opts.BuildOptions.SizeMax && !opts.BuildOptions.IgnoreSizeMax(keyFullPath) {
725 // Skip without reading content into memory.
726 doc = skippedDoc(key, branches, index.SkipReasonTooLarge)
727 } else {
728 content := slab.alloc(size)
729 if _, err := io.ReadFull(cr, content); err != nil {
730 return fmt.Errorf("read blob %s: %w", keyFullPath, err)
731 }
732 doc = index.Document{
733 SubRepositoryPath: key.SubRepoPath,
734 Name: keyFullPath,
735 Content: content,
736 Branches: branches,
737 }
738 }
739 }
740
741 if err := builder.Add(doc); err != nil {
742 return fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err)
743 }
744
745 if idx%10_000 == 0 {
746 builder.CheckMemoryUsage()
747 }
748 }
749
750 return nil
751}
752
753// openRepo opens a git repository in a way that's optimized for indexing.
754//
755// It copies the relevant logic from git.PlainOpen, and tweaks certain filesystem options.
756func openRepo(repoDir string) (*git.Repository, io.Closer, error) {
757 fs := osfs.New(repoDir)
758
759 // Check if the root directory exists.
760 if _, err := fs.Stat(""); err != nil {
761 if os.IsNotExist(err) {
762 return nil, nil, git.ErrRepositoryNotExists
763 }
764 return nil, nil, err
765 }
766
767 fi, err := fs.Stat(git.GitDirName)
768 if err == nil && !fi.IsDir() {
769 return openCompatibleRepo(repoDir)
770 }
771
772 return openOptimizedRepo(repoDir)
773}
774
775func openCompatibleRepo(repoDir string) (*git.Repository, io.Closer, error) {
776 repo, err := plainOpenRepo(repoDir)
777 if err != nil {
778 return nil, nil, err
779 }
780
781 return repo, noopCloser{}, nil
782}
783
784func openOptimizedRepo(repoDir string) (*git.Repository, io.Closer, error) {
785 fs := osfs.New(repoDir)
786 wt := fs
787
788 // If there's a .git directory, use that as the new root.
789 if fi, err := fs.Stat(git.GitDirName); err == nil && fi.IsDir() {
790 if fs, err = fs.Chroot(git.GitDirName); err != nil {
791 return nil, nil, fmt.Errorf("fs.Chroot: %w", err)
792 }
793 }
794
795 s := filesystem.NewStorageWithOptions(fs, cache.NewObjectLRUDefault(), filesystem.Options{
796 // Cache the packfile handles, preventing the packfile from being opened then closed on every object access
797 KeepDescriptors: true,
798 })
799
800 // Because we're keeping descriptors open, we need to close the storage object when we're done.
801 repo, err := git.Open(s, wt)
802 return repo, s, err
803}
804
805type noopCloser struct{}
806
807func (noopCloser) Close() error { return nil }
808
809func catfileFilterSpec(opts Options) string {
810 // Can't filter by size if we have large file exceptions
811 if len(opts.BuildOptions.LargeFiles) > 0 {
812 return ""
813 }
814
815 if opts.BuildOptions.SizeMax <= 0 {
816 return ""
817 }
818
819 // Git's blob:limit filter excludes blobs whose size is >= the given limit,
820 // while zoekt indexes files up to and including SizeMax bytes.
821 return fmt.Sprintf("blob:limit=%d", int64(opts.BuildOptions.SizeMax)+1)
822}
823
824func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) {
825 ignoreFile, err := tree.File(ignore.IgnoreFile)
826 if err == object.ErrFileNotFound {
827 return &ignore.Matcher{}, nil
828 }
829 if err != nil {
830 return nil, err
831 }
832 content, err := ignoreFile.Contents()
833 if err != nil {
834 return nil, err
835 }
836 return ignore.ParseIgnoreFile(strings.NewReader(content))
837}
838
839// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing
840// a build.Builder instance for generating a delta build.
841type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error)
842
843// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing
844// a build.Builder instance for generating a normal build.
845type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error)
846
847type gitIndexConfig struct {
848 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to
849 // prepare the build.Builder instance for generating a delta build.
850 //
851 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead.
852 prepareDeltaBuild prepareDeltaBuildFunc
853
854 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to
855 // prepare the build.Builder instance for generating a normal build.
856 //
857 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead.
858 prepareNormalBuild prepareNormalBuildFunc
859}
860
861func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) {
862 if options.Submodules {
863 return nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing")
864 }
865
866 // discover what commits we indexed during our last build
867 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata()
868 if err != nil {
869 return nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err)
870 }
871
872 if !ok {
873 return nil, nil, nil, fmt.Errorf("no existing shards found for repository")
874 }
875
876 if options.DeltaShardNumberFallbackThreshold > 0 {
877 // HACK: For our interim compaction strategy, we force a full normal index once
878 // the number of shards on disk for this repository exceeds the provided threshold.
879 //
880 // This strategy obviously isn't optimal (as an example: we currently can't differentiate
881 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per
882 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads
883 // while we create a better compaction strategy).
884
885 oldShards := options.BuildOptions.FindAllShards()
886 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold {
887 return nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold)
888 }
889 }
890
891 // Check to see if the set of branch names is consistent with what we last indexed.
892 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a
893 // normal one).
894
895 if !index.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) {
896 var existingBranchNames []string
897 for _, b := range existingRepository.Branches {
898 existingBranchNames = append(existingBranchNames, b.Name)
899 }
900
901 var optionsBranchNames []string
902 for _, b := range options.BuildOptions.RepositoryDescription.Branches {
903 optionsBranchNames = append(optionsBranchNames, b.Name)
904 }
905
906 existingBranchList := strings.Join(existingBranchNames, ", ")
907 optionsBranchList := strings.Join(optionsBranchNames, ", ")
908
909 return nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList)
910 }
911
912 // Check if the build options hash does not match the repository metadata's hash
913 // If it does not index then one or more index options has changed and will require a normal build instead of a delta build
914 if options.BuildOptions.GetHash() != existingRepository.IndexOptions {
915 return nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions())
916 }
917
918 // branch => (path, sha1) => repo.
919 repos = map[fileKey]BlobLocation{}
920
921 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix)
922 if err != nil {
923 return nil, nil, nil, fmt.Errorf("expandBranches: %w", err)
924 }
925
926 // branch name -> git worktree at most current commit
927 branchToCurrentTree := make(map[string]*object.Tree, len(branches))
928
929 for _, b := range branches {
930 commit, err := getCommit(repository, options.BranchPrefix, b)
931 if err != nil {
932 return nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err)
933 }
934
935 tree, err := commit.Tree()
936 if err != nil {
937 return nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err)
938 }
939
940 branchToCurrentTree[b] = tree
941 }
942
943 rawURL := options.BuildOptions.RepositoryDescription.URL
944 u, err := url.Parse(rawURL)
945 if err != nil {
946 return nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err)
947 }
948
949 // TODO: Support repository submodules for delta builds
950
951 // loop over all branches, calculate the diff between our
952 // last indexed commit and the current commit, and add files mentioned in the diff
953 for _, branch := range existingRepository.Branches {
954 lastIndexedCommit, err := getCommit(repository, "", branch.Version)
955 if err != nil {
956 return nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err)
957 }
958
959 lastIndexedTree, err := lastIndexedCommit.Tree()
960 if err != nil {
961 return nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err)
962 }
963
964 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false})
965 if err != nil {
966 return nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err)
967 }
968
969 for i, c := range changes {
970 oldFile, newFile, err := c.Files()
971 if err != nil {
972 return nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err)
973 }
974
975 if newFile != nil {
976 // note: newFile.Name could be a path that isn't relative to the repository root - using the
977 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root
978 newFileRelativeRootPath := c.To.Name
979
980 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds
981 if newFileRelativeRootPath == ignore.IgnoreFile {
982 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
983 }
984
985 // either file is added or renamed, so we need to add the new version to the build
986 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash}
987 if existing, ok := repos[file]; ok {
988 existing.Branches = append(existing.Branches, branch.Name)
989 repos[file] = existing
990 } else {
991 repos[file] = BlobLocation{
992 GitRepo: repository,
993 URL: u,
994 Branches: []string{branch.Name},
995 }
996 }
997 }
998
999 if oldFile == nil {
1000 // file added - nothing more to do
1001 continue
1002 }
1003
1004 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the
1005 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root
1006 oldFileRelativeRootPath := c.From.Name
1007
1008 if oldFileRelativeRootPath == ignore.IgnoreFile {
1009 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
1010 }
1011
1012 // The file is either modified or deleted. So, we need to add ALL versions
1013 // of the old file (across all branches) to the build.
1014 for b, currentTree := range branchToCurrentTree {
1015 f, err := currentTree.File(oldFileRelativeRootPath)
1016 if err != nil {
1017 // the file doesn't exist in this branch
1018 if errors.Is(err, object.ErrFileNotFound) {
1019 continue
1020 }
1021
1022 return nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err)
1023 }
1024
1025 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()}
1026 if existing, ok := repos[file]; ok {
1027 existing.Branches = append(existing.Branches, b)
1028 repos[file] = existing
1029 } else {
1030 repos[file] = BlobLocation{
1031 GitRepo: repository,
1032 URL: u,
1033 Branches: []string{b},
1034 }
1035 }
1036 }
1037
1038 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath)
1039 }
1040 }
1041
1042 // we need to de-duplicate the branch map before returning it - it's possible for the same
1043 // branch to have been added multiple times if a file has been modified across multiple commits
1044 for _, info := range repos {
1045 sort.Strings(info.Branches)
1046 info.Branches = uniq(info.Branches)
1047 }
1048
1049 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates
1050 // for the same reasoning as above
1051 sort.Strings(changedOrDeletedPaths)
1052 changedOrDeletedPaths = uniq(changedOrDeletedPaths)
1053
1054 return repos, nil, changedOrDeletedPaths, nil
1055}
1056
1057func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) {
1058 var repoCache *RepoCache
1059 if options.Submodules && options.RepoCacheDir != "" {
1060 repoCache = NewRepoCache(options.RepoCacheDir)
1061 }
1062 return prepareNormalBuildRecurse(options, repository, repoCache, false)
1063}
1064
1065func prepareNormalBuildRecurse(options Options, repository *git.Repository, repoCache *RepoCache, isSubrepo bool) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) {
1066 // Branch => Repo => SHA1
1067 branchVersions = map[string]map[string]plumbing.Hash{}
1068
1069 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix)
1070 if err != nil {
1071 return nil, nil, fmt.Errorf("expandBranches: %w", err)
1072 }
1073
1074 repoURL := options.BuildOptions.RepositoryDescription.URL
1075
1076 if isSubrepo {
1077 cfg, err := repository.Config()
1078 if err != nil {
1079 return nil, nil, fmt.Errorf("unable to get repository config: %w", err)
1080 }
1081
1082 u, err := normalizeSubmoduleRemoteURL(cfg)
1083 if err != nil {
1084 return nil, nil, fmt.Errorf("failed to identify subrepository URL: %w", err)
1085 }
1086 repoURL = u
1087 }
1088
1089 rw := NewRepoWalker(repository, repoURL, repoCache)
1090 for _, b := range branches {
1091 commit, err := getCommit(repository, options.BranchPrefix, b)
1092 if err != nil {
1093 if options.AllowMissingBranch && err.Error() == "reference not found" {
1094 continue
1095 }
1096
1097 return nil, nil, fmt.Errorf("getCommit: %w", err)
1098 }
1099
1100 tree, err := commit.Tree()
1101 if err != nil {
1102 return nil, nil, fmt.Errorf("commit.Tree: %w", err)
1103 }
1104
1105 ig, err := newIgnoreMatcher(tree)
1106 if err != nil {
1107 return nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err)
1108 }
1109
1110 subVersions, err := rw.CollectFiles(tree, b, ig)
1111 if err != nil {
1112 return nil, nil, fmt.Errorf("CollectFiles: %w", err)
1113 }
1114
1115 branchVersions[b] = subVersions
1116 }
1117
1118 // Index submodules using go-git if we didn't do so using the repo cache
1119 if options.Submodules && options.RepoCacheDir == "" {
1120 worktree, err := repository.Worktree()
1121 if err != nil {
1122 return nil, nil, fmt.Errorf("failed to get repository worktree: %w", err)
1123 }
1124
1125 submodules, err := worktree.Submodules()
1126 if err != nil {
1127 return nil, nil, fmt.Errorf("failed to get submodules: %w", err)
1128 }
1129
1130 for _, submodule := range submodules {
1131 subRepository, err := submodule.Repository()
1132 if err != nil {
1133 log.Printf("failed to open submodule repository: %s, %s", submodule.Config().Name, err)
1134 continue
1135 }
1136
1137 sw, subVersions, err := prepareNormalBuildRecurse(options, subRepository, repoCache, true)
1138 if err != nil {
1139 log.Printf("failed to index submodule repository: %s, %s", submodule.Config().Name, err)
1140 continue
1141 }
1142
1143 log.Printf("adding subrepository files from: %s", submodule.Config().Name)
1144
1145 for k, repo := range sw {
1146 rw.Files[fileKey{
1147 SubRepoPath: filepath.Join(submodule.Config().Path, k.SubRepoPath),
1148 Path: k.Path,
1149 ID: k.ID,
1150 }] = repo
1151 }
1152
1153 for k, v := range subVersions {
1154 branchVersions[filepath.Join(submodule.Config().Path, k)] = v
1155 }
1156 }
1157 }
1158
1159 return rw.Files, branchVersions, nil
1160}
1161
1162func createDocument(key fileKey,
1163 repos map[fileKey]BlobLocation,
1164 opts index.Options,
1165) (index.Document, error) {
1166 repo := repos[key]
1167 blob, err := repo.GitRepo.BlobObject(key.ID)
1168 branches := repos[key].Branches
1169
1170 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found.
1171 if errors.Is(err, plumbing.ErrObjectNotFound) {
1172 return skippedDoc(key, branches, index.SkipReasonTooLarge), nil
1173 }
1174
1175 if err != nil {
1176 return index.Document{}, err
1177 }
1178
1179 keyFullPath := key.FullPath()
1180 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) {
1181 return skippedDoc(key, branches, index.SkipReasonTooLarge), nil
1182 }
1183
1184 contents, err := blobContents(blob)
1185 if err != nil {
1186 return index.Document{}, err
1187 }
1188
1189 return index.Document{
1190 SubRepositoryPath: key.SubRepoPath,
1191 Name: keyFullPath,
1192 Content: contents,
1193 Branches: branches,
1194 }, nil
1195}
1196
1197// skippedDoc creates a Document placeholder for a blob that was not indexed.
1198func skippedDoc(key fileKey, branches []string, reason index.SkipReason) index.Document {
1199 return index.Document{
1200 SkipReason: reason,
1201 Name: key.FullPath(),
1202 Branches: branches,
1203 SubRepositoryPath: key.SubRepoPath,
1204 }
1205}
1206
1207func blobContents(blob *object.Blob) ([]byte, error) {
1208 r, err := blob.Reader()
1209 if err != nil {
1210 return nil, err
1211 }
1212 defer r.Close()
1213
1214 var buf bytes.Buffer
1215 buf.Grow(int(blob.Size))
1216 _, err = buf.ReadFrom(r)
1217 if err != nil {
1218 return nil, err
1219 }
1220 return buf.Bytes(), nil
1221}
1222
1223func uniq(ss []string) []string {
1224 result := ss[:0]
1225 var last string
1226 for i, s := range ss {
1227 if i == 0 || s != last {
1228 result = append(result, s)
1229 }
1230 last = s
1231 }
1232 return result
1233}