fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package gitindex provides functions for indexing Git repositories.
16package gitindex
17
18import (
19 "bytes"
20 "cmp"
21 "context"
22 "errors"
23 "fmt"
24 "io"
25 "log"
26 "math"
27 "net/url"
28 "os"
29 "path/filepath"
30 "regexp"
31 "sort"
32 "strconv"
33 "strings"
34
35 "github.com/go-git/go-billy/v5/osfs"
36 "github.com/go-git/go-git/v5/config"
37 "github.com/go-git/go-git/v5/plumbing"
38 "github.com/go-git/go-git/v5/plumbing/cache"
39 "github.com/go-git/go-git/v5/plumbing/object"
40 "github.com/go-git/go-git/v5/storage/filesystem"
41
42 "github.com/sourcegraph/zoekt"
43 "github.com/sourcegraph/zoekt/ignore"
44 "github.com/sourcegraph/zoekt/index"
45
46 git "github.com/go-git/go-git/v5"
47)
48
49// FindGitRepos finds directories holding git repositories below the
50// given directory. It will find both bare and the ".git" dirs in
51// non-bare repositories. It returns the full path including the dir
52// passed in.
53func FindGitRepos(dir string) ([]string, error) {
54 arg, err := filepath.Abs(dir)
55 if err != nil {
56 return nil, err
57 }
58 var dirs []string
59 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error {
60 // Best-effort, ignore filepath.Walk failing
61 if err != nil {
62 return nil
63 }
64
65 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() {
66 dirs = append(dirs, filepath.Join(name, ".git"))
67 return filepath.SkipDir
68 }
69
70 if !strings.HasSuffix(name, ".git") || !fi.IsDir() {
71 return nil
72 }
73
74 fi, err = os.Lstat(filepath.Join(name, "objects"))
75 if err != nil || !fi.IsDir() {
76 return nil
77 }
78
79 dirs = append(dirs, name)
80 return filepath.SkipDir
81 }); err != nil {
82 return nil, err
83 }
84
85 return dirs, nil
86}
87
88// setTemplates fills in URL templates for known git hosting
89// sites.
90func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error {
91 if u.Scheme == "ssh+git" {
92 u.Scheme = "https"
93 u.User = nil
94 }
95
96 // helper to generate u.JoinPath as a template
97 varVersion := ".Version"
98 varPath := ".Path"
99 urlJoinPath := func(elem ...string) string {
100 elem = append([]string{u.String()}, elem...)
101 var parts []string
102 for _, e := range elem {
103 if e == varVersion || e == varPath {
104 parts = append(parts, e)
105 } else {
106 parts = append(parts, strconv.Quote(e))
107 }
108 }
109 return fmt.Sprintf("{{URLJoinPath %s}}", strings.Join(parts, " "))
110 }
111
112 repo.URL = u.String()
113 switch typ {
114 case "gitiles":
115 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20
116 repo.CommitURLTemplate = urlJoinPath("+", varVersion)
117 repo.FileURLTemplate = urlJoinPath("+", varVersion, varPath)
118 repo.LineFragmentTemplate = "#{{.LineNumber}}"
119 case "github":
120 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10
121 repo.CommitURLTemplate = urlJoinPath("commit", varVersion)
122 repo.FileURLTemplate = urlJoinPath("blob", varVersion, varPath)
123 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
124 case "cgit":
125 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100
126 repo.CommitURLTemplate = urlJoinPath("commit") + "/?id={{.Version}}"
127 repo.FileURLTemplate = urlJoinPath("tree", varPath) + "/?id={{.Version}}"
128 repo.LineFragmentTemplate = "#n{{.LineNumber}}"
129 case "gitweb":
130 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10
131 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}"
132 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}"
133 repo.LineFragmentTemplate = "#l{{.LineNumber}}"
134 case "source.bazel.build":
135 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9
136 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10
137 repo.CommitURLTemplate = u.String() + "/%2B/{{.Version}}"
138 repo.FileURLTemplate = u.String() + "/%2B/{{.Version}}:{{.Path}}"
139 repo.LineFragmentTemplate = ";l={{.LineNumber}}"
140 case "bitbucket-server":
141 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc
142 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc
143 repo.CommitURLTemplate = urlJoinPath("commits", varVersion)
144 repo.FileURLTemplate = urlJoinPath(varPath) + "?at={{.Version}}"
145 repo.LineFragmentTemplate = "#{{.LineNumber}}"
146 case "gitlab":
147 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed
148 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template
149 repo.CommitURLTemplate = urlJoinPath("-/commit", varVersion)
150 repo.FileURLTemplate = urlJoinPath("-/blob", varVersion, varPath)
151 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
152 case "gitea":
153 repo.CommitURLTemplate = urlJoinPath("commit", varVersion)
154 // NOTE The `display=source` query parameter is required to disable file rendering.
155 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to
156 // a line without `display=source`. This is supported since gitea 1.17.0.
157 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}},
158 // but the query parameters are obmitted.
159 repo.FileURLTemplate = urlJoinPath("src/commit", varVersion, varPath) + "?display=source"
160 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
161 default:
162 return fmt.Errorf("URL scheme type %q unknown", typ)
163 }
164 return nil
165}
166
167// getCommit returns a tree object for the given reference.
168func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) {
169 sha1, err := repo.ResolveRevision(plumbing.Revision(ref))
170 // ref might be a branch name (e.g. "master") add branch prefix and try again.
171 if err != nil {
172 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref)))
173 }
174 if err != nil {
175 return nil, err
176 }
177
178 commitObj, err := repo.CommitObject(*sha1)
179 if err != nil {
180 return nil, err
181 }
182 return commitObj, nil
183}
184
185func plainOpenRepo(repoDir string) (*git.Repository, error) {
186 return git.PlainOpenWithOptions(repoDir, &git.PlainOpenOptions{
187 DetectDotGit: true,
188 EnableDotGitCommonDir: true,
189 })
190}
191
192func configLookupRemoteURL(cfg *config.Config, key string) string {
193 rc := cfg.Remotes[key]
194 if rc == nil || len(rc.URLs) == 0 {
195 return ""
196 }
197 return rc.URLs[0]
198}
199
200var sshRelativeURLRegexp = regexp.MustCompile(`^([^@]+)@([^:]+):(.*)$`)
201
202func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error {
203 repo, err := plainOpenRepo(repoDir)
204 if err != nil {
205 return err
206 }
207
208 cfg, err := repo.Config()
209 if err != nil {
210 return err
211 }
212
213 return setTemplatesFromRepoConfig(desc, cfg)
214}
215
216func setTemplatesFromRepo(desc *zoekt.Repository, repo *git.Repository, repoDir string) error {
217 cfg, err := repo.Config()
218 if err == nil {
219 return setTemplatesFromRepoConfig(desc, cfg)
220 }
221
222 return setTemplatesFromConfig(desc, repoDir)
223}
224
225func setTemplatesFromRepoConfig(desc *zoekt.Repository, cfg *config.Config) error {
226 sec := cfg.Raw.Section("zoekt")
227
228 webURLStr := sec.Options.Get("web-url")
229 webURLType := sec.Options.Get("web-url-type")
230
231 if webURLType != "" && webURLStr != "" {
232 webURL, err := url.Parse(webURLStr)
233 if err != nil {
234 return err
235 }
236 if err := setTemplates(desc, webURL, webURLType); err != nil {
237 return err
238 }
239 } else if webURLStr != "" {
240 desc.URL = webURLStr
241 }
242
243 name := sec.Options.Get("name")
244 if name != "" {
245 desc.Name = name
246 } else {
247 remoteURL := configLookupRemoteURL(cfg, "origin")
248 if remoteURL == "" {
249 return nil
250 }
251 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil {
252 user := sm[1]
253 host := sm[2]
254 path := sm[3]
255
256 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path)
257 }
258
259 u, err := url.Parse(remoteURL)
260 if err != nil {
261 return err
262 }
263 if err := SetTemplatesFromOrigin(desc, u); err != nil {
264 return err
265 }
266 }
267
268 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32)
269 desc.ID = uint32(id)
270
271 desc.TenantID, _ = strconv.Atoi(sec.Options.Get("tenantID"))
272
273 if desc.RawConfig == nil {
274 desc.RawConfig = map[string]string{}
275 }
276 for _, o := range sec.Options {
277 desc.RawConfig[o.Key] = o.Value
278 }
279
280 // Ranking info.
281
282 // Github:
283 traction := 0
284 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} {
285 f, err := strconv.Atoi(sec.Options.Get(s))
286 if err == nil {
287 traction += f
288 }
289 }
290
291 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 {
292 // Pretend everything on googlesource.com has 1000
293 // github stars.
294 traction = 1000
295 }
296
297 if traction > 0 {
298 l := math.Log(float64(traction))
299 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000)
300 }
301
302 return nil
303}
304
305// This attempts to get a repo URL similar to the main repository template processing as in setTemplatesFromConfig()
306func normalizeSubmoduleRemoteURL(cfg *config.Config) (string, error) {
307 sec := cfg.Raw.Section("zoekt")
308 remoteURL := sec.Options.Get("web-url")
309 if remoteURL == "" {
310 // fall back to "origin" remote
311 remoteURL = configLookupRemoteURL(cfg, "origin")
312 if remoteURL == "" {
313 return "", nil
314 }
315 }
316
317 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil {
318 user := sm[1]
319 host := sm[2]
320 path := sm[3]
321
322 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path)
323 }
324
325 u, err := url.Parse(remoteURL)
326 if err != nil {
327 return "", fmt.Errorf("unable to parse remote URL %q: %w", remoteURL, err)
328 }
329
330 if u.Scheme == "ssh+git" {
331 u.Scheme = "https"
332 u.User = nil
333 }
334
335 // Assume we cannot build templates for this URL, leave it empty
336 if u.Scheme == "" {
337 return "", nil
338 }
339
340 return u.String(), nil
341}
342
343// SetTemplatesFromOrigin fills in templates based on the origin URL.
344func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error {
345 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git"))
346
347 if strings.HasSuffix(u.Host, ".googlesource.com") {
348 return setTemplates(desc, u, "gitiles")
349 } else if u.Host == "github.com" {
350 u.Path = strings.TrimSuffix(u.Path, ".git")
351 return setTemplates(desc, u, "github")
352 } else {
353 return fmt.Errorf("unknown git hosting site %q", u)
354 }
355}
356
357// The Options structs controls details of the indexing process.
358type Options struct {
359 // The repository to be indexed.
360 RepoDir string
361
362 // If set, follow submodule links. This requires RepoCacheDir to be set.
363 Submodules bool
364
365 // If set, skip indexing if the existing index shard is newer
366 // than the refs in the repository.
367 Incremental bool
368
369 // Don't error out if some branch is missing
370 AllowMissingBranch bool
371
372 // Specifies the root of a Repository cache. Needed for submodule indexing.
373 RepoCacheDir string
374
375 // Indexing options.
376 BuildOptions index.Options
377
378 // Prefix of the branch to index, e.g. `remotes/origin`.
379 BranchPrefix string
380
381 // List of branch names to index, e.g. []string{"HEAD", "stable"}
382 Branches []string
383
384 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards
385 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold,
386 // then a normal build will be performed instead.
387 //
388 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled:
389 // a delta build will always be performed regardless of the number of preexisting shards.
390 DeltaShardNumberFallbackThreshold uint64
391}
392
393func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) {
394 var result []string
395 for _, b := range bs {
396 // Sourcegraph: We disable resolving refs. We want to return the exact ref
397 // requested so we can match it up.
398 if b == "HEAD" && false {
399 ref, err := repo.Head()
400 if err != nil {
401 return nil, err
402 }
403
404 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix))
405 continue
406 }
407
408 if strings.Contains(b, "*") {
409 iter, err := repo.Branches()
410 if err != nil {
411 return nil, err
412 }
413
414 defer iter.Close()
415 for {
416 ref, err := iter.Next()
417 if err == io.EOF {
418 break
419 }
420 if err != nil {
421 return nil, err
422 }
423
424 name := ref.Name().Short()
425 if matched, err := filepath.Match(b, name); err != nil {
426 return nil, err
427 } else if !matched {
428 continue
429 }
430
431 result = append(result, strings.TrimPrefix(name, prefix))
432 }
433 continue
434 }
435
436 result = append(result, b)
437 }
438
439 return result, nil
440}
441
442// IndexGitRepo indexes the git repository as specified by the options.
443// The returned bool indicates whether the index was updated as a result. This
444// can be informative if doing incremental indexing.
445func IndexGitRepo(opts Options) (bool, error) {
446 return indexGitRepo(opts, gitIndexConfig{})
447}
448
449// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig.
450// The returned bool indicates whether the index was updated as a result. This
451// can be informative if doing incremental indexing.
452func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
453 prepareDeltaBuild := prepareDeltaBuild
454 if config.prepareDeltaBuild != nil {
455 prepareDeltaBuild = config.prepareDeltaBuild
456 }
457
458 prepareNormalBuild := prepareNormalBuild
459 if config.prepareNormalBuild != nil {
460 prepareNormalBuild = config.prepareNormalBuild
461 }
462
463 // Set max thresholds, since we use them in this function.
464 opts.BuildOptions.SetDefaults()
465 if opts.RepoDir == "" {
466 return false, fmt.Errorf("gitindex: must set RepoDir")
467 }
468
469 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir
470
471 var repo *git.Repository
472 legacyRepoOpen := cmp.Or(os.Getenv("ZOEKT_DISABLE_GOGIT_OPTIMIZATION"), "false")
473 if b, err := strconv.ParseBool(legacyRepoOpen); b || err != nil {
474 repo, err = plainOpenRepo(opts.RepoDir)
475 if err != nil {
476 return false, fmt.Errorf("plainOpenRepo: %w", err)
477 }
478 } else {
479 var repoCloser io.Closer
480 repo, repoCloser, err = openRepo(opts.RepoDir)
481 if err != nil {
482 return false, fmt.Errorf("openRepo: %w", err)
483 }
484 defer repoCloser.Close()
485 }
486
487 if err := setTemplatesFromRepo(&opts.BuildOptions.RepositoryDescription, repo, opts.RepoDir); err != nil {
488 log.Printf("setTemplatesFromRepo(%s): %s", opts.RepoDir, err)
489 }
490
491 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix)
492 if err != nil {
493 return false, fmt.Errorf("expandBranches: %w", err)
494 }
495 for _, b := range branches {
496 commit, err := getCommit(repo, opts.BranchPrefix, b)
497 if err != nil {
498 if opts.AllowMissingBranch && err.Error() == "reference not found" {
499 continue
500 }
501
502 return false, fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err)
503 }
504
505 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{
506 Name: b,
507 Version: commit.Hash.String(),
508 })
509
510 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) {
511 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when
512 }
513 }
514
515 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() {
516 return false, nil
517 }
518
519 // branch => (path, sha1) => repo.
520 var repos map[fileKey]BlobLocation
521
522 // Branch => Repo => SHA1
523 var branchVersions map[string]map[string]plumbing.Hash
524
525 // set of file paths that have been changed or deleted since
526 // the last indexed commit
527 //
528 // These only have an effect on delta builds
529 var changedOrRemovedFiles []string
530
531 if opts.BuildOptions.IsDelta {
532 repos, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo)
533 if err != nil {
534 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err)
535 opts.BuildOptions.IsDelta = false
536 }
537 }
538
539 if !opts.BuildOptions.IsDelta {
540 repos, branchVersions, err = prepareNormalBuild(opts, repo)
541 if err != nil {
542 return false, fmt.Errorf("preparing normal build: %w", err)
543 }
544 }
545
546 reposByPath := map[string]BlobLocation{}
547 for key, info := range repos {
548 reposByPath[key.SubRepoPath] = info
549 }
550
551 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{}
552 for path, info := range reposByPath {
553 tpl := opts.BuildOptions.RepositoryDescription
554 if path != "" {
555 tpl = zoekt.Repository{URL: info.URL.String()}
556 if info.URL.String() != "" {
557 if err := SetTemplatesFromOrigin(&tpl, info.URL); err != nil {
558 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.URL, err)
559 }
560 }
561 if tpl.Name == "" {
562 tpl.Name = path
563 }
564 }
565 opts.BuildOptions.SubRepositories[path] = &tpl
566 }
567
568 for _, br := range opts.BuildOptions.RepositoryDescription.Branches {
569 for path, repo := range opts.BuildOptions.SubRepositories {
570 id := branchVersions[br.Name][path]
571 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{
572 Name: br.Name,
573 Version: id.String(),
574 })
575 }
576 }
577
578 builder, err := index.NewBuilder(opts.BuildOptions)
579 if err != nil {
580 return false, fmt.Errorf("build.NewBuilder: %w", err)
581 }
582
583 // Preparing the build can consume substantial memory, so check usage before starting to index.
584 builder.CheckMemoryUsage()
585
586 // we don't need to check error, since we either already have an error, or
587 // we returning the first call to builder.Finish.
588 defer builder.Finish() // nolint:errcheck
589
590 for _, f := range changedOrRemovedFiles {
591 builder.MarkFileAsChangedOrRemoved(f)
592 }
593
594 var names []string
595 fileKeys := map[string][]fileKey{}
596 totalFiles := 0
597
598 for key := range repos {
599 n := key.FullPath()
600 fileKeys[n] = append(fileKeys[n], key)
601 names = append(names, n)
602 totalFiles++
603 }
604
605 sort.Strings(names)
606 names = uniq(names)
607
608 // Separate main-repo keys from submodule keys, collecting blob SHAs
609 // for the main repo so we can stream them via git cat-file --batch.
610 // ZOEKT_DISABLE_CATFILE_BATCH=true falls back to the go-git path for
611 // all files, useful as a kill switch if the cat-file path causes issues.
612 catfileBatchDisabled := cmp.Or(os.Getenv("ZOEKT_DISABLE_CATFILE_BATCH"), "false")
613 useCatfileBatch := true
614 if disabled, _ := strconv.ParseBool(catfileBatchDisabled); disabled {
615 useCatfileBatch = false
616 log.Printf("cat-file batch disabled via ZOEKT_DISABLE_CATFILE_BATCH, using go-git")
617 }
618
619 mainRepoKeys := make([]fileKey, 0, totalFiles)
620 mainRepoIDs := make([]plumbing.Hash, 0, totalFiles)
621 var submoduleKeys []fileKey
622
623 for _, name := range names {
624 for _, key := range fileKeys[name] {
625 if useCatfileBatch && key.SubRepoPath == "" {
626 mainRepoKeys = append(mainRepoKeys, key)
627 mainRepoIDs = append(mainRepoIDs, key.ID)
628 } else {
629 submoduleKeys = append(submoduleKeys, key)
630 }
631 }
632 }
633
634 log.Printf("attempting to index %d total files (%d via cat-file, %d via go-git)", totalFiles, len(mainRepoIDs), len(submoduleKeys))
635
636 // Stream main-repo blobs via pipelined cat-file --batch --buffer.
637 // Large blobs are skipped without reading content into memory.
638 if len(mainRepoIDs) > 0 {
639 cr, err := newCatfileReader(opts.RepoDir, mainRepoIDs)
640 if err != nil {
641 return false, fmt.Errorf("newCatfileReader: %w", err)
642 }
643
644 if err := indexCatfileBlobs(cr, mainRepoKeys, repos, opts, builder); err != nil {
645 return false, err
646 }
647 }
648
649 // Index submodule blobs via go-git.
650 for idx, key := range submoduleKeys {
651 doc, err := createDocument(key, repos, opts.BuildOptions)
652 if err != nil {
653 return false, err
654 }
655
656 if err := builder.Add(doc); err != nil {
657 return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err)
658 }
659
660 if idx%10_000 == 0 {
661 builder.CheckMemoryUsage()
662 }
663 }
664
665 return true, builder.Finish()
666}
667
668// indexCatfileBlobs streams main-repo blobs from the catfileReader into the
669// builder. Large blobs are skipped without reading content into memory.
670// keys must correspond 1:1 (in order) with the ids passed to newCatfileReader.
671// The reader is always closed when this function returns.
672func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]BlobLocation, opts Options, builder *index.Builder) error {
673 defer cr.Close()
674
675 for idx, key := range keys {
676 size, missing, err := cr.Next()
677 if err != nil {
678 return fmt.Errorf("cat-file next for %s: %w", key.FullPath(), err)
679 }
680
681 branches := repos[key].Branches
682 var doc index.Document
683
684 if missing {
685 // Unexpected for local repos — may indicate corruption, shallow
686 // clone, or a race with git gc. Log a warning and skip.
687 log.Printf("warning: blob %s missing for %s", key.ID, key.FullPath())
688 doc = skippedDoc(key, branches, index.SkipReasonMissing)
689 } else {
690 keyFullPath := key.FullPath()
691 if size > opts.BuildOptions.SizeMax && !opts.BuildOptions.IgnoreSizeMax(keyFullPath) {
692 // Skip without reading content into memory.
693 doc = skippedDoc(key, branches, index.SkipReasonTooLarge)
694 } else {
695 // Pre-allocate and read the full blob content in one call.
696 // io.ReadFull is preferred over io.LimitedReader here as it
697 // avoids the intermediate allocation and the size is known.
698 content := make([]byte, size)
699 if _, err := io.ReadFull(cr, content); err != nil {
700 return fmt.Errorf("read blob %s: %w", keyFullPath, err)
701 }
702 doc = index.Document{
703 SubRepositoryPath: key.SubRepoPath,
704 Name: keyFullPath,
705 Content: content,
706 Branches: branches,
707 }
708 }
709 }
710
711 if err := builder.Add(doc); err != nil {
712 return fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err)
713 }
714
715 if idx%10_000 == 0 {
716 builder.CheckMemoryUsage()
717 }
718 }
719
720 return nil
721}
722
723// openRepo opens a git repository in a way that's optimized for indexing.
724//
725// It copies the relevant logic from git.PlainOpen, and tweaks certain filesystem options.
726func openRepo(repoDir string) (*git.Repository, io.Closer, error) {
727 fs := osfs.New(repoDir)
728
729 // Check if the root directory exists.
730 if _, err := fs.Stat(""); err != nil {
731 if os.IsNotExist(err) {
732 return nil, nil, git.ErrRepositoryNotExists
733 }
734 return nil, nil, err
735 }
736
737 fi, err := fs.Stat(git.GitDirName)
738 if err == nil && !fi.IsDir() {
739 return openCompatibleRepo(repoDir)
740 }
741
742 return openOptimizedRepo(repoDir)
743}
744
745func openCompatibleRepo(repoDir string) (*git.Repository, io.Closer, error) {
746 repo, err := plainOpenRepo(repoDir)
747 if err != nil {
748 return nil, nil, err
749 }
750
751 return repo, noopCloser{}, nil
752}
753
754func openOptimizedRepo(repoDir string) (*git.Repository, io.Closer, error) {
755 fs := osfs.New(repoDir)
756 wt := fs
757
758 // If there's a .git directory, use that as the new root.
759 if fi, err := fs.Stat(git.GitDirName); err == nil && fi.IsDir() {
760 if fs, err = fs.Chroot(git.GitDirName); err != nil {
761 return nil, nil, fmt.Errorf("fs.Chroot: %w", err)
762 }
763 }
764
765 s := filesystem.NewStorageWithOptions(fs, cache.NewObjectLRUDefault(), filesystem.Options{
766 // Cache the packfile handles, preventing the packfile from being opened then closed on every object access
767 KeepDescriptors: true,
768 })
769
770 // Because we're keeping descriptors open, we need to close the storage object when we're done.
771 repo, err := git.Open(s, wt)
772 return repo, s, err
773}
774
775type noopCloser struct{}
776
777func (noopCloser) Close() error { return nil }
778
779func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) {
780 ignoreFile, err := tree.File(ignore.IgnoreFile)
781 if err == object.ErrFileNotFound {
782 return &ignore.Matcher{}, nil
783 }
784 if err != nil {
785 return nil, err
786 }
787 content, err := ignoreFile.Contents()
788 if err != nil {
789 return nil, err
790 }
791 return ignore.ParseIgnoreFile(strings.NewReader(content))
792}
793
794// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing
795// a build.Builder instance for generating a delta build.
796type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error)
797
798// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing
799// a build.Builder instance for generating a normal build.
800type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error)
801
802type gitIndexConfig struct {
803 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to
804 // prepare the build.Builder instance for generating a delta build.
805 //
806 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead.
807 prepareDeltaBuild prepareDeltaBuildFunc
808
809 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to
810 // prepare the build.Builder instance for generating a normal build.
811 //
812 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead.
813 prepareNormalBuild prepareNormalBuildFunc
814}
815
816func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) {
817 if options.Submodules {
818 return nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing")
819 }
820
821 // discover what commits we indexed during our last build
822 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata()
823 if err != nil {
824 return nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err)
825 }
826
827 if !ok {
828 return nil, nil, nil, fmt.Errorf("no existing shards found for repository")
829 }
830
831 if options.DeltaShardNumberFallbackThreshold > 0 {
832 // HACK: For our interim compaction strategy, we force a full normal index once
833 // the number of shards on disk for this repository exceeds the provided threshold.
834 //
835 // This strategy obviously isn't optimal (as an example: we currently can't differentiate
836 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per
837 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads
838 // while we create a better compaction strategy).
839
840 oldShards := options.BuildOptions.FindAllShards()
841 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold {
842 return nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold)
843 }
844 }
845
846 // Check to see if the set of branch names is consistent with what we last indexed.
847 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a
848 // normal one).
849
850 if !index.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) {
851 var existingBranchNames []string
852 for _, b := range existingRepository.Branches {
853 existingBranchNames = append(existingBranchNames, b.Name)
854 }
855
856 var optionsBranchNames []string
857 for _, b := range options.BuildOptions.RepositoryDescription.Branches {
858 optionsBranchNames = append(optionsBranchNames, b.Name)
859 }
860
861 existingBranchList := strings.Join(existingBranchNames, ", ")
862 optionsBranchList := strings.Join(optionsBranchNames, ", ")
863
864 return nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList)
865 }
866
867 // Check if the build options hash does not match the repository metadata's hash
868 // If it does not index then one or more index options has changed and will require a normal build instead of a delta build
869 if options.BuildOptions.GetHash() != existingRepository.IndexOptions {
870 return nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions())
871 }
872
873 // branch => (path, sha1) => repo.
874 repos = map[fileKey]BlobLocation{}
875
876 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix)
877 if err != nil {
878 return nil, nil, nil, fmt.Errorf("expandBranches: %w", err)
879 }
880
881 // branch name -> git worktree at most current commit
882 branchToCurrentTree := make(map[string]*object.Tree, len(branches))
883
884 for _, b := range branches {
885 commit, err := getCommit(repository, options.BranchPrefix, b)
886 if err != nil {
887 return nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err)
888 }
889
890 tree, err := commit.Tree()
891 if err != nil {
892 return nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err)
893 }
894
895 branchToCurrentTree[b] = tree
896 }
897
898 rawURL := options.BuildOptions.RepositoryDescription.URL
899 u, err := url.Parse(rawURL)
900 if err != nil {
901 return nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err)
902 }
903
904 // TODO: Support repository submodules for delta builds
905
906 // loop over all branches, calculate the diff between our
907 // last indexed commit and the current commit, and add files mentioned in the diff
908 for _, branch := range existingRepository.Branches {
909 lastIndexedCommit, err := getCommit(repository, "", branch.Version)
910 if err != nil {
911 return nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err)
912 }
913
914 lastIndexedTree, err := lastIndexedCommit.Tree()
915 if err != nil {
916 return nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err)
917 }
918
919 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false})
920 if err != nil {
921 return nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err)
922 }
923
924 for i, c := range changes {
925 oldFile, newFile, err := c.Files()
926 if err != nil {
927 return nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err)
928 }
929
930 if newFile != nil {
931 // note: newFile.Name could be a path that isn't relative to the repository root - using the
932 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root
933 newFileRelativeRootPath := c.To.Name
934
935 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds
936 if newFileRelativeRootPath == ignore.IgnoreFile {
937 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
938 }
939
940 // either file is added or renamed, so we need to add the new version to the build
941 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash}
942 if existing, ok := repos[file]; ok {
943 existing.Branches = append(existing.Branches, branch.Name)
944 repos[file] = existing
945 } else {
946 repos[file] = BlobLocation{
947 GitRepo: repository,
948 URL: u,
949 Branches: []string{branch.Name},
950 }
951 }
952 }
953
954 if oldFile == nil {
955 // file added - nothing more to do
956 continue
957 }
958
959 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the
960 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root
961 oldFileRelativeRootPath := c.From.Name
962
963 if oldFileRelativeRootPath == ignore.IgnoreFile {
964 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
965 }
966
967 // The file is either modified or deleted. So, we need to add ALL versions
968 // of the old file (across all branches) to the build.
969 for b, currentTree := range branchToCurrentTree {
970 f, err := currentTree.File(oldFileRelativeRootPath)
971 if err != nil {
972 // the file doesn't exist in this branch
973 if errors.Is(err, object.ErrFileNotFound) {
974 continue
975 }
976
977 return nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err)
978 }
979
980 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()}
981 if existing, ok := repos[file]; ok {
982 existing.Branches = append(existing.Branches, b)
983 repos[file] = existing
984 } else {
985 repos[file] = BlobLocation{
986 GitRepo: repository,
987 URL: u,
988 Branches: []string{b},
989 }
990 }
991 }
992
993 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath)
994 }
995 }
996
997 // we need to de-duplicate the branch map before returning it - it's possible for the same
998 // branch to have been added multiple times if a file has been modified across multiple commits
999 for _, info := range repos {
1000 sort.Strings(info.Branches)
1001 info.Branches = uniq(info.Branches)
1002 }
1003
1004 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates
1005 // for the same reasoning as above
1006 sort.Strings(changedOrDeletedPaths)
1007 changedOrDeletedPaths = uniq(changedOrDeletedPaths)
1008
1009 return repos, nil, changedOrDeletedPaths, nil
1010}
1011
1012func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) {
1013 var repoCache *RepoCache
1014 if options.Submodules && options.RepoCacheDir != "" {
1015 repoCache = NewRepoCache(options.RepoCacheDir)
1016 }
1017 return prepareNormalBuildRecurse(options, repository, repoCache, false)
1018}
1019
1020func prepareNormalBuildRecurse(options Options, repository *git.Repository, repoCache *RepoCache, isSubrepo bool) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) {
1021 // Branch => Repo => SHA1
1022 branchVersions = map[string]map[string]plumbing.Hash{}
1023
1024 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix)
1025 if err != nil {
1026 return nil, nil, fmt.Errorf("expandBranches: %w", err)
1027 }
1028
1029 repoURL := options.BuildOptions.RepositoryDescription.URL
1030
1031 if isSubrepo {
1032 cfg, err := repository.Config()
1033 if err != nil {
1034 return nil, nil, fmt.Errorf("unable to get repository config: %w", err)
1035 }
1036
1037 u, err := normalizeSubmoduleRemoteURL(cfg)
1038 if err != nil {
1039 return nil, nil, fmt.Errorf("failed to identify subrepository URL: %w", err)
1040 }
1041 repoURL = u
1042 }
1043
1044 rw := NewRepoWalker(repository, repoURL, repoCache)
1045 for _, b := range branches {
1046 commit, err := getCommit(repository, options.BranchPrefix, b)
1047 if err != nil {
1048 if options.AllowMissingBranch && err.Error() == "reference not found" {
1049 continue
1050 }
1051
1052 return nil, nil, fmt.Errorf("getCommit: %w", err)
1053 }
1054
1055 tree, err := commit.Tree()
1056 if err != nil {
1057 return nil, nil, fmt.Errorf("commit.Tree: %w", err)
1058 }
1059
1060 ig, err := newIgnoreMatcher(tree)
1061 if err != nil {
1062 return nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err)
1063 }
1064
1065 subVersions, err := rw.CollectFiles(tree, b, ig)
1066 if err != nil {
1067 return nil, nil, fmt.Errorf("CollectFiles: %w", err)
1068 }
1069
1070 branchVersions[b] = subVersions
1071 }
1072
1073 // Index submodules using go-git if we didn't do so using the repo cache
1074 if options.Submodules && options.RepoCacheDir == "" {
1075 worktree, err := repository.Worktree()
1076 if err != nil {
1077 return nil, nil, fmt.Errorf("failed to get repository worktree: %w", err)
1078 }
1079
1080 submodules, err := worktree.Submodules()
1081 if err != nil {
1082 return nil, nil, fmt.Errorf("failed to get submodules: %w", err)
1083 }
1084
1085 for _, submodule := range submodules {
1086 subRepository, err := submodule.Repository()
1087 if err != nil {
1088 log.Printf("failed to open submodule repository: %s, %s", submodule.Config().Name, err)
1089 continue
1090 }
1091
1092 sw, subVersions, err := prepareNormalBuildRecurse(options, subRepository, repoCache, true)
1093 if err != nil {
1094 log.Printf("failed to index submodule repository: %s, %s", submodule.Config().Name, err)
1095 continue
1096 }
1097
1098 log.Printf("adding subrepository files from: %s", submodule.Config().Name)
1099
1100 for k, repo := range sw {
1101 rw.Files[fileKey{
1102 SubRepoPath: filepath.Join(submodule.Config().Path, k.SubRepoPath),
1103 Path: k.Path,
1104 ID: k.ID,
1105 }] = repo
1106 }
1107
1108 for k, v := range subVersions {
1109 branchVersions[filepath.Join(submodule.Config().Path, k)] = v
1110 }
1111 }
1112 }
1113
1114 return rw.Files, branchVersions, nil
1115}
1116
1117func createDocument(key fileKey,
1118 repos map[fileKey]BlobLocation,
1119 opts index.Options,
1120) (index.Document, error) {
1121 repo := repos[key]
1122 blob, err := repo.GitRepo.BlobObject(key.ID)
1123 branches := repos[key].Branches
1124
1125 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found.
1126 if errors.Is(err, plumbing.ErrObjectNotFound) {
1127 return skippedDoc(key, branches, index.SkipReasonTooLarge), nil
1128 }
1129
1130 if err != nil {
1131 return index.Document{}, err
1132 }
1133
1134 keyFullPath := key.FullPath()
1135 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) {
1136 return skippedDoc(key, branches, index.SkipReasonTooLarge), nil
1137 }
1138
1139 contents, err := blobContents(blob)
1140 if err != nil {
1141 return index.Document{}, err
1142 }
1143
1144 return index.Document{
1145 SubRepositoryPath: key.SubRepoPath,
1146 Name: keyFullPath,
1147 Content: contents,
1148 Branches: branches,
1149 }, nil
1150}
1151
1152// skippedDoc creates a Document placeholder for a blob that was not indexed.
1153func skippedDoc(key fileKey, branches []string, reason index.SkipReason) index.Document {
1154 return index.Document{
1155 SkipReason: reason,
1156 Name: key.FullPath(),
1157 Branches: branches,
1158 SubRepositoryPath: key.SubRepoPath,
1159 }
1160}
1161
1162func blobContents(blob *object.Blob) ([]byte, error) {
1163 r, err := blob.Reader()
1164 if err != nil {
1165 return nil, err
1166 }
1167 defer r.Close()
1168
1169 var buf bytes.Buffer
1170 buf.Grow(int(blob.Size))
1171 _, err = buf.ReadFrom(r)
1172 if err != nil {
1173 return nil, err
1174 }
1175 return buf.Bytes(), nil
1176}
1177
1178func uniq(ss []string) []string {
1179 result := ss[:0]
1180 var last string
1181 for i, s := range ss {
1182 if i == 0 || s != last {
1183 result = append(result, s)
1184 }
1185 last = s
1186 }
1187 return result
1188}