fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package gitindex provides functions for indexing Git repositories.
16package gitindex
17
18import (
19 "bytes"
20 "cmp"
21 "context"
22 "errors"
23 "fmt"
24 "io"
25 "log"
26 "math"
27 "net/url"
28 "os"
29 "path/filepath"
30 "regexp"
31 "sort"
32 "strconv"
33 "strings"
34
35 "github.com/go-git/go-billy/v5/osfs"
36 "github.com/go-git/go-git/v5/config"
37 "github.com/go-git/go-git/v5/plumbing"
38 "github.com/go-git/go-git/v5/plumbing/cache"
39 "github.com/go-git/go-git/v5/plumbing/object"
40 "github.com/go-git/go-git/v5/storage/filesystem"
41 "github.com/sourcegraph/zoekt"
42 "github.com/sourcegraph/zoekt/ignore"
43 "github.com/sourcegraph/zoekt/index"
44
45 git "github.com/go-git/go-git/v5"
46)
47
48// FindGitRepos finds directories holding git repositories below the
49// given directory. It will find both bare and the ".git" dirs in
50// non-bare repositories. It returns the full path including the dir
51// passed in.
52func FindGitRepos(dir string) ([]string, error) {
53 arg, err := filepath.Abs(dir)
54 if err != nil {
55 return nil, err
56 }
57 var dirs []string
58 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error {
59 // Best-effort, ignore filepath.Walk failing
60 if err != nil {
61 return nil
62 }
63
64 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() {
65 dirs = append(dirs, filepath.Join(name, ".git"))
66 return filepath.SkipDir
67 }
68
69 if !strings.HasSuffix(name, ".git") || !fi.IsDir() {
70 return nil
71 }
72
73 fi, err = os.Lstat(filepath.Join(name, "objects"))
74 if err != nil || !fi.IsDir() {
75 return nil
76 }
77
78 dirs = append(dirs, name)
79 return filepath.SkipDir
80 }); err != nil {
81 return nil, err
82 }
83
84 return dirs, nil
85}
86
87// setTemplates fills in URL templates for known git hosting
88// sites.
89func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error {
90 if u.Scheme == "ssh+git" {
91 u.Scheme = "https"
92 u.User = nil
93 }
94
95 // helper to generate u.JoinPath as a template
96 varVersion := ".Version"
97 varPath := ".Path"
98 urlJoinPath := func(elem ...string) string {
99 elem = append([]string{u.String()}, elem...)
100 var parts []string
101 for _, e := range elem {
102 if e == varVersion || e == varPath {
103 parts = append(parts, e)
104 } else {
105 parts = append(parts, strconv.Quote(e))
106 }
107 }
108 return fmt.Sprintf("{{URLJoinPath %s}}", strings.Join(parts, " "))
109 }
110
111 repo.URL = u.String()
112 switch typ {
113 case "gitiles":
114 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20
115 repo.CommitURLTemplate = urlJoinPath("+", varVersion)
116 repo.FileURLTemplate = urlJoinPath("+", varVersion, varPath)
117 repo.LineFragmentTemplate = "#{{.LineNumber}}"
118 case "github":
119 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10
120 repo.CommitURLTemplate = urlJoinPath("commit", varVersion)
121 repo.FileURLTemplate = urlJoinPath("blob", varVersion, varPath)
122 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
123 case "cgit":
124 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100
125 repo.CommitURLTemplate = urlJoinPath("commit") + "/?id={{.Version}}"
126 repo.FileURLTemplate = urlJoinPath("tree", varPath) + "/?id={{.Version}}"
127 repo.LineFragmentTemplate = "#n{{.LineNumber}}"
128 case "gitweb":
129 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10
130 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}"
131 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}"
132 repo.LineFragmentTemplate = "#l{{.LineNumber}}"
133 case "source.bazel.build":
134 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9
135 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10
136 repo.CommitURLTemplate = u.String() + "/%2B/{{.Version}}"
137 repo.FileURLTemplate = u.String() + "/%2B/{{.Version}}:{{.Path}}"
138 repo.LineFragmentTemplate = ";l={{.LineNumber}}"
139 case "bitbucket-server":
140 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc
141 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc
142 repo.CommitURLTemplate = urlJoinPath("commits", varVersion)
143 repo.FileURLTemplate = urlJoinPath(varPath) + "?at={{.Version}}"
144 repo.LineFragmentTemplate = "#{{.LineNumber}}"
145 case "gitlab":
146 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed
147 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template
148 repo.CommitURLTemplate = urlJoinPath("-/commit", varVersion)
149 repo.FileURLTemplate = urlJoinPath("-/blob", varVersion, varPath)
150 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
151 case "gitea":
152 repo.CommitURLTemplate = urlJoinPath("commit", varVersion)
153 // NOTE The `display=source` query parameter is required to disable file rendering.
154 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to
155 // a line without `display=source`. This is supported since gitea 1.17.0.
156 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}},
157 // but the query parameters are obmitted.
158 repo.FileURLTemplate = urlJoinPath("src/commit", varVersion, varPath) + "?display=source"
159 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
160 default:
161 return fmt.Errorf("URL scheme type %q unknown", typ)
162 }
163 return nil
164}
165
166// getCommit returns a tree object for the given reference.
167func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) {
168 sha1, err := repo.ResolveRevision(plumbing.Revision(ref))
169 // ref might be a branch name (e.g. "master") add branch prefix and try again.
170 if err != nil {
171 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref)))
172 }
173 if err != nil {
174 return nil, err
175 }
176
177 commitObj, err := repo.CommitObject(*sha1)
178 if err != nil {
179 return nil, err
180 }
181 return commitObj, nil
182}
183
184func configLookupRemoteURL(cfg *config.Config, key string) string {
185 rc := cfg.Remotes[key]
186 if rc == nil || len(rc.URLs) == 0 {
187 return ""
188 }
189 return rc.URLs[0]
190}
191
192var sshRelativeURLRegexp = regexp.MustCompile(`^([^@]+)@([^:]+):(.*)$`)
193
194func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error {
195 repo, err := git.PlainOpen(repoDir)
196 if err != nil {
197 return err
198 }
199
200 cfg, err := repo.Config()
201 if err != nil {
202 return err
203 }
204
205 sec := cfg.Raw.Section("zoekt")
206
207 webURLStr := sec.Options.Get("web-url")
208 webURLType := sec.Options.Get("web-url-type")
209
210 if webURLType != "" && webURLStr != "" {
211 webURL, err := url.Parse(webURLStr)
212 if err != nil {
213 return err
214 }
215 if err := setTemplates(desc, webURL, webURLType); err != nil {
216 return err
217 }
218 } else if webURLStr != "" {
219 desc.URL = webURLStr
220 }
221
222 name := sec.Options.Get("name")
223 if name != "" {
224 desc.Name = name
225 } else {
226 remoteURL := configLookupRemoteURL(cfg, "origin")
227 if remoteURL == "" {
228 return nil
229 }
230 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil {
231 user := sm[1]
232 host := sm[2]
233 path := sm[3]
234
235 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path)
236 }
237
238 u, err := url.Parse(remoteURL)
239 if err != nil {
240 return err
241 }
242 if err := SetTemplatesFromOrigin(desc, u); err != nil {
243 return err
244 }
245 }
246
247 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32)
248 desc.ID = uint32(id)
249
250 if desc.RawConfig == nil {
251 desc.RawConfig = map[string]string{}
252 }
253 for _, o := range sec.Options {
254 desc.RawConfig[o.Key] = o.Value
255 }
256
257 // Ranking info.
258
259 // Github:
260 traction := 0
261 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} {
262 f, err := strconv.Atoi(sec.Options.Get(s))
263 if err == nil {
264 traction += f
265 }
266 }
267
268 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 {
269 // Pretend everything on googlesource.com has 1000
270 // github stars.
271 traction = 1000
272 }
273
274 if traction > 0 {
275 l := math.Log(float64(traction))
276 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000)
277 }
278
279 return nil
280}
281
282// SetTemplatesFromOrigin fills in templates based on the origin URL.
283func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error {
284 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git"))
285
286 if strings.HasSuffix(u.Host, ".googlesource.com") {
287 return setTemplates(desc, u, "gitiles")
288 } else if u.Host == "github.com" {
289 u.Path = strings.TrimSuffix(u.Path, ".git")
290 return setTemplates(desc, u, "github")
291 } else {
292 return fmt.Errorf("unknown git hosting site %q", u)
293 }
294}
295
296// The Options structs controls details of the indexing process.
297type Options struct {
298 // The repository to be indexed.
299 RepoDir string
300
301 // If set, follow submodule links. This requires RepoCacheDir to be set.
302 Submodules bool
303
304 // If set, skip indexing if the existing index shard is newer
305 // than the refs in the repository.
306 Incremental bool
307
308 // Don't error out if some branch is missing
309 AllowMissingBranch bool
310
311 // Specifies the root of a Repository cache. Needed for submodule indexing.
312 RepoCacheDir string
313
314 // Indexing options.
315 BuildOptions index.Options
316
317 // Prefix of the branch to index, e.g. `remotes/origin`.
318 BranchPrefix string
319
320 // List of branch names to index, e.g. []string{"HEAD", "stable"}
321 Branches []string
322
323 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards
324 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold,
325 // then a normal build will be performed instead.
326 //
327 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled:
328 // a delta build will always be performed regardless of the number of preexisting shards.
329 DeltaShardNumberFallbackThreshold uint64
330}
331
332func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) {
333 var result []string
334 for _, b := range bs {
335 // Sourcegraph: We disable resolving refs. We want to return the exact ref
336 // requested so we can match it up.
337 if b == "HEAD" && false {
338 ref, err := repo.Head()
339 if err != nil {
340 return nil, err
341 }
342
343 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix))
344 continue
345 }
346
347 if strings.Contains(b, "*") {
348 iter, err := repo.Branches()
349 if err != nil {
350 return nil, err
351 }
352
353 defer iter.Close()
354 for {
355 ref, err := iter.Next()
356 if err == io.EOF {
357 break
358 }
359 if err != nil {
360 return nil, err
361 }
362
363 name := ref.Name().Short()
364 if matched, err := filepath.Match(b, name); err != nil {
365 return nil, err
366 } else if !matched {
367 continue
368 }
369
370 result = append(result, strings.TrimPrefix(name, prefix))
371 }
372 continue
373 }
374
375 result = append(result, b)
376 }
377
378 return result, nil
379}
380
381// IndexGitRepo indexes the git repository as specified by the options.
382// The returned bool indicates whether the index was updated as a result. This
383// can be informative if doing incremental indexing.
384func IndexGitRepo(opts Options) (bool, error) {
385 return indexGitRepo(opts, gitIndexConfig{})
386}
387
388// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig.
389// The returned bool indicates whether the index was updated as a result. This
390// can be informative if doing incremental indexing.
391func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
392 prepareDeltaBuild := prepareDeltaBuild
393 if config.prepareDeltaBuild != nil {
394 prepareDeltaBuild = config.prepareDeltaBuild
395 }
396
397 prepareNormalBuild := prepareNormalBuild
398 if config.prepareNormalBuild != nil {
399 prepareNormalBuild = config.prepareNormalBuild
400 }
401
402 // Set max thresholds, since we use them in this function.
403 opts.BuildOptions.SetDefaults()
404 if opts.RepoDir == "" {
405 return false, fmt.Errorf("gitindex: must set RepoDir")
406 }
407
408 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir
409
410 var repo *git.Repository
411 legacyRepoOpen := cmp.Or(os.Getenv("ZOEKT_DISABLE_GOGIT_OPTIMIZATION"), "false")
412 if b, err := strconv.ParseBool(legacyRepoOpen); b || err != nil {
413 repo, err = git.PlainOpen(opts.RepoDir)
414 if err != nil {
415 return false, fmt.Errorf("git.PlainOpen: %w", err)
416 }
417 } else {
418 var repoCloser io.Closer
419 repo, repoCloser, err = openRepo(opts.RepoDir)
420 if err != nil {
421 return false, fmt.Errorf("openRepo: %w", err)
422 }
423 defer repoCloser.Close()
424 }
425
426 if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil {
427 log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err)
428 }
429
430 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix)
431 if err != nil {
432 return false, fmt.Errorf("expandBranches: %w", err)
433 }
434 for _, b := range branches {
435 commit, err := getCommit(repo, opts.BranchPrefix, b)
436 if err != nil {
437 if opts.AllowMissingBranch && err.Error() == "reference not found" {
438 continue
439 }
440
441 return false, fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err)
442 }
443
444 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{
445 Name: b,
446 Version: commit.Hash.String(),
447 })
448
449 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) {
450 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when
451 }
452 }
453
454 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() {
455 return false, nil
456 }
457
458 // branch => (path, sha1) => repo.
459 var repos map[fileKey]BlobLocation
460
461 // Branch => Repo => SHA1
462 var branchVersions map[string]map[string]plumbing.Hash
463
464 // set of file paths that have been changed or deleted since
465 // the last indexed commit
466 //
467 // These only have an effect on delta builds
468 var changedOrRemovedFiles []string
469
470 if opts.BuildOptions.IsDelta {
471 repos, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo)
472 if err != nil {
473 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err)
474 opts.BuildOptions.IsDelta = false
475 }
476 }
477
478 if !opts.BuildOptions.IsDelta {
479 repos, branchVersions, err = prepareNormalBuild(opts, repo)
480 if err != nil {
481 return false, fmt.Errorf("preparing normal build: %w", err)
482 }
483 }
484
485 reposByPath := map[string]BlobLocation{}
486 for key, info := range repos {
487 reposByPath[key.SubRepoPath] = info
488 }
489
490 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{}
491 for path, info := range reposByPath {
492 tpl := opts.BuildOptions.RepositoryDescription
493 if path != "" {
494 tpl = zoekt.Repository{URL: info.URL.String()}
495 if err := SetTemplatesFromOrigin(&tpl, info.URL); err != nil {
496 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.URL, err)
497 }
498 }
499 opts.BuildOptions.SubRepositories[path] = &tpl
500 }
501
502 for _, br := range opts.BuildOptions.RepositoryDescription.Branches {
503 for path, repo := range opts.BuildOptions.SubRepositories {
504 id := branchVersions[br.Name][path]
505 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{
506 Name: br.Name,
507 Version: id.String(),
508 })
509 }
510 }
511
512 builder, err := index.NewBuilder(opts.BuildOptions)
513 if err != nil {
514 return false, fmt.Errorf("build.NewBuilder: %w", err)
515 }
516
517 // Preparing the build can consume substantial memory, so check usage before starting to index.
518 builder.CheckMemoryUsage()
519
520 // we don't need to check error, since we either already have an error, or
521 // we returning the first call to builder.Finish.
522 defer builder.Finish() // nolint:errcheck
523
524 for _, f := range changedOrRemovedFiles {
525 builder.MarkFileAsChangedOrRemoved(f)
526 }
527
528 var names []string
529 fileKeys := map[string][]fileKey{}
530 totalFiles := 0
531
532 for key := range repos {
533 n := key.FullPath()
534 fileKeys[n] = append(fileKeys[n], key)
535 names = append(names, n)
536 totalFiles++
537 }
538
539 sort.Strings(names)
540 names = uniq(names)
541
542 log.Printf("attempting to index %d total files", totalFiles)
543 for idx, name := range names {
544 keys := fileKeys[name]
545
546 for _, key := range keys {
547 doc, err := createDocument(key, repos, opts.BuildOptions)
548 if err != nil {
549 return false, err
550 }
551
552 if err := builder.Add(doc); err != nil {
553 return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err)
554 }
555
556 if idx%10_000 == 0 {
557 builder.CheckMemoryUsage()
558 }
559 }
560 }
561 return true, builder.Finish()
562}
563
564// openRepo opens a git repository in a way that's optimized for indexing.
565//
566// It copies the relevant logic from git.PlainOpen, and tweaks certain filesystem options.
567func openRepo(repoDir string) (*git.Repository, io.Closer, error) {
568 fs := osfs.New(repoDir)
569
570 // Check if the root directory exists.
571 if _, err := fs.Stat(""); err != nil {
572 if os.IsNotExist(err) {
573 return nil, nil, git.ErrRepositoryNotExists
574 }
575 return nil, nil, err
576 }
577
578 // If there's a .git directory, use that as the new root.
579 if fi, err := fs.Stat(git.GitDirName); err == nil && fi.IsDir() {
580 if fs, err = fs.Chroot(git.GitDirName); err != nil {
581 return nil, nil, fmt.Errorf("fs.Chroot: %w", err)
582 }
583 }
584
585 s := filesystem.NewStorageWithOptions(fs, cache.NewObjectLRUDefault(), filesystem.Options{
586 // Cache the packfile handles, preventing the packfile from being opened then closed on every object access
587 KeepDescriptors: true,
588 })
589
590 // Because we're keeping descriptors open, we need to close the storage object when we're done.
591 repo, err := git.Open(s, fs)
592 return repo, s, err
593}
594
595func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) {
596 ignoreFile, err := tree.File(ignore.IgnoreFile)
597 if err == object.ErrFileNotFound {
598 return &ignore.Matcher{}, nil
599 }
600 if err != nil {
601 return nil, err
602 }
603 content, err := ignoreFile.Contents()
604 if err != nil {
605 return nil, err
606 }
607 return ignore.ParseIgnoreFile(strings.NewReader(content))
608}
609
610// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing
611// a build.Builder instance for generating a delta build.
612type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error)
613
614// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing
615// a build.Builder instance for generating a normal build.
616type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error)
617
618type gitIndexConfig struct {
619 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to
620 // prepare the build.Builder instance for generating a delta build.
621 //
622 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead.
623 prepareDeltaBuild prepareDeltaBuildFunc
624
625 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to
626 // prepare the build.Builder instance for generating a normal build.
627 //
628 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead.
629 prepareNormalBuild prepareNormalBuildFunc
630}
631
632func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) {
633 if options.Submodules {
634 return nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing")
635 }
636
637 // discover what commits we indexed during our last build
638 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata()
639 if err != nil {
640 return nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err)
641 }
642
643 if !ok {
644 return nil, nil, nil, fmt.Errorf("no existing shards found for repository")
645 }
646
647 if options.DeltaShardNumberFallbackThreshold > 0 {
648 // HACK: For our interim compaction strategy, we force a full normal index once
649 // the number of shards on disk for this repository exceeds the provided threshold.
650 //
651 // This strategy obviously isn't optimal (as an example: we currently can't differentiate
652 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per
653 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads
654 // while we create a better compaction strategy).
655
656 oldShards := options.BuildOptions.FindAllShards()
657 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold {
658 return nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold)
659 }
660 }
661
662 // Check to see if the set of branch names is consistent with what we last indexed.
663 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a
664 // normal one).
665
666 if !index.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) {
667 var existingBranchNames []string
668 for _, b := range existingRepository.Branches {
669 existingBranchNames = append(existingBranchNames, b.Name)
670 }
671
672 var optionsBranchNames []string
673 for _, b := range options.BuildOptions.RepositoryDescription.Branches {
674 optionsBranchNames = append(optionsBranchNames, b.Name)
675 }
676
677 existingBranchList := strings.Join(existingBranchNames, ", ")
678 optionsBranchList := strings.Join(optionsBranchNames, ", ")
679
680 return nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList)
681 }
682
683 // Check if the build options hash does not match the repository metadata's hash
684 // If it does not index then one or more index options has changed and will require a normal build instead of a delta build
685 if options.BuildOptions.GetHash() != existingRepository.IndexOptions {
686 return nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions())
687 }
688
689 // branch => (path, sha1) => repo.
690 repos = map[fileKey]BlobLocation{}
691
692 // branch name -> git worktree at most current commit
693 branchToCurrentTree := make(map[string]*object.Tree, len(options.Branches))
694
695 for _, b := range options.Branches {
696 commit, err := getCommit(repository, options.BranchPrefix, b)
697 if err != nil {
698 return nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err)
699 }
700
701 tree, err := commit.Tree()
702 if err != nil {
703 return nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err)
704 }
705
706 branchToCurrentTree[b] = tree
707 }
708
709 rawURL := options.BuildOptions.RepositoryDescription.URL
710 u, err := url.Parse(rawURL)
711 if err != nil {
712 return nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err)
713 }
714
715 // TODO: Support repository submodules for delta builds
716
717 // loop over all branches, calculate the diff between our
718 // last indexed commit and the current commit, and add files mentioned in the diff
719 for _, branch := range existingRepository.Branches {
720 lastIndexedCommit, err := getCommit(repository, "", branch.Version)
721 if err != nil {
722 return nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err)
723 }
724
725 lastIndexedTree, err := lastIndexedCommit.Tree()
726 if err != nil {
727 return nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err)
728 }
729
730 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false})
731 if err != nil {
732 return nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err)
733 }
734
735 for i, c := range changes {
736 oldFile, newFile, err := c.Files()
737 if err != nil {
738 return nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err)
739 }
740
741 if newFile != nil {
742 // note: newFile.Name could be a path that isn't relative to the repository root - using the
743 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root
744 newFileRelativeRootPath := c.To.Name
745
746 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds
747 if newFileRelativeRootPath == ignore.IgnoreFile {
748 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
749 }
750
751 // either file is added or renamed, so we need to add the new version to the build
752 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash}
753 if existing, ok := repos[file]; ok {
754 existing.Branches = append(existing.Branches, branch.Name)
755 repos[file] = existing
756 } else {
757 repos[file] = BlobLocation{
758 GitRepo: repository,
759 URL: u,
760 Branches: []string{branch.Name},
761 }
762 }
763 }
764
765 if oldFile == nil {
766 // file added - nothing more to do
767 continue
768 }
769
770 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the
771 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root
772 oldFileRelativeRootPath := c.From.Name
773
774 if oldFileRelativeRootPath == ignore.IgnoreFile {
775 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
776 }
777
778 // The file is either modified or deleted. So, we need to add ALL versions
779 // of the old file (across all branches) to the build.
780 for b, currentTree := range branchToCurrentTree {
781 f, err := currentTree.File(oldFileRelativeRootPath)
782 if err != nil {
783 // the file doesn't exist in this branch
784 if errors.Is(err, object.ErrFileNotFound) {
785 continue
786 }
787
788 return nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err)
789 }
790
791 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()}
792 if existing, ok := repos[file]; ok {
793 existing.Branches = append(existing.Branches, b)
794 repos[file] = existing
795 } else {
796 repos[file] = BlobLocation{
797 GitRepo: repository,
798 URL: u,
799 Branches: []string{b},
800 }
801 }
802 }
803
804 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath)
805 }
806 }
807
808 // we need to de-duplicate the branch map before returning it - it's possible for the same
809 // branch to have been added multiple times if a file has been modified across multiple commits
810 for _, info := range repos {
811 sort.Strings(info.Branches)
812 info.Branches = uniq(info.Branches)
813 }
814
815 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates
816 // for the same reasoning as above
817 sort.Strings(changedOrDeletedPaths)
818 changedOrDeletedPaths = uniq(changedOrDeletedPaths)
819
820 return repos, nil, changedOrDeletedPaths, nil
821}
822
823func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) {
824 var repoCache *RepoCache
825 if options.Submodules {
826 repoCache = NewRepoCache(options.RepoCacheDir)
827 }
828
829 // Branch => Repo => SHA1
830 branchVersions = map[string]map[string]plumbing.Hash{}
831
832 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix)
833 if err != nil {
834 return nil, nil, fmt.Errorf("expandBranches: %w", err)
835 }
836
837 rw := NewRepoWalker(repository, options.BuildOptions.RepositoryDescription.URL, repoCache)
838 for _, b := range branches {
839 commit, err := getCommit(repository, options.BranchPrefix, b)
840 if err != nil {
841 if options.AllowMissingBranch && err.Error() == "reference not found" {
842 continue
843 }
844
845 return nil, nil, fmt.Errorf("getCommit: %w", err)
846 }
847
848 tree, err := commit.Tree()
849 if err != nil {
850 return nil, nil, fmt.Errorf("commit.Tree: %w", err)
851 }
852
853 ig, err := newIgnoreMatcher(tree)
854 if err != nil {
855 return nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err)
856 }
857
858 subVersions, err := rw.CollectFiles(tree, b, ig)
859 if err != nil {
860 return nil, nil, fmt.Errorf("CollectFiles: %w", err)
861 }
862
863 branchVersions[b] = subVersions
864 }
865
866 return rw.Files, branchVersions, nil
867}
868
869func createDocument(key fileKey,
870 repos map[fileKey]BlobLocation,
871 opts index.Options,
872) (index.Document, error) {
873 repo := repos[key]
874 blob, err := repo.GitRepo.BlobObject(key.ID)
875 branches := repos[key].Branches
876
877 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found.
878 if errors.Is(err, plumbing.ErrObjectNotFound) {
879 return skippedLargeDoc(key, branches), nil
880 }
881
882 if err != nil {
883 return index.Document{}, err
884 }
885
886 keyFullPath := key.FullPath()
887 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) {
888 return skippedLargeDoc(key, branches), nil
889 }
890
891 contents, err := blobContents(blob)
892 if err != nil {
893 return index.Document{}, err
894 }
895
896 return index.Document{
897 SubRepositoryPath: key.SubRepoPath,
898 Name: keyFullPath,
899 Content: contents,
900 Branches: branches,
901 }, nil
902}
903
904func skippedLargeDoc(key fileKey, branches []string) index.Document {
905 return index.Document{
906 SkipReason: index.SkipReasonTooLarge,
907 Name: key.FullPath(),
908 Branches: branches,
909 SubRepositoryPath: key.SubRepoPath,
910 }
911}
912
913func blobContents(blob *object.Blob) ([]byte, error) {
914 r, err := blob.Reader()
915 if err != nil {
916 return nil, err
917 }
918 defer r.Close()
919
920 var buf bytes.Buffer
921 buf.Grow(int(blob.Size))
922 _, err = buf.ReadFrom(r)
923 if err != nil {
924 return nil, err
925 }
926 return buf.Bytes(), nil
927}
928
929func uniq(ss []string) []string {
930 result := ss[:0]
931 var last string
932 for i, s := range ss {
933 if i == 0 || s != last {
934 result = append(result, s)
935 }
936 last = s
937 }
938 return result
939}