fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package gitindex provides functions for indexing Git repositories.
16package gitindex
17
18import (
19 "bytes"
20 "cmp"
21 "context"
22 "errors"
23 "fmt"
24 "io"
25 "log"
26 "math"
27 "net/url"
28 "os"
29 "path/filepath"
30 "regexp"
31 "sort"
32 "strconv"
33 "strings"
34
35 "github.com/go-git/go-billy/v5/osfs"
36 "github.com/go-git/go-git/v5/config"
37 "github.com/go-git/go-git/v5/plumbing"
38 "github.com/go-git/go-git/v5/plumbing/cache"
39 "github.com/go-git/go-git/v5/plumbing/object"
40 "github.com/go-git/go-git/v5/storage/filesystem"
41
42 "github.com/sourcegraph/zoekt"
43 "github.com/sourcegraph/zoekt/ignore"
44 "github.com/sourcegraph/zoekt/index"
45
46 git "github.com/go-git/go-git/v5"
47)
48
49// FindGitRepos finds directories holding git repositories below the
50// given directory. It will find both bare and the ".git" dirs in
51// non-bare repositories. It returns the full path including the dir
52// passed in.
53func FindGitRepos(dir string) ([]string, error) {
54 arg, err := filepath.Abs(dir)
55 if err != nil {
56 return nil, err
57 }
58 var dirs []string
59 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error {
60 // Best-effort, ignore filepath.Walk failing
61 if err != nil {
62 return nil
63 }
64
65 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() {
66 dirs = append(dirs, filepath.Join(name, ".git"))
67 return filepath.SkipDir
68 }
69
70 if !strings.HasSuffix(name, ".git") || !fi.IsDir() {
71 return nil
72 }
73
74 fi, err = os.Lstat(filepath.Join(name, "objects"))
75 if err != nil || !fi.IsDir() {
76 return nil
77 }
78
79 dirs = append(dirs, name)
80 return filepath.SkipDir
81 }); err != nil {
82 return nil, err
83 }
84
85 return dirs, nil
86}
87
88// setTemplates fills in URL templates for known git hosting
89// sites.
90func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error {
91 if u.Scheme == "ssh+git" {
92 u.Scheme = "https"
93 u.User = nil
94 }
95
96 // helper to generate u.JoinPath as a template
97 varVersion := ".Version"
98 varPath := ".Path"
99 urlJoinPath := func(elem ...string) string {
100 elem = append([]string{u.String()}, elem...)
101 var parts []string
102 for _, e := range elem {
103 if e == varVersion || e == varPath {
104 parts = append(parts, e)
105 } else {
106 parts = append(parts, strconv.Quote(e))
107 }
108 }
109 return fmt.Sprintf("{{URLJoinPath %s}}", strings.Join(parts, " "))
110 }
111
112 repo.URL = u.String()
113 switch typ {
114 case "gitiles":
115 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20
116 repo.CommitURLTemplate = urlJoinPath("+", varVersion)
117 repo.FileURLTemplate = urlJoinPath("+", varVersion, varPath)
118 repo.LineFragmentTemplate = "#{{.LineNumber}}"
119 case "github":
120 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10
121 repo.CommitURLTemplate = urlJoinPath("commit", varVersion)
122 repo.FileURLTemplate = urlJoinPath("blob", varVersion, varPath)
123 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
124 case "cgit":
125 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100
126 repo.CommitURLTemplate = urlJoinPath("commit") + "/?id={{.Version}}"
127 repo.FileURLTemplate = urlJoinPath("tree", varPath) + "/?id={{.Version}}"
128 repo.LineFragmentTemplate = "#n{{.LineNumber}}"
129 case "gitweb":
130 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10
131 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}"
132 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}"
133 repo.LineFragmentTemplate = "#l{{.LineNumber}}"
134 case "source.bazel.build":
135 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9
136 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10
137 repo.CommitURLTemplate = u.String() + "/%2B/{{.Version}}"
138 repo.FileURLTemplate = u.String() + "/%2B/{{.Version}}:{{.Path}}"
139 repo.LineFragmentTemplate = ";l={{.LineNumber}}"
140 case "bitbucket-server":
141 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc
142 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc
143 repo.CommitURLTemplate = urlJoinPath("commits", varVersion)
144 repo.FileURLTemplate = urlJoinPath(varPath) + "?at={{.Version}}"
145 repo.LineFragmentTemplate = "#{{.LineNumber}}"
146 case "gitlab":
147 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed
148 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template
149 repo.CommitURLTemplate = urlJoinPath("-/commit", varVersion)
150 repo.FileURLTemplate = urlJoinPath("-/blob", varVersion, varPath)
151 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
152 case "gitea":
153 repo.CommitURLTemplate = urlJoinPath("commit", varVersion)
154 // NOTE The `display=source` query parameter is required to disable file rendering.
155 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to
156 // a line without `display=source`. This is supported since gitea 1.17.0.
157 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}},
158 // but the query parameters are obmitted.
159 repo.FileURLTemplate = urlJoinPath("src/commit", varVersion, varPath) + "?display=source"
160 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
161 default:
162 return fmt.Errorf("URL scheme type %q unknown", typ)
163 }
164 return nil
165}
166
167// getCommit returns a tree object for the given reference.
168func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) {
169 sha1, err := repo.ResolveRevision(plumbing.Revision(ref))
170 // ref might be a branch name (e.g. "master") add branch prefix and try again.
171 if err != nil {
172 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref)))
173 }
174 if err != nil {
175 return nil, err
176 }
177
178 commitObj, err := repo.CommitObject(*sha1)
179 if err != nil {
180 return nil, err
181 }
182 return commitObj, nil
183}
184
185func configLookupRemoteURL(cfg *config.Config, key string) string {
186 rc := cfg.Remotes[key]
187 if rc == nil || len(rc.URLs) == 0 {
188 return ""
189 }
190 return rc.URLs[0]
191}
192
193var sshRelativeURLRegexp = regexp.MustCompile(`^([^@]+)@([^:]+):(.*)$`)
194
195func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error {
196 repo, err := git.PlainOpen(repoDir)
197 if err != nil {
198 return err
199 }
200
201 cfg, err := repo.Config()
202 if err != nil {
203 return err
204 }
205
206 sec := cfg.Raw.Section("zoekt")
207
208 webURLStr := sec.Options.Get("web-url")
209 webURLType := sec.Options.Get("web-url-type")
210
211 if webURLType != "" && webURLStr != "" {
212 webURL, err := url.Parse(webURLStr)
213 if err != nil {
214 return err
215 }
216 if err := setTemplates(desc, webURL, webURLType); err != nil {
217 return err
218 }
219 } else if webURLStr != "" {
220 desc.URL = webURLStr
221 }
222
223 name := sec.Options.Get("name")
224 if name != "" {
225 desc.Name = name
226 } else {
227 remoteURL := configLookupRemoteURL(cfg, "origin")
228 if remoteURL == "" {
229 return nil
230 }
231 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil {
232 user := sm[1]
233 host := sm[2]
234 path := sm[3]
235
236 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path)
237 }
238
239 u, err := url.Parse(remoteURL)
240 if err != nil {
241 return err
242 }
243 if err := SetTemplatesFromOrigin(desc, u); err != nil {
244 return err
245 }
246 }
247
248 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32)
249 desc.ID = uint32(id)
250
251 desc.TenantID, _ = strconv.Atoi(sec.Options.Get("tenantID"))
252
253 if desc.RawConfig == nil {
254 desc.RawConfig = map[string]string{}
255 }
256 for _, o := range sec.Options {
257 desc.RawConfig[o.Key] = o.Value
258 }
259
260 // Ranking info.
261
262 // Github:
263 traction := 0
264 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} {
265 f, err := strconv.Atoi(sec.Options.Get(s))
266 if err == nil {
267 traction += f
268 }
269 }
270
271 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 {
272 // Pretend everything on googlesource.com has 1000
273 // github stars.
274 traction = 1000
275 }
276
277 if traction > 0 {
278 l := math.Log(float64(traction))
279 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000)
280 }
281
282 return nil
283}
284
285// This attempts to get a repo URL similar to the main repository template processing as in setTemplatesFromConfig()
286func normalizeSubmoduleRemoteURL(cfg *config.Config) (string, error) {
287 sec := cfg.Raw.Section("zoekt")
288 remoteURL := sec.Options.Get("web-url")
289 if remoteURL == "" {
290 // fall back to "origin" remote
291 remoteURL = configLookupRemoteURL(cfg, "origin")
292 if remoteURL == "" {
293 return "", nil
294 }
295 }
296
297 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil {
298 user := sm[1]
299 host := sm[2]
300 path := sm[3]
301
302 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path)
303 }
304
305 u, err := url.Parse(remoteURL)
306 if err != nil {
307 return "", fmt.Errorf("unable to parse remote URL %q: %w", remoteURL, err)
308 }
309
310 if u.Scheme == "ssh+git" {
311 u.Scheme = "https"
312 u.User = nil
313 }
314
315 // Assume we cannot build templates for this URL, leave it empty
316 if u.Scheme == "" {
317 return "", nil
318 }
319
320 return u.String(), nil
321}
322
323// SetTemplatesFromOrigin fills in templates based on the origin URL.
324func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error {
325 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git"))
326
327 if strings.HasSuffix(u.Host, ".googlesource.com") {
328 return setTemplates(desc, u, "gitiles")
329 } else if u.Host == "github.com" {
330 u.Path = strings.TrimSuffix(u.Path, ".git")
331 return setTemplates(desc, u, "github")
332 } else {
333 return fmt.Errorf("unknown git hosting site %q", u)
334 }
335}
336
337// The Options structs controls details of the indexing process.
338type Options struct {
339 // The repository to be indexed.
340 RepoDir string
341
342 // If set, follow submodule links. This requires RepoCacheDir to be set.
343 Submodules bool
344
345 // If set, skip indexing if the existing index shard is newer
346 // than the refs in the repository.
347 Incremental bool
348
349 // Don't error out if some branch is missing
350 AllowMissingBranch bool
351
352 // Specifies the root of a Repository cache. Needed for submodule indexing.
353 RepoCacheDir string
354
355 // Indexing options.
356 BuildOptions index.Options
357
358 // Prefix of the branch to index, e.g. `remotes/origin`.
359 BranchPrefix string
360
361 // List of branch names to index, e.g. []string{"HEAD", "stable"}
362 Branches []string
363
364 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards
365 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold,
366 // then a normal build will be performed instead.
367 //
368 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled:
369 // a delta build will always be performed regardless of the number of preexisting shards.
370 DeltaShardNumberFallbackThreshold uint64
371}
372
373func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) {
374 var result []string
375 for _, b := range bs {
376 // Sourcegraph: We disable resolving refs. We want to return the exact ref
377 // requested so we can match it up.
378 if b == "HEAD" && false {
379 ref, err := repo.Head()
380 if err != nil {
381 return nil, err
382 }
383
384 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix))
385 continue
386 }
387
388 if strings.Contains(b, "*") {
389 iter, err := repo.Branches()
390 if err != nil {
391 return nil, err
392 }
393
394 defer iter.Close()
395 for {
396 ref, err := iter.Next()
397 if err == io.EOF {
398 break
399 }
400 if err != nil {
401 return nil, err
402 }
403
404 name := ref.Name().Short()
405 if matched, err := filepath.Match(b, name); err != nil {
406 return nil, err
407 } else if !matched {
408 continue
409 }
410
411 result = append(result, strings.TrimPrefix(name, prefix))
412 }
413 continue
414 }
415
416 result = append(result, b)
417 }
418
419 return result, nil
420}
421
422// IndexGitRepo indexes the git repository as specified by the options.
423// The returned bool indicates whether the index was updated as a result. This
424// can be informative if doing incremental indexing.
425func IndexGitRepo(opts Options) (bool, error) {
426 return indexGitRepo(opts, gitIndexConfig{})
427}
428
429// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig.
430// The returned bool indicates whether the index was updated as a result. This
431// can be informative if doing incremental indexing.
432func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
433 prepareDeltaBuild := prepareDeltaBuild
434 if config.prepareDeltaBuild != nil {
435 prepareDeltaBuild = config.prepareDeltaBuild
436 }
437
438 prepareNormalBuild := prepareNormalBuild
439 if config.prepareNormalBuild != nil {
440 prepareNormalBuild = config.prepareNormalBuild
441 }
442
443 // Set max thresholds, since we use them in this function.
444 opts.BuildOptions.SetDefaults()
445 if opts.RepoDir == "" {
446 return false, fmt.Errorf("gitindex: must set RepoDir")
447 }
448
449 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir
450
451 var repo *git.Repository
452 legacyRepoOpen := cmp.Or(os.Getenv("ZOEKT_DISABLE_GOGIT_OPTIMIZATION"), "false")
453 if b, err := strconv.ParseBool(legacyRepoOpen); b || err != nil {
454 repo, err = git.PlainOpen(opts.RepoDir)
455 if err != nil {
456 return false, fmt.Errorf("git.PlainOpen: %w", err)
457 }
458 } else {
459 var repoCloser io.Closer
460 repo, repoCloser, err = openRepo(opts.RepoDir)
461 if err != nil {
462 return false, fmt.Errorf("openRepo: %w", err)
463 }
464 defer repoCloser.Close()
465 }
466
467 if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil {
468 log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err)
469 }
470
471 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix)
472 if err != nil {
473 return false, fmt.Errorf("expandBranches: %w", err)
474 }
475 for _, b := range branches {
476 commit, err := getCommit(repo, opts.BranchPrefix, b)
477 if err != nil {
478 if opts.AllowMissingBranch && err.Error() == "reference not found" {
479 continue
480 }
481
482 return false, fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err)
483 }
484
485 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{
486 Name: b,
487 Version: commit.Hash.String(),
488 })
489
490 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) {
491 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when
492 }
493 }
494
495 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() {
496 return false, nil
497 }
498
499 // branch => (path, sha1) => repo.
500 var repos map[fileKey]BlobLocation
501
502 // Branch => Repo => SHA1
503 var branchVersions map[string]map[string]plumbing.Hash
504
505 // set of file paths that have been changed or deleted since
506 // the last indexed commit
507 //
508 // These only have an effect on delta builds
509 var changedOrRemovedFiles []string
510
511 if opts.BuildOptions.IsDelta {
512 repos, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo)
513 if err != nil {
514 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err)
515 opts.BuildOptions.IsDelta = false
516 }
517 }
518
519 if !opts.BuildOptions.IsDelta {
520 repos, branchVersions, err = prepareNormalBuild(opts, repo)
521 if err != nil {
522 return false, fmt.Errorf("preparing normal build: %w", err)
523 }
524 }
525
526 reposByPath := map[string]BlobLocation{}
527 for key, info := range repos {
528 reposByPath[key.SubRepoPath] = info
529 }
530
531 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{}
532 for path, info := range reposByPath {
533 tpl := opts.BuildOptions.RepositoryDescription
534 if path != "" {
535 tpl = zoekt.Repository{URL: info.URL.String()}
536 if info.URL.String() != "" {
537 if err := SetTemplatesFromOrigin(&tpl, info.URL); err != nil {
538 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.URL, err)
539 }
540 }
541 if tpl.Name == "" {
542 tpl.Name = path
543 }
544 }
545 opts.BuildOptions.SubRepositories[path] = &tpl
546 }
547
548 for _, br := range opts.BuildOptions.RepositoryDescription.Branches {
549 for path, repo := range opts.BuildOptions.SubRepositories {
550 id := branchVersions[br.Name][path]
551 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{
552 Name: br.Name,
553 Version: id.String(),
554 })
555 }
556 }
557
558 builder, err := index.NewBuilder(opts.BuildOptions)
559 if err != nil {
560 return false, fmt.Errorf("build.NewBuilder: %w", err)
561 }
562
563 // Preparing the build can consume substantial memory, so check usage before starting to index.
564 builder.CheckMemoryUsage()
565
566 // we don't need to check error, since we either already have an error, or
567 // we returning the first call to builder.Finish.
568 defer builder.Finish() // nolint:errcheck
569
570 for _, f := range changedOrRemovedFiles {
571 builder.MarkFileAsChangedOrRemoved(f)
572 }
573
574 var names []string
575 fileKeys := map[string][]fileKey{}
576 totalFiles := 0
577
578 for key := range repos {
579 n := key.FullPath()
580 fileKeys[n] = append(fileKeys[n], key)
581 names = append(names, n)
582 totalFiles++
583 }
584
585 sort.Strings(names)
586 names = uniq(names)
587
588 log.Printf("attempting to index %d total files", totalFiles)
589 for idx, name := range names {
590 keys := fileKeys[name]
591
592 for _, key := range keys {
593 doc, err := createDocument(key, repos, opts.BuildOptions)
594 if err != nil {
595 return false, err
596 }
597
598 if err := builder.Add(doc); err != nil {
599 return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err)
600 }
601
602 if idx%10_000 == 0 {
603 builder.CheckMemoryUsage()
604 }
605 }
606 }
607 return true, builder.Finish()
608}
609
610// openRepo opens a git repository in a way that's optimized for indexing.
611//
612// It copies the relevant logic from git.PlainOpen, and tweaks certain filesystem options.
613func openRepo(repoDir string) (*git.Repository, io.Closer, error) {
614 fs := osfs.New(repoDir)
615 wt := fs
616
617 // Check if the root directory exists.
618 if _, err := fs.Stat(""); err != nil {
619 if os.IsNotExist(err) {
620 return nil, nil, git.ErrRepositoryNotExists
621 }
622 return nil, nil, err
623 }
624
625 // If there's a .git directory, use that as the new root.
626 if fi, err := fs.Stat(git.GitDirName); err == nil && fi.IsDir() {
627 if fs, err = fs.Chroot(git.GitDirName); err != nil {
628 return nil, nil, fmt.Errorf("fs.Chroot: %w", err)
629 }
630 }
631
632 s := filesystem.NewStorageWithOptions(fs, cache.NewObjectLRUDefault(), filesystem.Options{
633 // Cache the packfile handles, preventing the packfile from being opened then closed on every object access
634 KeepDescriptors: true,
635 })
636
637 // Because we're keeping descriptors open, we need to close the storage object when we're done.
638 repo, err := git.Open(s, wt)
639 return repo, s, err
640}
641
642func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) {
643 ignoreFile, err := tree.File(ignore.IgnoreFile)
644 if err == object.ErrFileNotFound {
645 return &ignore.Matcher{}, nil
646 }
647 if err != nil {
648 return nil, err
649 }
650 content, err := ignoreFile.Contents()
651 if err != nil {
652 return nil, err
653 }
654 return ignore.ParseIgnoreFile(strings.NewReader(content))
655}
656
657// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing
658// a build.Builder instance for generating a delta build.
659type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error)
660
661// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing
662// a build.Builder instance for generating a normal build.
663type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error)
664
665type gitIndexConfig struct {
666 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to
667 // prepare the build.Builder instance for generating a delta build.
668 //
669 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead.
670 prepareDeltaBuild prepareDeltaBuildFunc
671
672 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to
673 // prepare the build.Builder instance for generating a normal build.
674 //
675 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead.
676 prepareNormalBuild prepareNormalBuildFunc
677}
678
679func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) {
680 if options.Submodules {
681 return nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing")
682 }
683
684 // discover what commits we indexed during our last build
685 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata()
686 if err != nil {
687 return nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err)
688 }
689
690 if !ok {
691 return nil, nil, nil, fmt.Errorf("no existing shards found for repository")
692 }
693
694 if options.DeltaShardNumberFallbackThreshold > 0 {
695 // HACK: For our interim compaction strategy, we force a full normal index once
696 // the number of shards on disk for this repository exceeds the provided threshold.
697 //
698 // This strategy obviously isn't optimal (as an example: we currently can't differentiate
699 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per
700 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads
701 // while we create a better compaction strategy).
702
703 oldShards := options.BuildOptions.FindAllShards()
704 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold {
705 return nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold)
706 }
707 }
708
709 // Check to see if the set of branch names is consistent with what we last indexed.
710 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a
711 // normal one).
712
713 if !index.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) {
714 var existingBranchNames []string
715 for _, b := range existingRepository.Branches {
716 existingBranchNames = append(existingBranchNames, b.Name)
717 }
718
719 var optionsBranchNames []string
720 for _, b := range options.BuildOptions.RepositoryDescription.Branches {
721 optionsBranchNames = append(optionsBranchNames, b.Name)
722 }
723
724 existingBranchList := strings.Join(existingBranchNames, ", ")
725 optionsBranchList := strings.Join(optionsBranchNames, ", ")
726
727 return nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList)
728 }
729
730 // Check if the build options hash does not match the repository metadata's hash
731 // If it does not index then one or more index options has changed and will require a normal build instead of a delta build
732 if options.BuildOptions.GetHash() != existingRepository.IndexOptions {
733 return nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions())
734 }
735
736 // branch => (path, sha1) => repo.
737 repos = map[fileKey]BlobLocation{}
738
739 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix)
740 if err != nil {
741 return nil, nil, nil, fmt.Errorf("expandBranches: %w", err)
742 }
743
744 // branch name -> git worktree at most current commit
745 branchToCurrentTree := make(map[string]*object.Tree, len(branches))
746
747 for _, b := range branches {
748 commit, err := getCommit(repository, options.BranchPrefix, b)
749 if err != nil {
750 return nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err)
751 }
752
753 tree, err := commit.Tree()
754 if err != nil {
755 return nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err)
756 }
757
758 branchToCurrentTree[b] = tree
759 }
760
761 rawURL := options.BuildOptions.RepositoryDescription.URL
762 u, err := url.Parse(rawURL)
763 if err != nil {
764 return nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err)
765 }
766
767 // TODO: Support repository submodules for delta builds
768
769 // loop over all branches, calculate the diff between our
770 // last indexed commit and the current commit, and add files mentioned in the diff
771 for _, branch := range existingRepository.Branches {
772 lastIndexedCommit, err := getCommit(repository, "", branch.Version)
773 if err != nil {
774 return nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err)
775 }
776
777 lastIndexedTree, err := lastIndexedCommit.Tree()
778 if err != nil {
779 return nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err)
780 }
781
782 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false})
783 if err != nil {
784 return nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err)
785 }
786
787 for i, c := range changes {
788 oldFile, newFile, err := c.Files()
789 if err != nil {
790 return nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err)
791 }
792
793 if newFile != nil {
794 // note: newFile.Name could be a path that isn't relative to the repository root - using the
795 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root
796 newFileRelativeRootPath := c.To.Name
797
798 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds
799 if newFileRelativeRootPath == ignore.IgnoreFile {
800 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
801 }
802
803 // either file is added or renamed, so we need to add the new version to the build
804 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash}
805 if existing, ok := repos[file]; ok {
806 existing.Branches = append(existing.Branches, branch.Name)
807 repos[file] = existing
808 } else {
809 repos[file] = BlobLocation{
810 GitRepo: repository,
811 URL: u,
812 Branches: []string{branch.Name},
813 }
814 }
815 }
816
817 if oldFile == nil {
818 // file added - nothing more to do
819 continue
820 }
821
822 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the
823 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root
824 oldFileRelativeRootPath := c.From.Name
825
826 if oldFileRelativeRootPath == ignore.IgnoreFile {
827 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
828 }
829
830 // The file is either modified or deleted. So, we need to add ALL versions
831 // of the old file (across all branches) to the build.
832 for b, currentTree := range branchToCurrentTree {
833 f, err := currentTree.File(oldFileRelativeRootPath)
834 if err != nil {
835 // the file doesn't exist in this branch
836 if errors.Is(err, object.ErrFileNotFound) {
837 continue
838 }
839
840 return nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err)
841 }
842
843 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()}
844 if existing, ok := repos[file]; ok {
845 existing.Branches = append(existing.Branches, b)
846 repos[file] = existing
847 } else {
848 repos[file] = BlobLocation{
849 GitRepo: repository,
850 URL: u,
851 Branches: []string{b},
852 }
853 }
854 }
855
856 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath)
857 }
858 }
859
860 // we need to de-duplicate the branch map before returning it - it's possible for the same
861 // branch to have been added multiple times if a file has been modified across multiple commits
862 for _, info := range repos {
863 sort.Strings(info.Branches)
864 info.Branches = uniq(info.Branches)
865 }
866
867 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates
868 // for the same reasoning as above
869 sort.Strings(changedOrDeletedPaths)
870 changedOrDeletedPaths = uniq(changedOrDeletedPaths)
871
872 return repos, nil, changedOrDeletedPaths, nil
873}
874
875func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) {
876 var repoCache *RepoCache
877 if options.Submodules && options.RepoCacheDir != "" {
878 repoCache = NewRepoCache(options.RepoCacheDir)
879 }
880 return prepareNormalBuildRecurse(options, repository, repoCache, false)
881}
882
883func prepareNormalBuildRecurse(options Options, repository *git.Repository, repoCache *RepoCache, isSubrepo bool) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) {
884 // Branch => Repo => SHA1
885 branchVersions = map[string]map[string]plumbing.Hash{}
886
887 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix)
888 if err != nil {
889 return nil, nil, fmt.Errorf("expandBranches: %w", err)
890 }
891
892 repoURL := options.BuildOptions.RepositoryDescription.URL
893
894 if isSubrepo {
895 cfg, err := repository.Config()
896 if err != nil {
897 return nil, nil, fmt.Errorf("unable to get repository config: %w", err)
898 }
899
900 u, err := normalizeSubmoduleRemoteURL(cfg)
901 if err != nil {
902 return nil, nil, fmt.Errorf("failed to identify subrepository URL: %w", err)
903 }
904 repoURL = u
905 }
906
907 rw := NewRepoWalker(repository, repoURL, repoCache)
908 for _, b := range branches {
909 commit, err := getCommit(repository, options.BranchPrefix, b)
910 if err != nil {
911 if options.AllowMissingBranch && err.Error() == "reference not found" {
912 continue
913 }
914
915 return nil, nil, fmt.Errorf("getCommit: %w", err)
916 }
917
918 tree, err := commit.Tree()
919 if err != nil {
920 return nil, nil, fmt.Errorf("commit.Tree: %w", err)
921 }
922
923 ig, err := newIgnoreMatcher(tree)
924 if err != nil {
925 return nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err)
926 }
927
928 subVersions, err := rw.CollectFiles(tree, b, ig)
929 if err != nil {
930 return nil, nil, fmt.Errorf("CollectFiles: %w", err)
931 }
932
933 branchVersions[b] = subVersions
934 }
935
936 // Index submodules using go-git if we didn't do so using the repo cache
937 if options.Submodules && options.RepoCacheDir == "" {
938 worktree, err := repository.Worktree()
939 if err != nil {
940 return nil, nil, fmt.Errorf("failed to get repository worktree: %w", err)
941 }
942
943 submodules, err := worktree.Submodules()
944 if err != nil {
945 return nil, nil, fmt.Errorf("failed to get submodules: %w", err)
946 }
947
948 for _, submodule := range submodules {
949 subRepository, err := submodule.Repository()
950 if err != nil {
951 log.Printf("failed to open submodule repository: %s, %s", submodule.Config().Name, err)
952 continue
953 }
954
955 sw, subVersions, err := prepareNormalBuildRecurse(options, subRepository, repoCache, true)
956 if err != nil {
957 log.Printf("failed to index submodule repository: %s, %s", submodule.Config().Name, err)
958 continue
959 }
960
961 log.Printf("adding subrepository files from: %s", submodule.Config().Name)
962
963 for k, repo := range sw {
964 rw.Files[fileKey{
965 SubRepoPath: filepath.Join(submodule.Config().Path, k.SubRepoPath),
966 Path: k.Path,
967 ID: k.ID,
968 }] = repo
969 }
970
971 for k, v := range subVersions {
972 branchVersions[filepath.Join(submodule.Config().Path, k)] = v
973 }
974 }
975 }
976
977 return rw.Files, branchVersions, nil
978}
979
980func createDocument(key fileKey,
981 repos map[fileKey]BlobLocation,
982 opts index.Options,
983) (index.Document, error) {
984 repo := repos[key]
985 blob, err := repo.GitRepo.BlobObject(key.ID)
986 branches := repos[key].Branches
987
988 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found.
989 if errors.Is(err, plumbing.ErrObjectNotFound) {
990 return skippedLargeDoc(key, branches), nil
991 }
992
993 if err != nil {
994 return index.Document{}, err
995 }
996
997 keyFullPath := key.FullPath()
998 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) {
999 return skippedLargeDoc(key, branches), nil
1000 }
1001
1002 contents, err := blobContents(blob)
1003 if err != nil {
1004 return index.Document{}, err
1005 }
1006
1007 return index.Document{
1008 SubRepositoryPath: key.SubRepoPath,
1009 Name: keyFullPath,
1010 Content: contents,
1011 Branches: branches,
1012 }, nil
1013}
1014
1015func skippedLargeDoc(key fileKey, branches []string) index.Document {
1016 return index.Document{
1017 SkipReason: index.SkipReasonTooLarge,
1018 Name: key.FullPath(),
1019 Branches: branches,
1020 SubRepositoryPath: key.SubRepoPath,
1021 }
1022}
1023
1024func blobContents(blob *object.Blob) ([]byte, error) {
1025 r, err := blob.Reader()
1026 if err != nil {
1027 return nil, err
1028 }
1029 defer r.Close()
1030
1031 var buf bytes.Buffer
1032 buf.Grow(int(blob.Size))
1033 _, err = buf.ReadFrom(r)
1034 if err != nil {
1035 return nil, err
1036 }
1037 return buf.Bytes(), nil
1038}
1039
1040func uniq(ss []string) []string {
1041 result := ss[:0]
1042 var last string
1043 for i, s := range ss {
1044 if i == 0 || s != last {
1045 result = append(result, s)
1046 }
1047 last = s
1048 }
1049 return result
1050}