fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package gitindex provides functions for indexing Git repositories.
16package gitindex
17
18import (
19 "bytes"
20 "context"
21 "encoding/json"
22 "errors"
23 "fmt"
24 "io"
25 "log"
26 "math"
27 "net/url"
28 "os"
29 "path/filepath"
30 "regexp"
31 "sort"
32 "strconv"
33 "strings"
34
35 "github.com/sourcegraph/zoekt"
36 "github.com/sourcegraph/zoekt/build"
37 "github.com/sourcegraph/zoekt/ignore"
38
39 "github.com/go-git/go-git/v5/config"
40 "github.com/go-git/go-git/v5/plumbing"
41 "github.com/go-git/go-git/v5/plumbing/object"
42
43 git "github.com/go-git/go-git/v5"
44)
45
46// FindGitRepos finds directories holding git repositories below the
47// given directory. It will find both bare and the ".git" dirs in
48// non-bare repositories. It returns the full path including the dir
49// passed in.
50func FindGitRepos(dir string) ([]string, error) {
51 arg, err := filepath.Abs(dir)
52 if err != nil {
53 return nil, err
54 }
55 var dirs []string
56 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error {
57 // Best-effort, ignore filepath.Walk failing
58 if err != nil {
59 return nil
60 }
61
62 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() {
63 dirs = append(dirs, filepath.Join(name, ".git"))
64 return filepath.SkipDir
65 }
66
67 if !strings.HasSuffix(name, ".git") || !fi.IsDir() {
68 return nil
69 }
70
71 fi, err = os.Lstat(filepath.Join(name, "objects"))
72 if err != nil || !fi.IsDir() {
73 return nil
74 }
75
76 dirs = append(dirs, name)
77 return filepath.SkipDir
78 }); err != nil {
79 return nil, err
80 }
81
82 return dirs, nil
83}
84
85// setTemplates fills in URL templates for known git hosting
86// sites.
87func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error {
88 if u.Scheme == "ssh+git" {
89 u.Scheme = "https"
90 u.User = nil
91 }
92
93 // helper to generate u.JoinPath as a template
94 varVersion := ".Version"
95 varPath := ".Path"
96 urlJoinPath := func(elem ...string) string {
97 elem = append([]string{u.String()}, elem...)
98 var parts []string
99 for _, e := range elem {
100 if e == varVersion || e == varPath {
101 parts = append(parts, e)
102 } else {
103 parts = append(parts, strconv.Quote(e))
104 }
105 }
106 return fmt.Sprintf("{{URLJoinPath %s}}", strings.Join(parts, " "))
107 }
108
109 repo.URL = u.String()
110 switch typ {
111 case "gitiles":
112 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20
113 repo.CommitURLTemplate = urlJoinPath("+", varVersion)
114 repo.FileURLTemplate = urlJoinPath("+", varVersion, varPath)
115 repo.LineFragmentTemplate = "#{{.LineNumber}}"
116 case "github":
117 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10
118 repo.CommitURLTemplate = urlJoinPath("commit", varVersion)
119 repo.FileURLTemplate = urlJoinPath("blob", varVersion, varPath)
120 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
121 case "cgit":
122 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100
123 repo.CommitURLTemplate = urlJoinPath("commit") + "/?id={{.Version}}"
124 repo.FileURLTemplate = urlJoinPath("tree", varPath) + "/?id={{.Version}}"
125 repo.LineFragmentTemplate = "#n{{.LineNumber}}"
126 case "gitweb":
127 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10
128 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}"
129 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}"
130 repo.LineFragmentTemplate = "#l{{.LineNumber}}"
131 case "source.bazel.build":
132 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9
133 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10
134 repo.CommitURLTemplate = u.String() + "/%2B/{{.Version}}"
135 repo.FileURLTemplate = u.String() + "/%2B/{{.Version}}:{{.Path}}"
136 repo.LineFragmentTemplate = ";l={{.LineNumber}}"
137 case "bitbucket-server":
138 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc
139 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc
140 repo.CommitURLTemplate = urlJoinPath("commits", varVersion)
141 repo.FileURLTemplate = urlJoinPath(varPath) + "?at={{.Version}}"
142 repo.LineFragmentTemplate = "#{{.LineNumber}}"
143 case "gitlab":
144 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed
145 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template
146 repo.CommitURLTemplate = urlJoinPath("-/commit", varVersion)
147 repo.FileURLTemplate = urlJoinPath("-/blob", varVersion, varPath)
148 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
149 case "gitea":
150 repo.CommitURLTemplate = urlJoinPath("commit", varVersion)
151 // NOTE The `display=source` query parameter is required to disable file rendering.
152 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to
153 // a line without `display=source`. This is supported since gitea 1.17.0.
154 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}},
155 // but the query parameters are obmitted.
156 repo.FileURLTemplate = urlJoinPath("src/commit", varVersion, varPath) + "?display=source"
157 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
158 default:
159 return fmt.Errorf("URL scheme type %q unknown", typ)
160 }
161 return nil
162}
163
164// getCommit returns a tree object for the given reference.
165func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) {
166 sha1, err := repo.ResolveRevision(plumbing.Revision(ref))
167 // ref might be a branch name (e.g. "master") add branch prefix and try again.
168 if err != nil {
169 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref)))
170 }
171 if err != nil {
172 return nil, err
173 }
174
175 commitObj, err := repo.CommitObject(*sha1)
176 if err != nil {
177 return nil, err
178 }
179 return commitObj, nil
180}
181
182func configLookupRemoteURL(cfg *config.Config, key string) string {
183 rc := cfg.Remotes[key]
184 if rc == nil || len(rc.URLs) == 0 {
185 return ""
186 }
187 return rc.URLs[0]
188}
189
190var sshRelativeURLRegexp = regexp.MustCompile(`^([^@]+)@([^:]+):(.*)$`)
191
192func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error {
193 repo, err := git.PlainOpen(repoDir)
194 if err != nil {
195 return err
196 }
197
198 cfg, err := repo.Config()
199 if err != nil {
200 return err
201 }
202
203 sec := cfg.Raw.Section("zoekt")
204
205 webURLStr := sec.Options.Get("web-url")
206 webURLType := sec.Options.Get("web-url-type")
207
208 if webURLType != "" && webURLStr != "" {
209 webURL, err := url.Parse(webURLStr)
210 if err != nil {
211 return err
212 }
213 if err := setTemplates(desc, webURL, webURLType); err != nil {
214 return err
215 }
216 } else if webURLStr != "" {
217 desc.URL = webURLStr
218 }
219
220 name := sec.Options.Get("name")
221 if name != "" {
222 desc.Name = name
223 } else {
224 remoteURL := configLookupRemoteURL(cfg, "origin")
225 if remoteURL == "" {
226 return nil
227 }
228 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil {
229 user := sm[1]
230 host := sm[2]
231 path := sm[3]
232
233 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path)
234 }
235
236 u, err := url.Parse(remoteURL)
237 if err != nil {
238 return err
239 }
240 if err := SetTemplatesFromOrigin(desc, u); err != nil {
241 return err
242 }
243 }
244
245 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32)
246 desc.ID = uint32(id)
247
248 if desc.RawConfig == nil {
249 desc.RawConfig = map[string]string{}
250 }
251 for _, o := range sec.Options {
252 desc.RawConfig[o.Key] = o.Value
253 }
254
255 // Ranking info.
256
257 // Github:
258 traction := 0
259 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} {
260 f, err := strconv.Atoi(sec.Options.Get(s))
261 if err == nil {
262 traction += f
263 }
264 }
265
266 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 {
267 // Pretend everything on googlesource.com has 1000
268 // github stars.
269 traction = 1000
270 }
271
272 if traction > 0 {
273 l := math.Log(float64(traction))
274 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000)
275 }
276
277 return nil
278}
279
280// SetTemplatesFromOrigin fills in templates based on the origin URL.
281func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error {
282 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git"))
283
284 if strings.HasSuffix(u.Host, ".googlesource.com") {
285 return setTemplates(desc, u, "gitiles")
286 } else if u.Host == "github.com" {
287 u.Path = strings.TrimSuffix(u.Path, ".git")
288 return setTemplates(desc, u, "github")
289 } else {
290 return fmt.Errorf("unknown git hosting site %q", u)
291 }
292}
293
294// The Options structs controls details of the indexing process.
295type Options struct {
296 // The repository to be indexed.
297 RepoDir string
298
299 // If set, follow submodule links. This requires RepoCacheDir to be set.
300 Submodules bool
301
302 // If set, skip indexing if the existing index shard is newer
303 // than the refs in the repository.
304 Incremental bool
305
306 // Don't error out if some branch is missing
307 AllowMissingBranch bool
308
309 // Specifies the root of a Repository cache. Needed for submodule indexing.
310 RepoCacheDir string
311
312 // Indexing options.
313 BuildOptions build.Options
314
315 // Prefix of the branch to index, e.g. `remotes/origin`.
316 BranchPrefix string
317
318 // List of branch names to index, e.g. []string{"HEAD", "stable"}
319 Branches []string
320
321 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards
322 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold,
323 // then a normal build will be performed instead.
324 //
325 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled:
326 // a delta build will always be performed regardless of the number of preexisting shards.
327 DeltaShardNumberFallbackThreshold uint64
328}
329
330func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) {
331 var result []string
332 for _, b := range bs {
333 // Sourcegraph: We disable resolving refs. We want to return the exact ref
334 // requested so we can match it up.
335 if b == "HEAD" && false {
336 ref, err := repo.Head()
337 if err != nil {
338 return nil, err
339 }
340
341 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix))
342 continue
343 }
344
345 if strings.Contains(b, "*") {
346 iter, err := repo.Branches()
347 if err != nil {
348 return nil, err
349 }
350
351 defer iter.Close()
352 for {
353 ref, err := iter.Next()
354 if err == io.EOF {
355 break
356 }
357 if err != nil {
358 return nil, err
359 }
360
361 name := ref.Name().Short()
362 if matched, err := filepath.Match(b, name); err != nil {
363 return nil, err
364 } else if !matched {
365 continue
366 }
367
368 result = append(result, strings.TrimPrefix(name, prefix))
369 }
370 continue
371 }
372
373 result = append(result, b)
374 }
375
376 return result, nil
377}
378
379// IndexGitRepo indexes the git repository as specified by the options.
380// The returned bool indicates whether the index was updated as a result. This
381// can be informative if doing incremental indexing.
382func IndexGitRepo(opts Options) (bool, error) {
383 return indexGitRepo(opts, gitIndexConfig{})
384}
385
386// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig.
387// The returned bool indicates whether the index was updated as a result. This
388// can be informative if doing incremental indexing.
389func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
390 prepareDeltaBuild := prepareDeltaBuild
391 if config.prepareDeltaBuild != nil {
392 prepareDeltaBuild = config.prepareDeltaBuild
393 }
394
395 prepareNormalBuild := prepareNormalBuild
396 if config.prepareNormalBuild != nil {
397 prepareNormalBuild = config.prepareNormalBuild
398 }
399
400 // Set max thresholds, since we use them in this function.
401 opts.BuildOptions.SetDefaults()
402 if opts.RepoDir == "" {
403 return false, fmt.Errorf("gitindex: must set RepoDir")
404 }
405
406 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir
407 repo, err := git.PlainOpen(opts.RepoDir)
408 if err != nil {
409 return false, fmt.Errorf("git.PlainOpen: %w", err)
410 }
411
412 if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil {
413 log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err)
414 }
415
416 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix)
417 if err != nil {
418 return false, fmt.Errorf("expandBranches: %w", err)
419 }
420 for _, b := range branches {
421 commit, err := getCommit(repo, opts.BranchPrefix, b)
422 if err != nil {
423 if opts.AllowMissingBranch && err.Error() == "reference not found" {
424 continue
425 }
426
427 return false, fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err)
428 }
429
430 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{
431 Name: b,
432 Version: commit.Hash.String(),
433 })
434
435 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) {
436 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when
437 }
438 }
439
440 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() {
441 return false, nil
442 }
443
444 // branch => (path, sha1) => repo.
445 var repos map[fileKey]BlobLocation
446
447 // Branch => Repo => SHA1
448 var branchVersions map[string]map[string]plumbing.Hash
449
450 // set of file paths that have been changed or deleted since
451 // the last indexed commit
452 //
453 // These only have an effect on delta builds
454 var changedOrRemovedFiles []string
455
456 if opts.BuildOptions.IsDelta {
457 repos, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo)
458 if err != nil {
459 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err)
460 opts.BuildOptions.IsDelta = false
461 }
462 }
463
464 if !opts.BuildOptions.IsDelta {
465 repos, branchVersions, err = prepareNormalBuild(opts, repo)
466 if err != nil {
467 return false, fmt.Errorf("preparing normal build: %w", err)
468 }
469 }
470
471 reposByPath := map[string]BlobLocation{}
472 for key, info := range repos {
473 reposByPath[key.SubRepoPath] = info
474 }
475
476 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{}
477 for path, info := range reposByPath {
478 tpl := opts.BuildOptions.RepositoryDescription
479 if path != "" {
480 tpl = zoekt.Repository{URL: info.URL.String()}
481 if err := SetTemplatesFromOrigin(&tpl, info.URL); err != nil {
482 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.URL, err)
483 }
484 }
485 opts.BuildOptions.SubRepositories[path] = &tpl
486 }
487
488 for _, br := range opts.BuildOptions.RepositoryDescription.Branches {
489 for path, repo := range opts.BuildOptions.SubRepositories {
490 id := branchVersions[br.Name][path]
491 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{
492 Name: br.Name,
493 Version: id.String(),
494 })
495 }
496 }
497
498 builder, err := build.NewBuilder(opts.BuildOptions)
499 if err != nil {
500 return false, fmt.Errorf("build.NewBuilder: %w", err)
501 }
502
503 // Preparing the build can consume substantial memory, so check usage before starting to index.
504 builder.CheckMemoryUsage()
505
506 var ranks repoPathRanks
507 var meanRank float64
508 if opts.BuildOptions.DocumentRanksPath != "" {
509 data, err := os.ReadFile(opts.BuildOptions.DocumentRanksPath)
510 if err != nil {
511 return false, err
512 }
513
514 err = json.Unmarshal(data, &ranks)
515 if err != nil {
516 return false, err
517 }
518
519 // Compute the mean rank for this repository. Note: we overwrite the rank
520 // mean that's stored in the document ranks file, since that currently
521 // represents a global mean rank across repos, which is not what we want.
522 numRanks := len(ranks.Paths)
523 if numRanks > 0 {
524 for _, rank := range ranks.Paths {
525 meanRank += rank
526 }
527 ranks.MeanRank = meanRank / float64(numRanks)
528 }
529 }
530
531 // we don't need to check error, since we either already have an error, or
532 // we returning the first call to builder.Finish.
533 defer builder.Finish() // nolint:errcheck
534
535 for _, f := range changedOrRemovedFiles {
536 builder.MarkFileAsChangedOrRemoved(f)
537 }
538
539 var names []string
540 fileKeys := map[string][]fileKey{}
541 totalFiles := 0
542
543 for key := range repos {
544 n := key.FullPath()
545 fileKeys[n] = append(fileKeys[n], key)
546 names = append(names, n)
547 totalFiles++
548 }
549
550 sort.Strings(names)
551 names = uniq(names)
552
553 log.Printf("attempting to index %d total files", totalFiles)
554 for idx, name := range names {
555 keys := fileKeys[name]
556
557 for _, key := range keys {
558 doc, err := createDocument(key, repos, ranks, opts.BuildOptions)
559 if err != nil {
560 return false, err
561 }
562
563 if err := builder.Add(doc); err != nil {
564 return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err)
565 }
566
567 if idx%10_000 == 0 {
568 builder.CheckMemoryUsage()
569 }
570 }
571 }
572 return true, builder.Finish()
573}
574
575type repoPathRanks struct {
576 MeanRank float64 `json:"mean_reference_count"`
577 Paths map[string]float64 `json:"paths"`
578}
579
580// rank returns the rank for a given path. It uses these rules:
581// - If we have a concrete rank for this file, always use it
582// - If there's no rank, and it's a low priority file like a test, then use rank 0
583// - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage
584func (r repoPathRanks) rank(path string, content []byte) float64 {
585 if rank, ok := r.Paths[path]; ok {
586 return rank
587 } else if build.IsLowPriority(path, content) {
588 return 0.0
589 } else {
590 return r.MeanRank
591 }
592}
593
594func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) {
595 ignoreFile, err := tree.File(ignore.IgnoreFile)
596 if err == object.ErrFileNotFound {
597 return &ignore.Matcher{}, nil
598 }
599 if err != nil {
600 return nil, err
601 }
602 content, err := ignoreFile.Contents()
603 if err != nil {
604 return nil, err
605 }
606 return ignore.ParseIgnoreFile(strings.NewReader(content))
607}
608
609// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing
610// a build.Builder instance for generating a delta build.
611type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error)
612
613// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing
614// a build.Builder instance for generating a normal build.
615type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error)
616
617type gitIndexConfig struct {
618 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to
619 // prepare the build.Builder instance for generating a delta build.
620 //
621 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead.
622 prepareDeltaBuild prepareDeltaBuildFunc
623
624 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to
625 // prepare the build.Builder instance for generating a normal build.
626 //
627 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead.
628 prepareNormalBuild prepareNormalBuildFunc
629}
630
631func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) {
632 if options.Submodules {
633 return nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing")
634 }
635
636 // discover what commits we indexed during our last build
637 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata()
638 if err != nil {
639 return nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err)
640 }
641
642 if !ok {
643 return nil, nil, nil, fmt.Errorf("no existing shards found for repository")
644 }
645
646 if options.DeltaShardNumberFallbackThreshold > 0 {
647 // HACK: For our interim compaction strategy, we force a full normal index once
648 // the number of shards on disk for this repository exceeds the provided threshold.
649 //
650 // This strategy obviously isn't optimal (as an example: we currently can't differentiate
651 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per
652 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads
653 // while we create a better compaction strategy).
654
655 oldShards := options.BuildOptions.FindAllShards()
656 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold {
657 return nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold)
658 }
659 }
660
661 // Check to see if the set of branch names is consistent with what we last indexed.
662 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a
663 // normal one).
664
665 if !build.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) {
666 var existingBranchNames []string
667 for _, b := range existingRepository.Branches {
668 existingBranchNames = append(existingBranchNames, b.Name)
669 }
670
671 var optionsBranchNames []string
672 for _, b := range options.BuildOptions.RepositoryDescription.Branches {
673 optionsBranchNames = append(optionsBranchNames, b.Name)
674 }
675
676 existingBranchList := strings.Join(existingBranchNames, ", ")
677 optionsBranchList := strings.Join(optionsBranchNames, ", ")
678
679 return nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList)
680 }
681
682 // Check if the build options hash does not match the repository metadata's hash
683 // If it does not match then one or more index options has changed and will require a normal build instead of a delta build
684 if options.BuildOptions.GetHash() != existingRepository.IndexOptions {
685 return nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions())
686 }
687
688 // branch => (path, sha1) => repo.
689 repos = map[fileKey]BlobLocation{}
690
691 // branch name -> git worktree at most current commit
692 branchToCurrentTree := make(map[string]*object.Tree, len(options.Branches))
693
694 for _, b := range options.Branches {
695 commit, err := getCommit(repository, options.BranchPrefix, b)
696 if err != nil {
697 return nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err)
698 }
699
700 tree, err := commit.Tree()
701 if err != nil {
702 return nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err)
703 }
704
705 branchToCurrentTree[b] = tree
706 }
707
708 rawURL := options.BuildOptions.RepositoryDescription.URL
709 u, err := url.Parse(rawURL)
710 if err != nil {
711 return nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err)
712 }
713
714 // TODO: Support repository submodules for delta builds
715
716 // loop over all branches, calculate the diff between our
717 // last indexed commit and the current commit, and add files mentioned in the diff
718 for _, branch := range existingRepository.Branches {
719 lastIndexedCommit, err := getCommit(repository, "", branch.Version)
720 if err != nil {
721 return nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err)
722 }
723
724 lastIndexedTree, err := lastIndexedCommit.Tree()
725 if err != nil {
726 return nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err)
727 }
728
729 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false})
730 if err != nil {
731 return nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err)
732 }
733
734 for i, c := range changes {
735 oldFile, newFile, err := c.Files()
736 if err != nil {
737 return nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err)
738 }
739
740 if newFile != nil {
741 // note: newFile.Name could be a path that isn't relative to the repository root - using the
742 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root
743 newFileRelativeRootPath := c.To.Name
744
745 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds
746 if newFileRelativeRootPath == ignore.IgnoreFile {
747 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
748 }
749
750 // either file is added or renamed, so we need to add the new version to the build
751 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash}
752 if existing, ok := repos[file]; ok {
753 existing.Branches = append(existing.Branches, branch.Name)
754 repos[file] = existing
755 } else {
756 repos[file] = BlobLocation{
757 GitRepo: repository,
758 URL: u,
759 Branches: []string{branch.Name},
760 }
761 }
762 }
763
764 if oldFile == nil {
765 // file added - nothing more to do
766 continue
767 }
768
769 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the
770 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root
771 oldFileRelativeRootPath := c.From.Name
772
773 if oldFileRelativeRootPath == ignore.IgnoreFile {
774 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
775 }
776
777 // The file is either modified or deleted. So, we need to add ALL versions
778 // of the old file (across all branches) to the build.
779 for b, currentTree := range branchToCurrentTree {
780 f, err := currentTree.File(oldFileRelativeRootPath)
781 if err != nil {
782 // the file doesn't exist in this branch
783 if errors.Is(err, object.ErrFileNotFound) {
784 continue
785 }
786
787 return nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err)
788 }
789
790 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()}
791 if existing, ok := repos[file]; ok {
792 existing.Branches = append(existing.Branches, b)
793 repos[file] = existing
794 } else {
795 repos[file] = BlobLocation{
796 GitRepo: repository,
797 URL: u,
798 Branches: []string{b},
799 }
800 }
801 }
802
803 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath)
804 }
805 }
806
807 // we need to de-duplicate the branch map before returning it - it's possible for the same
808 // branch to have been added multiple times if a file has been modified across multiple commits
809 for _, info := range repos {
810 sort.Strings(info.Branches)
811 info.Branches = uniq(info.Branches)
812 }
813
814 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates
815 // for the same reasoning as above
816 sort.Strings(changedOrDeletedPaths)
817 changedOrDeletedPaths = uniq(changedOrDeletedPaths)
818
819 return repos, nil, changedOrDeletedPaths, nil
820}
821
822func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) {
823 var repoCache *RepoCache
824 if options.Submodules {
825 repoCache = NewRepoCache(options.RepoCacheDir)
826 }
827
828 // Branch => Repo => SHA1
829 branchVersions = map[string]map[string]plumbing.Hash{}
830
831 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix)
832 if err != nil {
833 return nil, nil, fmt.Errorf("expandBranches: %w", err)
834 }
835
836 rw := NewRepoWalker(repository, options.BuildOptions.RepositoryDescription.URL, repoCache)
837 for _, b := range branches {
838 commit, err := getCommit(repository, options.BranchPrefix, b)
839 if err != nil {
840 if options.AllowMissingBranch && err.Error() == "reference not found" {
841 continue
842 }
843
844 return nil, nil, fmt.Errorf("getCommit: %w", err)
845 }
846
847 tree, err := commit.Tree()
848 if err != nil {
849 return nil, nil, fmt.Errorf("commit.Tree: %w", err)
850 }
851
852 ig, err := newIgnoreMatcher(tree)
853 if err != nil {
854 return nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err)
855 }
856
857 subVersions, err := rw.CollectFiles(tree, b, ig)
858 if err != nil {
859 return nil, nil, fmt.Errorf("CollectFiles: %w", err)
860 }
861
862 branchVersions[b] = subVersions
863 }
864
865 return rw.Files, branchVersions, nil
866}
867
868func createDocument(key fileKey,
869 repos map[fileKey]BlobLocation,
870 ranks repoPathRanks,
871 opts build.Options,
872) (zoekt.Document, error) {
873 repo := repos[key]
874 blob, err := repo.GitRepo.BlobObject(key.ID)
875 branches := repos[key].Branches
876
877 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found.
878 if errors.Is(err, plumbing.ErrObjectNotFound) {
879 return skippedLargeDoc(key, branches, opts), nil
880 }
881
882 if err != nil {
883 return zoekt.Document{}, err
884 }
885
886 keyFullPath := key.FullPath()
887 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) {
888 return skippedLargeDoc(key, branches, opts), nil
889 }
890
891 contents, err := blobContents(blob)
892 if err != nil {
893 return zoekt.Document{}, err
894 }
895
896 var pathRanks []float64
897 if len(ranks.Paths) > 0 {
898 // If the repository has ranking data, then store the file's rank.
899 pathRank := ranks.rank(keyFullPath, contents)
900 pathRanks = []float64{pathRank}
901 }
902
903 return zoekt.Document{
904 SubRepositoryPath: key.SubRepoPath,
905 Name: keyFullPath,
906 Content: contents,
907 Branches: branches,
908 Ranks: pathRanks,
909 }, nil
910}
911
912func skippedLargeDoc(key fileKey, branches []string, opts build.Options) zoekt.Document {
913 return zoekt.Document{
914 SkipReason: fmt.Sprintf("file size exceeds maximum size %d", opts.SizeMax),
915 Name: key.FullPath(),
916 Branches: branches,
917 SubRepositoryPath: key.SubRepoPath,
918 }
919}
920
921func blobContents(blob *object.Blob) ([]byte, error) {
922 r, err := blob.Reader()
923 if err != nil {
924 return nil, err
925 }
926 defer r.Close()
927
928 var buf bytes.Buffer
929 buf.Grow(int(blob.Size))
930 _, err = buf.ReadFrom(r)
931 if err != nil {
932 return nil, err
933 }
934 return buf.Bytes(), nil
935}
936
937func uniq(ss []string) []string {
938 result := ss[:0]
939 var last string
940 for i, s := range ss {
941 if i == 0 || s != last {
942 result = append(result, s)
943 }
944 last = s
945 }
946 return result
947}