fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package gitindex provides functions for indexing Git repositories.
16package gitindex
17
18import (
19 "bytes"
20 "context"
21 "encoding/json"
22 "errors"
23 "fmt"
24 "io"
25 "log"
26 "math"
27 "net/url"
28 "os"
29 "path/filepath"
30 "sort"
31 "strconv"
32 "strings"
33 "time"
34
35 "github.com/sourcegraph/zoekt"
36 "github.com/sourcegraph/zoekt/build"
37 "github.com/sourcegraph/zoekt/ignore"
38
39 "github.com/go-git/go-git/v5/config"
40 "github.com/go-git/go-git/v5/plumbing"
41 "github.com/go-git/go-git/v5/plumbing/object"
42
43 git "github.com/go-git/go-git/v5"
44)
45
46// RepoModTime returns the time of last fetch of a git repository.
47func RepoModTime(dir string) (time.Time, error) {
48 var last time.Time
49 refDir := filepath.Join(dir, "refs")
50 if _, err := os.Lstat(refDir); err == nil {
51 if err := filepath.Walk(refDir,
52 func(_ string, fi os.FileInfo, _ error) error {
53 if !fi.IsDir() && last.Before(fi.ModTime()) {
54 last = fi.ModTime()
55 }
56 return nil
57 }); err != nil {
58 return last, err
59 }
60 }
61
62 // git gc compresses refs into the following file:
63 for _, fn := range []string{"info/refs", "packed-refs"} {
64 if fi, err := os.Lstat(filepath.Join(dir, fn)); err == nil && !fi.IsDir() && last.Before(fi.ModTime()) {
65 last = fi.ModTime()
66 }
67 }
68
69 return last, nil
70}
71
72// FindGitRepos finds directories holding git repositories below the
73// given directory. It will find both bare and the ".git" dirs in
74// non-bare repositories. It returns the full path including the dir
75// passed in.
76func FindGitRepos(dir string) ([]string, error) {
77 arg, err := filepath.Abs(dir)
78 if err != nil {
79 return nil, err
80 }
81 var dirs []string
82 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error {
83 // Best-effort, ignore filepath.Walk failing
84 if err != nil {
85 return nil
86 }
87
88 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() {
89 dirs = append(dirs, filepath.Join(name, ".git"))
90 return filepath.SkipDir
91 }
92
93 if !strings.HasSuffix(name, ".git") || !fi.IsDir() {
94 return nil
95 }
96
97 fi, err = os.Lstat(filepath.Join(name, "objects"))
98 if err != nil || !fi.IsDir() {
99 return nil
100 }
101
102 dirs = append(dirs, name)
103 return filepath.SkipDir
104 }); err != nil {
105 return nil, err
106 }
107
108 return dirs, nil
109}
110
111// setTemplates fills in URL templates for known git hosting
112// sites.
113func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error {
114 repo.URL = u.String()
115 switch typ {
116 case "gitiles":
117 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20
118 repo.CommitURLTemplate = u.String() + "/+/{{.Version}}"
119 repo.FileURLTemplate = u.String() + "/+/{{.Version}}/{{.Path}}"
120 repo.LineFragmentTemplate = "#{{.LineNumber}}"
121 case "github":
122 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10
123 repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}"
124 repo.FileURLTemplate = u.String() + "/blob/{{.Version}}/{{.Path}}"
125 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
126 case "cgit":
127 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100
128 repo.CommitURLTemplate = u.String() + "/commit/?id={{.Version}}"
129 repo.FileURLTemplate = u.String() + "/tree/{{.Path}}/?id={{.Version}}"
130 repo.LineFragmentTemplate = "#n{{.LineNumber}}"
131 case "gitweb":
132 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10
133 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}"
134 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}"
135 repo.LineFragmentTemplate = "#l{{.LineNumber}}"
136 case "source.bazel.build":
137 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9
138 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10
139 repo.CommitURLTemplate = u.String() + "/+/{{.Version}}"
140 repo.FileURLTemplate = u.String() + "/+/{{.Version}}:{{.Path}}"
141 repo.LineFragmentTemplate = ";l={{.LineNumber}}"
142 case "bitbucket-server":
143 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc
144 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc
145 repo.CommitURLTemplate = u.String() + "/commits/{{.Version}}"
146 repo.FileURLTemplate = u.String() + "/{{.Path}}?at={{.Version}}"
147 repo.LineFragmentTemplate = "#{{.LineNumber}}"
148 case "gitlab":
149 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed
150 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template
151 repo.CommitURLTemplate = u.String() + "/-/commit/{{.Version}}"
152 repo.FileURLTemplate = u.String() + "/-/blob/{{.Version}}/{{.Path}}"
153 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
154 case "gitea":
155 repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}"
156 // NOTE The `display=source` query parameter is required to disable file rendering.
157 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to
158 // a line without `display=source`. This is supported since gitea 1.17.0.
159 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}},
160 // but the query parameters are obmitted.
161 repo.FileURLTemplate = u.String() + "/src/commit/{{.Version}}/{{.Path}}?display=source"
162 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
163 default:
164 return fmt.Errorf("URL scheme type %q unknown", typ)
165 }
166 return nil
167}
168
169// getCommit returns a tree object for the given reference.
170func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) {
171 sha1, err := repo.ResolveRevision(plumbing.Revision(ref))
172 // ref might be a branch name (e.g. "master") add branch prefix and try again.
173 if err != nil {
174 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref)))
175 }
176 if err != nil {
177 return nil, err
178 }
179
180 commitObj, err := repo.CommitObject(*sha1)
181 if err != nil {
182 return nil, err
183 }
184 return commitObj, nil
185}
186
187func configLookupRemoteURL(cfg *config.Config, key string) string {
188 rc := cfg.Remotes[key]
189 if rc == nil || len(rc.URLs) == 0 {
190 return ""
191 }
192 return rc.URLs[0]
193}
194
195func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error {
196 repo, err := git.PlainOpen(repoDir)
197 if err != nil {
198 return err
199 }
200
201 cfg, err := repo.Config()
202 if err != nil {
203 return err
204 }
205
206 sec := cfg.Raw.Section("zoekt")
207
208 webURLStr := sec.Options.Get("web-url")
209 webURLType := sec.Options.Get("web-url-type")
210
211 if webURLType != "" && webURLStr != "" {
212 webURL, err := url.Parse(webURLStr)
213 if err != nil {
214 return err
215 }
216 if err := setTemplates(desc, webURL, webURLType); err != nil {
217 return err
218 }
219 } else if webURLStr != "" {
220 desc.URL = webURLStr
221 }
222
223 name := sec.Options.Get("name")
224 if name != "" {
225 desc.Name = name
226 } else {
227 remoteURL := configLookupRemoteURL(cfg, "origin")
228 if remoteURL == "" {
229 return nil
230 }
231 u, err := url.Parse(remoteURL)
232 if err != nil {
233 return err
234 }
235 if err := SetTemplatesFromOrigin(desc, u); err != nil {
236 return err
237 }
238 }
239
240 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32)
241 desc.ID = uint32(id)
242
243 if desc.RawConfig == nil {
244 desc.RawConfig = map[string]string{}
245 }
246 for _, o := range sec.Options {
247 desc.RawConfig[o.Key] = o.Value
248 }
249
250 // Ranking info.
251
252 // Github:
253 traction := 0
254 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} {
255 f, err := strconv.Atoi(sec.Options.Get(s))
256 if err == nil {
257 traction += f
258 }
259 }
260
261 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 {
262 // Pretend everything on googlesource.com has 1000
263 // github stars.
264 traction = 1000
265 }
266
267 if traction > 0 {
268 l := math.Log(float64(traction))
269 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000)
270 }
271
272 return nil
273}
274
275// SetTemplatesFromOrigin fills in templates based on the origin URL.
276func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error {
277 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git"))
278
279 if strings.HasSuffix(u.Host, ".googlesource.com") {
280 return setTemplates(desc, u, "gitiles")
281 } else if u.Host == "github.com" {
282 u.Path = strings.TrimSuffix(u.Path, ".git")
283 return setTemplates(desc, u, "github")
284 } else {
285 return fmt.Errorf("unknown git hosting site %q", u)
286 }
287}
288
289// The Options structs controls details of the indexing process.
290type Options struct {
291 // The repository to be indexed.
292 RepoDir string
293
294 // If set, follow submodule links. This requires RepoCacheDir to be set.
295 Submodules bool
296
297 // If set, skip indexing if the existing index shard is newer
298 // than the refs in the repository.
299 Incremental bool
300
301 // Don't error out if some branch is missing
302 AllowMissingBranch bool
303
304 // Specifies the root of a Repository cache. Needed for submodule indexing.
305 RepoCacheDir string
306
307 // Indexing options.
308 BuildOptions build.Options
309
310 // Prefix of the branch to index, e.g. `remotes/origin`.
311 BranchPrefix string
312
313 // List of branch names to index, e.g. []string{"HEAD", "stable"}
314 Branches []string
315
316 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards
317 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold,
318 // then a normal build will be performed instead.
319 //
320 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled:
321 // a delta build will always be performed regardless of the number of preexisting shards.
322 DeltaShardNumberFallbackThreshold uint64
323}
324
325func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) {
326 var result []string
327 for _, b := range bs {
328 // Sourcegraph: We disable resolving refs. We want to return the exact ref
329 // requested so we can match it up.
330 if b == "HEAD" && false {
331 ref, err := repo.Head()
332 if err != nil {
333 return nil, err
334 }
335
336 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix))
337 continue
338 }
339
340 if strings.Contains(b, "*") {
341 iter, err := repo.Branches()
342 if err != nil {
343 return nil, err
344 }
345
346 defer iter.Close()
347 for {
348 ref, err := iter.Next()
349 if err == io.EOF {
350 break
351 }
352 if err != nil {
353 return nil, err
354 }
355
356 name := ref.Name().Short()
357 if matched, err := filepath.Match(b, name); err != nil {
358 return nil, err
359 } else if !matched {
360 continue
361 }
362
363 result = append(result, strings.TrimPrefix(name, prefix))
364 }
365 continue
366 }
367
368 result = append(result, b)
369 }
370
371 return result, nil
372}
373
374// IndexGitRepo indexes the git repository as specified by the options.
375func IndexGitRepo(opts Options) error {
376 return indexGitRepo(opts, gitIndexConfig{})
377}
378
379// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig.
380func indexGitRepo(opts Options, config gitIndexConfig) error {
381 prepareDeltaBuild := prepareDeltaBuild
382 if config.prepareDeltaBuild != nil {
383 prepareDeltaBuild = config.prepareDeltaBuild
384 }
385
386 prepareNormalBuild := prepareNormalBuild
387 if config.prepareNormalBuild != nil {
388 prepareNormalBuild = config.prepareNormalBuild
389 }
390
391 // Set max thresholds, since we use them in this function.
392 opts.BuildOptions.SetDefaults()
393 if opts.RepoDir == "" {
394 return fmt.Errorf("gitindex: must set RepoDir")
395 }
396
397 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir
398 repo, err := git.PlainOpen(opts.RepoDir)
399 if err != nil {
400 return fmt.Errorf("git.PlainOpen: %w", err)
401 }
402
403 if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil {
404 log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err)
405 }
406
407 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix)
408 if err != nil {
409 return fmt.Errorf("expandBranches: %w", err)
410 }
411 for _, b := range branches {
412 commit, err := getCommit(repo, opts.BranchPrefix, b)
413 if err != nil {
414 if opts.AllowMissingBranch && err.Error() == "reference not found" {
415 continue
416 }
417
418 return fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err)
419 }
420
421 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{
422 Name: b,
423 Version: commit.Hash.String(),
424 })
425
426 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) {
427 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when
428 }
429 }
430
431 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() {
432 return nil
433 }
434
435 // branch => (path, sha1) => repo.
436 var repos map[fileKey]BlobLocation
437
438 // fileKey => branches
439 var branchMap map[fileKey][]string
440
441 // Branch => Repo => SHA1
442 var branchVersions map[string]map[string]plumbing.Hash
443
444 // set of file paths that have been changed or deleted since
445 // the last indexed commit
446 //
447 // These only have an effect on delta builds
448 var changedOrRemovedFiles []string
449
450 if opts.BuildOptions.IsDelta {
451 repos, branchMap, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo)
452 if err != nil {
453 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err)
454 opts.BuildOptions.IsDelta = false
455 }
456 }
457
458 if !opts.BuildOptions.IsDelta {
459 repos, branchMap, branchVersions, err = prepareNormalBuild(opts, repo)
460 if err != nil {
461 return fmt.Errorf("preparing normal build: %w", err)
462 }
463 }
464
465 reposByPath := map[string]BlobLocation{}
466 for key, location := range repos {
467 reposByPath[key.SubRepoPath] = location
468 }
469
470 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{}
471 for path, location := range reposByPath {
472 tpl := opts.BuildOptions.RepositoryDescription
473 if path != "" {
474 tpl = zoekt.Repository{URL: location.URL.String()}
475 if err := SetTemplatesFromOrigin(&tpl, location.URL); err != nil {
476 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, location.URL, err)
477 }
478 }
479 opts.BuildOptions.SubRepositories[path] = &tpl
480 }
481
482 for _, br := range opts.BuildOptions.RepositoryDescription.Branches {
483 for path, repo := range opts.BuildOptions.SubRepositories {
484 id := branchVersions[br.Name][path]
485 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{
486 Name: br.Name,
487 Version: id.String(),
488 })
489 }
490 }
491
492 builder, err := build.NewBuilder(opts.BuildOptions)
493 if err != nil {
494 return fmt.Errorf("build.NewBuilder: %w", err)
495 }
496
497 var ranks repoPathRanks
498 var meanRank float64
499 if opts.BuildOptions.DocumentRanksPath != "" {
500 data, err := os.ReadFile(opts.BuildOptions.DocumentRanksPath)
501 if err != nil {
502 return err
503 }
504
505 err = json.Unmarshal(data, &ranks)
506 if err != nil {
507 return err
508 }
509
510 // Compute the mean rank for this repository. Note: we overwrite the rank
511 // mean that's stored in the document ranks file, since that currently
512 // represents a global mean rank across repos, which is not what we want.
513 numRanks := len(ranks.Paths)
514 if numRanks > 0 {
515 for _, rank := range ranks.Paths {
516 meanRank += rank
517 }
518 ranks.MeanRank = meanRank / float64(numRanks)
519 }
520 }
521
522 // we don't need to check error, since we either already have an error, or
523 // we returning the first call to builder.Finish.
524 defer builder.Finish() // nolint:errcheck
525
526 for _, f := range changedOrRemovedFiles {
527 builder.MarkFileAsChangedOrRemoved(f)
528 }
529
530 var names []string
531 fileKeys := map[string][]fileKey{}
532 totalFiles := 0
533
534 for key := range repos {
535 n := key.FullPath()
536 fileKeys[n] = append(fileKeys[n], key)
537 names = append(names, n)
538 totalFiles++
539 }
540
541 sort.Strings(names)
542 names = uniq(names)
543
544 log.Printf("attempting to index %d total files", totalFiles)
545 for _, name := range names {
546 keys := fileKeys[name]
547
548 for _, key := range keys {
549 doc, err := createDocument(key, repos, branchMap, ranks, opts.BuildOptions)
550 if err != nil {
551 return err
552 }
553
554 if err := builder.Add(doc); err != nil {
555 return fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err)
556 }
557 }
558 }
559
560 return builder.Finish()
561}
562
563type repoPathRanks struct {
564 MeanRank float64 `json:"mean_reference_count"`
565 Paths map[string]float64 `json:"paths"`
566}
567
568// rank returns the rank for a given path. It uses these rules:
569// - If we have a concrete rank for this file, always use it
570// - If there's no rank, and it's a low priority file like a test, then use rank 0
571// - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage
572func (r repoPathRanks) rank(path string) float64 {
573 if rank, ok := r.Paths[path]; ok {
574 return rank
575 } else if build.IsLowPriority(path) {
576 return 0.0
577 } else {
578 return r.MeanRank
579 }
580}
581
582func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) {
583 ignoreFile, err := tree.File(ignore.IgnoreFile)
584 if err == object.ErrFileNotFound {
585 return &ignore.Matcher{}, nil
586 }
587 if err != nil {
588 return nil, err
589 }
590 content, err := ignoreFile.Contents()
591 if err != nil {
592 return nil, err
593 }
594 return ignore.ParseIgnoreFile(strings.NewReader(content))
595}
596
597// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing
598// a build.Builder instance for generating a delta build.
599type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error)
600
601// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing
602// a build.Builder instance for generating a normal build.
603type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, err error)
604
605type gitIndexConfig struct {
606 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to
607 // prepare the build.Builder instance for generating a delta build.
608 //
609 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead.
610 prepareDeltaBuild prepareDeltaBuildFunc
611
612 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to
613 // prepare the build.Builder instance for generating a normal build.
614 //
615 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead.
616 prepareNormalBuild prepareNormalBuildFunc
617}
618
619func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) {
620 if options.Submodules {
621 return nil, nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing")
622 }
623
624 // discover what commits we indexed during our last build
625 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata()
626 if err != nil {
627 return nil, nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err)
628 }
629
630 if !ok {
631 return nil, nil, nil, nil, fmt.Errorf("no existing shards found for repository")
632 }
633
634 if options.DeltaShardNumberFallbackThreshold > 0 {
635 // HACK: For our interim compaction strategy, we force a full normal index once
636 // the number of shards on disk for this repository exceeds the provided threshold.
637 //
638 // This strategy obviously isn't optimal (as an example: we currently can't differentiate
639 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per
640 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads
641 // while we create a better compaction strategy).
642
643 oldShards := options.BuildOptions.FindAllShards()
644 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold {
645 return nil, nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold)
646 }
647 }
648
649 // Check to see if the set of branch names is consistent with what we last indexed.
650 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a
651 // normal one).
652
653 if !build.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) {
654 var existingBranchNames []string
655 for _, b := range existingRepository.Branches {
656 existingBranchNames = append(existingBranchNames, b.Name)
657 }
658
659 var optionsBranchNames []string
660 for _, b := range options.BuildOptions.RepositoryDescription.Branches {
661 optionsBranchNames = append(optionsBranchNames, b.Name)
662 }
663
664 existingBranchList := strings.Join(existingBranchNames, ", ")
665 optionsBranchList := strings.Join(optionsBranchNames, ", ")
666
667 return nil, nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList)
668 }
669
670 // Check if the build options hash does not match the repository metadata's hash
671 // If it does not match then one or more index options has changed and will require a normal build instead of a delta build
672 if options.BuildOptions.GetHash() != existingRepository.IndexOptions {
673 return nil, nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions())
674 }
675
676 // branch => (path, sha1) => repo.
677 repos = map[fileKey]BlobLocation{}
678
679 // fileKey => branches
680 branchMap = map[fileKey][]string{}
681
682 // branch name -> git worktree at most current commit
683 branchToCurrentTree := make(map[string]*object.Tree, len(options.Branches))
684
685 for _, b := range options.Branches {
686 commit, err := getCommit(repository, options.BranchPrefix, b)
687 if err != nil {
688 return nil, nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err)
689 }
690
691 tree, err := commit.Tree()
692 if err != nil {
693 return nil, nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err)
694 }
695
696 branchToCurrentTree[b] = tree
697 }
698
699 rawURL := options.BuildOptions.RepositoryDescription.URL
700 u, err := url.Parse(rawURL)
701 if err != nil {
702 return nil, nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err)
703 }
704
705 // TODO: Support repository submodules for delta builds
706 // For this prototype, we are ignoring repository submodules, which means that we can use the same
707 // blob location for all files
708 hackSharedBlobLocation := BlobLocation{
709 Repo: repository,
710 URL: u,
711 }
712
713 // loop over all branches, calculate the diff between our
714 // last indexed commit and the current commit, and add files mentioned in the diff
715 for _, branch := range existingRepository.Branches {
716 lastIndexedCommit, err := getCommit(repository, "", branch.Version)
717 if err != nil {
718 return nil, nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err)
719 }
720
721 lastIndexedTree, err := lastIndexedCommit.Tree()
722 if err != nil {
723 return nil, nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err)
724 }
725
726 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false})
727 if err != nil {
728 return nil, nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err)
729 }
730
731 for i, c := range changes {
732 oldFile, newFile, err := c.Files()
733 if err != nil {
734 return nil, nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err)
735 }
736
737 if newFile != nil {
738 // note: newFile.Name could be a path that isn't relative to the repository root - using the
739 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root
740 newFileRelativeRootPath := c.To.Name
741
742 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds
743 if newFileRelativeRootPath == ignore.IgnoreFile {
744 return nil, nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
745 }
746
747 // either file is added or renamed, so we need to add the new version to the build
748 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash}
749 repos[file] = hackSharedBlobLocation
750 branchMap[file] = append(branchMap[file], branch.Name)
751 }
752
753 if oldFile == nil {
754 // file added - nothing more to do
755 continue
756 }
757
758 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the
759 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root
760 oldFileRelativeRootPath := c.From.Name
761
762 if oldFileRelativeRootPath == ignore.IgnoreFile {
763 return nil, nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
764 }
765
766 // The file is either modified or deleted. So, we need to add ALL versions
767 // of the old file (across all branches) to the build.
768 for b, currentTree := range branchToCurrentTree {
769 f, err := currentTree.File(oldFileRelativeRootPath)
770 if err != nil {
771 // the file doesn't exist in this branch
772 if errors.Is(err, object.ErrFileNotFound) {
773 continue
774 }
775
776 return nil, nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err)
777 }
778
779 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()}
780 repos[file] = hackSharedBlobLocation
781 branchMap[file] = append(branchMap[file], b)
782 }
783
784 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath)
785 }
786 }
787
788 // we need to de-duplicate the branch map before returning it - it's possible for the same
789 // branch to have been added multiple times if a file has been modified across multiple commits
790
791 for file, branches := range branchMap {
792 sort.Strings(branches)
793 branchMap[file] = uniq(branches)
794 }
795
796 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates
797 // for the same reasoning as above
798
799 sort.Strings(changedOrDeletedPaths)
800 changedOrDeletedPaths = uniq(changedOrDeletedPaths)
801
802 return repos, branchMap, nil, changedOrDeletedPaths, nil
803}
804
805func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, err error) {
806 var repoCache *RepoCache
807 if options.Submodules {
808 repoCache = NewRepoCache(options.RepoCacheDir)
809 }
810
811 // branch => (path, sha1) => repo.
812 repos = map[fileKey]BlobLocation{}
813
814 // fileKey => branches
815 branchMap = map[fileKey][]string{}
816
817 // Branch => Repo => SHA1
818 branchVersions = map[string]map[string]plumbing.Hash{}
819
820 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix)
821 if err != nil {
822 return nil, nil, nil, fmt.Errorf("expandBranches: %w", err)
823 }
824
825 for _, b := range branches {
826 commit, err := getCommit(repository, options.BranchPrefix, b)
827 if err != nil {
828 if options.AllowMissingBranch && err.Error() == "reference not found" {
829 continue
830 }
831
832 return nil, nil, nil, fmt.Errorf("getCommit: %w", err)
833 }
834
835 tree, err := commit.Tree()
836 if err != nil {
837 return nil, nil, nil, fmt.Errorf("commit.Tree: %w", err)
838 }
839
840 ig, err := newIgnoreMatcher(tree)
841 if err != nil {
842 return nil, nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err)
843 }
844
845 files, subVersions, err := TreeToFiles(repository, tree, options.BuildOptions.RepositoryDescription.URL, repoCache)
846 if err != nil {
847 return nil, nil, nil, fmt.Errorf("TreeToFiles: %w", err)
848 }
849 for k, v := range files {
850 if ig.Match(k.Path) {
851 continue
852 }
853 repos[k] = v
854 branchMap[k] = append(branchMap[k], b)
855 }
856
857 branchVersions[b] = subVersions
858 }
859
860 return repos, branchMap, branchVersions, nil
861}
862
863func createDocument(key fileKey,
864 repos map[fileKey]BlobLocation,
865 branchMap map[fileKey][]string,
866 ranks repoPathRanks,
867 opts build.Options,
868) (zoekt.Document, error) {
869 blob, err := repos[key].Repo.BlobObject(key.ID)
870 if err != nil {
871 return zoekt.Document{}, err
872 }
873
874 keyFullPath := key.FullPath()
875 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) {
876 return zoekt.Document{
877 SkipReason: fmt.Sprintf("file size %d exceeds maximum size %d", blob.Size, opts.SizeMax),
878 Name: key.FullPath(),
879 Branches: branchMap[key],
880 SubRepositoryPath: key.SubRepoPath,
881 }, nil
882 }
883
884 contents, err := blobContents(blob)
885 if err != nil {
886 return zoekt.Document{}, err
887 }
888
889 var pathRanks []float64
890 if len(ranks.Paths) > 0 {
891 // If the repository has ranking data, then store the file's rank.
892 pathRank := ranks.rank(keyFullPath)
893 pathRanks = []float64{pathRank}
894 }
895
896 return zoekt.Document{
897 SubRepositoryPath: key.SubRepoPath,
898 Name: keyFullPath,
899 Content: contents,
900 Branches: branchMap[key],
901 Ranks: pathRanks,
902 }, nil
903}
904
905func blobContents(blob *object.Blob) ([]byte, error) {
906 r, err := blob.Reader()
907 if err != nil {
908 return nil, err
909 }
910 defer r.Close()
911
912 var buf bytes.Buffer
913 buf.Grow(int(blob.Size))
914 _, err = buf.ReadFrom(r)
915 if err != nil {
916 return nil, err
917 }
918 return buf.Bytes(), nil
919}
920
921func uniq(ss []string) []string {
922 result := ss[:0]
923 var last string
924 for i, s := range ss {
925 if i == 0 || s != last {
926 result = append(result, s)
927 }
928 last = s
929 }
930 return result
931}