fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package gitindex provides functions for indexing Git repositories.
16package gitindex
17
18import (
19 "bytes"
20 "context"
21 "encoding/json"
22 "errors"
23 "fmt"
24 "io"
25 "log"
26 "math"
27 "net/url"
28 "os"
29 "path/filepath"
30 "regexp"
31 "sort"
32 "strconv"
33 "strings"
34 "time"
35
36 "github.com/sourcegraph/zoekt"
37 "github.com/sourcegraph/zoekt/build"
38 "github.com/sourcegraph/zoekt/ignore"
39
40 "github.com/go-git/go-git/v5/config"
41 "github.com/go-git/go-git/v5/plumbing"
42 "github.com/go-git/go-git/v5/plumbing/object"
43
44 git "github.com/go-git/go-git/v5"
45)
46
47// RepoModTime returns the time of last fetch of a git repository.
48func RepoModTime(dir string) (time.Time, error) {
49 var last time.Time
50 refDir := filepath.Join(dir, "refs")
51 if _, err := os.Lstat(refDir); err == nil {
52 if err := filepath.Walk(refDir,
53 func(_ string, fi os.FileInfo, _ error) error {
54 if !fi.IsDir() && last.Before(fi.ModTime()) {
55 last = fi.ModTime()
56 }
57 return nil
58 }); err != nil {
59 return last, err
60 }
61 }
62
63 // git gc compresses refs into the following file:
64 for _, fn := range []string{"info/refs", "packed-refs"} {
65 if fi, err := os.Lstat(filepath.Join(dir, fn)); err == nil && !fi.IsDir() && last.Before(fi.ModTime()) {
66 last = fi.ModTime()
67 }
68 }
69
70 return last, nil
71}
72
73// FindGitRepos finds directories holding git repositories below the
74// given directory. It will find both bare and the ".git" dirs in
75// non-bare repositories. It returns the full path including the dir
76// passed in.
77func FindGitRepos(dir string) ([]string, error) {
78 arg, err := filepath.Abs(dir)
79 if err != nil {
80 return nil, err
81 }
82 var dirs []string
83 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error {
84 // Best-effort, ignore filepath.Walk failing
85 if err != nil {
86 return nil
87 }
88
89 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() {
90 dirs = append(dirs, filepath.Join(name, ".git"))
91 return filepath.SkipDir
92 }
93
94 if !strings.HasSuffix(name, ".git") || !fi.IsDir() {
95 return nil
96 }
97
98 fi, err = os.Lstat(filepath.Join(name, "objects"))
99 if err != nil || !fi.IsDir() {
100 return nil
101 }
102
103 dirs = append(dirs, name)
104 return filepath.SkipDir
105 }); err != nil {
106 return nil, err
107 }
108
109 return dirs, nil
110}
111
112// setTemplates fills in URL templates for known git hosting
113// sites.
114func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error {
115 if u.Scheme == "ssh+git" {
116 u.Scheme = "https"
117 u.User = nil
118 }
119
120 repo.URL = u.String()
121 switch typ {
122 case "gitiles":
123 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20
124 repo.CommitURLTemplate = u.String() + "/+/{{.Version}}"
125 repo.FileURLTemplate = u.String() + "/+/{{.Version}}/{{.Path}}"
126 repo.LineFragmentTemplate = "#{{.LineNumber}}"
127 case "github":
128 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10
129 repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}"
130 repo.FileURLTemplate = u.String() + "/blob/{{.Version}}/{{.Path}}"
131 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
132 case "cgit":
133 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100
134 repo.CommitURLTemplate = u.String() + "/commit/?id={{.Version}}"
135 repo.FileURLTemplate = u.String() + "/tree/{{.Path}}/?id={{.Version}}"
136 repo.LineFragmentTemplate = "#n{{.LineNumber}}"
137 case "gitweb":
138 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10
139 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}"
140 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}"
141 repo.LineFragmentTemplate = "#l{{.LineNumber}}"
142 case "source.bazel.build":
143 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9
144 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10
145 repo.CommitURLTemplate = u.String() + "/+/{{.Version}}"
146 repo.FileURLTemplate = u.String() + "/+/{{.Version}}:{{.Path}}"
147 repo.LineFragmentTemplate = ";l={{.LineNumber}}"
148 case "bitbucket-server":
149 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc
150 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc
151 repo.CommitURLTemplate = u.String() + "/commits/{{.Version}}"
152 repo.FileURLTemplate = u.String() + "/{{.Path}}?at={{.Version}}"
153 repo.LineFragmentTemplate = "#{{.LineNumber}}"
154 case "gitlab":
155 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed
156 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template
157 repo.CommitURLTemplate = u.String() + "/-/commit/{{.Version}}"
158 repo.FileURLTemplate = u.String() + "/-/blob/{{.Version}}/{{.Path}}"
159 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
160 case "gitea":
161 repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}"
162 // NOTE The `display=source` query parameter is required to disable file rendering.
163 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to
164 // a line without `display=source`. This is supported since gitea 1.17.0.
165 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}},
166 // but the query parameters are obmitted.
167 repo.FileURLTemplate = u.String() + "/src/commit/{{.Version}}/{{.Path}}?display=source"
168 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
169 default:
170 return fmt.Errorf("URL scheme type %q unknown", typ)
171 }
172 return nil
173}
174
175// getCommit returns a tree object for the given reference.
176func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) {
177 sha1, err := repo.ResolveRevision(plumbing.Revision(ref))
178 // ref might be a branch name (e.g. "master") add branch prefix and try again.
179 if err != nil {
180 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref)))
181 }
182 if err != nil {
183 return nil, err
184 }
185
186 commitObj, err := repo.CommitObject(*sha1)
187 if err != nil {
188 return nil, err
189 }
190 return commitObj, nil
191}
192
193func configLookupRemoteURL(cfg *config.Config, key string) string {
194 rc := cfg.Remotes[key]
195 if rc == nil || len(rc.URLs) == 0 {
196 return ""
197 }
198 return rc.URLs[0]
199}
200
201var sshRelativeURLRegexp = regexp.MustCompile(`^([^@]+)@([^:]+):(.*)$`)
202
203func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error {
204 repo, err := git.PlainOpen(repoDir)
205 if err != nil {
206 return err
207 }
208
209 cfg, err := repo.Config()
210 if err != nil {
211 return err
212 }
213
214 sec := cfg.Raw.Section("zoekt")
215
216 webURLStr := sec.Options.Get("web-url")
217 webURLType := sec.Options.Get("web-url-type")
218
219 if webURLType != "" && webURLStr != "" {
220 webURL, err := url.Parse(webURLStr)
221 if err != nil {
222 return err
223 }
224 if err := setTemplates(desc, webURL, webURLType); err != nil {
225 return err
226 }
227 } else if webURLStr != "" {
228 desc.URL = webURLStr
229 }
230
231 name := sec.Options.Get("name")
232 if name != "" {
233 desc.Name = name
234 } else {
235 remoteURL := configLookupRemoteURL(cfg, "origin")
236 if remoteURL == "" {
237 return nil
238 }
239 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil {
240 user := sm[1]
241 host := sm[2]
242 path := sm[3]
243
244 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path)
245 }
246
247 u, err := url.Parse(remoteURL)
248 if err != nil {
249 return err
250 }
251 if err := SetTemplatesFromOrigin(desc, u); err != nil {
252 return err
253 }
254 }
255
256 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32)
257 desc.ID = uint32(id)
258
259 if desc.RawConfig == nil {
260 desc.RawConfig = map[string]string{}
261 }
262 for _, o := range sec.Options {
263 desc.RawConfig[o.Key] = o.Value
264 }
265
266 // Ranking info.
267
268 // Github:
269 traction := 0
270 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} {
271 f, err := strconv.Atoi(sec.Options.Get(s))
272 if err == nil {
273 traction += f
274 }
275 }
276
277 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 {
278 // Pretend everything on googlesource.com has 1000
279 // github stars.
280 traction = 1000
281 }
282
283 if traction > 0 {
284 l := math.Log(float64(traction))
285 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000)
286 }
287
288 return nil
289}
290
291// SetTemplatesFromOrigin fills in templates based on the origin URL.
292func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error {
293 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git"))
294
295 if strings.HasSuffix(u.Host, ".googlesource.com") {
296 return setTemplates(desc, u, "gitiles")
297 } else if u.Host == "github.com" {
298 u.Path = strings.TrimSuffix(u.Path, ".git")
299 return setTemplates(desc, u, "github")
300 } else {
301 return fmt.Errorf("unknown git hosting site %q", u)
302 }
303}
304
305// The Options structs controls details of the indexing process.
306type Options struct {
307 // The repository to be indexed.
308 RepoDir string
309
310 // If set, follow submodule links. This requires RepoCacheDir to be set.
311 Submodules bool
312
313 // If set, skip indexing if the existing index shard is newer
314 // than the refs in the repository.
315 Incremental bool
316
317 // Don't error out if some branch is missing
318 AllowMissingBranch bool
319
320 // Specifies the root of a Repository cache. Needed for submodule indexing.
321 RepoCacheDir string
322
323 // Indexing options.
324 BuildOptions build.Options
325
326 // Prefix of the branch to index, e.g. `remotes/origin`.
327 BranchPrefix string
328
329 // List of branch names to index, e.g. []string{"HEAD", "stable"}
330 Branches []string
331
332 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards
333 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold,
334 // then a normal build will be performed instead.
335 //
336 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled:
337 // a delta build will always be performed regardless of the number of preexisting shards.
338 DeltaShardNumberFallbackThreshold uint64
339}
340
341func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) {
342 var result []string
343 for _, b := range bs {
344 // Sourcegraph: We disable resolving refs. We want to return the exact ref
345 // requested so we can match it up.
346 if b == "HEAD" && false {
347 ref, err := repo.Head()
348 if err != nil {
349 return nil, err
350 }
351
352 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix))
353 continue
354 }
355
356 if strings.Contains(b, "*") {
357 iter, err := repo.Branches()
358 if err != nil {
359 return nil, err
360 }
361
362 defer iter.Close()
363 for {
364 ref, err := iter.Next()
365 if err == io.EOF {
366 break
367 }
368 if err != nil {
369 return nil, err
370 }
371
372 name := ref.Name().Short()
373 if matched, err := filepath.Match(b, name); err != nil {
374 return nil, err
375 } else if !matched {
376 continue
377 }
378
379 result = append(result, strings.TrimPrefix(name, prefix))
380 }
381 continue
382 }
383
384 result = append(result, b)
385 }
386
387 return result, nil
388}
389
390// IndexGitRepo indexes the git repository as specified by the options.
391// The returned bool indicates whether the index was updated as a result. This
392// can be informative if doing incremental indexing.
393func IndexGitRepo(opts Options) (bool, error) {
394 return indexGitRepo(opts, gitIndexConfig{})
395}
396
397// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig.
398// The returned bool indicates whether the index was updated as a result. This
399// can be informative if doing incremental indexing.
400func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
401 prepareDeltaBuild := prepareDeltaBuild
402 if config.prepareDeltaBuild != nil {
403 prepareDeltaBuild = config.prepareDeltaBuild
404 }
405
406 prepareNormalBuild := prepareNormalBuild
407 if config.prepareNormalBuild != nil {
408 prepareNormalBuild = config.prepareNormalBuild
409 }
410
411 // Set max thresholds, since we use them in this function.
412 opts.BuildOptions.SetDefaults()
413 if opts.RepoDir == "" {
414 return false, fmt.Errorf("gitindex: must set RepoDir")
415 }
416
417 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir
418 repo, err := git.PlainOpen(opts.RepoDir)
419 if err != nil {
420 return false, fmt.Errorf("git.PlainOpen: %w", err)
421 }
422
423 if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil {
424 log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err)
425 }
426
427 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix)
428 if err != nil {
429 return false, fmt.Errorf("expandBranches: %w", err)
430 }
431 for _, b := range branches {
432 commit, err := getCommit(repo, opts.BranchPrefix, b)
433 if err != nil {
434 if opts.AllowMissingBranch && err.Error() == "reference not found" {
435 continue
436 }
437
438 return false, fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err)
439 }
440
441 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{
442 Name: b,
443 Version: commit.Hash.String(),
444 })
445
446 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) {
447 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when
448 }
449 }
450
451 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() {
452 return false, nil
453 }
454
455 // branch => (path, sha1) => repo.
456 var repos map[fileKey]BlobLocation
457
458 // fileKey => branches
459 var branchMap map[fileKey][]string
460
461 // Branch => Repo => SHA1
462 var branchVersions map[string]map[string]plumbing.Hash
463
464 // set of file paths that have been changed or deleted since
465 // the last indexed commit
466 //
467 // These only have an effect on delta builds
468 var changedOrRemovedFiles []string
469
470 if opts.BuildOptions.IsDelta {
471 repos, branchMap, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo)
472 if err != nil {
473 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err)
474 opts.BuildOptions.IsDelta = false
475 }
476 }
477
478 if !opts.BuildOptions.IsDelta {
479 repos, branchMap, branchVersions, err = prepareNormalBuild(opts, repo)
480 if err != nil {
481 return false, fmt.Errorf("preparing normal build: %w", err)
482 }
483 }
484
485 reposByPath := map[string]BlobLocation{}
486 for key, location := range repos {
487 reposByPath[key.SubRepoPath] = location
488 }
489
490 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{}
491 for path, location := range reposByPath {
492 tpl := opts.BuildOptions.RepositoryDescription
493 if path != "" {
494 tpl = zoekt.Repository{URL: location.URL.String()}
495 if err := SetTemplatesFromOrigin(&tpl, location.URL); err != nil {
496 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, location.URL, err)
497 }
498 }
499 opts.BuildOptions.SubRepositories[path] = &tpl
500 }
501
502 for _, br := range opts.BuildOptions.RepositoryDescription.Branches {
503 for path, repo := range opts.BuildOptions.SubRepositories {
504 id := branchVersions[br.Name][path]
505 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{
506 Name: br.Name,
507 Version: id.String(),
508 })
509 }
510 }
511
512 builder, err := build.NewBuilder(opts.BuildOptions)
513 if err != nil {
514 return false, fmt.Errorf("build.NewBuilder: %w", err)
515 }
516
517 var ranks repoPathRanks
518 var meanRank float64
519 if opts.BuildOptions.DocumentRanksPath != "" {
520 data, err := os.ReadFile(opts.BuildOptions.DocumentRanksPath)
521 if err != nil {
522 return false, err
523 }
524
525 err = json.Unmarshal(data, &ranks)
526 if err != nil {
527 return false, err
528 }
529
530 // Compute the mean rank for this repository. Note: we overwrite the rank
531 // mean that's stored in the document ranks file, since that currently
532 // represents a global mean rank across repos, which is not what we want.
533 numRanks := len(ranks.Paths)
534 if numRanks > 0 {
535 for _, rank := range ranks.Paths {
536 meanRank += rank
537 }
538 ranks.MeanRank = meanRank / float64(numRanks)
539 }
540 }
541
542 // we don't need to check error, since we either already have an error, or
543 // we returning the first call to builder.Finish.
544 defer builder.Finish() // nolint:errcheck
545
546 for _, f := range changedOrRemovedFiles {
547 builder.MarkFileAsChangedOrRemoved(f)
548 }
549
550 var names []string
551 fileKeys := map[string][]fileKey{}
552 totalFiles := 0
553
554 for key := range repos {
555 n := key.FullPath()
556 fileKeys[n] = append(fileKeys[n], key)
557 names = append(names, n)
558 totalFiles++
559 }
560
561 sort.Strings(names)
562 names = uniq(names)
563
564 log.Printf("attempting to index %d total files", totalFiles)
565 for _, name := range names {
566 keys := fileKeys[name]
567
568 for _, key := range keys {
569 doc, err := createDocument(key, repos, branchMap, ranks, opts.BuildOptions)
570 if err != nil {
571 return false, err
572 }
573
574 if err := builder.Add(doc); err != nil {
575 return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err)
576 }
577 }
578 }
579
580 return true, builder.Finish()
581}
582
583type repoPathRanks struct {
584 MeanRank float64 `json:"mean_reference_count"`
585 Paths map[string]float64 `json:"paths"`
586}
587
588// rank returns the rank for a given path. It uses these rules:
589// - If we have a concrete rank for this file, always use it
590// - If there's no rank, and it's a low priority file like a test, then use rank 0
591// - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage
592func (r repoPathRanks) rank(path string, content []byte) float64 {
593 if rank, ok := r.Paths[path]; ok {
594 return rank
595 } else if build.IsLowPriority(path, content) {
596 return 0.0
597 } else {
598 return r.MeanRank
599 }
600}
601
602func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) {
603 ignoreFile, err := tree.File(ignore.IgnoreFile)
604 if err == object.ErrFileNotFound {
605 return &ignore.Matcher{}, nil
606 }
607 if err != nil {
608 return nil, err
609 }
610 content, err := ignoreFile.Contents()
611 if err != nil {
612 return nil, err
613 }
614 return ignore.ParseIgnoreFile(strings.NewReader(content))
615}
616
617// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing
618// a build.Builder instance for generating a delta build.
619type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error)
620
621// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing
622// a build.Builder instance for generating a normal build.
623type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, err error)
624
625type gitIndexConfig struct {
626 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to
627 // prepare the build.Builder instance for generating a delta build.
628 //
629 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead.
630 prepareDeltaBuild prepareDeltaBuildFunc
631
632 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to
633 // prepare the build.Builder instance for generating a normal build.
634 //
635 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead.
636 prepareNormalBuild prepareNormalBuildFunc
637}
638
639func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) {
640 if options.Submodules {
641 return nil, nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing")
642 }
643
644 // discover what commits we indexed during our last build
645 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata()
646 if err != nil {
647 return nil, nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err)
648 }
649
650 if !ok {
651 return nil, nil, nil, nil, fmt.Errorf("no existing shards found for repository")
652 }
653
654 if options.DeltaShardNumberFallbackThreshold > 0 {
655 // HACK: For our interim compaction strategy, we force a full normal index once
656 // the number of shards on disk for this repository exceeds the provided threshold.
657 //
658 // This strategy obviously isn't optimal (as an example: we currently can't differentiate
659 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per
660 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads
661 // while we create a better compaction strategy).
662
663 oldShards := options.BuildOptions.FindAllShards()
664 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold {
665 return nil, nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold)
666 }
667 }
668
669 // Check to see if the set of branch names is consistent with what we last indexed.
670 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a
671 // normal one).
672
673 if !build.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) {
674 var existingBranchNames []string
675 for _, b := range existingRepository.Branches {
676 existingBranchNames = append(existingBranchNames, b.Name)
677 }
678
679 var optionsBranchNames []string
680 for _, b := range options.BuildOptions.RepositoryDescription.Branches {
681 optionsBranchNames = append(optionsBranchNames, b.Name)
682 }
683
684 existingBranchList := strings.Join(existingBranchNames, ", ")
685 optionsBranchList := strings.Join(optionsBranchNames, ", ")
686
687 return nil, nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList)
688 }
689
690 // Check if the build options hash does not match the repository metadata's hash
691 // If it does not match then one or more index options has changed and will require a normal build instead of a delta build
692 if options.BuildOptions.GetHash() != existingRepository.IndexOptions {
693 return nil, nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions())
694 }
695
696 // branch => (path, sha1) => repo.
697 repos = map[fileKey]BlobLocation{}
698
699 // fileKey => branches
700 branchMap = map[fileKey][]string{}
701
702 // branch name -> git worktree at most current commit
703 branchToCurrentTree := make(map[string]*object.Tree, len(options.Branches))
704
705 for _, b := range options.Branches {
706 commit, err := getCommit(repository, options.BranchPrefix, b)
707 if err != nil {
708 return nil, nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err)
709 }
710
711 tree, err := commit.Tree()
712 if err != nil {
713 return nil, nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err)
714 }
715
716 branchToCurrentTree[b] = tree
717 }
718
719 rawURL := options.BuildOptions.RepositoryDescription.URL
720 u, err := url.Parse(rawURL)
721 if err != nil {
722 return nil, nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err)
723 }
724
725 // TODO: Support repository submodules for delta builds
726 // For this prototype, we are ignoring repository submodules, which means that we can use the same
727 // blob location for all files
728 hackSharedBlobLocation := BlobLocation{
729 Repo: repository,
730 URL: u,
731 }
732
733 // loop over all branches, calculate the diff between our
734 // last indexed commit and the current commit, and add files mentioned in the diff
735 for _, branch := range existingRepository.Branches {
736 lastIndexedCommit, err := getCommit(repository, "", branch.Version)
737 if err != nil {
738 return nil, nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err)
739 }
740
741 lastIndexedTree, err := lastIndexedCommit.Tree()
742 if err != nil {
743 return nil, nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err)
744 }
745
746 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false})
747 if err != nil {
748 return nil, nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err)
749 }
750
751 for i, c := range changes {
752 oldFile, newFile, err := c.Files()
753 if err != nil {
754 return nil, nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err)
755 }
756
757 if newFile != nil {
758 // note: newFile.Name could be a path that isn't relative to the repository root - using the
759 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root
760 newFileRelativeRootPath := c.To.Name
761
762 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds
763 if newFileRelativeRootPath == ignore.IgnoreFile {
764 return nil, nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
765 }
766
767 // either file is added or renamed, so we need to add the new version to the build
768 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash}
769 repos[file] = hackSharedBlobLocation
770 branchMap[file] = append(branchMap[file], branch.Name)
771 }
772
773 if oldFile == nil {
774 // file added - nothing more to do
775 continue
776 }
777
778 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the
779 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root
780 oldFileRelativeRootPath := c.From.Name
781
782 if oldFileRelativeRootPath == ignore.IgnoreFile {
783 return nil, nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
784 }
785
786 // The file is either modified or deleted. So, we need to add ALL versions
787 // of the old file (across all branches) to the build.
788 for b, currentTree := range branchToCurrentTree {
789 f, err := currentTree.File(oldFileRelativeRootPath)
790 if err != nil {
791 // the file doesn't exist in this branch
792 if errors.Is(err, object.ErrFileNotFound) {
793 continue
794 }
795
796 return nil, nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err)
797 }
798
799 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()}
800 repos[file] = hackSharedBlobLocation
801 branchMap[file] = append(branchMap[file], b)
802 }
803
804 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath)
805 }
806 }
807
808 // we need to de-duplicate the branch map before returning it - it's possible for the same
809 // branch to have been added multiple times if a file has been modified across multiple commits
810
811 for file, branches := range branchMap {
812 sort.Strings(branches)
813 branchMap[file] = uniq(branches)
814 }
815
816 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates
817 // for the same reasoning as above
818
819 sort.Strings(changedOrDeletedPaths)
820 changedOrDeletedPaths = uniq(changedOrDeletedPaths)
821
822 return repos, branchMap, nil, changedOrDeletedPaths, nil
823}
824
825func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, err error) {
826 var repoCache *RepoCache
827 if options.Submodules {
828 repoCache = NewRepoCache(options.RepoCacheDir)
829 }
830
831 // branch => (path, sha1) => repo.
832 repos = map[fileKey]BlobLocation{}
833
834 // fileKey => branches
835 branchMap = map[fileKey][]string{}
836
837 // Branch => Repo => SHA1
838 branchVersions = map[string]map[string]plumbing.Hash{}
839
840 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix)
841 if err != nil {
842 return nil, nil, nil, fmt.Errorf("expandBranches: %w", err)
843 }
844
845 for _, b := range branches {
846 commit, err := getCommit(repository, options.BranchPrefix, b)
847 if err != nil {
848 if options.AllowMissingBranch && err.Error() == "reference not found" {
849 continue
850 }
851
852 return nil, nil, nil, fmt.Errorf("getCommit: %w", err)
853 }
854
855 tree, err := commit.Tree()
856 if err != nil {
857 return nil, nil, nil, fmt.Errorf("commit.Tree: %w", err)
858 }
859
860 ig, err := newIgnoreMatcher(tree)
861 if err != nil {
862 return nil, nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err)
863 }
864
865 files, subVersions, err := TreeToFiles(repository, tree, options.BuildOptions.RepositoryDescription.URL, repoCache)
866 if err != nil {
867 return nil, nil, nil, fmt.Errorf("TreeToFiles: %w", err)
868 }
869 for k, v := range files {
870 if ig.Match(k.Path) {
871 continue
872 }
873 repos[k] = v
874 branchMap[k] = append(branchMap[k], b)
875 }
876
877 branchVersions[b] = subVersions
878 }
879
880 return repos, branchMap, branchVersions, nil
881}
882
883func createDocument(key fileKey,
884 repos map[fileKey]BlobLocation,
885 branchMap map[fileKey][]string,
886 ranks repoPathRanks,
887 opts build.Options,
888) (zoekt.Document, error) {
889 blob, err := repos[key].Repo.BlobObject(key.ID)
890
891 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found.
892 if errors.Is(err, plumbing.ErrObjectNotFound) {
893 return skippedLargeDoc(key, branchMap, opts), nil
894 }
895
896 if err != nil {
897 return zoekt.Document{}, err
898 }
899
900 keyFullPath := key.FullPath()
901 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) {
902 return skippedLargeDoc(key, branchMap, opts), nil
903 }
904
905 contents, err := blobContents(blob)
906 if err != nil {
907 return zoekt.Document{}, err
908 }
909
910 var pathRanks []float64
911 if len(ranks.Paths) > 0 {
912 // If the repository has ranking data, then store the file's rank.
913 pathRank := ranks.rank(keyFullPath, contents)
914 pathRanks = []float64{pathRank}
915 }
916
917 return zoekt.Document{
918 SubRepositoryPath: key.SubRepoPath,
919 Name: keyFullPath,
920 Content: contents,
921 Branches: branchMap[key],
922 Ranks: pathRanks,
923 }, nil
924}
925
926func skippedLargeDoc(key fileKey, branchMap map[fileKey][]string, opts build.Options) zoekt.Document {
927 return zoekt.Document{
928 SkipReason: fmt.Sprintf("file size exceeds maximum size %d", opts.SizeMax),
929 Name: key.FullPath(),
930 Branches: branchMap[key],
931 SubRepositoryPath: key.SubRepoPath,
932 }
933}
934
935func blobContents(blob *object.Blob) ([]byte, error) {
936 r, err := blob.Reader()
937 if err != nil {
938 return nil, err
939 }
940 defer r.Close()
941
942 var buf bytes.Buffer
943 buf.Grow(int(blob.Size))
944 _, err = buf.ReadFrom(r)
945 if err != nil {
946 return nil, err
947 }
948 return buf.Bytes(), nil
949}
950
951func uniq(ss []string) []string {
952 result := ss[:0]
953 var last string
954 for i, s := range ss {
955 if i == 0 || s != last {
956 result = append(result, s)
957 }
958 last = s
959 }
960 return result
961}