fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package gitindex provides functions for indexing Git repositories.
16package gitindex
17
18import (
19 "bytes"
20 "context"
21 "encoding/json"
22 "errors"
23 "fmt"
24 "io"
25 "log"
26 "math"
27 "net/url"
28 "os"
29 "path/filepath"
30 "regexp"
31 "sort"
32 "strconv"
33 "strings"
34
35 "github.com/sourcegraph/zoekt"
36 "github.com/sourcegraph/zoekt/build"
37 "github.com/sourcegraph/zoekt/ignore"
38
39 "github.com/go-git/go-git/v5/config"
40 "github.com/go-git/go-git/v5/plumbing"
41 "github.com/go-git/go-git/v5/plumbing/object"
42
43 git "github.com/go-git/go-git/v5"
44)
45
46// FindGitRepos finds directories holding git repositories below the
47// given directory. It will find both bare and the ".git" dirs in
48// non-bare repositories. It returns the full path including the dir
49// passed in.
50func FindGitRepos(dir string) ([]string, error) {
51 arg, err := filepath.Abs(dir)
52 if err != nil {
53 return nil, err
54 }
55 var dirs []string
56 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error {
57 // Best-effort, ignore filepath.Walk failing
58 if err != nil {
59 return nil
60 }
61
62 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() {
63 dirs = append(dirs, filepath.Join(name, ".git"))
64 return filepath.SkipDir
65 }
66
67 if !strings.HasSuffix(name, ".git") || !fi.IsDir() {
68 return nil
69 }
70
71 fi, err = os.Lstat(filepath.Join(name, "objects"))
72 if err != nil || !fi.IsDir() {
73 return nil
74 }
75
76 dirs = append(dirs, name)
77 return filepath.SkipDir
78 }); err != nil {
79 return nil, err
80 }
81
82 return dirs, nil
83}
84
85// setTemplates fills in URL templates for known git hosting
86// sites.
87func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error {
88 if u.Scheme == "ssh+git" {
89 u.Scheme = "https"
90 u.User = nil
91 }
92
93 repo.URL = u.String()
94 switch typ {
95 case "gitiles":
96 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20
97 repo.CommitURLTemplate = u.String() + "/+/{{.Version}}"
98 repo.FileURLTemplate = u.String() + "/+/{{.Version}}/{{.Path}}"
99 repo.LineFragmentTemplate = "#{{.LineNumber}}"
100 case "github":
101 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10
102 repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}"
103 repo.FileURLTemplate = u.String() + "/blob/{{.Version}}/{{.Path}}"
104 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
105 case "cgit":
106 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100
107 repo.CommitURLTemplate = u.String() + "/commit/?id={{.Version}}"
108 repo.FileURLTemplate = u.String() + "/tree/{{.Path}}/?id={{.Version}}"
109 repo.LineFragmentTemplate = "#n{{.LineNumber}}"
110 case "gitweb":
111 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10
112 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}"
113 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}"
114 repo.LineFragmentTemplate = "#l{{.LineNumber}}"
115 case "source.bazel.build":
116 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9
117 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10
118 repo.CommitURLTemplate = u.String() + "/+/{{.Version}}"
119 repo.FileURLTemplate = u.String() + "/+/{{.Version}}:{{.Path}}"
120 repo.LineFragmentTemplate = ";l={{.LineNumber}}"
121 case "bitbucket-server":
122 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc
123 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc
124 repo.CommitURLTemplate = u.String() + "/commits/{{.Version}}"
125 repo.FileURLTemplate = u.String() + "/{{.Path}}?at={{.Version}}"
126 repo.LineFragmentTemplate = "#{{.LineNumber}}"
127 case "gitlab":
128 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed
129 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template
130 repo.CommitURLTemplate = u.String() + "/-/commit/{{.Version}}"
131 repo.FileURLTemplate = u.String() + "/-/blob/{{.Version}}/{{.Path}}"
132 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
133 case "gitea":
134 repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}"
135 // NOTE The `display=source` query parameter is required to disable file rendering.
136 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to
137 // a line without `display=source`. This is supported since gitea 1.17.0.
138 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}},
139 // but the query parameters are obmitted.
140 repo.FileURLTemplate = u.String() + "/src/commit/{{.Version}}/{{.Path}}?display=source"
141 repo.LineFragmentTemplate = "#L{{.LineNumber}}"
142 default:
143 return fmt.Errorf("URL scheme type %q unknown", typ)
144 }
145 return nil
146}
147
148// getCommit returns a tree object for the given reference.
149func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) {
150 sha1, err := repo.ResolveRevision(plumbing.Revision(ref))
151 // ref might be a branch name (e.g. "master") add branch prefix and try again.
152 if err != nil {
153 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref)))
154 }
155 if err != nil {
156 return nil, err
157 }
158
159 commitObj, err := repo.CommitObject(*sha1)
160 if err != nil {
161 return nil, err
162 }
163 return commitObj, nil
164}
165
166func configLookupRemoteURL(cfg *config.Config, key string) string {
167 rc := cfg.Remotes[key]
168 if rc == nil || len(rc.URLs) == 0 {
169 return ""
170 }
171 return rc.URLs[0]
172}
173
174var sshRelativeURLRegexp = regexp.MustCompile(`^([^@]+)@([^:]+):(.*)$`)
175
176func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error {
177 repo, err := git.PlainOpen(repoDir)
178 if err != nil {
179 return err
180 }
181
182 cfg, err := repo.Config()
183 if err != nil {
184 return err
185 }
186
187 sec := cfg.Raw.Section("zoekt")
188
189 webURLStr := sec.Options.Get("web-url")
190 webURLType := sec.Options.Get("web-url-type")
191
192 if webURLType != "" && webURLStr != "" {
193 webURL, err := url.Parse(webURLStr)
194 if err != nil {
195 return err
196 }
197 if err := setTemplates(desc, webURL, webURLType); err != nil {
198 return err
199 }
200 } else if webURLStr != "" {
201 desc.URL = webURLStr
202 }
203
204 name := sec.Options.Get("name")
205 if name != "" {
206 desc.Name = name
207 } else {
208 remoteURL := configLookupRemoteURL(cfg, "origin")
209 if remoteURL == "" {
210 return nil
211 }
212 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil {
213 user := sm[1]
214 host := sm[2]
215 path := sm[3]
216
217 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path)
218 }
219
220 u, err := url.Parse(remoteURL)
221 if err != nil {
222 return err
223 }
224 if err := SetTemplatesFromOrigin(desc, u); err != nil {
225 return err
226 }
227 }
228
229 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32)
230 desc.ID = uint32(id)
231
232 if desc.RawConfig == nil {
233 desc.RawConfig = map[string]string{}
234 }
235 for _, o := range sec.Options {
236 desc.RawConfig[o.Key] = o.Value
237 }
238
239 // Ranking info.
240
241 // Github:
242 traction := 0
243 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} {
244 f, err := strconv.Atoi(sec.Options.Get(s))
245 if err == nil {
246 traction += f
247 }
248 }
249
250 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 {
251 // Pretend everything on googlesource.com has 1000
252 // github stars.
253 traction = 1000
254 }
255
256 if traction > 0 {
257 l := math.Log(float64(traction))
258 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000)
259 }
260
261 return nil
262}
263
264// SetTemplatesFromOrigin fills in templates based on the origin URL.
265func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error {
266 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git"))
267
268 if strings.HasSuffix(u.Host, ".googlesource.com") {
269 return setTemplates(desc, u, "gitiles")
270 } else if u.Host == "github.com" {
271 u.Path = strings.TrimSuffix(u.Path, ".git")
272 return setTemplates(desc, u, "github")
273 } else {
274 return fmt.Errorf("unknown git hosting site %q", u)
275 }
276}
277
278// The Options structs controls details of the indexing process.
279type Options struct {
280 // The repository to be indexed.
281 RepoDir string
282
283 // If set, follow submodule links. This requires RepoCacheDir to be set.
284 Submodules bool
285
286 // If set, skip indexing if the existing index shard is newer
287 // than the refs in the repository.
288 Incremental bool
289
290 // Don't error out if some branch is missing
291 AllowMissingBranch bool
292
293 // Specifies the root of a Repository cache. Needed for submodule indexing.
294 RepoCacheDir string
295
296 // Indexing options.
297 BuildOptions build.Options
298
299 // Prefix of the branch to index, e.g. `remotes/origin`.
300 BranchPrefix string
301
302 // List of branch names to index, e.g. []string{"HEAD", "stable"}
303 Branches []string
304
305 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards
306 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold,
307 // then a normal build will be performed instead.
308 //
309 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled:
310 // a delta build will always be performed regardless of the number of preexisting shards.
311 DeltaShardNumberFallbackThreshold uint64
312}
313
314func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) {
315 var result []string
316 for _, b := range bs {
317 // Sourcegraph: We disable resolving refs. We want to return the exact ref
318 // requested so we can match it up.
319 if b == "HEAD" && false {
320 ref, err := repo.Head()
321 if err != nil {
322 return nil, err
323 }
324
325 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix))
326 continue
327 }
328
329 if strings.Contains(b, "*") {
330 iter, err := repo.Branches()
331 if err != nil {
332 return nil, err
333 }
334
335 defer iter.Close()
336 for {
337 ref, err := iter.Next()
338 if err == io.EOF {
339 break
340 }
341 if err != nil {
342 return nil, err
343 }
344
345 name := ref.Name().Short()
346 if matched, err := filepath.Match(b, name); err != nil {
347 return nil, err
348 } else if !matched {
349 continue
350 }
351
352 result = append(result, strings.TrimPrefix(name, prefix))
353 }
354 continue
355 }
356
357 result = append(result, b)
358 }
359
360 return result, nil
361}
362
363// IndexGitRepo indexes the git repository as specified by the options.
364// The returned bool indicates whether the index was updated as a result. This
365// can be informative if doing incremental indexing.
366func IndexGitRepo(opts Options) (bool, error) {
367 return indexGitRepo(opts, gitIndexConfig{})
368}
369
370// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig.
371// The returned bool indicates whether the index was updated as a result. This
372// can be informative if doing incremental indexing.
373func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
374 prepareDeltaBuild := prepareDeltaBuild
375 if config.prepareDeltaBuild != nil {
376 prepareDeltaBuild = config.prepareDeltaBuild
377 }
378
379 prepareNormalBuild := prepareNormalBuild
380 if config.prepareNormalBuild != nil {
381 prepareNormalBuild = config.prepareNormalBuild
382 }
383
384 // Set max thresholds, since we use them in this function.
385 opts.BuildOptions.SetDefaults()
386 if opts.RepoDir == "" {
387 return false, fmt.Errorf("gitindex: must set RepoDir")
388 }
389
390 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir
391 repo, err := git.PlainOpen(opts.RepoDir)
392 if err != nil {
393 return false, fmt.Errorf("git.PlainOpen: %w", err)
394 }
395
396 if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil {
397 log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err)
398 }
399
400 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix)
401 if err != nil {
402 return false, fmt.Errorf("expandBranches: %w", err)
403 }
404 for _, b := range branches {
405 commit, err := getCommit(repo, opts.BranchPrefix, b)
406 if err != nil {
407 if opts.AllowMissingBranch && err.Error() == "reference not found" {
408 continue
409 }
410
411 return false, fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err)
412 }
413
414 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{
415 Name: b,
416 Version: commit.Hash.String(),
417 })
418
419 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) {
420 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when
421 }
422 }
423
424 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() {
425 return false, nil
426 }
427
428 // branch => (path, sha1) => repo.
429 var repos map[fileKey]BlobLocation
430
431 // fileKey => branches
432 var branchMap map[fileKey][]string
433
434 // Branch => Repo => SHA1
435 var branchVersions map[string]map[string]plumbing.Hash
436
437 // set of file paths that have been changed or deleted since
438 // the last indexed commit
439 //
440 // These only have an effect on delta builds
441 var changedOrRemovedFiles []string
442
443 if opts.BuildOptions.IsDelta {
444 repos, branchMap, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo)
445 if err != nil {
446 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err)
447 opts.BuildOptions.IsDelta = false
448 }
449 }
450
451 if !opts.BuildOptions.IsDelta {
452 repos, branchMap, branchVersions, err = prepareNormalBuild(opts, repo)
453 if err != nil {
454 return false, fmt.Errorf("preparing normal build: %w", err)
455 }
456 }
457
458 reposByPath := map[string]BlobLocation{}
459 for key, location := range repos {
460 reposByPath[key.SubRepoPath] = location
461 }
462
463 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{}
464 for path, location := range reposByPath {
465 tpl := opts.BuildOptions.RepositoryDescription
466 if path != "" {
467 tpl = zoekt.Repository{URL: location.URL.String()}
468 if err := SetTemplatesFromOrigin(&tpl, location.URL); err != nil {
469 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, location.URL, err)
470 }
471 }
472 opts.BuildOptions.SubRepositories[path] = &tpl
473 }
474
475 for _, br := range opts.BuildOptions.RepositoryDescription.Branches {
476 for path, repo := range opts.BuildOptions.SubRepositories {
477 id := branchVersions[br.Name][path]
478 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{
479 Name: br.Name,
480 Version: id.String(),
481 })
482 }
483 }
484
485 builder, err := build.NewBuilder(opts.BuildOptions)
486 if err != nil {
487 return false, fmt.Errorf("build.NewBuilder: %w", err)
488 }
489
490 // Preparing the build can consume substantial memory, so check usage before starting to index.
491 builder.CheckMemoryUsage()
492
493 var ranks repoPathRanks
494 var meanRank float64
495 if opts.BuildOptions.DocumentRanksPath != "" {
496 data, err := os.ReadFile(opts.BuildOptions.DocumentRanksPath)
497 if err != nil {
498 return false, err
499 }
500
501 err = json.Unmarshal(data, &ranks)
502 if err != nil {
503 return false, err
504 }
505
506 // Compute the mean rank for this repository. Note: we overwrite the rank
507 // mean that's stored in the document ranks file, since that currently
508 // represents a global mean rank across repos, which is not what we want.
509 numRanks := len(ranks.Paths)
510 if numRanks > 0 {
511 for _, rank := range ranks.Paths {
512 meanRank += rank
513 }
514 ranks.MeanRank = meanRank / float64(numRanks)
515 }
516 }
517
518 // we don't need to check error, since we either already have an error, or
519 // we returning the first call to builder.Finish.
520 defer builder.Finish() // nolint:errcheck
521
522 for _, f := range changedOrRemovedFiles {
523 builder.MarkFileAsChangedOrRemoved(f)
524 }
525
526 var names []string
527 fileKeys := map[string][]fileKey{}
528 totalFiles := 0
529
530 for key := range repos {
531 n := key.FullPath()
532 fileKeys[n] = append(fileKeys[n], key)
533 names = append(names, n)
534 totalFiles++
535 }
536
537 sort.Strings(names)
538 names = uniq(names)
539
540 log.Printf("attempting to index %d total files", totalFiles)
541 for idx, name := range names {
542 keys := fileKeys[name]
543
544 for _, key := range keys {
545 doc, err := createDocument(key, repos, branchMap, ranks, opts.BuildOptions)
546 if err != nil {
547 return false, err
548 }
549
550 if err := builder.Add(doc); err != nil {
551 return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err)
552 }
553
554 if idx%10_000 == 0 {
555 builder.CheckMemoryUsage()
556 }
557 }
558 }
559 return true, builder.Finish()
560}
561
562type repoPathRanks struct {
563 MeanRank float64 `json:"mean_reference_count"`
564 Paths map[string]float64 `json:"paths"`
565}
566
567// rank returns the rank for a given path. It uses these rules:
568// - If we have a concrete rank for this file, always use it
569// - If there's no rank, and it's a low priority file like a test, then use rank 0
570// - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage
571func (r repoPathRanks) rank(path string, content []byte) float64 {
572 if rank, ok := r.Paths[path]; ok {
573 return rank
574 } else if build.IsLowPriority(path, content) {
575 return 0.0
576 } else {
577 return r.MeanRank
578 }
579}
580
581func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) {
582 ignoreFile, err := tree.File(ignore.IgnoreFile)
583 if err == object.ErrFileNotFound {
584 return &ignore.Matcher{}, nil
585 }
586 if err != nil {
587 return nil, err
588 }
589 content, err := ignoreFile.Contents()
590 if err != nil {
591 return nil, err
592 }
593 return ignore.ParseIgnoreFile(strings.NewReader(content))
594}
595
596// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing
597// a build.Builder instance for generating a delta build.
598type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error)
599
600// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing
601// a build.Builder instance for generating a normal build.
602type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, err error)
603
604type gitIndexConfig struct {
605 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to
606 // prepare the build.Builder instance for generating a delta build.
607 //
608 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead.
609 prepareDeltaBuild prepareDeltaBuildFunc
610
611 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to
612 // prepare the build.Builder instance for generating a normal build.
613 //
614 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead.
615 prepareNormalBuild prepareNormalBuildFunc
616}
617
618func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) {
619 if options.Submodules {
620 return nil, nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing")
621 }
622
623 // discover what commits we indexed during our last build
624 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata()
625 if err != nil {
626 return nil, nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err)
627 }
628
629 if !ok {
630 return nil, nil, nil, nil, fmt.Errorf("no existing shards found for repository")
631 }
632
633 if options.DeltaShardNumberFallbackThreshold > 0 {
634 // HACK: For our interim compaction strategy, we force a full normal index once
635 // the number of shards on disk for this repository exceeds the provided threshold.
636 //
637 // This strategy obviously isn't optimal (as an example: we currently can't differentiate
638 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per
639 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads
640 // while we create a better compaction strategy).
641
642 oldShards := options.BuildOptions.FindAllShards()
643 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold {
644 return nil, nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold)
645 }
646 }
647
648 // Check to see if the set of branch names is consistent with what we last indexed.
649 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a
650 // normal one).
651
652 if !build.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) {
653 var existingBranchNames []string
654 for _, b := range existingRepository.Branches {
655 existingBranchNames = append(existingBranchNames, b.Name)
656 }
657
658 var optionsBranchNames []string
659 for _, b := range options.BuildOptions.RepositoryDescription.Branches {
660 optionsBranchNames = append(optionsBranchNames, b.Name)
661 }
662
663 existingBranchList := strings.Join(existingBranchNames, ", ")
664 optionsBranchList := strings.Join(optionsBranchNames, ", ")
665
666 return nil, nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList)
667 }
668
669 // Check if the build options hash does not match the repository metadata's hash
670 // If it does not match then one or more index options has changed and will require a normal build instead of a delta build
671 if options.BuildOptions.GetHash() != existingRepository.IndexOptions {
672 return nil, nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions())
673 }
674
675 // branch => (path, sha1) => repo.
676 repos = map[fileKey]BlobLocation{}
677
678 // fileKey => branches
679 branchMap = map[fileKey][]string{}
680
681 // branch name -> git worktree at most current commit
682 branchToCurrentTree := make(map[string]*object.Tree, len(options.Branches))
683
684 for _, b := range options.Branches {
685 commit, err := getCommit(repository, options.BranchPrefix, b)
686 if err != nil {
687 return nil, nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err)
688 }
689
690 tree, err := commit.Tree()
691 if err != nil {
692 return nil, nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err)
693 }
694
695 branchToCurrentTree[b] = tree
696 }
697
698 rawURL := options.BuildOptions.RepositoryDescription.URL
699 u, err := url.Parse(rawURL)
700 if err != nil {
701 return nil, nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err)
702 }
703
704 // TODO: Support repository submodules for delta builds
705 // For this prototype, we are ignoring repository submodules, which means that we can use the same
706 // blob location for all files
707 hackSharedBlobLocation := BlobLocation{
708 Repo: repository,
709 URL: u,
710 }
711
712 // loop over all branches, calculate the diff between our
713 // last indexed commit and the current commit, and add files mentioned in the diff
714 for _, branch := range existingRepository.Branches {
715 lastIndexedCommit, err := getCommit(repository, "", branch.Version)
716 if err != nil {
717 return nil, nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err)
718 }
719
720 lastIndexedTree, err := lastIndexedCommit.Tree()
721 if err != nil {
722 return nil, nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err)
723 }
724
725 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false})
726 if err != nil {
727 return nil, nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err)
728 }
729
730 for i, c := range changes {
731 oldFile, newFile, err := c.Files()
732 if err != nil {
733 return nil, nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err)
734 }
735
736 if newFile != nil {
737 // note: newFile.Name could be a path that isn't relative to the repository root - using the
738 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root
739 newFileRelativeRootPath := c.To.Name
740
741 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds
742 if newFileRelativeRootPath == ignore.IgnoreFile {
743 return nil, nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
744 }
745
746 // either file is added or renamed, so we need to add the new version to the build
747 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash}
748 repos[file] = hackSharedBlobLocation
749 branchMap[file] = append(branchMap[file], branch.Name)
750 }
751
752 if oldFile == nil {
753 // file added - nothing more to do
754 continue
755 }
756
757 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the
758 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root
759 oldFileRelativeRootPath := c.From.Name
760
761 if oldFileRelativeRootPath == ignore.IgnoreFile {
762 return nil, nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile)
763 }
764
765 // The file is either modified or deleted. So, we need to add ALL versions
766 // of the old file (across all branches) to the build.
767 for b, currentTree := range branchToCurrentTree {
768 f, err := currentTree.File(oldFileRelativeRootPath)
769 if err != nil {
770 // the file doesn't exist in this branch
771 if errors.Is(err, object.ErrFileNotFound) {
772 continue
773 }
774
775 return nil, nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err)
776 }
777
778 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()}
779 repos[file] = hackSharedBlobLocation
780 branchMap[file] = append(branchMap[file], b)
781 }
782
783 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath)
784 }
785 }
786
787 // we need to de-duplicate the branch map before returning it - it's possible for the same
788 // branch to have been added multiple times if a file has been modified across multiple commits
789
790 for file, branches := range branchMap {
791 sort.Strings(branches)
792 branchMap[file] = uniq(branches)
793 }
794
795 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates
796 // for the same reasoning as above
797
798 sort.Strings(changedOrDeletedPaths)
799 changedOrDeletedPaths = uniq(changedOrDeletedPaths)
800
801 return repos, branchMap, nil, changedOrDeletedPaths, nil
802}
803
804func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, err error) {
805 var repoCache *RepoCache
806 if options.Submodules {
807 repoCache = NewRepoCache(options.RepoCacheDir)
808 }
809
810 // branch => (path, sha1) => repo.
811 repos = map[fileKey]BlobLocation{}
812
813 // fileKey => branches
814 branchMap = map[fileKey][]string{}
815
816 // Branch => Repo => SHA1
817 branchVersions = map[string]map[string]plumbing.Hash{}
818
819 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix)
820 if err != nil {
821 return nil, nil, nil, fmt.Errorf("expandBranches: %w", err)
822 }
823
824 for _, b := range branches {
825 commit, err := getCommit(repository, options.BranchPrefix, b)
826 if err != nil {
827 if options.AllowMissingBranch && err.Error() == "reference not found" {
828 continue
829 }
830
831 return nil, nil, nil, fmt.Errorf("getCommit: %w", err)
832 }
833
834 tree, err := commit.Tree()
835 if err != nil {
836 return nil, nil, nil, fmt.Errorf("commit.Tree: %w", err)
837 }
838
839 ig, err := newIgnoreMatcher(tree)
840 if err != nil {
841 return nil, nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err)
842 }
843
844 files, subVersions, err := TreeToFiles(repository, tree, options.BuildOptions.RepositoryDescription.URL, repoCache)
845 if err != nil {
846 return nil, nil, nil, fmt.Errorf("TreeToFiles: %w", err)
847 }
848 for k, v := range files {
849 if ig.Match(k.Path) {
850 continue
851 }
852 repos[k] = v
853 branchMap[k] = append(branchMap[k], b)
854 }
855
856 branchVersions[b] = subVersions
857 }
858
859 return repos, branchMap, branchVersions, nil
860}
861
862func createDocument(key fileKey,
863 repos map[fileKey]BlobLocation,
864 branchMap map[fileKey][]string,
865 ranks repoPathRanks,
866 opts build.Options,
867) (zoekt.Document, error) {
868 blob, err := repos[key].Repo.BlobObject(key.ID)
869
870 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found.
871 if errors.Is(err, plumbing.ErrObjectNotFound) {
872 return skippedLargeDoc(key, branchMap, opts), nil
873 }
874
875 if err != nil {
876 return zoekt.Document{}, err
877 }
878
879 keyFullPath := key.FullPath()
880 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) {
881 return skippedLargeDoc(key, branchMap, opts), nil
882 }
883
884 contents, err := blobContents(blob)
885 if err != nil {
886 return zoekt.Document{}, err
887 }
888
889 var pathRanks []float64
890 if len(ranks.Paths) > 0 {
891 // If the repository has ranking data, then store the file's rank.
892 pathRank := ranks.rank(keyFullPath, contents)
893 pathRanks = []float64{pathRank}
894 }
895
896 return zoekt.Document{
897 SubRepositoryPath: key.SubRepoPath,
898 Name: keyFullPath,
899 Content: contents,
900 Branches: branchMap[key],
901 Ranks: pathRanks,
902 }, nil
903}
904
905func skippedLargeDoc(key fileKey, branchMap map[fileKey][]string, opts build.Options) zoekt.Document {
906 return zoekt.Document{
907 SkipReason: fmt.Sprintf("file size exceeds maximum size %d", opts.SizeMax),
908 Name: key.FullPath(),
909 Branches: branchMap[key],
910 SubRepositoryPath: key.SubRepoPath,
911 }
912}
913
914func blobContents(blob *object.Blob) ([]byte, error) {
915 r, err := blob.Reader()
916 if err != nil {
917 return nil, err
918 }
919 defer r.Close()
920
921 var buf bytes.Buffer
922 buf.Grow(int(blob.Size))
923 _, err = buf.ReadFrom(r)
924 if err != nil {
925 return nil, err
926 }
927 return buf.Bytes(), nil
928}
929
930func uniq(ss []string) []string {
931 result := ss[:0]
932 var last string
933 for i, s := range ss {
934 if i == 0 || s != last {
935 result = append(result, s)
936 }
937 last = s
938 }
939 return result
940}