fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package gitindex provides functions for indexing Git repositories. 16package gitindex 17 18import ( 19 "bytes" 20 "context" 21 "encoding/json" 22 "errors" 23 "fmt" 24 "io" 25 "log" 26 "math" 27 "net/url" 28 "os" 29 "path/filepath" 30 "regexp" 31 "sort" 32 "strconv" 33 "strings" 34 35 "github.com/sourcegraph/zoekt" 36 "github.com/sourcegraph/zoekt/build" 37 "github.com/sourcegraph/zoekt/ignore" 38 39 "github.com/go-git/go-git/v5/config" 40 "github.com/go-git/go-git/v5/plumbing" 41 "github.com/go-git/go-git/v5/plumbing/object" 42 43 git "github.com/go-git/go-git/v5" 44) 45 46// FindGitRepos finds directories holding git repositories below the 47// given directory. It will find both bare and the ".git" dirs in 48// non-bare repositories. It returns the full path including the dir 49// passed in. 50func FindGitRepos(dir string) ([]string, error) { 51 arg, err := filepath.Abs(dir) 52 if err != nil { 53 return nil, err 54 } 55 var dirs []string 56 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error { 57 // Best-effort, ignore filepath.Walk failing 58 if err != nil { 59 return nil 60 } 61 62 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() { 63 dirs = append(dirs, filepath.Join(name, ".git")) 64 return filepath.SkipDir 65 } 66 67 if !strings.HasSuffix(name, ".git") || !fi.IsDir() { 68 return nil 69 } 70 71 fi, err = os.Lstat(filepath.Join(name, "objects")) 72 if err != nil || !fi.IsDir() { 73 return nil 74 } 75 76 dirs = append(dirs, name) 77 return filepath.SkipDir 78 }); err != nil { 79 return nil, err 80 } 81 82 return dirs, nil 83} 84 85// setTemplates fills in URL templates for known git hosting 86// sites. 87func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error { 88 if u.Scheme == "ssh+git" { 89 u.Scheme = "https" 90 u.User = nil 91 } 92 93 // helper to generate u.JoinPath as a template 94 varVersion := ".Version" 95 varPath := ".Path" 96 urlJoinPath := func(elem ...string) string { 97 elem = append([]string{u.String()}, elem...) 98 var parts []string 99 for _, e := range elem { 100 if e == varVersion || e == varPath { 101 parts = append(parts, e) 102 } else { 103 parts = append(parts, strconv.Quote(e)) 104 } 105 } 106 return fmt.Sprintf("{{URLJoinPath %s}}", strings.Join(parts, " ")) 107 } 108 109 repo.URL = u.String() 110 switch typ { 111 case "gitiles": 112 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20 113 repo.CommitURLTemplate = urlJoinPath("+", varVersion) 114 repo.FileURLTemplate = urlJoinPath("+", varVersion, varPath) 115 repo.LineFragmentTemplate = "#{{.LineNumber}}" 116 case "github": 117 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10 118 repo.CommitURLTemplate = urlJoinPath("commit", varVersion) 119 repo.FileURLTemplate = urlJoinPath("blob", varVersion, varPath) 120 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 121 case "cgit": 122 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100 123 repo.CommitURLTemplate = urlJoinPath("commit") + "/?id={{.Version}}" 124 repo.FileURLTemplate = urlJoinPath("tree", varPath) + "/?id={{.Version}}" 125 repo.LineFragmentTemplate = "#n{{.LineNumber}}" 126 case "gitweb": 127 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10 128 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}" 129 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}" 130 repo.LineFragmentTemplate = "#l{{.LineNumber}}" 131 case "source.bazel.build": 132 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9 133 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10 134 repo.CommitURLTemplate = u.String() + "/%2B/{{.Version}}" 135 repo.FileURLTemplate = u.String() + "/%2B/{{.Version}}:{{.Path}}" 136 repo.LineFragmentTemplate = ";l={{.LineNumber}}" 137 case "bitbucket-server": 138 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc 139 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc 140 repo.CommitURLTemplate = urlJoinPath("commits", varVersion) 141 repo.FileURLTemplate = urlJoinPath(varPath) + "?at={{.Version}}" 142 repo.LineFragmentTemplate = "#{{.LineNumber}}" 143 case "gitlab": 144 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed 145 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template 146 repo.CommitURLTemplate = urlJoinPath("-/commit", varVersion) 147 repo.FileURLTemplate = urlJoinPath("-/blob", varVersion, varPath) 148 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 149 case "gitea": 150 repo.CommitURLTemplate = urlJoinPath("commit", varVersion) 151 // NOTE The `display=source` query parameter is required to disable file rendering. 152 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to 153 // a line without `display=source`. This is supported since gitea 1.17.0. 154 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}}, 155 // but the query parameters are obmitted. 156 repo.FileURLTemplate = urlJoinPath("src/commit", varVersion, varPath) + "?display=source" 157 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 158 default: 159 return fmt.Errorf("URL scheme type %q unknown", typ) 160 } 161 return nil 162} 163 164// getCommit returns a tree object for the given reference. 165func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) { 166 sha1, err := repo.ResolveRevision(plumbing.Revision(ref)) 167 // ref might be a branch name (e.g. "master") add branch prefix and try again. 168 if err != nil { 169 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref))) 170 } 171 if err != nil { 172 return nil, err 173 } 174 175 commitObj, err := repo.CommitObject(*sha1) 176 if err != nil { 177 return nil, err 178 } 179 return commitObj, nil 180} 181 182func configLookupRemoteURL(cfg *config.Config, key string) string { 183 rc := cfg.Remotes[key] 184 if rc == nil || len(rc.URLs) == 0 { 185 return "" 186 } 187 return rc.URLs[0] 188} 189 190var sshRelativeURLRegexp = regexp.MustCompile(`^([^@]+)@([^:]+):(.*)$`) 191 192func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error { 193 repo, err := git.PlainOpen(repoDir) 194 if err != nil { 195 return err 196 } 197 198 cfg, err := repo.Config() 199 if err != nil { 200 return err 201 } 202 203 sec := cfg.Raw.Section("zoekt") 204 205 webURLStr := sec.Options.Get("web-url") 206 webURLType := sec.Options.Get("web-url-type") 207 208 if webURLType != "" && webURLStr != "" { 209 webURL, err := url.Parse(webURLStr) 210 if err != nil { 211 return err 212 } 213 if err := setTemplates(desc, webURL, webURLType); err != nil { 214 return err 215 } 216 } else if webURLStr != "" { 217 desc.URL = webURLStr 218 } 219 220 name := sec.Options.Get("name") 221 if name != "" { 222 desc.Name = name 223 } else { 224 remoteURL := configLookupRemoteURL(cfg, "origin") 225 if remoteURL == "" { 226 return nil 227 } 228 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil { 229 user := sm[1] 230 host := sm[2] 231 path := sm[3] 232 233 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path) 234 } 235 236 u, err := url.Parse(remoteURL) 237 if err != nil { 238 return err 239 } 240 if err := SetTemplatesFromOrigin(desc, u); err != nil { 241 return err 242 } 243 } 244 245 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32) 246 desc.ID = uint32(id) 247 248 if desc.RawConfig == nil { 249 desc.RawConfig = map[string]string{} 250 } 251 for _, o := range sec.Options { 252 desc.RawConfig[o.Key] = o.Value 253 } 254 255 // Ranking info. 256 257 // Github: 258 traction := 0 259 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} { 260 f, err := strconv.Atoi(sec.Options.Get(s)) 261 if err == nil { 262 traction += f 263 } 264 } 265 266 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 { 267 // Pretend everything on googlesource.com has 1000 268 // github stars. 269 traction = 1000 270 } 271 272 if traction > 0 { 273 l := math.Log(float64(traction)) 274 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000) 275 } 276 277 return nil 278} 279 280// SetTemplatesFromOrigin fills in templates based on the origin URL. 281func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error { 282 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git")) 283 284 if strings.HasSuffix(u.Host, ".googlesource.com") { 285 return setTemplates(desc, u, "gitiles") 286 } else if u.Host == "github.com" { 287 u.Path = strings.TrimSuffix(u.Path, ".git") 288 return setTemplates(desc, u, "github") 289 } else { 290 return fmt.Errorf("unknown git hosting site %q", u) 291 } 292} 293 294// The Options structs controls details of the indexing process. 295type Options struct { 296 // The repository to be indexed. 297 RepoDir string 298 299 // If set, follow submodule links. This requires RepoCacheDir to be set. 300 Submodules bool 301 302 // If set, skip indexing if the existing index shard is newer 303 // than the refs in the repository. 304 Incremental bool 305 306 // Don't error out if some branch is missing 307 AllowMissingBranch bool 308 309 // Specifies the root of a Repository cache. Needed for submodule indexing. 310 RepoCacheDir string 311 312 // Indexing options. 313 BuildOptions build.Options 314 315 // Prefix of the branch to index, e.g. `remotes/origin`. 316 BranchPrefix string 317 318 // List of branch names to index, e.g. []string{"HEAD", "stable"} 319 Branches []string 320 321 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards 322 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold, 323 // then a normal build will be performed instead. 324 // 325 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled: 326 // a delta build will always be performed regardless of the number of preexisting shards. 327 DeltaShardNumberFallbackThreshold uint64 328} 329 330func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) { 331 var result []string 332 for _, b := range bs { 333 // Sourcegraph: We disable resolving refs. We want to return the exact ref 334 // requested so we can match it up. 335 if b == "HEAD" && false { 336 ref, err := repo.Head() 337 if err != nil { 338 return nil, err 339 } 340 341 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix)) 342 continue 343 } 344 345 if strings.Contains(b, "*") { 346 iter, err := repo.Branches() 347 if err != nil { 348 return nil, err 349 } 350 351 defer iter.Close() 352 for { 353 ref, err := iter.Next() 354 if err == io.EOF { 355 break 356 } 357 if err != nil { 358 return nil, err 359 } 360 361 name := ref.Name().Short() 362 if matched, err := filepath.Match(b, name); err != nil { 363 return nil, err 364 } else if !matched { 365 continue 366 } 367 368 result = append(result, strings.TrimPrefix(name, prefix)) 369 } 370 continue 371 } 372 373 result = append(result, b) 374 } 375 376 return result, nil 377} 378 379// IndexGitRepo indexes the git repository as specified by the options. 380// The returned bool indicates whether the index was updated as a result. This 381// can be informative if doing incremental indexing. 382func IndexGitRepo(opts Options) (bool, error) { 383 return indexGitRepo(opts, gitIndexConfig{}) 384} 385 386// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig. 387// The returned bool indicates whether the index was updated as a result. This 388// can be informative if doing incremental indexing. 389func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { 390 prepareDeltaBuild := prepareDeltaBuild 391 if config.prepareDeltaBuild != nil { 392 prepareDeltaBuild = config.prepareDeltaBuild 393 } 394 395 prepareNormalBuild := prepareNormalBuild 396 if config.prepareNormalBuild != nil { 397 prepareNormalBuild = config.prepareNormalBuild 398 } 399 400 // Set max thresholds, since we use them in this function. 401 opts.BuildOptions.SetDefaults() 402 if opts.RepoDir == "" { 403 return false, fmt.Errorf("gitindex: must set RepoDir") 404 } 405 406 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir 407 repo, err := git.PlainOpen(opts.RepoDir) 408 if err != nil { 409 return false, fmt.Errorf("git.PlainOpen: %w", err) 410 } 411 412 if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil { 413 log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err) 414 } 415 416 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix) 417 if err != nil { 418 return false, fmt.Errorf("expandBranches: %w", err) 419 } 420 for _, b := range branches { 421 commit, err := getCommit(repo, opts.BranchPrefix, b) 422 if err != nil { 423 if opts.AllowMissingBranch && err.Error() == "reference not found" { 424 continue 425 } 426 427 return false, fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err) 428 } 429 430 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{ 431 Name: b, 432 Version: commit.Hash.String(), 433 }) 434 435 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) { 436 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when 437 } 438 } 439 440 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() { 441 return false, nil 442 } 443 444 // branch => (path, sha1) => repo. 445 var repos map[fileKey]BlobLocation 446 447 // Branch => Repo => SHA1 448 var branchVersions map[string]map[string]plumbing.Hash 449 450 // set of file paths that have been changed or deleted since 451 // the last indexed commit 452 // 453 // These only have an effect on delta builds 454 var changedOrRemovedFiles []string 455 456 if opts.BuildOptions.IsDelta { 457 repos, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo) 458 if err != nil { 459 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err) 460 opts.BuildOptions.IsDelta = false 461 } 462 } 463 464 if !opts.BuildOptions.IsDelta { 465 repos, branchVersions, err = prepareNormalBuild(opts, repo) 466 if err != nil { 467 return false, fmt.Errorf("preparing normal build: %w", err) 468 } 469 } 470 471 reposByPath := map[string]BlobLocation{} 472 for key, info := range repos { 473 reposByPath[key.SubRepoPath] = info 474 } 475 476 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{} 477 for path, info := range reposByPath { 478 tpl := opts.BuildOptions.RepositoryDescription 479 if path != "" { 480 tpl = zoekt.Repository{URL: info.URL.String()} 481 if err := SetTemplatesFromOrigin(&tpl, info.URL); err != nil { 482 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.URL, err) 483 } 484 } 485 opts.BuildOptions.SubRepositories[path] = &tpl 486 } 487 488 for _, br := range opts.BuildOptions.RepositoryDescription.Branches { 489 for path, repo := range opts.BuildOptions.SubRepositories { 490 id := branchVersions[br.Name][path] 491 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{ 492 Name: br.Name, 493 Version: id.String(), 494 }) 495 } 496 } 497 498 builder, err := build.NewBuilder(opts.BuildOptions) 499 if err != nil { 500 return false, fmt.Errorf("build.NewBuilder: %w", err) 501 } 502 503 // Preparing the build can consume substantial memory, so check usage before starting to index. 504 builder.CheckMemoryUsage() 505 506 var ranks repoPathRanks 507 var meanRank float64 508 if opts.BuildOptions.DocumentRanksPath != "" { 509 data, err := os.ReadFile(opts.BuildOptions.DocumentRanksPath) 510 if err != nil { 511 return false, err 512 } 513 514 err = json.Unmarshal(data, &ranks) 515 if err != nil { 516 return false, err 517 } 518 519 // Compute the mean rank for this repository. Note: we overwrite the rank 520 // mean that's stored in the document ranks file, since that currently 521 // represents a global mean rank across repos, which is not what we want. 522 numRanks := len(ranks.Paths) 523 if numRanks > 0 { 524 for _, rank := range ranks.Paths { 525 meanRank += rank 526 } 527 ranks.MeanRank = meanRank / float64(numRanks) 528 } 529 } 530 531 // we don't need to check error, since we either already have an error, or 532 // we returning the first call to builder.Finish. 533 defer builder.Finish() // nolint:errcheck 534 535 for _, f := range changedOrRemovedFiles { 536 builder.MarkFileAsChangedOrRemoved(f) 537 } 538 539 var names []string 540 fileKeys := map[string][]fileKey{} 541 totalFiles := 0 542 543 for key := range repos { 544 n := key.FullPath() 545 fileKeys[n] = append(fileKeys[n], key) 546 names = append(names, n) 547 totalFiles++ 548 } 549 550 sort.Strings(names) 551 names = uniq(names) 552 553 log.Printf("attempting to index %d total files", totalFiles) 554 for idx, name := range names { 555 keys := fileKeys[name] 556 557 for _, key := range keys { 558 doc, err := createDocument(key, repos, ranks, opts.BuildOptions) 559 if err != nil { 560 return false, err 561 } 562 563 if err := builder.Add(doc); err != nil { 564 return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err) 565 } 566 567 if idx%10_000 == 0 { 568 builder.CheckMemoryUsage() 569 } 570 } 571 } 572 return true, builder.Finish() 573} 574 575type repoPathRanks struct { 576 MeanRank float64 `json:"mean_reference_count"` 577 Paths map[string]float64 `json:"paths"` 578} 579 580// rank returns the rank for a given path. It uses these rules: 581// - If we have a concrete rank for this file, always use it 582// - If there's no rank, and it's a low priority file like a test, then use rank 0 583// - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage 584func (r repoPathRanks) rank(path string, content []byte) float64 { 585 if rank, ok := r.Paths[path]; ok { 586 return rank 587 } else if build.IsLowPriority(path, content) { 588 return 0.0 589 } else { 590 return r.MeanRank 591 } 592} 593 594func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) { 595 ignoreFile, err := tree.File(ignore.IgnoreFile) 596 if err == object.ErrFileNotFound { 597 return &ignore.Matcher{}, nil 598 } 599 if err != nil { 600 return nil, err 601 } 602 content, err := ignoreFile.Contents() 603 if err != nil { 604 return nil, err 605 } 606 return ignore.ParseIgnoreFile(strings.NewReader(content)) 607} 608 609// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing 610// a build.Builder instance for generating a delta build. 611type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) 612 613// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing 614// a build.Builder instance for generating a normal build. 615type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) 616 617type gitIndexConfig struct { 618 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to 619 // prepare the build.Builder instance for generating a delta build. 620 // 621 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead. 622 prepareDeltaBuild prepareDeltaBuildFunc 623 624 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to 625 // prepare the build.Builder instance for generating a normal build. 626 // 627 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead. 628 prepareNormalBuild prepareNormalBuildFunc 629} 630 631func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { 632 if options.Submodules { 633 return nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing") 634 } 635 636 // discover what commits we indexed during our last build 637 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata() 638 if err != nil { 639 return nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err) 640 } 641 642 if !ok { 643 return nil, nil, nil, fmt.Errorf("no existing shards found for repository") 644 } 645 646 if options.DeltaShardNumberFallbackThreshold > 0 { 647 // HACK: For our interim compaction strategy, we force a full normal index once 648 // the number of shards on disk for this repository exceeds the provided threshold. 649 // 650 // This strategy obviously isn't optimal (as an example: we currently can't differentiate 651 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per 652 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads 653 // while we create a better compaction strategy). 654 655 oldShards := options.BuildOptions.FindAllShards() 656 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold { 657 return nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold) 658 } 659 } 660 661 // Check to see if the set of branch names is consistent with what we last indexed. 662 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a 663 // normal one). 664 665 if !build.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) { 666 var existingBranchNames []string 667 for _, b := range existingRepository.Branches { 668 existingBranchNames = append(existingBranchNames, b.Name) 669 } 670 671 var optionsBranchNames []string 672 for _, b := range options.BuildOptions.RepositoryDescription.Branches { 673 optionsBranchNames = append(optionsBranchNames, b.Name) 674 } 675 676 existingBranchList := strings.Join(existingBranchNames, ", ") 677 optionsBranchList := strings.Join(optionsBranchNames, ", ") 678 679 return nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList) 680 } 681 682 // Check if the build options hash does not match the repository metadata's hash 683 // If it does not match then one or more index options has changed and will require a normal build instead of a delta build 684 if options.BuildOptions.GetHash() != existingRepository.IndexOptions { 685 return nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions()) 686 } 687 688 // branch => (path, sha1) => repo. 689 repos = map[fileKey]BlobLocation{} 690 691 // branch name -> git worktree at most current commit 692 branchToCurrentTree := make(map[string]*object.Tree, len(options.Branches)) 693 694 for _, b := range options.Branches { 695 commit, err := getCommit(repository, options.BranchPrefix, b) 696 if err != nil { 697 return nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err) 698 } 699 700 tree, err := commit.Tree() 701 if err != nil { 702 return nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err) 703 } 704 705 branchToCurrentTree[b] = tree 706 } 707 708 rawURL := options.BuildOptions.RepositoryDescription.URL 709 u, err := url.Parse(rawURL) 710 if err != nil { 711 return nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err) 712 } 713 714 // TODO: Support repository submodules for delta builds 715 716 // loop over all branches, calculate the diff between our 717 // last indexed commit and the current commit, and add files mentioned in the diff 718 for _, branch := range existingRepository.Branches { 719 lastIndexedCommit, err := getCommit(repository, "", branch.Version) 720 if err != nil { 721 return nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err) 722 } 723 724 lastIndexedTree, err := lastIndexedCommit.Tree() 725 if err != nil { 726 return nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err) 727 } 728 729 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false}) 730 if err != nil { 731 return nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err) 732 } 733 734 for i, c := range changes { 735 oldFile, newFile, err := c.Files() 736 if err != nil { 737 return nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err) 738 } 739 740 if newFile != nil { 741 // note: newFile.Name could be a path that isn't relative to the repository root - using the 742 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root 743 newFileRelativeRootPath := c.To.Name 744 745 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds 746 if newFileRelativeRootPath == ignore.IgnoreFile { 747 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile) 748 } 749 750 // either file is added or renamed, so we need to add the new version to the build 751 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash} 752 if existing, ok := repos[file]; ok { 753 existing.Branches = append(existing.Branches, branch.Name) 754 repos[file] = existing 755 } else { 756 repos[file] = BlobLocation{ 757 GitRepo: repository, 758 URL: u, 759 Branches: []string{branch.Name}, 760 } 761 } 762 } 763 764 if oldFile == nil { 765 // file added - nothing more to do 766 continue 767 } 768 769 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the 770 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root 771 oldFileRelativeRootPath := c.From.Name 772 773 if oldFileRelativeRootPath == ignore.IgnoreFile { 774 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile) 775 } 776 777 // The file is either modified or deleted. So, we need to add ALL versions 778 // of the old file (across all branches) to the build. 779 for b, currentTree := range branchToCurrentTree { 780 f, err := currentTree.File(oldFileRelativeRootPath) 781 if err != nil { 782 // the file doesn't exist in this branch 783 if errors.Is(err, object.ErrFileNotFound) { 784 continue 785 } 786 787 return nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err) 788 } 789 790 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()} 791 if existing, ok := repos[file]; ok { 792 existing.Branches = append(existing.Branches, b) 793 repos[file] = existing 794 } else { 795 repos[file] = BlobLocation{ 796 GitRepo: repository, 797 URL: u, 798 Branches: []string{b}, 799 } 800 } 801 } 802 803 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath) 804 } 805 } 806 807 // we need to de-duplicate the branch map before returning it - it's possible for the same 808 // branch to have been added multiple times if a file has been modified across multiple commits 809 for _, info := range repos { 810 sort.Strings(info.Branches) 811 info.Branches = uniq(info.Branches) 812 } 813 814 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates 815 // for the same reasoning as above 816 sort.Strings(changedOrDeletedPaths) 817 changedOrDeletedPaths = uniq(changedOrDeletedPaths) 818 819 return repos, nil, changedOrDeletedPaths, nil 820} 821 822func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) { 823 var repoCache *RepoCache 824 if options.Submodules { 825 repoCache = NewRepoCache(options.RepoCacheDir) 826 } 827 828 // Branch => Repo => SHA1 829 branchVersions = map[string]map[string]plumbing.Hash{} 830 831 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix) 832 if err != nil { 833 return nil, nil, fmt.Errorf("expandBranches: %w", err) 834 } 835 836 rw := NewRepoWalker(repository, options.BuildOptions.RepositoryDescription.URL, repoCache) 837 for _, b := range branches { 838 commit, err := getCommit(repository, options.BranchPrefix, b) 839 if err != nil { 840 if options.AllowMissingBranch && err.Error() == "reference not found" { 841 continue 842 } 843 844 return nil, nil, fmt.Errorf("getCommit: %w", err) 845 } 846 847 tree, err := commit.Tree() 848 if err != nil { 849 return nil, nil, fmt.Errorf("commit.Tree: %w", err) 850 } 851 852 ig, err := newIgnoreMatcher(tree) 853 if err != nil { 854 return nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err) 855 } 856 857 subVersions, err := rw.CollectFiles(tree, b, ig) 858 if err != nil { 859 return nil, nil, fmt.Errorf("CollectFiles: %w", err) 860 } 861 862 branchVersions[b] = subVersions 863 } 864 865 return rw.Files, branchVersions, nil 866} 867 868func createDocument(key fileKey, 869 repos map[fileKey]BlobLocation, 870 ranks repoPathRanks, 871 opts build.Options, 872) (zoekt.Document, error) { 873 repo := repos[key] 874 blob, err := repo.GitRepo.BlobObject(key.ID) 875 branches := repos[key].Branches 876 877 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found. 878 if errors.Is(err, plumbing.ErrObjectNotFound) { 879 return skippedLargeDoc(key, branches, opts), nil 880 } 881 882 if err != nil { 883 return zoekt.Document{}, err 884 } 885 886 keyFullPath := key.FullPath() 887 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) { 888 return skippedLargeDoc(key, branches, opts), nil 889 } 890 891 contents, err := blobContents(blob) 892 if err != nil { 893 return zoekt.Document{}, err 894 } 895 896 var pathRanks []float64 897 if len(ranks.Paths) > 0 { 898 // If the repository has ranking data, then store the file's rank. 899 pathRank := ranks.rank(keyFullPath, contents) 900 pathRanks = []float64{pathRank} 901 } 902 903 return zoekt.Document{ 904 SubRepositoryPath: key.SubRepoPath, 905 Name: keyFullPath, 906 Content: contents, 907 Branches: branches, 908 Ranks: pathRanks, 909 }, nil 910} 911 912func skippedLargeDoc(key fileKey, branches []string, opts build.Options) zoekt.Document { 913 return zoekt.Document{ 914 SkipReason: fmt.Sprintf("file size exceeds maximum size %d", opts.SizeMax), 915 Name: key.FullPath(), 916 Branches: branches, 917 SubRepositoryPath: key.SubRepoPath, 918 } 919} 920 921func blobContents(blob *object.Blob) ([]byte, error) { 922 r, err := blob.Reader() 923 if err != nil { 924 return nil, err 925 } 926 defer r.Close() 927 928 var buf bytes.Buffer 929 buf.Grow(int(blob.Size)) 930 _, err = buf.ReadFrom(r) 931 if err != nil { 932 return nil, err 933 } 934 return buf.Bytes(), nil 935} 936 937func uniq(ss []string) []string { 938 result := ss[:0] 939 var last string 940 for i, s := range ss { 941 if i == 0 || s != last { 942 result = append(result, s) 943 } 944 last = s 945 } 946 return result 947}