fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package gitindex provides functions for indexing Git repositories. 16package gitindex 17 18import ( 19 "bytes" 20 "context" 21 "encoding/json" 22 "errors" 23 "fmt" 24 "io" 25 "log" 26 "math" 27 "net/url" 28 "os" 29 "path/filepath" 30 "regexp" 31 "sort" 32 "strconv" 33 "strings" 34 35 "github.com/sourcegraph/zoekt" 36 "github.com/sourcegraph/zoekt/build" 37 "github.com/sourcegraph/zoekt/ignore" 38 39 "github.com/go-git/go-git/v5/config" 40 "github.com/go-git/go-git/v5/plumbing" 41 "github.com/go-git/go-git/v5/plumbing/object" 42 43 git "github.com/go-git/go-git/v5" 44) 45 46// FindGitRepos finds directories holding git repositories below the 47// given directory. It will find both bare and the ".git" dirs in 48// non-bare repositories. It returns the full path including the dir 49// passed in. 50func FindGitRepos(dir string) ([]string, error) { 51 arg, err := filepath.Abs(dir) 52 if err != nil { 53 return nil, err 54 } 55 var dirs []string 56 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error { 57 // Best-effort, ignore filepath.Walk failing 58 if err != nil { 59 return nil 60 } 61 62 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() { 63 dirs = append(dirs, filepath.Join(name, ".git")) 64 return filepath.SkipDir 65 } 66 67 if !strings.HasSuffix(name, ".git") || !fi.IsDir() { 68 return nil 69 } 70 71 fi, err = os.Lstat(filepath.Join(name, "objects")) 72 if err != nil || !fi.IsDir() { 73 return nil 74 } 75 76 dirs = append(dirs, name) 77 return filepath.SkipDir 78 }); err != nil { 79 return nil, err 80 } 81 82 return dirs, nil 83} 84 85// setTemplates fills in URL templates for known git hosting 86// sites. 87func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error { 88 if u.Scheme == "ssh+git" { 89 u.Scheme = "https" 90 u.User = nil 91 } 92 93 repo.URL = u.String() 94 switch typ { 95 case "gitiles": 96 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20 97 repo.CommitURLTemplate = u.String() + "/+/{{.Version}}" 98 repo.FileURLTemplate = u.String() + "/+/{{.Version}}/{{.Path}}" 99 repo.LineFragmentTemplate = "#{{.LineNumber}}" 100 case "github": 101 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10 102 repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}" 103 repo.FileURLTemplate = u.String() + "/blob/{{.Version}}/{{.Path}}" 104 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 105 case "cgit": 106 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100 107 repo.CommitURLTemplate = u.String() + "/commit/?id={{.Version}}" 108 repo.FileURLTemplate = u.String() + "/tree/{{.Path}}/?id={{.Version}}" 109 repo.LineFragmentTemplate = "#n{{.LineNumber}}" 110 case "gitweb": 111 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10 112 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}" 113 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}" 114 repo.LineFragmentTemplate = "#l{{.LineNumber}}" 115 case "source.bazel.build": 116 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9 117 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10 118 repo.CommitURLTemplate = u.String() + "/+/{{.Version}}" 119 repo.FileURLTemplate = u.String() + "/+/{{.Version}}:{{.Path}}" 120 repo.LineFragmentTemplate = ";l={{.LineNumber}}" 121 case "bitbucket-server": 122 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc 123 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc 124 repo.CommitURLTemplate = u.String() + "/commits/{{.Version}}" 125 repo.FileURLTemplate = u.String() + "/{{.Path}}?at={{.Version}}" 126 repo.LineFragmentTemplate = "#{{.LineNumber}}" 127 case "gitlab": 128 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed 129 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template 130 repo.CommitURLTemplate = u.String() + "/-/commit/{{.Version}}" 131 repo.FileURLTemplate = u.String() + "/-/blob/{{.Version}}/{{.Path}}" 132 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 133 case "gitea": 134 repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}" 135 // NOTE The `display=source` query parameter is required to disable file rendering. 136 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to 137 // a line without `display=source`. This is supported since gitea 1.17.0. 138 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}}, 139 // but the query parameters are obmitted. 140 repo.FileURLTemplate = u.String() + "/src/commit/{{.Version}}/{{.Path}}?display=source" 141 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 142 default: 143 return fmt.Errorf("URL scheme type %q unknown", typ) 144 } 145 return nil 146} 147 148// getCommit returns a tree object for the given reference. 149func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) { 150 sha1, err := repo.ResolveRevision(plumbing.Revision(ref)) 151 // ref might be a branch name (e.g. "master") add branch prefix and try again. 152 if err != nil { 153 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref))) 154 } 155 if err != nil { 156 return nil, err 157 } 158 159 commitObj, err := repo.CommitObject(*sha1) 160 if err != nil { 161 return nil, err 162 } 163 return commitObj, nil 164} 165 166func configLookupRemoteURL(cfg *config.Config, key string) string { 167 rc := cfg.Remotes[key] 168 if rc == nil || len(rc.URLs) == 0 { 169 return "" 170 } 171 return rc.URLs[0] 172} 173 174var sshRelativeURLRegexp = regexp.MustCompile(`^([^@]+)@([^:]+):(.*)$`) 175 176func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error { 177 repo, err := git.PlainOpen(repoDir) 178 if err != nil { 179 return err 180 } 181 182 cfg, err := repo.Config() 183 if err != nil { 184 return err 185 } 186 187 sec := cfg.Raw.Section("zoekt") 188 189 webURLStr := sec.Options.Get("web-url") 190 webURLType := sec.Options.Get("web-url-type") 191 192 if webURLType != "" && webURLStr != "" { 193 webURL, err := url.Parse(webURLStr) 194 if err != nil { 195 return err 196 } 197 if err := setTemplates(desc, webURL, webURLType); err != nil { 198 return err 199 } 200 } else if webURLStr != "" { 201 desc.URL = webURLStr 202 } 203 204 name := sec.Options.Get("name") 205 if name != "" { 206 desc.Name = name 207 } else { 208 remoteURL := configLookupRemoteURL(cfg, "origin") 209 if remoteURL == "" { 210 return nil 211 } 212 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil { 213 user := sm[1] 214 host := sm[2] 215 path := sm[3] 216 217 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path) 218 } 219 220 u, err := url.Parse(remoteURL) 221 if err != nil { 222 return err 223 } 224 if err := SetTemplatesFromOrigin(desc, u); err != nil { 225 return err 226 } 227 } 228 229 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32) 230 desc.ID = uint32(id) 231 232 if desc.RawConfig == nil { 233 desc.RawConfig = map[string]string{} 234 } 235 for _, o := range sec.Options { 236 desc.RawConfig[o.Key] = o.Value 237 } 238 239 // Ranking info. 240 241 // Github: 242 traction := 0 243 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} { 244 f, err := strconv.Atoi(sec.Options.Get(s)) 245 if err == nil { 246 traction += f 247 } 248 } 249 250 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 { 251 // Pretend everything on googlesource.com has 1000 252 // github stars. 253 traction = 1000 254 } 255 256 if traction > 0 { 257 l := math.Log(float64(traction)) 258 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000) 259 } 260 261 return nil 262} 263 264// SetTemplatesFromOrigin fills in templates based on the origin URL. 265func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error { 266 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git")) 267 268 if strings.HasSuffix(u.Host, ".googlesource.com") { 269 return setTemplates(desc, u, "gitiles") 270 } else if u.Host == "github.com" { 271 u.Path = strings.TrimSuffix(u.Path, ".git") 272 return setTemplates(desc, u, "github") 273 } else { 274 return fmt.Errorf("unknown git hosting site %q", u) 275 } 276} 277 278// The Options structs controls details of the indexing process. 279type Options struct { 280 // The repository to be indexed. 281 RepoDir string 282 283 // If set, follow submodule links. This requires RepoCacheDir to be set. 284 Submodules bool 285 286 // If set, skip indexing if the existing index shard is newer 287 // than the refs in the repository. 288 Incremental bool 289 290 // Don't error out if some branch is missing 291 AllowMissingBranch bool 292 293 // Specifies the root of a Repository cache. Needed for submodule indexing. 294 RepoCacheDir string 295 296 // Indexing options. 297 BuildOptions build.Options 298 299 // Prefix of the branch to index, e.g. `remotes/origin`. 300 BranchPrefix string 301 302 // List of branch names to index, e.g. []string{"HEAD", "stable"} 303 Branches []string 304 305 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards 306 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold, 307 // then a normal build will be performed instead. 308 // 309 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled: 310 // a delta build will always be performed regardless of the number of preexisting shards. 311 DeltaShardNumberFallbackThreshold uint64 312} 313 314func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) { 315 var result []string 316 for _, b := range bs { 317 // Sourcegraph: We disable resolving refs. We want to return the exact ref 318 // requested so we can match it up. 319 if b == "HEAD" && false { 320 ref, err := repo.Head() 321 if err != nil { 322 return nil, err 323 } 324 325 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix)) 326 continue 327 } 328 329 if strings.Contains(b, "*") { 330 iter, err := repo.Branches() 331 if err != nil { 332 return nil, err 333 } 334 335 defer iter.Close() 336 for { 337 ref, err := iter.Next() 338 if err == io.EOF { 339 break 340 } 341 if err != nil { 342 return nil, err 343 } 344 345 name := ref.Name().Short() 346 if matched, err := filepath.Match(b, name); err != nil { 347 return nil, err 348 } else if !matched { 349 continue 350 } 351 352 result = append(result, strings.TrimPrefix(name, prefix)) 353 } 354 continue 355 } 356 357 result = append(result, b) 358 } 359 360 return result, nil 361} 362 363// IndexGitRepo indexes the git repository as specified by the options. 364// The returned bool indicates whether the index was updated as a result. This 365// can be informative if doing incremental indexing. 366func IndexGitRepo(opts Options) (bool, error) { 367 return indexGitRepo(opts, gitIndexConfig{}) 368} 369 370// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig. 371// The returned bool indicates whether the index was updated as a result. This 372// can be informative if doing incremental indexing. 373func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { 374 prepareDeltaBuild := prepareDeltaBuild 375 if config.prepareDeltaBuild != nil { 376 prepareDeltaBuild = config.prepareDeltaBuild 377 } 378 379 prepareNormalBuild := prepareNormalBuild 380 if config.prepareNormalBuild != nil { 381 prepareNormalBuild = config.prepareNormalBuild 382 } 383 384 // Set max thresholds, since we use them in this function. 385 opts.BuildOptions.SetDefaults() 386 if opts.RepoDir == "" { 387 return false, fmt.Errorf("gitindex: must set RepoDir") 388 } 389 390 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir 391 repo, err := git.PlainOpen(opts.RepoDir) 392 if err != nil { 393 return false, fmt.Errorf("git.PlainOpen: %w", err) 394 } 395 396 if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil { 397 log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err) 398 } 399 400 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix) 401 if err != nil { 402 return false, fmt.Errorf("expandBranches: %w", err) 403 } 404 for _, b := range branches { 405 commit, err := getCommit(repo, opts.BranchPrefix, b) 406 if err != nil { 407 if opts.AllowMissingBranch && err.Error() == "reference not found" { 408 continue 409 } 410 411 return false, fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err) 412 } 413 414 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{ 415 Name: b, 416 Version: commit.Hash.String(), 417 }) 418 419 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) { 420 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when 421 } 422 } 423 424 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() { 425 return false, nil 426 } 427 428 // branch => (path, sha1) => repo. 429 var repos map[fileKey]BlobLocation 430 431 // fileKey => branches 432 var branchMap map[fileKey][]string 433 434 // Branch => Repo => SHA1 435 var branchVersions map[string]map[string]plumbing.Hash 436 437 // set of file paths that have been changed or deleted since 438 // the last indexed commit 439 // 440 // These only have an effect on delta builds 441 var changedOrRemovedFiles []string 442 443 if opts.BuildOptions.IsDelta { 444 repos, branchMap, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo) 445 if err != nil { 446 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err) 447 opts.BuildOptions.IsDelta = false 448 } 449 } 450 451 if !opts.BuildOptions.IsDelta { 452 repos, branchMap, branchVersions, err = prepareNormalBuild(opts, repo) 453 if err != nil { 454 return false, fmt.Errorf("preparing normal build: %w", err) 455 } 456 } 457 458 reposByPath := map[string]BlobLocation{} 459 for key, location := range repos { 460 reposByPath[key.SubRepoPath] = location 461 } 462 463 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{} 464 for path, location := range reposByPath { 465 tpl := opts.BuildOptions.RepositoryDescription 466 if path != "" { 467 tpl = zoekt.Repository{URL: location.URL.String()} 468 if err := SetTemplatesFromOrigin(&tpl, location.URL); err != nil { 469 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, location.URL, err) 470 } 471 } 472 opts.BuildOptions.SubRepositories[path] = &tpl 473 } 474 475 for _, br := range opts.BuildOptions.RepositoryDescription.Branches { 476 for path, repo := range opts.BuildOptions.SubRepositories { 477 id := branchVersions[br.Name][path] 478 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{ 479 Name: br.Name, 480 Version: id.String(), 481 }) 482 } 483 } 484 485 builder, err := build.NewBuilder(opts.BuildOptions) 486 if err != nil { 487 return false, fmt.Errorf("build.NewBuilder: %w", err) 488 } 489 490 // Preparing the build can consume substantial memory, so check usage before starting to index. 491 builder.CheckMemoryUsage() 492 493 var ranks repoPathRanks 494 var meanRank float64 495 if opts.BuildOptions.DocumentRanksPath != "" { 496 data, err := os.ReadFile(opts.BuildOptions.DocumentRanksPath) 497 if err != nil { 498 return false, err 499 } 500 501 err = json.Unmarshal(data, &ranks) 502 if err != nil { 503 return false, err 504 } 505 506 // Compute the mean rank for this repository. Note: we overwrite the rank 507 // mean that's stored in the document ranks file, since that currently 508 // represents a global mean rank across repos, which is not what we want. 509 numRanks := len(ranks.Paths) 510 if numRanks > 0 { 511 for _, rank := range ranks.Paths { 512 meanRank += rank 513 } 514 ranks.MeanRank = meanRank / float64(numRanks) 515 } 516 } 517 518 // we don't need to check error, since we either already have an error, or 519 // we returning the first call to builder.Finish. 520 defer builder.Finish() // nolint:errcheck 521 522 for _, f := range changedOrRemovedFiles { 523 builder.MarkFileAsChangedOrRemoved(f) 524 } 525 526 var names []string 527 fileKeys := map[string][]fileKey{} 528 totalFiles := 0 529 530 for key := range repos { 531 n := key.FullPath() 532 fileKeys[n] = append(fileKeys[n], key) 533 names = append(names, n) 534 totalFiles++ 535 } 536 537 sort.Strings(names) 538 names = uniq(names) 539 540 log.Printf("attempting to index %d total files", totalFiles) 541 for idx, name := range names { 542 keys := fileKeys[name] 543 544 for _, key := range keys { 545 doc, err := createDocument(key, repos, branchMap, ranks, opts.BuildOptions) 546 if err != nil { 547 return false, err 548 } 549 550 if err := builder.Add(doc); err != nil { 551 return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err) 552 } 553 554 if idx%10_000 == 0 { 555 builder.CheckMemoryUsage() 556 } 557 } 558 } 559 return true, builder.Finish() 560} 561 562type repoPathRanks struct { 563 MeanRank float64 `json:"mean_reference_count"` 564 Paths map[string]float64 `json:"paths"` 565} 566 567// rank returns the rank for a given path. It uses these rules: 568// - If we have a concrete rank for this file, always use it 569// - If there's no rank, and it's a low priority file like a test, then use rank 0 570// - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage 571func (r repoPathRanks) rank(path string, content []byte) float64 { 572 if rank, ok := r.Paths[path]; ok { 573 return rank 574 } else if build.IsLowPriority(path, content) { 575 return 0.0 576 } else { 577 return r.MeanRank 578 } 579} 580 581func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) { 582 ignoreFile, err := tree.File(ignore.IgnoreFile) 583 if err == object.ErrFileNotFound { 584 return &ignore.Matcher{}, nil 585 } 586 if err != nil { 587 return nil, err 588 } 589 content, err := ignoreFile.Contents() 590 if err != nil { 591 return nil, err 592 } 593 return ignore.ParseIgnoreFile(strings.NewReader(content)) 594} 595 596// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing 597// a build.Builder instance for generating a delta build. 598type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) 599 600// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing 601// a build.Builder instance for generating a normal build. 602type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, err error) 603 604type gitIndexConfig struct { 605 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to 606 // prepare the build.Builder instance for generating a delta build. 607 // 608 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead. 609 prepareDeltaBuild prepareDeltaBuildFunc 610 611 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to 612 // prepare the build.Builder instance for generating a normal build. 613 // 614 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead. 615 prepareNormalBuild prepareNormalBuildFunc 616} 617 618func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { 619 if options.Submodules { 620 return nil, nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing") 621 } 622 623 // discover what commits we indexed during our last build 624 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata() 625 if err != nil { 626 return nil, nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err) 627 } 628 629 if !ok { 630 return nil, nil, nil, nil, fmt.Errorf("no existing shards found for repository") 631 } 632 633 if options.DeltaShardNumberFallbackThreshold > 0 { 634 // HACK: For our interim compaction strategy, we force a full normal index once 635 // the number of shards on disk for this repository exceeds the provided threshold. 636 // 637 // This strategy obviously isn't optimal (as an example: we currently can't differentiate 638 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per 639 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads 640 // while we create a better compaction strategy). 641 642 oldShards := options.BuildOptions.FindAllShards() 643 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold { 644 return nil, nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold) 645 } 646 } 647 648 // Check to see if the set of branch names is consistent with what we last indexed. 649 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a 650 // normal one). 651 652 if !build.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) { 653 var existingBranchNames []string 654 for _, b := range existingRepository.Branches { 655 existingBranchNames = append(existingBranchNames, b.Name) 656 } 657 658 var optionsBranchNames []string 659 for _, b := range options.BuildOptions.RepositoryDescription.Branches { 660 optionsBranchNames = append(optionsBranchNames, b.Name) 661 } 662 663 existingBranchList := strings.Join(existingBranchNames, ", ") 664 optionsBranchList := strings.Join(optionsBranchNames, ", ") 665 666 return nil, nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList) 667 } 668 669 // Check if the build options hash does not match the repository metadata's hash 670 // If it does not match then one or more index options has changed and will require a normal build instead of a delta build 671 if options.BuildOptions.GetHash() != existingRepository.IndexOptions { 672 return nil, nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions()) 673 } 674 675 // branch => (path, sha1) => repo. 676 repos = map[fileKey]BlobLocation{} 677 678 // fileKey => branches 679 branchMap = map[fileKey][]string{} 680 681 // branch name -> git worktree at most current commit 682 branchToCurrentTree := make(map[string]*object.Tree, len(options.Branches)) 683 684 for _, b := range options.Branches { 685 commit, err := getCommit(repository, options.BranchPrefix, b) 686 if err != nil { 687 return nil, nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err) 688 } 689 690 tree, err := commit.Tree() 691 if err != nil { 692 return nil, nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err) 693 } 694 695 branchToCurrentTree[b] = tree 696 } 697 698 rawURL := options.BuildOptions.RepositoryDescription.URL 699 u, err := url.Parse(rawURL) 700 if err != nil { 701 return nil, nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err) 702 } 703 704 // TODO: Support repository submodules for delta builds 705 // For this prototype, we are ignoring repository submodules, which means that we can use the same 706 // blob location for all files 707 hackSharedBlobLocation := BlobLocation{ 708 Repo: repository, 709 URL: u, 710 } 711 712 // loop over all branches, calculate the diff between our 713 // last indexed commit and the current commit, and add files mentioned in the diff 714 for _, branch := range existingRepository.Branches { 715 lastIndexedCommit, err := getCommit(repository, "", branch.Version) 716 if err != nil { 717 return nil, nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err) 718 } 719 720 lastIndexedTree, err := lastIndexedCommit.Tree() 721 if err != nil { 722 return nil, nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err) 723 } 724 725 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false}) 726 if err != nil { 727 return nil, nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err) 728 } 729 730 for i, c := range changes { 731 oldFile, newFile, err := c.Files() 732 if err != nil { 733 return nil, nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err) 734 } 735 736 if newFile != nil { 737 // note: newFile.Name could be a path that isn't relative to the repository root - using the 738 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root 739 newFileRelativeRootPath := c.To.Name 740 741 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds 742 if newFileRelativeRootPath == ignore.IgnoreFile { 743 return nil, nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile) 744 } 745 746 // either file is added or renamed, so we need to add the new version to the build 747 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash} 748 repos[file] = hackSharedBlobLocation 749 branchMap[file] = append(branchMap[file], branch.Name) 750 } 751 752 if oldFile == nil { 753 // file added - nothing more to do 754 continue 755 } 756 757 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the 758 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root 759 oldFileRelativeRootPath := c.From.Name 760 761 if oldFileRelativeRootPath == ignore.IgnoreFile { 762 return nil, nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile) 763 } 764 765 // The file is either modified or deleted. So, we need to add ALL versions 766 // of the old file (across all branches) to the build. 767 for b, currentTree := range branchToCurrentTree { 768 f, err := currentTree.File(oldFileRelativeRootPath) 769 if err != nil { 770 // the file doesn't exist in this branch 771 if errors.Is(err, object.ErrFileNotFound) { 772 continue 773 } 774 775 return nil, nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err) 776 } 777 778 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()} 779 repos[file] = hackSharedBlobLocation 780 branchMap[file] = append(branchMap[file], b) 781 } 782 783 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath) 784 } 785 } 786 787 // we need to de-duplicate the branch map before returning it - it's possible for the same 788 // branch to have been added multiple times if a file has been modified across multiple commits 789 790 for file, branches := range branchMap { 791 sort.Strings(branches) 792 branchMap[file] = uniq(branches) 793 } 794 795 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates 796 // for the same reasoning as above 797 798 sort.Strings(changedOrDeletedPaths) 799 changedOrDeletedPaths = uniq(changedOrDeletedPaths) 800 801 return repos, branchMap, nil, changedOrDeletedPaths, nil 802} 803 804func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, err error) { 805 var repoCache *RepoCache 806 if options.Submodules { 807 repoCache = NewRepoCache(options.RepoCacheDir) 808 } 809 810 // branch => (path, sha1) => repo. 811 repos = map[fileKey]BlobLocation{} 812 813 // fileKey => branches 814 branchMap = map[fileKey][]string{} 815 816 // Branch => Repo => SHA1 817 branchVersions = map[string]map[string]plumbing.Hash{} 818 819 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix) 820 if err != nil { 821 return nil, nil, nil, fmt.Errorf("expandBranches: %w", err) 822 } 823 824 for _, b := range branches { 825 commit, err := getCommit(repository, options.BranchPrefix, b) 826 if err != nil { 827 if options.AllowMissingBranch && err.Error() == "reference not found" { 828 continue 829 } 830 831 return nil, nil, nil, fmt.Errorf("getCommit: %w", err) 832 } 833 834 tree, err := commit.Tree() 835 if err != nil { 836 return nil, nil, nil, fmt.Errorf("commit.Tree: %w", err) 837 } 838 839 ig, err := newIgnoreMatcher(tree) 840 if err != nil { 841 return nil, nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err) 842 } 843 844 files, subVersions, err := TreeToFiles(repository, tree, options.BuildOptions.RepositoryDescription.URL, repoCache) 845 if err != nil { 846 return nil, nil, nil, fmt.Errorf("TreeToFiles: %w", err) 847 } 848 for k, v := range files { 849 if ig.Match(k.Path) { 850 continue 851 } 852 repos[k] = v 853 branchMap[k] = append(branchMap[k], b) 854 } 855 856 branchVersions[b] = subVersions 857 } 858 859 return repos, branchMap, branchVersions, nil 860} 861 862func createDocument(key fileKey, 863 repos map[fileKey]BlobLocation, 864 branchMap map[fileKey][]string, 865 ranks repoPathRanks, 866 opts build.Options, 867) (zoekt.Document, error) { 868 blob, err := repos[key].Repo.BlobObject(key.ID) 869 870 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found. 871 if errors.Is(err, plumbing.ErrObjectNotFound) { 872 return skippedLargeDoc(key, branchMap, opts), nil 873 } 874 875 if err != nil { 876 return zoekt.Document{}, err 877 } 878 879 keyFullPath := key.FullPath() 880 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) { 881 return skippedLargeDoc(key, branchMap, opts), nil 882 } 883 884 contents, err := blobContents(blob) 885 if err != nil { 886 return zoekt.Document{}, err 887 } 888 889 var pathRanks []float64 890 if len(ranks.Paths) > 0 { 891 // If the repository has ranking data, then store the file's rank. 892 pathRank := ranks.rank(keyFullPath, contents) 893 pathRanks = []float64{pathRank} 894 } 895 896 return zoekt.Document{ 897 SubRepositoryPath: key.SubRepoPath, 898 Name: keyFullPath, 899 Content: contents, 900 Branches: branchMap[key], 901 Ranks: pathRanks, 902 }, nil 903} 904 905func skippedLargeDoc(key fileKey, branchMap map[fileKey][]string, opts build.Options) zoekt.Document { 906 return zoekt.Document{ 907 SkipReason: fmt.Sprintf("file size exceeds maximum size %d", opts.SizeMax), 908 Name: key.FullPath(), 909 Branches: branchMap[key], 910 SubRepositoryPath: key.SubRepoPath, 911 } 912} 913 914func blobContents(blob *object.Blob) ([]byte, error) { 915 r, err := blob.Reader() 916 if err != nil { 917 return nil, err 918 } 919 defer r.Close() 920 921 var buf bytes.Buffer 922 buf.Grow(int(blob.Size)) 923 _, err = buf.ReadFrom(r) 924 if err != nil { 925 return nil, err 926 } 927 return buf.Bytes(), nil 928} 929 930func uniq(ss []string) []string { 931 result := ss[:0] 932 var last string 933 for i, s := range ss { 934 if i == 0 || s != last { 935 result = append(result, s) 936 } 937 last = s 938 } 939 return result 940}