fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package gitindex provides functions for indexing Git repositories. 16package gitindex 17 18import ( 19 "bytes" 20 "cmp" 21 "context" 22 "errors" 23 "fmt" 24 "io" 25 "log" 26 "math" 27 "net/url" 28 "os" 29 "path/filepath" 30 "regexp" 31 "sort" 32 "strconv" 33 "strings" 34 35 "github.com/go-git/go-billy/v5/osfs" 36 "github.com/go-git/go-git/v5/config" 37 "github.com/go-git/go-git/v5/plumbing" 38 "github.com/go-git/go-git/v5/plumbing/cache" 39 "github.com/go-git/go-git/v5/plumbing/object" 40 "github.com/go-git/go-git/v5/storage/filesystem" 41 "github.com/sourcegraph/zoekt" 42 "github.com/sourcegraph/zoekt/ignore" 43 "github.com/sourcegraph/zoekt/index" 44 45 git "github.com/go-git/go-git/v5" 46) 47 48// FindGitRepos finds directories holding git repositories below the 49// given directory. It will find both bare and the ".git" dirs in 50// non-bare repositories. It returns the full path including the dir 51// passed in. 52func FindGitRepos(dir string) ([]string, error) { 53 arg, err := filepath.Abs(dir) 54 if err != nil { 55 return nil, err 56 } 57 var dirs []string 58 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error { 59 // Best-effort, ignore filepath.Walk failing 60 if err != nil { 61 return nil 62 } 63 64 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() { 65 dirs = append(dirs, filepath.Join(name, ".git")) 66 return filepath.SkipDir 67 } 68 69 if !strings.HasSuffix(name, ".git") || !fi.IsDir() { 70 return nil 71 } 72 73 fi, err = os.Lstat(filepath.Join(name, "objects")) 74 if err != nil || !fi.IsDir() { 75 return nil 76 } 77 78 dirs = append(dirs, name) 79 return filepath.SkipDir 80 }); err != nil { 81 return nil, err 82 } 83 84 return dirs, nil 85} 86 87// setTemplates fills in URL templates for known git hosting 88// sites. 89func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error { 90 if u.Scheme == "ssh+git" { 91 u.Scheme = "https" 92 u.User = nil 93 } 94 95 // helper to generate u.JoinPath as a template 96 varVersion := ".Version" 97 varPath := ".Path" 98 urlJoinPath := func(elem ...string) string { 99 elem = append([]string{u.String()}, elem...) 100 var parts []string 101 for _, e := range elem { 102 if e == varVersion || e == varPath { 103 parts = append(parts, e) 104 } else { 105 parts = append(parts, strconv.Quote(e)) 106 } 107 } 108 return fmt.Sprintf("{{URLJoinPath %s}}", strings.Join(parts, " ")) 109 } 110 111 repo.URL = u.String() 112 switch typ { 113 case "gitiles": 114 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20 115 repo.CommitURLTemplate = urlJoinPath("+", varVersion) 116 repo.FileURLTemplate = urlJoinPath("+", varVersion, varPath) 117 repo.LineFragmentTemplate = "#{{.LineNumber}}" 118 case "github": 119 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10 120 repo.CommitURLTemplate = urlJoinPath("commit", varVersion) 121 repo.FileURLTemplate = urlJoinPath("blob", varVersion, varPath) 122 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 123 case "cgit": 124 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100 125 repo.CommitURLTemplate = urlJoinPath("commit") + "/?id={{.Version}}" 126 repo.FileURLTemplate = urlJoinPath("tree", varPath) + "/?id={{.Version}}" 127 repo.LineFragmentTemplate = "#n{{.LineNumber}}" 128 case "gitweb": 129 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10 130 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}" 131 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}" 132 repo.LineFragmentTemplate = "#l{{.LineNumber}}" 133 case "source.bazel.build": 134 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9 135 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10 136 repo.CommitURLTemplate = u.String() + "/%2B/{{.Version}}" 137 repo.FileURLTemplate = u.String() + "/%2B/{{.Version}}:{{.Path}}" 138 repo.LineFragmentTemplate = ";l={{.LineNumber}}" 139 case "bitbucket-server": 140 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc 141 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc 142 repo.CommitURLTemplate = urlJoinPath("commits", varVersion) 143 repo.FileURLTemplate = urlJoinPath(varPath) + "?at={{.Version}}" 144 repo.LineFragmentTemplate = "#{{.LineNumber}}" 145 case "gitlab": 146 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed 147 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template 148 repo.CommitURLTemplate = urlJoinPath("-/commit", varVersion) 149 repo.FileURLTemplate = urlJoinPath("-/blob", varVersion, varPath) 150 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 151 case "gitea": 152 repo.CommitURLTemplate = urlJoinPath("commit", varVersion) 153 // NOTE The `display=source` query parameter is required to disable file rendering. 154 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to 155 // a line without `display=source`. This is supported since gitea 1.17.0. 156 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}}, 157 // but the query parameters are obmitted. 158 repo.FileURLTemplate = urlJoinPath("src/commit", varVersion, varPath) + "?display=source" 159 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 160 default: 161 return fmt.Errorf("URL scheme type %q unknown", typ) 162 } 163 return nil 164} 165 166// getCommit returns a tree object for the given reference. 167func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) { 168 sha1, err := repo.ResolveRevision(plumbing.Revision(ref)) 169 // ref might be a branch name (e.g. "master") add branch prefix and try again. 170 if err != nil { 171 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref))) 172 } 173 if err != nil { 174 return nil, err 175 } 176 177 commitObj, err := repo.CommitObject(*sha1) 178 if err != nil { 179 return nil, err 180 } 181 return commitObj, nil 182} 183 184func configLookupRemoteURL(cfg *config.Config, key string) string { 185 rc := cfg.Remotes[key] 186 if rc == nil || len(rc.URLs) == 0 { 187 return "" 188 } 189 return rc.URLs[0] 190} 191 192var sshRelativeURLRegexp = regexp.MustCompile(`^([^@]+)@([^:]+):(.*)$`) 193 194func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error { 195 repo, err := git.PlainOpen(repoDir) 196 if err != nil { 197 return err 198 } 199 200 cfg, err := repo.Config() 201 if err != nil { 202 return err 203 } 204 205 sec := cfg.Raw.Section("zoekt") 206 207 webURLStr := sec.Options.Get("web-url") 208 webURLType := sec.Options.Get("web-url-type") 209 210 if webURLType != "" && webURLStr != "" { 211 webURL, err := url.Parse(webURLStr) 212 if err != nil { 213 return err 214 } 215 if err := setTemplates(desc, webURL, webURLType); err != nil { 216 return err 217 } 218 } else if webURLStr != "" { 219 desc.URL = webURLStr 220 } 221 222 name := sec.Options.Get("name") 223 if name != "" { 224 desc.Name = name 225 } else { 226 remoteURL := configLookupRemoteURL(cfg, "origin") 227 if remoteURL == "" { 228 return nil 229 } 230 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil { 231 user := sm[1] 232 host := sm[2] 233 path := sm[3] 234 235 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path) 236 } 237 238 u, err := url.Parse(remoteURL) 239 if err != nil { 240 return err 241 } 242 if err := SetTemplatesFromOrigin(desc, u); err != nil { 243 return err 244 } 245 } 246 247 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32) 248 desc.ID = uint32(id) 249 250 if desc.RawConfig == nil { 251 desc.RawConfig = map[string]string{} 252 } 253 for _, o := range sec.Options { 254 desc.RawConfig[o.Key] = o.Value 255 } 256 257 // Ranking info. 258 259 // Github: 260 traction := 0 261 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} { 262 f, err := strconv.Atoi(sec.Options.Get(s)) 263 if err == nil { 264 traction += f 265 } 266 } 267 268 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 { 269 // Pretend everything on googlesource.com has 1000 270 // github stars. 271 traction = 1000 272 } 273 274 if traction > 0 { 275 l := math.Log(float64(traction)) 276 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000) 277 } 278 279 return nil 280} 281 282// SetTemplatesFromOrigin fills in templates based on the origin URL. 283func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error { 284 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git")) 285 286 if strings.HasSuffix(u.Host, ".googlesource.com") { 287 return setTemplates(desc, u, "gitiles") 288 } else if u.Host == "github.com" { 289 u.Path = strings.TrimSuffix(u.Path, ".git") 290 return setTemplates(desc, u, "github") 291 } else { 292 return fmt.Errorf("unknown git hosting site %q", u) 293 } 294} 295 296// The Options structs controls details of the indexing process. 297type Options struct { 298 // The repository to be indexed. 299 RepoDir string 300 301 // If set, follow submodule links. This requires RepoCacheDir to be set. 302 Submodules bool 303 304 // If set, skip indexing if the existing index shard is newer 305 // than the refs in the repository. 306 Incremental bool 307 308 // Don't error out if some branch is missing 309 AllowMissingBranch bool 310 311 // Specifies the root of a Repository cache. Needed for submodule indexing. 312 RepoCacheDir string 313 314 // Indexing options. 315 BuildOptions index.Options 316 317 // Prefix of the branch to index, e.g. `remotes/origin`. 318 BranchPrefix string 319 320 // List of branch names to index, e.g. []string{"HEAD", "stable"} 321 Branches []string 322 323 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards 324 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold, 325 // then a normal build will be performed instead. 326 // 327 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled: 328 // a delta build will always be performed regardless of the number of preexisting shards. 329 DeltaShardNumberFallbackThreshold uint64 330} 331 332func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) { 333 var result []string 334 for _, b := range bs { 335 // Sourcegraph: We disable resolving refs. We want to return the exact ref 336 // requested so we can match it up. 337 if b == "HEAD" && false { 338 ref, err := repo.Head() 339 if err != nil { 340 return nil, err 341 } 342 343 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix)) 344 continue 345 } 346 347 if strings.Contains(b, "*") { 348 iter, err := repo.Branches() 349 if err != nil { 350 return nil, err 351 } 352 353 defer iter.Close() 354 for { 355 ref, err := iter.Next() 356 if err == io.EOF { 357 break 358 } 359 if err != nil { 360 return nil, err 361 } 362 363 name := ref.Name().Short() 364 if matched, err := filepath.Match(b, name); err != nil { 365 return nil, err 366 } else if !matched { 367 continue 368 } 369 370 result = append(result, strings.TrimPrefix(name, prefix)) 371 } 372 continue 373 } 374 375 result = append(result, b) 376 } 377 378 return result, nil 379} 380 381// IndexGitRepo indexes the git repository as specified by the options. 382// The returned bool indicates whether the index was updated as a result. This 383// can be informative if doing incremental indexing. 384func IndexGitRepo(opts Options) (bool, error) { 385 return indexGitRepo(opts, gitIndexConfig{}) 386} 387 388// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig. 389// The returned bool indicates whether the index was updated as a result. This 390// can be informative if doing incremental indexing. 391func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { 392 prepareDeltaBuild := prepareDeltaBuild 393 if config.prepareDeltaBuild != nil { 394 prepareDeltaBuild = config.prepareDeltaBuild 395 } 396 397 prepareNormalBuild := prepareNormalBuild 398 if config.prepareNormalBuild != nil { 399 prepareNormalBuild = config.prepareNormalBuild 400 } 401 402 // Set max thresholds, since we use them in this function. 403 opts.BuildOptions.SetDefaults() 404 if opts.RepoDir == "" { 405 return false, fmt.Errorf("gitindex: must set RepoDir") 406 } 407 408 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir 409 410 var repo *git.Repository 411 legacyRepoOpen := cmp.Or(os.Getenv("ZOEKT_DISABLE_GOGIT_OPTIMIZATION"), "false") 412 if b, err := strconv.ParseBool(legacyRepoOpen); b || err != nil { 413 repo, err = git.PlainOpen(opts.RepoDir) 414 if err != nil { 415 return false, fmt.Errorf("git.PlainOpen: %w", err) 416 } 417 } else { 418 var repoCloser io.Closer 419 repo, repoCloser, err = openRepo(opts.RepoDir) 420 if err != nil { 421 return false, fmt.Errorf("openRepo: %w", err) 422 } 423 defer repoCloser.Close() 424 } 425 426 if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil { 427 log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err) 428 } 429 430 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix) 431 if err != nil { 432 return false, fmt.Errorf("expandBranches: %w", err) 433 } 434 for _, b := range branches { 435 commit, err := getCommit(repo, opts.BranchPrefix, b) 436 if err != nil { 437 if opts.AllowMissingBranch && err.Error() == "reference not found" { 438 continue 439 } 440 441 return false, fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err) 442 } 443 444 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{ 445 Name: b, 446 Version: commit.Hash.String(), 447 }) 448 449 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) { 450 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when 451 } 452 } 453 454 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() { 455 return false, nil 456 } 457 458 // branch => (path, sha1) => repo. 459 var repos map[fileKey]BlobLocation 460 461 // Branch => Repo => SHA1 462 var branchVersions map[string]map[string]plumbing.Hash 463 464 // set of file paths that have been changed or deleted since 465 // the last indexed commit 466 // 467 // These only have an effect on delta builds 468 var changedOrRemovedFiles []string 469 470 if opts.BuildOptions.IsDelta { 471 repos, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo) 472 if err != nil { 473 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err) 474 opts.BuildOptions.IsDelta = false 475 } 476 } 477 478 if !opts.BuildOptions.IsDelta { 479 repos, branchVersions, err = prepareNormalBuild(opts, repo) 480 if err != nil { 481 return false, fmt.Errorf("preparing normal build: %w", err) 482 } 483 } 484 485 reposByPath := map[string]BlobLocation{} 486 for key, info := range repos { 487 reposByPath[key.SubRepoPath] = info 488 } 489 490 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{} 491 for path, info := range reposByPath { 492 tpl := opts.BuildOptions.RepositoryDescription 493 if path != "" { 494 tpl = zoekt.Repository{URL: info.URL.String()} 495 if err := SetTemplatesFromOrigin(&tpl, info.URL); err != nil { 496 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.URL, err) 497 } 498 } 499 opts.BuildOptions.SubRepositories[path] = &tpl 500 } 501 502 for _, br := range opts.BuildOptions.RepositoryDescription.Branches { 503 for path, repo := range opts.BuildOptions.SubRepositories { 504 id := branchVersions[br.Name][path] 505 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{ 506 Name: br.Name, 507 Version: id.String(), 508 }) 509 } 510 } 511 512 builder, err := index.NewBuilder(opts.BuildOptions) 513 if err != nil { 514 return false, fmt.Errorf("build.NewBuilder: %w", err) 515 } 516 517 // Preparing the build can consume substantial memory, so check usage before starting to index. 518 builder.CheckMemoryUsage() 519 520 // we don't need to check error, since we either already have an error, or 521 // we returning the first call to builder.Finish. 522 defer builder.Finish() // nolint:errcheck 523 524 for _, f := range changedOrRemovedFiles { 525 builder.MarkFileAsChangedOrRemoved(f) 526 } 527 528 var names []string 529 fileKeys := map[string][]fileKey{} 530 totalFiles := 0 531 532 for key := range repos { 533 n := key.FullPath() 534 fileKeys[n] = append(fileKeys[n], key) 535 names = append(names, n) 536 totalFiles++ 537 } 538 539 sort.Strings(names) 540 names = uniq(names) 541 542 log.Printf("attempting to index %d total files", totalFiles) 543 for idx, name := range names { 544 keys := fileKeys[name] 545 546 for _, key := range keys { 547 doc, err := createDocument(key, repos, opts.BuildOptions) 548 if err != nil { 549 return false, err 550 } 551 552 if err := builder.Add(doc); err != nil { 553 return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err) 554 } 555 556 if idx%10_000 == 0 { 557 builder.CheckMemoryUsage() 558 } 559 } 560 } 561 return true, builder.Finish() 562} 563 564// openRepo opens a git repository in a way that's optimized for indexing. 565// 566// It copies the relevant logic from git.PlainOpen, and tweaks certain filesystem options. 567func openRepo(repoDir string) (*git.Repository, io.Closer, error) { 568 fs := osfs.New(repoDir) 569 570 // Check if the root directory exists. 571 if _, err := fs.Stat(""); err != nil { 572 if os.IsNotExist(err) { 573 return nil, nil, git.ErrRepositoryNotExists 574 } 575 return nil, nil, err 576 } 577 578 // If there's a .git directory, use that as the new root. 579 if fi, err := fs.Stat(git.GitDirName); err == nil && fi.IsDir() { 580 if fs, err = fs.Chroot(git.GitDirName); err != nil { 581 return nil, nil, fmt.Errorf("fs.Chroot: %w", err) 582 } 583 } 584 585 s := filesystem.NewStorageWithOptions(fs, cache.NewObjectLRUDefault(), filesystem.Options{ 586 // Cache the packfile handles, preventing the packfile from being opened then closed on every object access 587 KeepDescriptors: true, 588 }) 589 590 // Because we're keeping descriptors open, we need to close the storage object when we're done. 591 repo, err := git.Open(s, fs) 592 return repo, s, err 593} 594 595func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) { 596 ignoreFile, err := tree.File(ignore.IgnoreFile) 597 if err == object.ErrFileNotFound { 598 return &ignore.Matcher{}, nil 599 } 600 if err != nil { 601 return nil, err 602 } 603 content, err := ignoreFile.Contents() 604 if err != nil { 605 return nil, err 606 } 607 return ignore.ParseIgnoreFile(strings.NewReader(content)) 608} 609 610// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing 611// a build.Builder instance for generating a delta build. 612type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) 613 614// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing 615// a build.Builder instance for generating a normal build. 616type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) 617 618type gitIndexConfig struct { 619 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to 620 // prepare the build.Builder instance for generating a delta build. 621 // 622 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead. 623 prepareDeltaBuild prepareDeltaBuildFunc 624 625 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to 626 // prepare the build.Builder instance for generating a normal build. 627 // 628 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead. 629 prepareNormalBuild prepareNormalBuildFunc 630} 631 632func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { 633 if options.Submodules { 634 return nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing") 635 } 636 637 // discover what commits we indexed during our last build 638 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata() 639 if err != nil { 640 return nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err) 641 } 642 643 if !ok { 644 return nil, nil, nil, fmt.Errorf("no existing shards found for repository") 645 } 646 647 if options.DeltaShardNumberFallbackThreshold > 0 { 648 // HACK: For our interim compaction strategy, we force a full normal index once 649 // the number of shards on disk for this repository exceeds the provided threshold. 650 // 651 // This strategy obviously isn't optimal (as an example: we currently can't differentiate 652 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per 653 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads 654 // while we create a better compaction strategy). 655 656 oldShards := options.BuildOptions.FindAllShards() 657 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold { 658 return nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold) 659 } 660 } 661 662 // Check to see if the set of branch names is consistent with what we last indexed. 663 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a 664 // normal one). 665 666 if !index.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) { 667 var existingBranchNames []string 668 for _, b := range existingRepository.Branches { 669 existingBranchNames = append(existingBranchNames, b.Name) 670 } 671 672 var optionsBranchNames []string 673 for _, b := range options.BuildOptions.RepositoryDescription.Branches { 674 optionsBranchNames = append(optionsBranchNames, b.Name) 675 } 676 677 existingBranchList := strings.Join(existingBranchNames, ", ") 678 optionsBranchList := strings.Join(optionsBranchNames, ", ") 679 680 return nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList) 681 } 682 683 // Check if the build options hash does not match the repository metadata's hash 684 // If it does not index then one or more index options has changed and will require a normal build instead of a delta build 685 if options.BuildOptions.GetHash() != existingRepository.IndexOptions { 686 return nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions()) 687 } 688 689 // branch => (path, sha1) => repo. 690 repos = map[fileKey]BlobLocation{} 691 692 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix) 693 if err != nil { 694 return nil, nil, nil, fmt.Errorf("expandBranches: %w", err) 695 } 696 697 // branch name -> git worktree at most current commit 698 branchToCurrentTree := make(map[string]*object.Tree, len(branches)) 699 700 for _, b := range branches { 701 commit, err := getCommit(repository, options.BranchPrefix, b) 702 if err != nil { 703 return nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err) 704 } 705 706 tree, err := commit.Tree() 707 if err != nil { 708 return nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err) 709 } 710 711 branchToCurrentTree[b] = tree 712 } 713 714 rawURL := options.BuildOptions.RepositoryDescription.URL 715 u, err := url.Parse(rawURL) 716 if err != nil { 717 return nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err) 718 } 719 720 // TODO: Support repository submodules for delta builds 721 722 // loop over all branches, calculate the diff between our 723 // last indexed commit and the current commit, and add files mentioned in the diff 724 for _, branch := range existingRepository.Branches { 725 lastIndexedCommit, err := getCommit(repository, "", branch.Version) 726 if err != nil { 727 return nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err) 728 } 729 730 lastIndexedTree, err := lastIndexedCommit.Tree() 731 if err != nil { 732 return nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err) 733 } 734 735 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false}) 736 if err != nil { 737 return nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err) 738 } 739 740 for i, c := range changes { 741 oldFile, newFile, err := c.Files() 742 if err != nil { 743 return nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err) 744 } 745 746 if newFile != nil { 747 // note: newFile.Name could be a path that isn't relative to the repository root - using the 748 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root 749 newFileRelativeRootPath := c.To.Name 750 751 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds 752 if newFileRelativeRootPath == ignore.IgnoreFile { 753 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile) 754 } 755 756 // either file is added or renamed, so we need to add the new version to the build 757 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash} 758 if existing, ok := repos[file]; ok { 759 existing.Branches = append(existing.Branches, branch.Name) 760 repos[file] = existing 761 } else { 762 repos[file] = BlobLocation{ 763 GitRepo: repository, 764 URL: u, 765 Branches: []string{branch.Name}, 766 } 767 } 768 } 769 770 if oldFile == nil { 771 // file added - nothing more to do 772 continue 773 } 774 775 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the 776 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root 777 oldFileRelativeRootPath := c.From.Name 778 779 if oldFileRelativeRootPath == ignore.IgnoreFile { 780 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile) 781 } 782 783 // The file is either modified or deleted. So, we need to add ALL versions 784 // of the old file (across all branches) to the build. 785 for b, currentTree := range branchToCurrentTree { 786 f, err := currentTree.File(oldFileRelativeRootPath) 787 if err != nil { 788 // the file doesn't exist in this branch 789 if errors.Is(err, object.ErrFileNotFound) { 790 continue 791 } 792 793 return nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err) 794 } 795 796 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()} 797 if existing, ok := repos[file]; ok { 798 existing.Branches = append(existing.Branches, b) 799 repos[file] = existing 800 } else { 801 repos[file] = BlobLocation{ 802 GitRepo: repository, 803 URL: u, 804 Branches: []string{b}, 805 } 806 } 807 } 808 809 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath) 810 } 811 } 812 813 // we need to de-duplicate the branch map before returning it - it's possible for the same 814 // branch to have been added multiple times if a file has been modified across multiple commits 815 for _, info := range repos { 816 sort.Strings(info.Branches) 817 info.Branches = uniq(info.Branches) 818 } 819 820 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates 821 // for the same reasoning as above 822 sort.Strings(changedOrDeletedPaths) 823 changedOrDeletedPaths = uniq(changedOrDeletedPaths) 824 825 return repos, nil, changedOrDeletedPaths, nil 826} 827 828func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) { 829 var repoCache *RepoCache 830 if options.Submodules { 831 repoCache = NewRepoCache(options.RepoCacheDir) 832 } 833 834 // Branch => Repo => SHA1 835 branchVersions = map[string]map[string]plumbing.Hash{} 836 837 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix) 838 if err != nil { 839 return nil, nil, fmt.Errorf("expandBranches: %w", err) 840 } 841 842 rw := NewRepoWalker(repository, options.BuildOptions.RepositoryDescription.URL, repoCache) 843 for _, b := range branches { 844 commit, err := getCommit(repository, options.BranchPrefix, b) 845 if err != nil { 846 if options.AllowMissingBranch && err.Error() == "reference not found" { 847 continue 848 } 849 850 return nil, nil, fmt.Errorf("getCommit: %w", err) 851 } 852 853 tree, err := commit.Tree() 854 if err != nil { 855 return nil, nil, fmt.Errorf("commit.Tree: %w", err) 856 } 857 858 ig, err := newIgnoreMatcher(tree) 859 if err != nil { 860 return nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err) 861 } 862 863 subVersions, err := rw.CollectFiles(tree, b, ig) 864 if err != nil { 865 return nil, nil, fmt.Errorf("CollectFiles: %w", err) 866 } 867 868 branchVersions[b] = subVersions 869 } 870 871 return rw.Files, branchVersions, nil 872} 873 874func createDocument(key fileKey, 875 repos map[fileKey]BlobLocation, 876 opts index.Options, 877) (index.Document, error) { 878 repo := repos[key] 879 blob, err := repo.GitRepo.BlobObject(key.ID) 880 branches := repos[key].Branches 881 882 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found. 883 if errors.Is(err, plumbing.ErrObjectNotFound) { 884 return skippedLargeDoc(key, branches), nil 885 } 886 887 if err != nil { 888 return index.Document{}, err 889 } 890 891 keyFullPath := key.FullPath() 892 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) { 893 return skippedLargeDoc(key, branches), nil 894 } 895 896 contents, err := blobContents(blob) 897 if err != nil { 898 return index.Document{}, err 899 } 900 901 return index.Document{ 902 SubRepositoryPath: key.SubRepoPath, 903 Name: keyFullPath, 904 Content: contents, 905 Branches: branches, 906 }, nil 907} 908 909func skippedLargeDoc(key fileKey, branches []string) index.Document { 910 return index.Document{ 911 SkipReason: index.SkipReasonTooLarge, 912 Name: key.FullPath(), 913 Branches: branches, 914 SubRepositoryPath: key.SubRepoPath, 915 } 916} 917 918func blobContents(blob *object.Blob) ([]byte, error) { 919 r, err := blob.Reader() 920 if err != nil { 921 return nil, err 922 } 923 defer r.Close() 924 925 var buf bytes.Buffer 926 buf.Grow(int(blob.Size)) 927 _, err = buf.ReadFrom(r) 928 if err != nil { 929 return nil, err 930 } 931 return buf.Bytes(), nil 932} 933 934func uniq(ss []string) []string { 935 result := ss[:0] 936 var last string 937 for i, s := range ss { 938 if i == 0 || s != last { 939 result = append(result, s) 940 } 941 last = s 942 } 943 return result 944}