fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package gitindex provides functions for indexing Git repositories. 16package gitindex 17 18import ( 19 "bytes" 20 "context" 21 "encoding/json" 22 "errors" 23 "fmt" 24 "io" 25 "log" 26 "math" 27 "net/url" 28 "os" 29 "path/filepath" 30 "regexp" 31 "sort" 32 "strconv" 33 "strings" 34 "time" 35 36 "github.com/sourcegraph/zoekt" 37 "github.com/sourcegraph/zoekt/build" 38 "github.com/sourcegraph/zoekt/ignore" 39 40 "github.com/go-git/go-git/v5/config" 41 "github.com/go-git/go-git/v5/plumbing" 42 "github.com/go-git/go-git/v5/plumbing/object" 43 44 git "github.com/go-git/go-git/v5" 45) 46 47// RepoModTime returns the time of last fetch of a git repository. 48func RepoModTime(dir string) (time.Time, error) { 49 var last time.Time 50 refDir := filepath.Join(dir, "refs") 51 if _, err := os.Lstat(refDir); err == nil { 52 if err := filepath.Walk(refDir, 53 func(_ string, fi os.FileInfo, _ error) error { 54 if !fi.IsDir() && last.Before(fi.ModTime()) { 55 last = fi.ModTime() 56 } 57 return nil 58 }); err != nil { 59 return last, err 60 } 61 } 62 63 // git gc compresses refs into the following file: 64 for _, fn := range []string{"info/refs", "packed-refs"} { 65 if fi, err := os.Lstat(filepath.Join(dir, fn)); err == nil && !fi.IsDir() && last.Before(fi.ModTime()) { 66 last = fi.ModTime() 67 } 68 } 69 70 return last, nil 71} 72 73// FindGitRepos finds directories holding git repositories below the 74// given directory. It will find both bare and the ".git" dirs in 75// non-bare repositories. It returns the full path including the dir 76// passed in. 77func FindGitRepos(dir string) ([]string, error) { 78 arg, err := filepath.Abs(dir) 79 if err != nil { 80 return nil, err 81 } 82 var dirs []string 83 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error { 84 // Best-effort, ignore filepath.Walk failing 85 if err != nil { 86 return nil 87 } 88 89 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() { 90 dirs = append(dirs, filepath.Join(name, ".git")) 91 return filepath.SkipDir 92 } 93 94 if !strings.HasSuffix(name, ".git") || !fi.IsDir() { 95 return nil 96 } 97 98 fi, err = os.Lstat(filepath.Join(name, "objects")) 99 if err != nil || !fi.IsDir() { 100 return nil 101 } 102 103 dirs = append(dirs, name) 104 return filepath.SkipDir 105 }); err != nil { 106 return nil, err 107 } 108 109 return dirs, nil 110} 111 112// setTemplates fills in URL templates for known git hosting 113// sites. 114func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error { 115 if u.Scheme == "ssh+git" { 116 u.Scheme = "https" 117 u.User = nil 118 } 119 120 repo.URL = u.String() 121 switch typ { 122 case "gitiles": 123 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20 124 repo.CommitURLTemplate = u.String() + "/+/{{.Version}}" 125 repo.FileURLTemplate = u.String() + "/+/{{.Version}}/{{.Path}}" 126 repo.LineFragmentTemplate = "#{{.LineNumber}}" 127 case "github": 128 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10 129 repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}" 130 repo.FileURLTemplate = u.String() + "/blob/{{.Version}}/{{.Path}}" 131 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 132 case "cgit": 133 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100 134 repo.CommitURLTemplate = u.String() + "/commit/?id={{.Version}}" 135 repo.FileURLTemplate = u.String() + "/tree/{{.Path}}/?id={{.Version}}" 136 repo.LineFragmentTemplate = "#n{{.LineNumber}}" 137 case "gitweb": 138 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10 139 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}" 140 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}" 141 repo.LineFragmentTemplate = "#l{{.LineNumber}}" 142 case "source.bazel.build": 143 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9 144 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10 145 repo.CommitURLTemplate = u.String() + "/+/{{.Version}}" 146 repo.FileURLTemplate = u.String() + "/+/{{.Version}}:{{.Path}}" 147 repo.LineFragmentTemplate = ";l={{.LineNumber}}" 148 case "bitbucket-server": 149 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc 150 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc 151 repo.CommitURLTemplate = u.String() + "/commits/{{.Version}}" 152 repo.FileURLTemplate = u.String() + "/{{.Path}}?at={{.Version}}" 153 repo.LineFragmentTemplate = "#{{.LineNumber}}" 154 case "gitlab": 155 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed 156 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template 157 repo.CommitURLTemplate = u.String() + "/-/commit/{{.Version}}" 158 repo.FileURLTemplate = u.String() + "/-/blob/{{.Version}}/{{.Path}}" 159 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 160 case "gitea": 161 repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}" 162 // NOTE The `display=source` query parameter is required to disable file rendering. 163 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to 164 // a line without `display=source`. This is supported since gitea 1.17.0. 165 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}}, 166 // but the query parameters are obmitted. 167 repo.FileURLTemplate = u.String() + "/src/commit/{{.Version}}/{{.Path}}?display=source" 168 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 169 default: 170 return fmt.Errorf("URL scheme type %q unknown", typ) 171 } 172 return nil 173} 174 175// getCommit returns a tree object for the given reference. 176func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) { 177 sha1, err := repo.ResolveRevision(plumbing.Revision(ref)) 178 // ref might be a branch name (e.g. "master") add branch prefix and try again. 179 if err != nil { 180 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref))) 181 } 182 if err != nil { 183 return nil, err 184 } 185 186 commitObj, err := repo.CommitObject(*sha1) 187 if err != nil { 188 return nil, err 189 } 190 return commitObj, nil 191} 192 193func configLookupRemoteURL(cfg *config.Config, key string) string { 194 rc := cfg.Remotes[key] 195 if rc == nil || len(rc.URLs) == 0 { 196 return "" 197 } 198 return rc.URLs[0] 199} 200 201var sshRelativeURLRegexp = regexp.MustCompile(`^([^@]+)@([^:]+):(.*)$`) 202 203func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error { 204 repo, err := git.PlainOpen(repoDir) 205 if err != nil { 206 return err 207 } 208 209 cfg, err := repo.Config() 210 if err != nil { 211 return err 212 } 213 214 sec := cfg.Raw.Section("zoekt") 215 216 webURLStr := sec.Options.Get("web-url") 217 webURLType := sec.Options.Get("web-url-type") 218 219 if webURLType != "" && webURLStr != "" { 220 webURL, err := url.Parse(webURLStr) 221 if err != nil { 222 return err 223 } 224 if err := setTemplates(desc, webURL, webURLType); err != nil { 225 return err 226 } 227 } else if webURLStr != "" { 228 desc.URL = webURLStr 229 } 230 231 name := sec.Options.Get("name") 232 if name != "" { 233 desc.Name = name 234 } else { 235 remoteURL := configLookupRemoteURL(cfg, "origin") 236 if remoteURL == "" { 237 return nil 238 } 239 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil { 240 user := sm[1] 241 host := sm[2] 242 path := sm[3] 243 244 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path) 245 } 246 247 u, err := url.Parse(remoteURL) 248 if err != nil { 249 return err 250 } 251 if err := SetTemplatesFromOrigin(desc, u); err != nil { 252 return err 253 } 254 } 255 256 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32) 257 desc.ID = uint32(id) 258 259 if desc.RawConfig == nil { 260 desc.RawConfig = map[string]string{} 261 } 262 for _, o := range sec.Options { 263 desc.RawConfig[o.Key] = o.Value 264 } 265 266 // Ranking info. 267 268 // Github: 269 traction := 0 270 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} { 271 f, err := strconv.Atoi(sec.Options.Get(s)) 272 if err == nil { 273 traction += f 274 } 275 } 276 277 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 { 278 // Pretend everything on googlesource.com has 1000 279 // github stars. 280 traction = 1000 281 } 282 283 if traction > 0 { 284 l := math.Log(float64(traction)) 285 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000) 286 } 287 288 return nil 289} 290 291// SetTemplatesFromOrigin fills in templates based on the origin URL. 292func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error { 293 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git")) 294 295 if strings.HasSuffix(u.Host, ".googlesource.com") { 296 return setTemplates(desc, u, "gitiles") 297 } else if u.Host == "github.com" { 298 u.Path = strings.TrimSuffix(u.Path, ".git") 299 return setTemplates(desc, u, "github") 300 } else { 301 return fmt.Errorf("unknown git hosting site %q", u) 302 } 303} 304 305// The Options structs controls details of the indexing process. 306type Options struct { 307 // The repository to be indexed. 308 RepoDir string 309 310 // If set, follow submodule links. This requires RepoCacheDir to be set. 311 Submodules bool 312 313 // If set, skip indexing if the existing index shard is newer 314 // than the refs in the repository. 315 Incremental bool 316 317 // Don't error out if some branch is missing 318 AllowMissingBranch bool 319 320 // Specifies the root of a Repository cache. Needed for submodule indexing. 321 RepoCacheDir string 322 323 // Indexing options. 324 BuildOptions build.Options 325 326 // Prefix of the branch to index, e.g. `remotes/origin`. 327 BranchPrefix string 328 329 // List of branch names to index, e.g. []string{"HEAD", "stable"} 330 Branches []string 331 332 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards 333 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold, 334 // then a normal build will be performed instead. 335 // 336 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled: 337 // a delta build will always be performed regardless of the number of preexisting shards. 338 DeltaShardNumberFallbackThreshold uint64 339} 340 341func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) { 342 var result []string 343 for _, b := range bs { 344 // Sourcegraph: We disable resolving refs. We want to return the exact ref 345 // requested so we can match it up. 346 if b == "HEAD" && false { 347 ref, err := repo.Head() 348 if err != nil { 349 return nil, err 350 } 351 352 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix)) 353 continue 354 } 355 356 if strings.Contains(b, "*") { 357 iter, err := repo.Branches() 358 if err != nil { 359 return nil, err 360 } 361 362 defer iter.Close() 363 for { 364 ref, err := iter.Next() 365 if err == io.EOF { 366 break 367 } 368 if err != nil { 369 return nil, err 370 } 371 372 name := ref.Name().Short() 373 if matched, err := filepath.Match(b, name); err != nil { 374 return nil, err 375 } else if !matched { 376 continue 377 } 378 379 result = append(result, strings.TrimPrefix(name, prefix)) 380 } 381 continue 382 } 383 384 result = append(result, b) 385 } 386 387 return result, nil 388} 389 390// IndexGitRepo indexes the git repository as specified by the options. 391// The returned bool indicates whether the index was updated as a result. This 392// can be informative if doing incremental indexing. 393func IndexGitRepo(opts Options) (bool, error) { 394 return indexGitRepo(opts, gitIndexConfig{}) 395} 396 397// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig. 398// The returned bool indicates whether the index was updated as a result. This 399// can be informative if doing incremental indexing. 400func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { 401 prepareDeltaBuild := prepareDeltaBuild 402 if config.prepareDeltaBuild != nil { 403 prepareDeltaBuild = config.prepareDeltaBuild 404 } 405 406 prepareNormalBuild := prepareNormalBuild 407 if config.prepareNormalBuild != nil { 408 prepareNormalBuild = config.prepareNormalBuild 409 } 410 411 // Set max thresholds, since we use them in this function. 412 opts.BuildOptions.SetDefaults() 413 if opts.RepoDir == "" { 414 return false, fmt.Errorf("gitindex: must set RepoDir") 415 } 416 417 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir 418 repo, err := git.PlainOpen(opts.RepoDir) 419 if err != nil { 420 return false, fmt.Errorf("git.PlainOpen: %w", err) 421 } 422 423 if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil { 424 log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err) 425 } 426 427 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix) 428 if err != nil { 429 return false, fmt.Errorf("expandBranches: %w", err) 430 } 431 for _, b := range branches { 432 commit, err := getCommit(repo, opts.BranchPrefix, b) 433 if err != nil { 434 if opts.AllowMissingBranch && err.Error() == "reference not found" { 435 continue 436 } 437 438 return false, fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err) 439 } 440 441 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{ 442 Name: b, 443 Version: commit.Hash.String(), 444 }) 445 446 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) { 447 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when 448 } 449 } 450 451 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() { 452 return false, nil 453 } 454 455 // branch => (path, sha1) => repo. 456 var repos map[fileKey]BlobLocation 457 458 // fileKey => branches 459 var branchMap map[fileKey][]string 460 461 // Branch => Repo => SHA1 462 var branchVersions map[string]map[string]plumbing.Hash 463 464 // set of file paths that have been changed or deleted since 465 // the last indexed commit 466 // 467 // These only have an effect on delta builds 468 var changedOrRemovedFiles []string 469 470 if opts.BuildOptions.IsDelta { 471 repos, branchMap, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo) 472 if err != nil { 473 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err) 474 opts.BuildOptions.IsDelta = false 475 } 476 } 477 478 if !opts.BuildOptions.IsDelta { 479 repos, branchMap, branchVersions, err = prepareNormalBuild(opts, repo) 480 if err != nil { 481 return false, fmt.Errorf("preparing normal build: %w", err) 482 } 483 } 484 485 reposByPath := map[string]BlobLocation{} 486 for key, location := range repos { 487 reposByPath[key.SubRepoPath] = location 488 } 489 490 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{} 491 for path, location := range reposByPath { 492 tpl := opts.BuildOptions.RepositoryDescription 493 if path != "" { 494 tpl = zoekt.Repository{URL: location.URL.String()} 495 if err := SetTemplatesFromOrigin(&tpl, location.URL); err != nil { 496 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, location.URL, err) 497 } 498 } 499 opts.BuildOptions.SubRepositories[path] = &tpl 500 } 501 502 for _, br := range opts.BuildOptions.RepositoryDescription.Branches { 503 for path, repo := range opts.BuildOptions.SubRepositories { 504 id := branchVersions[br.Name][path] 505 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{ 506 Name: br.Name, 507 Version: id.String(), 508 }) 509 } 510 } 511 512 builder, err := build.NewBuilder(opts.BuildOptions) 513 if err != nil { 514 return false, fmt.Errorf("build.NewBuilder: %w", err) 515 } 516 517 var ranks repoPathRanks 518 var meanRank float64 519 if opts.BuildOptions.DocumentRanksPath != "" { 520 data, err := os.ReadFile(opts.BuildOptions.DocumentRanksPath) 521 if err != nil { 522 return false, err 523 } 524 525 err = json.Unmarshal(data, &ranks) 526 if err != nil { 527 return false, err 528 } 529 530 // Compute the mean rank for this repository. Note: we overwrite the rank 531 // mean that's stored in the document ranks file, since that currently 532 // represents a global mean rank across repos, which is not what we want. 533 numRanks := len(ranks.Paths) 534 if numRanks > 0 { 535 for _, rank := range ranks.Paths { 536 meanRank += rank 537 } 538 ranks.MeanRank = meanRank / float64(numRanks) 539 } 540 } 541 542 // we don't need to check error, since we either already have an error, or 543 // we returning the first call to builder.Finish. 544 defer builder.Finish() // nolint:errcheck 545 546 for _, f := range changedOrRemovedFiles { 547 builder.MarkFileAsChangedOrRemoved(f) 548 } 549 550 var names []string 551 fileKeys := map[string][]fileKey{} 552 totalFiles := 0 553 554 for key := range repos { 555 n := key.FullPath() 556 fileKeys[n] = append(fileKeys[n], key) 557 names = append(names, n) 558 totalFiles++ 559 } 560 561 sort.Strings(names) 562 names = uniq(names) 563 564 log.Printf("attempting to index %d total files", totalFiles) 565 for _, name := range names { 566 keys := fileKeys[name] 567 568 for _, key := range keys { 569 doc, err := createDocument(key, repos, branchMap, ranks, opts.BuildOptions) 570 if err != nil { 571 return false, err 572 } 573 574 if err := builder.Add(doc); err != nil { 575 return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err) 576 } 577 } 578 } 579 580 return true, builder.Finish() 581} 582 583type repoPathRanks struct { 584 MeanRank float64 `json:"mean_reference_count"` 585 Paths map[string]float64 `json:"paths"` 586} 587 588// rank returns the rank for a given path. It uses these rules: 589// - If we have a concrete rank for this file, always use it 590// - If there's no rank, and it's a low priority file like a test, then use rank 0 591// - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage 592func (r repoPathRanks) rank(path string) float64 { 593 if rank, ok := r.Paths[path]; ok { 594 return rank 595 } else if build.IsLowPriority(path) { 596 return 0.0 597 } else { 598 return r.MeanRank 599 } 600} 601 602func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) { 603 ignoreFile, err := tree.File(ignore.IgnoreFile) 604 if err == object.ErrFileNotFound { 605 return &ignore.Matcher{}, nil 606 } 607 if err != nil { 608 return nil, err 609 } 610 content, err := ignoreFile.Contents() 611 if err != nil { 612 return nil, err 613 } 614 return ignore.ParseIgnoreFile(strings.NewReader(content)) 615} 616 617// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing 618// a build.Builder instance for generating a delta build. 619type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) 620 621// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing 622// a build.Builder instance for generating a normal build. 623type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, err error) 624 625type gitIndexConfig struct { 626 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to 627 // prepare the build.Builder instance for generating a delta build. 628 // 629 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead. 630 prepareDeltaBuild prepareDeltaBuildFunc 631 632 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to 633 // prepare the build.Builder instance for generating a normal build. 634 // 635 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead. 636 prepareNormalBuild prepareNormalBuildFunc 637} 638 639func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { 640 if options.Submodules { 641 return nil, nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing") 642 } 643 644 // discover what commits we indexed during our last build 645 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata() 646 if err != nil { 647 return nil, nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err) 648 } 649 650 if !ok { 651 return nil, nil, nil, nil, fmt.Errorf("no existing shards found for repository") 652 } 653 654 if options.DeltaShardNumberFallbackThreshold > 0 { 655 // HACK: For our interim compaction strategy, we force a full normal index once 656 // the number of shards on disk for this repository exceeds the provided threshold. 657 // 658 // This strategy obviously isn't optimal (as an example: we currently can't differentiate 659 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per 660 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads 661 // while we create a better compaction strategy). 662 663 oldShards := options.BuildOptions.FindAllShards() 664 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold { 665 return nil, nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold) 666 } 667 } 668 669 // Check to see if the set of branch names is consistent with what we last indexed. 670 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a 671 // normal one). 672 673 if !build.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) { 674 var existingBranchNames []string 675 for _, b := range existingRepository.Branches { 676 existingBranchNames = append(existingBranchNames, b.Name) 677 } 678 679 var optionsBranchNames []string 680 for _, b := range options.BuildOptions.RepositoryDescription.Branches { 681 optionsBranchNames = append(optionsBranchNames, b.Name) 682 } 683 684 existingBranchList := strings.Join(existingBranchNames, ", ") 685 optionsBranchList := strings.Join(optionsBranchNames, ", ") 686 687 return nil, nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList) 688 } 689 690 // Check if the build options hash does not match the repository metadata's hash 691 // If it does not match then one or more index options has changed and will require a normal build instead of a delta build 692 if options.BuildOptions.GetHash() != existingRepository.IndexOptions { 693 return nil, nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions()) 694 } 695 696 // branch => (path, sha1) => repo. 697 repos = map[fileKey]BlobLocation{} 698 699 // fileKey => branches 700 branchMap = map[fileKey][]string{} 701 702 // branch name -> git worktree at most current commit 703 branchToCurrentTree := make(map[string]*object.Tree, len(options.Branches)) 704 705 for _, b := range options.Branches { 706 commit, err := getCommit(repository, options.BranchPrefix, b) 707 if err != nil { 708 return nil, nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err) 709 } 710 711 tree, err := commit.Tree() 712 if err != nil { 713 return nil, nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err) 714 } 715 716 branchToCurrentTree[b] = tree 717 } 718 719 rawURL := options.BuildOptions.RepositoryDescription.URL 720 u, err := url.Parse(rawURL) 721 if err != nil { 722 return nil, nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err) 723 } 724 725 // TODO: Support repository submodules for delta builds 726 // For this prototype, we are ignoring repository submodules, which means that we can use the same 727 // blob location for all files 728 hackSharedBlobLocation := BlobLocation{ 729 Repo: repository, 730 URL: u, 731 } 732 733 // loop over all branches, calculate the diff between our 734 // last indexed commit and the current commit, and add files mentioned in the diff 735 for _, branch := range existingRepository.Branches { 736 lastIndexedCommit, err := getCommit(repository, "", branch.Version) 737 if err != nil { 738 return nil, nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err) 739 } 740 741 lastIndexedTree, err := lastIndexedCommit.Tree() 742 if err != nil { 743 return nil, nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err) 744 } 745 746 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false}) 747 if err != nil { 748 return nil, nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err) 749 } 750 751 for i, c := range changes { 752 oldFile, newFile, err := c.Files() 753 if err != nil { 754 return nil, nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err) 755 } 756 757 if newFile != nil { 758 // note: newFile.Name could be a path that isn't relative to the repository root - using the 759 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root 760 newFileRelativeRootPath := c.To.Name 761 762 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds 763 if newFileRelativeRootPath == ignore.IgnoreFile { 764 return nil, nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile) 765 } 766 767 // either file is added or renamed, so we need to add the new version to the build 768 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash} 769 repos[file] = hackSharedBlobLocation 770 branchMap[file] = append(branchMap[file], branch.Name) 771 } 772 773 if oldFile == nil { 774 // file added - nothing more to do 775 continue 776 } 777 778 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the 779 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root 780 oldFileRelativeRootPath := c.From.Name 781 782 if oldFileRelativeRootPath == ignore.IgnoreFile { 783 return nil, nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile) 784 } 785 786 // The file is either modified or deleted. So, we need to add ALL versions 787 // of the old file (across all branches) to the build. 788 for b, currentTree := range branchToCurrentTree { 789 f, err := currentTree.File(oldFileRelativeRootPath) 790 if err != nil { 791 // the file doesn't exist in this branch 792 if errors.Is(err, object.ErrFileNotFound) { 793 continue 794 } 795 796 return nil, nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err) 797 } 798 799 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()} 800 repos[file] = hackSharedBlobLocation 801 branchMap[file] = append(branchMap[file], b) 802 } 803 804 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath) 805 } 806 } 807 808 // we need to de-duplicate the branch map before returning it - it's possible for the same 809 // branch to have been added multiple times if a file has been modified across multiple commits 810 811 for file, branches := range branchMap { 812 sort.Strings(branches) 813 branchMap[file] = uniq(branches) 814 } 815 816 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates 817 // for the same reasoning as above 818 819 sort.Strings(changedOrDeletedPaths) 820 changedOrDeletedPaths = uniq(changedOrDeletedPaths) 821 822 return repos, branchMap, nil, changedOrDeletedPaths, nil 823} 824 825func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, err error) { 826 var repoCache *RepoCache 827 if options.Submodules { 828 repoCache = NewRepoCache(options.RepoCacheDir) 829 } 830 831 // branch => (path, sha1) => repo. 832 repos = map[fileKey]BlobLocation{} 833 834 // fileKey => branches 835 branchMap = map[fileKey][]string{} 836 837 // Branch => Repo => SHA1 838 branchVersions = map[string]map[string]plumbing.Hash{} 839 840 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix) 841 if err != nil { 842 return nil, nil, nil, fmt.Errorf("expandBranches: %w", err) 843 } 844 845 for _, b := range branches { 846 commit, err := getCommit(repository, options.BranchPrefix, b) 847 if err != nil { 848 if options.AllowMissingBranch && err.Error() == "reference not found" { 849 continue 850 } 851 852 return nil, nil, nil, fmt.Errorf("getCommit: %w", err) 853 } 854 855 tree, err := commit.Tree() 856 if err != nil { 857 return nil, nil, nil, fmt.Errorf("commit.Tree: %w", err) 858 } 859 860 ig, err := newIgnoreMatcher(tree) 861 if err != nil { 862 return nil, nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err) 863 } 864 865 files, subVersions, err := TreeToFiles(repository, tree, options.BuildOptions.RepositoryDescription.URL, repoCache) 866 if err != nil { 867 return nil, nil, nil, fmt.Errorf("TreeToFiles: %w", err) 868 } 869 for k, v := range files { 870 if ig.Match(k.Path) { 871 continue 872 } 873 repos[k] = v 874 branchMap[k] = append(branchMap[k], b) 875 } 876 877 branchVersions[b] = subVersions 878 } 879 880 return repos, branchMap, branchVersions, nil 881} 882 883func createDocument(key fileKey, 884 repos map[fileKey]BlobLocation, 885 branchMap map[fileKey][]string, 886 ranks repoPathRanks, 887 opts build.Options, 888) (zoekt.Document, error) { 889 blob, err := repos[key].Repo.BlobObject(key.ID) 890 891 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found. 892 if errors.Is(err, plumbing.ErrObjectNotFound) { 893 return skippedLargeDoc(key, branchMap, opts), nil 894 } 895 896 if err != nil { 897 return zoekt.Document{}, err 898 } 899 900 keyFullPath := key.FullPath() 901 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) { 902 return skippedLargeDoc(key, branchMap, opts), nil 903 } 904 905 contents, err := blobContents(blob) 906 if err != nil { 907 return zoekt.Document{}, err 908 } 909 910 var pathRanks []float64 911 if len(ranks.Paths) > 0 { 912 // If the repository has ranking data, then store the file's rank. 913 pathRank := ranks.rank(keyFullPath) 914 pathRanks = []float64{pathRank} 915 } 916 917 return zoekt.Document{ 918 SubRepositoryPath: key.SubRepoPath, 919 Name: keyFullPath, 920 Content: contents, 921 Branches: branchMap[key], 922 Ranks: pathRanks, 923 }, nil 924} 925 926func skippedLargeDoc(key fileKey, branchMap map[fileKey][]string, opts build.Options) zoekt.Document { 927 return zoekt.Document{ 928 SkipReason: fmt.Sprintf("file size exceeds maximum size %d", opts.SizeMax), 929 Name: key.FullPath(), 930 Branches: branchMap[key], 931 SubRepositoryPath: key.SubRepoPath, 932 } 933} 934 935func blobContents(blob *object.Blob) ([]byte, error) { 936 r, err := blob.Reader() 937 if err != nil { 938 return nil, err 939 } 940 defer r.Close() 941 942 var buf bytes.Buffer 943 buf.Grow(int(blob.Size)) 944 _, err = buf.ReadFrom(r) 945 if err != nil { 946 return nil, err 947 } 948 return buf.Bytes(), nil 949} 950 951func uniq(ss []string) []string { 952 result := ss[:0] 953 var last string 954 for i, s := range ss { 955 if i == 0 || s != last { 956 result = append(result, s) 957 } 958 last = s 959 } 960 return result 961}