fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package gitindex provides functions for indexing Git repositories. 16package gitindex 17 18import ( 19 "bytes" 20 "context" 21 "encoding/json" 22 "errors" 23 "fmt" 24 "io" 25 "log" 26 "math" 27 "net/url" 28 "os" 29 "path/filepath" 30 "sort" 31 "strconv" 32 "strings" 33 "time" 34 35 "github.com/sourcegraph/zoekt" 36 "github.com/sourcegraph/zoekt/build" 37 "github.com/sourcegraph/zoekt/ignore" 38 39 "github.com/go-git/go-git/v5/config" 40 "github.com/go-git/go-git/v5/plumbing" 41 "github.com/go-git/go-git/v5/plumbing/object" 42 43 git "github.com/go-git/go-git/v5" 44) 45 46// RepoModTime returns the time of last fetch of a git repository. 47func RepoModTime(dir string) (time.Time, error) { 48 var last time.Time 49 refDir := filepath.Join(dir, "refs") 50 if _, err := os.Lstat(refDir); err == nil { 51 if err := filepath.Walk(refDir, 52 func(_ string, fi os.FileInfo, _ error) error { 53 if !fi.IsDir() && last.Before(fi.ModTime()) { 54 last = fi.ModTime() 55 } 56 return nil 57 }); err != nil { 58 return last, err 59 } 60 } 61 62 // git gc compresses refs into the following file: 63 for _, fn := range []string{"info/refs", "packed-refs"} { 64 if fi, err := os.Lstat(filepath.Join(dir, fn)); err == nil && !fi.IsDir() && last.Before(fi.ModTime()) { 65 last = fi.ModTime() 66 } 67 } 68 69 return last, nil 70} 71 72// FindGitRepos finds directories holding git repositories below the 73// given directory. It will find both bare and the ".git" dirs in 74// non-bare repositories. It returns the full path including the dir 75// passed in. 76func FindGitRepos(dir string) ([]string, error) { 77 arg, err := filepath.Abs(dir) 78 if err != nil { 79 return nil, err 80 } 81 var dirs []string 82 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error { 83 // Best-effort, ignore filepath.Walk failing 84 if err != nil { 85 return nil 86 } 87 88 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() { 89 dirs = append(dirs, filepath.Join(name, ".git")) 90 return filepath.SkipDir 91 } 92 93 if !strings.HasSuffix(name, ".git") || !fi.IsDir() { 94 return nil 95 } 96 97 fi, err = os.Lstat(filepath.Join(name, "objects")) 98 if err != nil || !fi.IsDir() { 99 return nil 100 } 101 102 dirs = append(dirs, name) 103 return filepath.SkipDir 104 }); err != nil { 105 return nil, err 106 } 107 108 return dirs, nil 109} 110 111// setTemplates fills in URL templates for known git hosting 112// sites. 113func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error { 114 repo.URL = u.String() 115 switch typ { 116 case "gitiles": 117 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20 118 repo.CommitURLTemplate = u.String() + "/+/{{.Version}}" 119 repo.FileURLTemplate = u.String() + "/+/{{.Version}}/{{.Path}}" 120 repo.LineFragmentTemplate = "#{{.LineNumber}}" 121 case "github": 122 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10 123 repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}" 124 repo.FileURLTemplate = u.String() + "/blob/{{.Version}}/{{.Path}}" 125 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 126 case "cgit": 127 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100 128 repo.CommitURLTemplate = u.String() + "/commit/?id={{.Version}}" 129 repo.FileURLTemplate = u.String() + "/tree/{{.Path}}/?id={{.Version}}" 130 repo.LineFragmentTemplate = "#n{{.LineNumber}}" 131 case "gitweb": 132 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10 133 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}" 134 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}" 135 repo.LineFragmentTemplate = "#l{{.LineNumber}}" 136 case "source.bazel.build": 137 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9 138 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10 139 repo.CommitURLTemplate = u.String() + "/+/{{.Version}}" 140 repo.FileURLTemplate = u.String() + "/+/{{.Version}}:{{.Path}}" 141 repo.LineFragmentTemplate = ";l={{.LineNumber}}" 142 case "bitbucket-server": 143 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc 144 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc 145 repo.CommitURLTemplate = u.String() + "/commits/{{.Version}}" 146 repo.FileURLTemplate = u.String() + "/{{.Path}}?at={{.Version}}" 147 repo.LineFragmentTemplate = "#{{.LineNumber}}" 148 case "gitlab": 149 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed 150 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template 151 repo.CommitURLTemplate = u.String() + "/-/commit/{{.Version}}" 152 repo.FileURLTemplate = u.String() + "/-/blob/{{.Version}}/{{.Path}}" 153 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 154 case "gitea": 155 repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}" 156 // NOTE The `display=source` query parameter is required to disable file rendering. 157 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to 158 // a line without `display=source`. This is supported since gitea 1.17.0. 159 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}}, 160 // but the query parameters are obmitted. 161 repo.FileURLTemplate = u.String() + "/src/commit/{{.Version}}/{{.Path}}?display=source" 162 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 163 default: 164 return fmt.Errorf("URL scheme type %q unknown", typ) 165 } 166 return nil 167} 168 169// getCommit returns a tree object for the given reference. 170func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) { 171 sha1, err := repo.ResolveRevision(plumbing.Revision(ref)) 172 // ref might be a branch name (e.g. "master") add branch prefix and try again. 173 if err != nil { 174 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref))) 175 } 176 if err != nil { 177 return nil, err 178 } 179 180 commitObj, err := repo.CommitObject(*sha1) 181 if err != nil { 182 return nil, err 183 } 184 return commitObj, nil 185} 186 187func configLookupRemoteURL(cfg *config.Config, key string) string { 188 rc := cfg.Remotes[key] 189 if rc == nil || len(rc.URLs) == 0 { 190 return "" 191 } 192 return rc.URLs[0] 193} 194 195func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error { 196 repo, err := git.PlainOpen(repoDir) 197 if err != nil { 198 return err 199 } 200 201 cfg, err := repo.Config() 202 if err != nil { 203 return err 204 } 205 206 sec := cfg.Raw.Section("zoekt") 207 208 webURLStr := sec.Options.Get("web-url") 209 webURLType := sec.Options.Get("web-url-type") 210 211 if webURLType != "" && webURLStr != "" { 212 webURL, err := url.Parse(webURLStr) 213 if err != nil { 214 return err 215 } 216 if err := setTemplates(desc, webURL, webURLType); err != nil { 217 return err 218 } 219 } else if webURLStr != "" { 220 desc.URL = webURLStr 221 } 222 223 name := sec.Options.Get("name") 224 if name != "" { 225 desc.Name = name 226 } else { 227 remoteURL := configLookupRemoteURL(cfg, "origin") 228 if remoteURL == "" { 229 return nil 230 } 231 u, err := url.Parse(remoteURL) 232 if err != nil { 233 return err 234 } 235 if err := SetTemplatesFromOrigin(desc, u); err != nil { 236 return err 237 } 238 } 239 240 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32) 241 desc.ID = uint32(id) 242 243 if desc.RawConfig == nil { 244 desc.RawConfig = map[string]string{} 245 } 246 for _, o := range sec.Options { 247 desc.RawConfig[o.Key] = o.Value 248 } 249 250 // Ranking info. 251 252 // Github: 253 traction := 0 254 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} { 255 f, err := strconv.Atoi(sec.Options.Get(s)) 256 if err == nil { 257 traction += f 258 } 259 } 260 261 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 { 262 // Pretend everything on googlesource.com has 1000 263 // github stars. 264 traction = 1000 265 } 266 267 if traction > 0 { 268 l := math.Log(float64(traction)) 269 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000) 270 } 271 272 return nil 273} 274 275// SetTemplatesFromOrigin fills in templates based on the origin URL. 276func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error { 277 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git")) 278 279 if strings.HasSuffix(u.Host, ".googlesource.com") { 280 return setTemplates(desc, u, "gitiles") 281 } else if u.Host == "github.com" { 282 u.Path = strings.TrimSuffix(u.Path, ".git") 283 return setTemplates(desc, u, "github") 284 } else { 285 return fmt.Errorf("unknown git hosting site %q", u) 286 } 287} 288 289// The Options structs controls details of the indexing process. 290type Options struct { 291 // The repository to be indexed. 292 RepoDir string 293 294 // If set, follow submodule links. This requires RepoCacheDir to be set. 295 Submodules bool 296 297 // If set, skip indexing if the existing index shard is newer 298 // than the refs in the repository. 299 Incremental bool 300 301 // Don't error out if some branch is missing 302 AllowMissingBranch bool 303 304 // Specifies the root of a Repository cache. Needed for submodule indexing. 305 RepoCacheDir string 306 307 // Indexing options. 308 BuildOptions build.Options 309 310 // Prefix of the branch to index, e.g. `remotes/origin`. 311 BranchPrefix string 312 313 // List of branch names to index, e.g. []string{"HEAD", "stable"} 314 Branches []string 315 316 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards 317 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold, 318 // then a normal build will be performed instead. 319 // 320 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled: 321 // a delta build will always be performed regardless of the number of preexisting shards. 322 DeltaShardNumberFallbackThreshold uint64 323} 324 325func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) { 326 var result []string 327 for _, b := range bs { 328 // Sourcegraph: We disable resolving refs. We want to return the exact ref 329 // requested so we can match it up. 330 if b == "HEAD" && false { 331 ref, err := repo.Head() 332 if err != nil { 333 return nil, err 334 } 335 336 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix)) 337 continue 338 } 339 340 if strings.Contains(b, "*") { 341 iter, err := repo.Branches() 342 if err != nil { 343 return nil, err 344 } 345 346 defer iter.Close() 347 for { 348 ref, err := iter.Next() 349 if err == io.EOF { 350 break 351 } 352 if err != nil { 353 return nil, err 354 } 355 356 name := ref.Name().Short() 357 if matched, err := filepath.Match(b, name); err != nil { 358 return nil, err 359 } else if !matched { 360 continue 361 } 362 363 result = append(result, strings.TrimPrefix(name, prefix)) 364 } 365 continue 366 } 367 368 result = append(result, b) 369 } 370 371 return result, nil 372} 373 374// IndexGitRepo indexes the git repository as specified by the options. 375func IndexGitRepo(opts Options) error { 376 return indexGitRepo(opts, gitIndexConfig{}) 377} 378 379// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig. 380func indexGitRepo(opts Options, config gitIndexConfig) error { 381 prepareDeltaBuild := prepareDeltaBuild 382 if config.prepareDeltaBuild != nil { 383 prepareDeltaBuild = config.prepareDeltaBuild 384 } 385 386 prepareNormalBuild := prepareNormalBuild 387 if config.prepareNormalBuild != nil { 388 prepareNormalBuild = config.prepareNormalBuild 389 } 390 391 // Set max thresholds, since we use them in this function. 392 opts.BuildOptions.SetDefaults() 393 if opts.RepoDir == "" { 394 return fmt.Errorf("gitindex: must set RepoDir") 395 } 396 397 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir 398 repo, err := git.PlainOpen(opts.RepoDir) 399 if err != nil { 400 return fmt.Errorf("git.PlainOpen: %w", err) 401 } 402 403 if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil { 404 log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err) 405 } 406 407 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix) 408 if err != nil { 409 return fmt.Errorf("expandBranches: %w", err) 410 } 411 for _, b := range branches { 412 commit, err := getCommit(repo, opts.BranchPrefix, b) 413 if err != nil { 414 if opts.AllowMissingBranch && err.Error() == "reference not found" { 415 continue 416 } 417 418 return fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err) 419 } 420 421 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{ 422 Name: b, 423 Version: commit.Hash.String(), 424 }) 425 426 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) { 427 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when 428 } 429 } 430 431 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() { 432 return nil 433 } 434 435 // branch => (path, sha1) => repo. 436 var repos map[fileKey]BlobLocation 437 438 // fileKey => branches 439 var branchMap map[fileKey][]string 440 441 // Branch => Repo => SHA1 442 var branchVersions map[string]map[string]plumbing.Hash 443 444 // set of file paths that have been changed or deleted since 445 // the last indexed commit 446 // 447 // These only have an effect on delta builds 448 var changedOrRemovedFiles []string 449 450 if opts.BuildOptions.IsDelta { 451 repos, branchMap, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo) 452 if err != nil { 453 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err) 454 opts.BuildOptions.IsDelta = false 455 } 456 } 457 458 if !opts.BuildOptions.IsDelta { 459 repos, branchMap, branchVersions, err = prepareNormalBuild(opts, repo) 460 if err != nil { 461 return fmt.Errorf("preparing normal build: %w", err) 462 } 463 } 464 465 reposByPath := map[string]BlobLocation{} 466 for key, location := range repos { 467 reposByPath[key.SubRepoPath] = location 468 } 469 470 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{} 471 for path, location := range reposByPath { 472 tpl := opts.BuildOptions.RepositoryDescription 473 if path != "" { 474 tpl = zoekt.Repository{URL: location.URL.String()} 475 if err := SetTemplatesFromOrigin(&tpl, location.URL); err != nil { 476 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, location.URL, err) 477 } 478 } 479 opts.BuildOptions.SubRepositories[path] = &tpl 480 } 481 482 for _, br := range opts.BuildOptions.RepositoryDescription.Branches { 483 for path, repo := range opts.BuildOptions.SubRepositories { 484 id := branchVersions[br.Name][path] 485 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{ 486 Name: br.Name, 487 Version: id.String(), 488 }) 489 } 490 } 491 492 builder, err := build.NewBuilder(opts.BuildOptions) 493 if err != nil { 494 return fmt.Errorf("build.NewBuilder: %w", err) 495 } 496 497 var ranks repoPathRanks 498 var meanRank float64 499 if opts.BuildOptions.DocumentRanksPath != "" { 500 data, err := os.ReadFile(opts.BuildOptions.DocumentRanksPath) 501 if err != nil { 502 return err 503 } 504 505 err = json.Unmarshal(data, &ranks) 506 if err != nil { 507 return err 508 } 509 510 // Compute the mean rank for this repository. Note: we overwrite the rank 511 // mean that's stored in the document ranks file, since that currently 512 // represents a global mean rank across repos, which is not what we want. 513 numRanks := len(ranks.Paths) 514 if numRanks > 0 { 515 for _, rank := range ranks.Paths { 516 meanRank += rank 517 } 518 ranks.MeanRank = meanRank / float64(numRanks) 519 } 520 } 521 522 // we don't need to check error, since we either already have an error, or 523 // we returning the first call to builder.Finish. 524 defer builder.Finish() // nolint:errcheck 525 526 for _, f := range changedOrRemovedFiles { 527 builder.MarkFileAsChangedOrRemoved(f) 528 } 529 530 var names []string 531 fileKeys := map[string][]fileKey{} 532 totalFiles := 0 533 534 for key := range repos { 535 n := key.FullPath() 536 fileKeys[n] = append(fileKeys[n], key) 537 names = append(names, n) 538 totalFiles++ 539 } 540 541 sort.Strings(names) 542 names = uniq(names) 543 544 log.Printf("attempting to index %d total files", totalFiles) 545 for _, name := range names { 546 keys := fileKeys[name] 547 548 for _, key := range keys { 549 doc, err := createDocument(key, repos, branchMap, ranks, opts.BuildOptions) 550 if err != nil { 551 return err 552 } 553 554 if err := builder.Add(doc); err != nil { 555 return fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err) 556 } 557 } 558 } 559 560 return builder.Finish() 561} 562 563type repoPathRanks struct { 564 MeanRank float64 `json:"mean_reference_count"` 565 Paths map[string]float64 `json:"paths"` 566} 567 568// rank returns the rank for a given path. It uses these rules: 569// - If we have a concrete rank for this file, always use it 570// - If there's no rank, and it's a low priority file like a test, then use rank 0 571// - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage 572func (r repoPathRanks) rank(path string) float64 { 573 if rank, ok := r.Paths[path]; ok { 574 return rank 575 } else if build.IsLowPriority(path) { 576 return 0.0 577 } else { 578 return r.MeanRank 579 } 580} 581 582func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) { 583 ignoreFile, err := tree.File(ignore.IgnoreFile) 584 if err == object.ErrFileNotFound { 585 return &ignore.Matcher{}, nil 586 } 587 if err != nil { 588 return nil, err 589 } 590 content, err := ignoreFile.Contents() 591 if err != nil { 592 return nil, err 593 } 594 return ignore.ParseIgnoreFile(strings.NewReader(content)) 595} 596 597// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing 598// a build.Builder instance for generating a delta build. 599type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) 600 601// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing 602// a build.Builder instance for generating a normal build. 603type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, err error) 604 605type gitIndexConfig struct { 606 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to 607 // prepare the build.Builder instance for generating a delta build. 608 // 609 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead. 610 prepareDeltaBuild prepareDeltaBuildFunc 611 612 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to 613 // prepare the build.Builder instance for generating a normal build. 614 // 615 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead. 616 prepareNormalBuild prepareNormalBuildFunc 617} 618 619func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { 620 if options.Submodules { 621 return nil, nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing") 622 } 623 624 // discover what commits we indexed during our last build 625 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata() 626 if err != nil { 627 return nil, nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err) 628 } 629 630 if !ok { 631 return nil, nil, nil, nil, fmt.Errorf("no existing shards found for repository") 632 } 633 634 if options.DeltaShardNumberFallbackThreshold > 0 { 635 // HACK: For our interim compaction strategy, we force a full normal index once 636 // the number of shards on disk for this repository exceeds the provided threshold. 637 // 638 // This strategy obviously isn't optimal (as an example: we currently can't differentiate 639 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per 640 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads 641 // while we create a better compaction strategy). 642 643 oldShards := options.BuildOptions.FindAllShards() 644 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold { 645 return nil, nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold) 646 } 647 } 648 649 // Check to see if the set of branch names is consistent with what we last indexed. 650 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a 651 // normal one). 652 653 if !build.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) { 654 var existingBranchNames []string 655 for _, b := range existingRepository.Branches { 656 existingBranchNames = append(existingBranchNames, b.Name) 657 } 658 659 var optionsBranchNames []string 660 for _, b := range options.BuildOptions.RepositoryDescription.Branches { 661 optionsBranchNames = append(optionsBranchNames, b.Name) 662 } 663 664 existingBranchList := strings.Join(existingBranchNames, ", ") 665 optionsBranchList := strings.Join(optionsBranchNames, ", ") 666 667 return nil, nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList) 668 } 669 670 // Check if the build options hash does not match the repository metadata's hash 671 // If it does not match then one or more index options has changed and will require a normal build instead of a delta build 672 if options.BuildOptions.GetHash() != existingRepository.IndexOptions { 673 return nil, nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions()) 674 } 675 676 // branch => (path, sha1) => repo. 677 repos = map[fileKey]BlobLocation{} 678 679 // fileKey => branches 680 branchMap = map[fileKey][]string{} 681 682 // branch name -> git worktree at most current commit 683 branchToCurrentTree := make(map[string]*object.Tree, len(options.Branches)) 684 685 for _, b := range options.Branches { 686 commit, err := getCommit(repository, options.BranchPrefix, b) 687 if err != nil { 688 return nil, nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err) 689 } 690 691 tree, err := commit.Tree() 692 if err != nil { 693 return nil, nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err) 694 } 695 696 branchToCurrentTree[b] = tree 697 } 698 699 rawURL := options.BuildOptions.RepositoryDescription.URL 700 u, err := url.Parse(rawURL) 701 if err != nil { 702 return nil, nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err) 703 } 704 705 // TODO: Support repository submodules for delta builds 706 // For this prototype, we are ignoring repository submodules, which means that we can use the same 707 // blob location for all files 708 hackSharedBlobLocation := BlobLocation{ 709 Repo: repository, 710 URL: u, 711 } 712 713 // loop over all branches, calculate the diff between our 714 // last indexed commit and the current commit, and add files mentioned in the diff 715 for _, branch := range existingRepository.Branches { 716 lastIndexedCommit, err := getCommit(repository, "", branch.Version) 717 if err != nil { 718 return nil, nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err) 719 } 720 721 lastIndexedTree, err := lastIndexedCommit.Tree() 722 if err != nil { 723 return nil, nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err) 724 } 725 726 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false}) 727 if err != nil { 728 return nil, nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err) 729 } 730 731 for i, c := range changes { 732 oldFile, newFile, err := c.Files() 733 if err != nil { 734 return nil, nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err) 735 } 736 737 if newFile != nil { 738 // note: newFile.Name could be a path that isn't relative to the repository root - using the 739 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root 740 newFileRelativeRootPath := c.To.Name 741 742 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds 743 if newFileRelativeRootPath == ignore.IgnoreFile { 744 return nil, nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile) 745 } 746 747 // either file is added or renamed, so we need to add the new version to the build 748 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash} 749 repos[file] = hackSharedBlobLocation 750 branchMap[file] = append(branchMap[file], branch.Name) 751 } 752 753 if oldFile == nil { 754 // file added - nothing more to do 755 continue 756 } 757 758 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the 759 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root 760 oldFileRelativeRootPath := c.From.Name 761 762 if oldFileRelativeRootPath == ignore.IgnoreFile { 763 return nil, nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile) 764 } 765 766 // The file is either modified or deleted. So, we need to add ALL versions 767 // of the old file (across all branches) to the build. 768 for b, currentTree := range branchToCurrentTree { 769 f, err := currentTree.File(oldFileRelativeRootPath) 770 if err != nil { 771 // the file doesn't exist in this branch 772 if errors.Is(err, object.ErrFileNotFound) { 773 continue 774 } 775 776 return nil, nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err) 777 } 778 779 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()} 780 repos[file] = hackSharedBlobLocation 781 branchMap[file] = append(branchMap[file], b) 782 } 783 784 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath) 785 } 786 } 787 788 // we need to de-duplicate the branch map before returning it - it's possible for the same 789 // branch to have been added multiple times if a file has been modified across multiple commits 790 791 for file, branches := range branchMap { 792 sort.Strings(branches) 793 branchMap[file] = uniq(branches) 794 } 795 796 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates 797 // for the same reasoning as above 798 799 sort.Strings(changedOrDeletedPaths) 800 changedOrDeletedPaths = uniq(changedOrDeletedPaths) 801 802 return repos, branchMap, nil, changedOrDeletedPaths, nil 803} 804 805func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchMap map[fileKey][]string, branchVersions map[string]map[string]plumbing.Hash, err error) { 806 var repoCache *RepoCache 807 if options.Submodules { 808 repoCache = NewRepoCache(options.RepoCacheDir) 809 } 810 811 // branch => (path, sha1) => repo. 812 repos = map[fileKey]BlobLocation{} 813 814 // fileKey => branches 815 branchMap = map[fileKey][]string{} 816 817 // Branch => Repo => SHA1 818 branchVersions = map[string]map[string]plumbing.Hash{} 819 820 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix) 821 if err != nil { 822 return nil, nil, nil, fmt.Errorf("expandBranches: %w", err) 823 } 824 825 for _, b := range branches { 826 commit, err := getCommit(repository, options.BranchPrefix, b) 827 if err != nil { 828 if options.AllowMissingBranch && err.Error() == "reference not found" { 829 continue 830 } 831 832 return nil, nil, nil, fmt.Errorf("getCommit: %w", err) 833 } 834 835 tree, err := commit.Tree() 836 if err != nil { 837 return nil, nil, nil, fmt.Errorf("commit.Tree: %w", err) 838 } 839 840 ig, err := newIgnoreMatcher(tree) 841 if err != nil { 842 return nil, nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err) 843 } 844 845 files, subVersions, err := TreeToFiles(repository, tree, options.BuildOptions.RepositoryDescription.URL, repoCache) 846 if err != nil { 847 return nil, nil, nil, fmt.Errorf("TreeToFiles: %w", err) 848 } 849 for k, v := range files { 850 if ig.Match(k.Path) { 851 continue 852 } 853 repos[k] = v 854 branchMap[k] = append(branchMap[k], b) 855 } 856 857 branchVersions[b] = subVersions 858 } 859 860 return repos, branchMap, branchVersions, nil 861} 862 863func createDocument(key fileKey, 864 repos map[fileKey]BlobLocation, 865 branchMap map[fileKey][]string, 866 ranks repoPathRanks, 867 opts build.Options, 868) (zoekt.Document, error) { 869 blob, err := repos[key].Repo.BlobObject(key.ID) 870 if err != nil { 871 return zoekt.Document{}, err 872 } 873 874 keyFullPath := key.FullPath() 875 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) { 876 return zoekt.Document{ 877 SkipReason: fmt.Sprintf("file size %d exceeds maximum size %d", blob.Size, opts.SizeMax), 878 Name: key.FullPath(), 879 Branches: branchMap[key], 880 SubRepositoryPath: key.SubRepoPath, 881 }, nil 882 } 883 884 contents, err := blobContents(blob) 885 if err != nil { 886 return zoekt.Document{}, err 887 } 888 889 var pathRanks []float64 890 if len(ranks.Paths) > 0 { 891 // If the repository has ranking data, then store the file's rank. 892 pathRank := ranks.rank(keyFullPath) 893 pathRanks = []float64{pathRank} 894 } 895 896 return zoekt.Document{ 897 SubRepositoryPath: key.SubRepoPath, 898 Name: keyFullPath, 899 Content: contents, 900 Branches: branchMap[key], 901 Ranks: pathRanks, 902 }, nil 903} 904 905func blobContents(blob *object.Blob) ([]byte, error) { 906 r, err := blob.Reader() 907 if err != nil { 908 return nil, err 909 } 910 defer r.Close() 911 912 var buf bytes.Buffer 913 buf.Grow(int(blob.Size)) 914 _, err = buf.ReadFrom(r) 915 if err != nil { 916 return nil, err 917 } 918 return buf.Bytes(), nil 919} 920 921func uniq(ss []string) []string { 922 result := ss[:0] 923 var last string 924 for i, s := range ss { 925 if i == 0 || s != last { 926 result = append(result, s) 927 } 928 last = s 929 } 930 return result 931}