fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package gitindex provides functions for indexing Git repositories. 16package gitindex 17 18import ( 19 "bytes" 20 "cmp" 21 "context" 22 "errors" 23 "fmt" 24 "io" 25 "log" 26 "math" 27 "net/url" 28 "os" 29 "path/filepath" 30 "regexp" 31 "sort" 32 "strconv" 33 "strings" 34 35 "github.com/go-git/go-billy/v5/osfs" 36 "github.com/go-git/go-git/v5/config" 37 "github.com/go-git/go-git/v5/plumbing" 38 "github.com/go-git/go-git/v5/plumbing/cache" 39 "github.com/go-git/go-git/v5/plumbing/object" 40 "github.com/go-git/go-git/v5/storage/filesystem" 41 42 "github.com/sourcegraph/zoekt" 43 "github.com/sourcegraph/zoekt/ignore" 44 "github.com/sourcegraph/zoekt/index" 45 46 git "github.com/go-git/go-git/v5" 47) 48 49// FindGitRepos finds directories holding git repositories below the 50// given directory. It will find both bare and the ".git" dirs in 51// non-bare repositories. It returns the full path including the dir 52// passed in. 53func FindGitRepos(dir string) ([]string, error) { 54 arg, err := filepath.Abs(dir) 55 if err != nil { 56 return nil, err 57 } 58 var dirs []string 59 if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error { 60 // Best-effort, ignore filepath.Walk failing 61 if err != nil { 62 return nil 63 } 64 65 if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() { 66 dirs = append(dirs, filepath.Join(name, ".git")) 67 return filepath.SkipDir 68 } 69 70 if !strings.HasSuffix(name, ".git") || !fi.IsDir() { 71 return nil 72 } 73 74 fi, err = os.Lstat(filepath.Join(name, "objects")) 75 if err != nil || !fi.IsDir() { 76 return nil 77 } 78 79 dirs = append(dirs, name) 80 return filepath.SkipDir 81 }); err != nil { 82 return nil, err 83 } 84 85 return dirs, nil 86} 87 88// setTemplates fills in URL templates for known git hosting 89// sites. 90func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error { 91 if u.Scheme == "ssh+git" { 92 u.Scheme = "https" 93 u.User = nil 94 } 95 96 // helper to generate u.JoinPath as a template 97 varVersion := ".Version" 98 varPath := ".Path" 99 urlJoinPath := func(elem ...string) string { 100 elem = append([]string{u.String()}, elem...) 101 var parts []string 102 for _, e := range elem { 103 if e == varVersion || e == varPath { 104 parts = append(parts, e) 105 } else { 106 parts = append(parts, strconv.Quote(e)) 107 } 108 } 109 return fmt.Sprintf("{{URLJoinPath %s}}", strings.Join(parts, " ")) 110 } 111 112 repo.URL = u.String() 113 switch typ { 114 case "gitiles": 115 // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20 116 repo.CommitURLTemplate = urlJoinPath("+", varVersion) 117 repo.FileURLTemplate = urlJoinPath("+", varVersion, varPath) 118 repo.LineFragmentTemplate = "#{{.LineNumber}}" 119 case "github": 120 // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10 121 repo.CommitURLTemplate = urlJoinPath("commit", varVersion) 122 repo.FileURLTemplate = urlJoinPath("blob", varVersion, varPath) 123 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 124 case "cgit": 125 // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100 126 repo.CommitURLTemplate = urlJoinPath("commit") + "/?id={{.Version}}" 127 repo.FileURLTemplate = urlJoinPath("tree", varPath) + "/?id={{.Version}}" 128 repo.LineFragmentTemplate = "#n{{.LineNumber}}" 129 case "gitweb": 130 // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10 131 repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}" 132 repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}" 133 repo.LineFragmentTemplate = "#l{{.LineNumber}}" 134 case "source.bazel.build": 135 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9 136 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10 137 repo.CommitURLTemplate = u.String() + "/%2B/{{.Version}}" 138 repo.FileURLTemplate = u.String() + "/%2B/{{.Version}}:{{.Path}}" 139 repo.LineFragmentTemplate = ";l={{.LineNumber}}" 140 case "bitbucket-server": 141 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc 142 // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc 143 repo.CommitURLTemplate = urlJoinPath("commits", varVersion) 144 repo.FileURLTemplate = urlJoinPath(varPath) + "?at={{.Version}}" 145 repo.LineFragmentTemplate = "#{{.LineNumber}}" 146 case "gitlab": 147 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed 148 // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template 149 repo.CommitURLTemplate = urlJoinPath("-/commit", varVersion) 150 repo.FileURLTemplate = urlJoinPath("-/blob", varVersion, varPath) 151 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 152 case "gitea": 153 repo.CommitURLTemplate = urlJoinPath("commit", varVersion) 154 // NOTE The `display=source` query parameter is required to disable file rendering. 155 // Since line numbers are disabled in rendered files, you wouldn't be able to jump to 156 // a line without `display=source`. This is supported since gitea 1.17.0. 157 // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}}, 158 // but the query parameters are obmitted. 159 repo.FileURLTemplate = urlJoinPath("src/commit", varVersion, varPath) + "?display=source" 160 repo.LineFragmentTemplate = "#L{{.LineNumber}}" 161 default: 162 return fmt.Errorf("URL scheme type %q unknown", typ) 163 } 164 return nil 165} 166 167// getCommit returns a tree object for the given reference. 168func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) { 169 sha1, err := repo.ResolveRevision(plumbing.Revision(ref)) 170 // ref might be a branch name (e.g. "master") add branch prefix and try again. 171 if err != nil { 172 sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref))) 173 } 174 if err != nil { 175 return nil, err 176 } 177 178 commitObj, err := repo.CommitObject(*sha1) 179 if err != nil { 180 return nil, err 181 } 182 return commitObj, nil 183} 184 185func configLookupRemoteURL(cfg *config.Config, key string) string { 186 rc := cfg.Remotes[key] 187 if rc == nil || len(rc.URLs) == 0 { 188 return "" 189 } 190 return rc.URLs[0] 191} 192 193var sshRelativeURLRegexp = regexp.MustCompile(`^([^@]+)@([^:]+):(.*)$`) 194 195func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error { 196 repo, err := git.PlainOpen(repoDir) 197 if err != nil { 198 return err 199 } 200 201 cfg, err := repo.Config() 202 if err != nil { 203 return err 204 } 205 206 sec := cfg.Raw.Section("zoekt") 207 208 webURLStr := sec.Options.Get("web-url") 209 webURLType := sec.Options.Get("web-url-type") 210 211 if webURLType != "" && webURLStr != "" { 212 webURL, err := url.Parse(webURLStr) 213 if err != nil { 214 return err 215 } 216 if err := setTemplates(desc, webURL, webURLType); err != nil { 217 return err 218 } 219 } else if webURLStr != "" { 220 desc.URL = webURLStr 221 } 222 223 name := sec.Options.Get("name") 224 if name != "" { 225 desc.Name = name 226 } else { 227 remoteURL := configLookupRemoteURL(cfg, "origin") 228 if remoteURL == "" { 229 return nil 230 } 231 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil { 232 user := sm[1] 233 host := sm[2] 234 path := sm[3] 235 236 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path) 237 } 238 239 u, err := url.Parse(remoteURL) 240 if err != nil { 241 return err 242 } 243 if err := SetTemplatesFromOrigin(desc, u); err != nil { 244 return err 245 } 246 } 247 248 id, _ := strconv.ParseUint(sec.Options.Get("repoid"), 10, 32) 249 desc.ID = uint32(id) 250 251 desc.TenantID, _ = strconv.Atoi(sec.Options.Get("tenantID")) 252 253 if desc.RawConfig == nil { 254 desc.RawConfig = map[string]string{} 255 } 256 for _, o := range sec.Options { 257 desc.RawConfig[o.Key] = o.Value 258 } 259 260 // Ranking info. 261 262 // Github: 263 traction := 0 264 for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} { 265 f, err := strconv.Atoi(sec.Options.Get(s)) 266 if err == nil { 267 traction += f 268 } 269 } 270 271 if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 { 272 // Pretend everything on googlesource.com has 1000 273 // github stars. 274 traction = 1000 275 } 276 277 if traction > 0 { 278 l := math.Log(float64(traction)) 279 desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000) 280 } 281 282 return nil 283} 284 285// This attempts to get a repo URL similar to the main repository template processing as in setTemplatesFromConfig() 286func normalizeSubmoduleRemoteURL(cfg *config.Config) (string, error) { 287 sec := cfg.Raw.Section("zoekt") 288 remoteURL := sec.Options.Get("web-url") 289 if remoteURL == "" { 290 // fall back to "origin" remote 291 remoteURL = configLookupRemoteURL(cfg, "origin") 292 if remoteURL == "" { 293 return "", nil 294 } 295 } 296 297 if sm := sshRelativeURLRegexp.FindStringSubmatch(remoteURL); sm != nil { 298 user := sm[1] 299 host := sm[2] 300 path := sm[3] 301 302 remoteURL = fmt.Sprintf("ssh+git://%s@%s/%s", user, host, path) 303 } 304 305 u, err := url.Parse(remoteURL) 306 if err != nil { 307 return "", fmt.Errorf("unable to parse remote URL %q: %w", remoteURL, err) 308 } 309 310 if u.Scheme == "ssh+git" { 311 u.Scheme = "https" 312 u.User = nil 313 } 314 315 // Assume we cannot build templates for this URL, leave it empty 316 if u.Scheme == "" { 317 return "", nil 318 } 319 320 return u.String(), nil 321} 322 323// SetTemplatesFromOrigin fills in templates based on the origin URL. 324func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error { 325 desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git")) 326 327 if strings.HasSuffix(u.Host, ".googlesource.com") { 328 return setTemplates(desc, u, "gitiles") 329 } else if u.Host == "github.com" { 330 u.Path = strings.TrimSuffix(u.Path, ".git") 331 return setTemplates(desc, u, "github") 332 } else { 333 return fmt.Errorf("unknown git hosting site %q", u) 334 } 335} 336 337// The Options structs controls details of the indexing process. 338type Options struct { 339 // The repository to be indexed. 340 RepoDir string 341 342 // If set, follow submodule links. This requires RepoCacheDir to be set. 343 Submodules bool 344 345 // If set, skip indexing if the existing index shard is newer 346 // than the refs in the repository. 347 Incremental bool 348 349 // Don't error out if some branch is missing 350 AllowMissingBranch bool 351 352 // Specifies the root of a Repository cache. Needed for submodule indexing. 353 RepoCacheDir string 354 355 // Indexing options. 356 BuildOptions index.Options 357 358 // Prefix of the branch to index, e.g. `remotes/origin`. 359 BranchPrefix string 360 361 // List of branch names to index, e.g. []string{"HEAD", "stable"} 362 Branches []string 363 364 // DeltaShardNumberFallbackThreshold defines an upper limit (inclusive) on the number of preexisting shards 365 // that can exist before attempting another delta build. If the number of preexisting shards exceeds this threshold, 366 // then a normal build will be performed instead. 367 // 368 // If DeltaShardNumberFallbackThreshold is 0, then this fallback behavior is disabled: 369 // a delta build will always be performed regardless of the number of preexisting shards. 370 DeltaShardNumberFallbackThreshold uint64 371} 372 373func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) { 374 var result []string 375 for _, b := range bs { 376 // Sourcegraph: We disable resolving refs. We want to return the exact ref 377 // requested so we can match it up. 378 if b == "HEAD" && false { 379 ref, err := repo.Head() 380 if err != nil { 381 return nil, err 382 } 383 384 result = append(result, strings.TrimPrefix(ref.Name().String(), prefix)) 385 continue 386 } 387 388 if strings.Contains(b, "*") { 389 iter, err := repo.Branches() 390 if err != nil { 391 return nil, err 392 } 393 394 defer iter.Close() 395 for { 396 ref, err := iter.Next() 397 if err == io.EOF { 398 break 399 } 400 if err != nil { 401 return nil, err 402 } 403 404 name := ref.Name().Short() 405 if matched, err := filepath.Match(b, name); err != nil { 406 return nil, err 407 } else if !matched { 408 continue 409 } 410 411 result = append(result, strings.TrimPrefix(name, prefix)) 412 } 413 continue 414 } 415 416 result = append(result, b) 417 } 418 419 return result, nil 420} 421 422// IndexGitRepo indexes the git repository as specified by the options. 423// The returned bool indicates whether the index was updated as a result. This 424// can be informative if doing incremental indexing. 425func IndexGitRepo(opts Options) (bool, error) { 426 return indexGitRepo(opts, gitIndexConfig{}) 427} 428 429// indexGitRepo indexes the git repository as specified by the options and the provided gitIndexConfig. 430// The returned bool indicates whether the index was updated as a result. This 431// can be informative if doing incremental indexing. 432func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { 433 prepareDeltaBuild := prepareDeltaBuild 434 if config.prepareDeltaBuild != nil { 435 prepareDeltaBuild = config.prepareDeltaBuild 436 } 437 438 prepareNormalBuild := prepareNormalBuild 439 if config.prepareNormalBuild != nil { 440 prepareNormalBuild = config.prepareNormalBuild 441 } 442 443 // Set max thresholds, since we use them in this function. 444 opts.BuildOptions.SetDefaults() 445 if opts.RepoDir == "" { 446 return false, fmt.Errorf("gitindex: must set RepoDir") 447 } 448 449 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir 450 451 var repo *git.Repository 452 legacyRepoOpen := cmp.Or(os.Getenv("ZOEKT_DISABLE_GOGIT_OPTIMIZATION"), "false") 453 if b, err := strconv.ParseBool(legacyRepoOpen); b || err != nil { 454 repo, err = git.PlainOpen(opts.RepoDir) 455 if err != nil { 456 return false, fmt.Errorf("git.PlainOpen: %w", err) 457 } 458 } else { 459 var repoCloser io.Closer 460 repo, repoCloser, err = openRepo(opts.RepoDir) 461 if err != nil { 462 return false, fmt.Errorf("openRepo: %w", err) 463 } 464 defer repoCloser.Close() 465 } 466 467 if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil { 468 log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err) 469 } 470 471 branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix) 472 if err != nil { 473 return false, fmt.Errorf("expandBranches: %w", err) 474 } 475 for _, b := range branches { 476 commit, err := getCommit(repo, opts.BranchPrefix, b) 477 if err != nil { 478 if opts.AllowMissingBranch && err.Error() == "reference not found" { 479 continue 480 } 481 482 return false, fmt.Errorf("getCommit(%q, %q): %w", opts.BranchPrefix, b, err) 483 } 484 485 opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{ 486 Name: b, 487 Version: commit.Hash.String(), 488 }) 489 490 if when := commit.Committer.When; when.After(opts.BuildOptions.RepositoryDescription.LatestCommitDate) { 491 opts.BuildOptions.RepositoryDescription.LatestCommitDate = when 492 } 493 } 494 495 if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() { 496 return false, nil 497 } 498 499 // branch => (path, sha1) => repo. 500 var repos map[fileKey]BlobLocation 501 502 // Branch => Repo => SHA1 503 var branchVersions map[string]map[string]plumbing.Hash 504 505 // set of file paths that have been changed or deleted since 506 // the last indexed commit 507 // 508 // These only have an effect on delta builds 509 var changedOrRemovedFiles []string 510 511 if opts.BuildOptions.IsDelta { 512 repos, branchVersions, changedOrRemovedFiles, err = prepareDeltaBuild(opts, repo) 513 if err != nil { 514 log.Printf("delta build: falling back to normal build since delta build failed, repository=%q, err=%s", opts.BuildOptions.RepositoryDescription.Name, err) 515 opts.BuildOptions.IsDelta = false 516 } 517 } 518 519 if !opts.BuildOptions.IsDelta { 520 repos, branchVersions, err = prepareNormalBuild(opts, repo) 521 if err != nil { 522 return false, fmt.Errorf("preparing normal build: %w", err) 523 } 524 } 525 526 reposByPath := map[string]BlobLocation{} 527 for key, info := range repos { 528 reposByPath[key.SubRepoPath] = info 529 } 530 531 opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{} 532 for path, info := range reposByPath { 533 tpl := opts.BuildOptions.RepositoryDescription 534 if path != "" { 535 tpl = zoekt.Repository{URL: info.URL.String()} 536 if info.URL.String() != "" { 537 if err := SetTemplatesFromOrigin(&tpl, info.URL); err != nil { 538 log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.URL, err) 539 } 540 } 541 if tpl.Name == "" { 542 tpl.Name = path 543 } 544 } 545 opts.BuildOptions.SubRepositories[path] = &tpl 546 } 547 548 for _, br := range opts.BuildOptions.RepositoryDescription.Branches { 549 for path, repo := range opts.BuildOptions.SubRepositories { 550 id := branchVersions[br.Name][path] 551 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{ 552 Name: br.Name, 553 Version: id.String(), 554 }) 555 } 556 } 557 558 builder, err := index.NewBuilder(opts.BuildOptions) 559 if err != nil { 560 return false, fmt.Errorf("build.NewBuilder: %w", err) 561 } 562 563 // Preparing the build can consume substantial memory, so check usage before starting to index. 564 builder.CheckMemoryUsage() 565 566 // we don't need to check error, since we either already have an error, or 567 // we returning the first call to builder.Finish. 568 defer builder.Finish() // nolint:errcheck 569 570 for _, f := range changedOrRemovedFiles { 571 builder.MarkFileAsChangedOrRemoved(f) 572 } 573 574 var names []string 575 fileKeys := map[string][]fileKey{} 576 totalFiles := 0 577 578 for key := range repos { 579 n := key.FullPath() 580 fileKeys[n] = append(fileKeys[n], key) 581 names = append(names, n) 582 totalFiles++ 583 } 584 585 sort.Strings(names) 586 names = uniq(names) 587 588 log.Printf("attempting to index %d total files", totalFiles) 589 for idx, name := range names { 590 keys := fileKeys[name] 591 592 for _, key := range keys { 593 doc, err := createDocument(key, repos, opts.BuildOptions) 594 if err != nil { 595 return false, err 596 } 597 598 if err := builder.Add(doc); err != nil { 599 return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err) 600 } 601 602 if idx%10_000 == 0 { 603 builder.CheckMemoryUsage() 604 } 605 } 606 } 607 return true, builder.Finish() 608} 609 610// openRepo opens a git repository in a way that's optimized for indexing. 611// 612// It copies the relevant logic from git.PlainOpen, and tweaks certain filesystem options. 613func openRepo(repoDir string) (*git.Repository, io.Closer, error) { 614 fs := osfs.New(repoDir) 615 wt := fs 616 617 // Check if the root directory exists. 618 if _, err := fs.Stat(""); err != nil { 619 if os.IsNotExist(err) { 620 return nil, nil, git.ErrRepositoryNotExists 621 } 622 return nil, nil, err 623 } 624 625 // If there's a .git directory, use that as the new root. 626 if fi, err := fs.Stat(git.GitDirName); err == nil && fi.IsDir() { 627 if fs, err = fs.Chroot(git.GitDirName); err != nil { 628 return nil, nil, fmt.Errorf("fs.Chroot: %w", err) 629 } 630 } 631 632 s := filesystem.NewStorageWithOptions(fs, cache.NewObjectLRUDefault(), filesystem.Options{ 633 // Cache the packfile handles, preventing the packfile from being opened then closed on every object access 634 KeepDescriptors: true, 635 }) 636 637 // Because we're keeping descriptors open, we need to close the storage object when we're done. 638 repo, err := git.Open(s, wt) 639 return repo, s, err 640} 641 642func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) { 643 ignoreFile, err := tree.File(ignore.IgnoreFile) 644 if err == object.ErrFileNotFound { 645 return &ignore.Matcher{}, nil 646 } 647 if err != nil { 648 return nil, err 649 } 650 content, err := ignoreFile.Contents() 651 if err != nil { 652 return nil, err 653 } 654 return ignore.ParseIgnoreFile(strings.NewReader(content)) 655} 656 657// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing 658// a build.Builder instance for generating a delta build. 659type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) 660 661// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing 662// a build.Builder instance for generating a normal build. 663type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) 664 665type gitIndexConfig struct { 666 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to 667 // prepare the build.Builder instance for generating a delta build. 668 // 669 // If prepareDeltaBuild is nil, gitindex.prepareDeltaBuild will be used instead. 670 prepareDeltaBuild prepareDeltaBuildFunc 671 672 // prepareNormalBuild, if not nil, is the function that is used to calculate the metadata that will be used to 673 // prepare the build.Builder instance for generating a normal build. 674 // 675 // If prepareNormalBuild is nil, gitindex.prepareNormalBuild will be used instead. 676 prepareNormalBuild prepareNormalBuildFunc 677} 678 679func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { 680 if options.Submodules { 681 return nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing") 682 } 683 684 // discover what commits we indexed during our last build 685 existingRepository, _, ok, err := options.BuildOptions.FindRepositoryMetadata() 686 if err != nil { 687 return nil, nil, nil, fmt.Errorf("failed to get repository metadata: %w", err) 688 } 689 690 if !ok { 691 return nil, nil, nil, fmt.Errorf("no existing shards found for repository") 692 } 693 694 if options.DeltaShardNumberFallbackThreshold > 0 { 695 // HACK: For our interim compaction strategy, we force a full normal index once 696 // the number of shards on disk for this repository exceeds the provided threshold. 697 // 698 // This strategy obviously isn't optimal (as an example: we currently can't differentiate 699 // between "normal" and "delta" shards, so repositories like the gigarepo that generate a large number of shards per 700 // build would be disproportionately affected by this), but it'll allow us to continue experimenting on real workloads 701 // while we create a better compaction strategy). 702 703 oldShards := options.BuildOptions.FindAllShards() 704 if uint64(len(oldShards)) > options.DeltaShardNumberFallbackThreshold { 705 return nil, nil, nil, fmt.Errorf("number of existing shards (%d) > requested shard threshold (%d)", len(oldShards), options.DeltaShardNumberFallbackThreshold) 706 } 707 } 708 709 // Check to see if the set of branch names is consistent with what we last indexed. 710 // If it isn't consistent, that we can't proceed with a delta build (and the caller should fall back to a 711 // normal one). 712 713 if !index.BranchNamesEqual(existingRepository.Branches, options.BuildOptions.RepositoryDescription.Branches) { 714 var existingBranchNames []string 715 for _, b := range existingRepository.Branches { 716 existingBranchNames = append(existingBranchNames, b.Name) 717 } 718 719 var optionsBranchNames []string 720 for _, b := range options.BuildOptions.RepositoryDescription.Branches { 721 optionsBranchNames = append(optionsBranchNames, b.Name) 722 } 723 724 existingBranchList := strings.Join(existingBranchNames, ", ") 725 optionsBranchList := strings.Join(optionsBranchNames, ", ") 726 727 return nil, nil, nil, fmt.Errorf("requested branch set in build options (%q) != branch set found on disk (%q) - branch set must be the same for delta shards", optionsBranchList, existingBranchList) 728 } 729 730 // Check if the build options hash does not match the repository metadata's hash 731 // If it does not index then one or more index options has changed and will require a normal build instead of a delta build 732 if options.BuildOptions.GetHash() != existingRepository.IndexOptions { 733 return nil, nil, nil, fmt.Errorf("one or more index options previously stored for repository %s (ID: %d) does not match the index options for this requested build; These index option updates are incompatible with delta build. new index options: %+v", existingRepository.Name, existingRepository.ID, options.BuildOptions.HashOptions()) 734 } 735 736 // branch => (path, sha1) => repo. 737 repos = map[fileKey]BlobLocation{} 738 739 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix) 740 if err != nil { 741 return nil, nil, nil, fmt.Errorf("expandBranches: %w", err) 742 } 743 744 // branch name -> git worktree at most current commit 745 branchToCurrentTree := make(map[string]*object.Tree, len(branches)) 746 747 for _, b := range branches { 748 commit, err := getCommit(repository, options.BranchPrefix, b) 749 if err != nil { 750 return nil, nil, nil, fmt.Errorf("getting last current commit for branch %q: %w", b, err) 751 } 752 753 tree, err := commit.Tree() 754 if err != nil { 755 return nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err) 756 } 757 758 branchToCurrentTree[b] = tree 759 } 760 761 rawURL := options.BuildOptions.RepositoryDescription.URL 762 u, err := url.Parse(rawURL) 763 if err != nil { 764 return nil, nil, nil, fmt.Errorf("parsing repository URL %q: %w", rawURL, err) 765 } 766 767 // TODO: Support repository submodules for delta builds 768 769 // loop over all branches, calculate the diff between our 770 // last indexed commit and the current commit, and add files mentioned in the diff 771 for _, branch := range existingRepository.Branches { 772 lastIndexedCommit, err := getCommit(repository, "", branch.Version) 773 if err != nil { 774 return nil, nil, nil, fmt.Errorf("getting last indexed commit for branch %q: %w", branch.Name, err) 775 } 776 777 lastIndexedTree, err := lastIndexedCommit.Tree() 778 if err != nil { 779 return nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err) 780 } 781 782 changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false}) 783 if err != nil { 784 return nil, nil, nil, fmt.Errorf("generating changeset for branch %q: %w", branch.Name, err) 785 } 786 787 for i, c := range changes { 788 oldFile, newFile, err := c.Files() 789 if err != nil { 790 return nil, nil, nil, fmt.Errorf("change #%d: getting files before and after change: %w", i, err) 791 } 792 793 if newFile != nil { 794 // note: newFile.Name could be a path that isn't relative to the repository root - using the 795 // change's Name field is the only way that @ggilmore saw to get the full path relative to the root 796 newFileRelativeRootPath := c.To.Name 797 798 // TODO@ggilmore: HACK - remove once ignore files are supported in delta builds 799 if newFileRelativeRootPath == ignore.IgnoreFile { 800 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile) 801 } 802 803 // either file is added or renamed, so we need to add the new version to the build 804 file := fileKey{Path: newFileRelativeRootPath, ID: newFile.Hash} 805 if existing, ok := repos[file]; ok { 806 existing.Branches = append(existing.Branches, branch.Name) 807 repos[file] = existing 808 } else { 809 repos[file] = BlobLocation{ 810 GitRepo: repository, 811 URL: u, 812 Branches: []string{branch.Name}, 813 } 814 } 815 } 816 817 if oldFile == nil { 818 // file added - nothing more to do 819 continue 820 } 821 822 // Note: oldFile.Name could be a path that isn't relative to the repository root - using the 823 // change's "Name" field is the only way that ggilmore saw to get the full path relative to the root 824 oldFileRelativeRootPath := c.From.Name 825 826 if oldFileRelativeRootPath == ignore.IgnoreFile { 827 return nil, nil, nil, fmt.Errorf("%q file is not yet supported in delta builds", ignore.IgnoreFile) 828 } 829 830 // The file is either modified or deleted. So, we need to add ALL versions 831 // of the old file (across all branches) to the build. 832 for b, currentTree := range branchToCurrentTree { 833 f, err := currentTree.File(oldFileRelativeRootPath) 834 if err != nil { 835 // the file doesn't exist in this branch 836 if errors.Is(err, object.ErrFileNotFound) { 837 continue 838 } 839 840 return nil, nil, nil, fmt.Errorf("getting hash for file %q in branch %q: %w", oldFile.Name, b, err) 841 } 842 843 file := fileKey{Path: oldFileRelativeRootPath, ID: f.ID()} 844 if existing, ok := repos[file]; ok { 845 existing.Branches = append(existing.Branches, b) 846 repos[file] = existing 847 } else { 848 repos[file] = BlobLocation{ 849 GitRepo: repository, 850 URL: u, 851 Branches: []string{b}, 852 } 853 } 854 } 855 856 changedOrDeletedPaths = append(changedOrDeletedPaths, oldFileRelativeRootPath) 857 } 858 } 859 860 // we need to de-duplicate the branch map before returning it - it's possible for the same 861 // branch to have been added multiple times if a file has been modified across multiple commits 862 for _, info := range repos { 863 sort.Strings(info.Branches) 864 info.Branches = uniq(info.Branches) 865 } 866 867 // we also need to de-duplicate the list of changed or deleted file paths, it's also possible to have duplicates 868 // for the same reasoning as above 869 sort.Strings(changedOrDeletedPaths) 870 changedOrDeletedPaths = uniq(changedOrDeletedPaths) 871 872 return repos, nil, changedOrDeletedPaths, nil 873} 874 875func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) { 876 var repoCache *RepoCache 877 if options.Submodules && options.RepoCacheDir != "" { 878 repoCache = NewRepoCache(options.RepoCacheDir) 879 } 880 return prepareNormalBuildRecurse(options, repository, repoCache, false) 881} 882 883func prepareNormalBuildRecurse(options Options, repository *git.Repository, repoCache *RepoCache, isSubrepo bool) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) { 884 // Branch => Repo => SHA1 885 branchVersions = map[string]map[string]plumbing.Hash{} 886 887 branches, err := expandBranches(repository, options.Branches, options.BranchPrefix) 888 if err != nil { 889 return nil, nil, fmt.Errorf("expandBranches: %w", err) 890 } 891 892 repoURL := options.BuildOptions.RepositoryDescription.URL 893 894 if isSubrepo { 895 cfg, err := repository.Config() 896 if err != nil { 897 return nil, nil, fmt.Errorf("unable to get repository config: %w", err) 898 } 899 900 u, err := normalizeSubmoduleRemoteURL(cfg) 901 if err != nil { 902 return nil, nil, fmt.Errorf("failed to identify subrepository URL: %w", err) 903 } 904 repoURL = u 905 } 906 907 rw := NewRepoWalker(repository, repoURL, repoCache) 908 for _, b := range branches { 909 commit, err := getCommit(repository, options.BranchPrefix, b) 910 if err != nil { 911 if options.AllowMissingBranch && err.Error() == "reference not found" { 912 continue 913 } 914 915 return nil, nil, fmt.Errorf("getCommit: %w", err) 916 } 917 918 tree, err := commit.Tree() 919 if err != nil { 920 return nil, nil, fmt.Errorf("commit.Tree: %w", err) 921 } 922 923 ig, err := newIgnoreMatcher(tree) 924 if err != nil { 925 return nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err) 926 } 927 928 subVersions, err := rw.CollectFiles(tree, b, ig) 929 if err != nil { 930 return nil, nil, fmt.Errorf("CollectFiles: %w", err) 931 } 932 933 branchVersions[b] = subVersions 934 } 935 936 // Index submodules using go-git if we didn't do so using the repo cache 937 if options.Submodules && options.RepoCacheDir == "" { 938 worktree, err := repository.Worktree() 939 if err != nil { 940 return nil, nil, fmt.Errorf("failed to get repository worktree: %w", err) 941 } 942 943 submodules, err := worktree.Submodules() 944 if err != nil { 945 return nil, nil, fmt.Errorf("failed to get submodules: %w", err) 946 } 947 948 for _, submodule := range submodules { 949 subRepository, err := submodule.Repository() 950 if err != nil { 951 log.Printf("failed to open submodule repository: %s, %s", submodule.Config().Name, err) 952 continue 953 } 954 955 sw, subVersions, err := prepareNormalBuildRecurse(options, subRepository, repoCache, true) 956 if err != nil { 957 log.Printf("failed to index submodule repository: %s, %s", submodule.Config().Name, err) 958 continue 959 } 960 961 log.Printf("adding subrepository files from: %s", submodule.Config().Name) 962 963 for k, repo := range sw { 964 rw.Files[fileKey{ 965 SubRepoPath: filepath.Join(submodule.Config().Path, k.SubRepoPath), 966 Path: k.Path, 967 ID: k.ID, 968 }] = repo 969 } 970 971 for k, v := range subVersions { 972 branchVersions[filepath.Join(submodule.Config().Path, k)] = v 973 } 974 } 975 } 976 977 return rw.Files, branchVersions, nil 978} 979 980func createDocument(key fileKey, 981 repos map[fileKey]BlobLocation, 982 opts index.Options, 983) (index.Document, error) { 984 repo := repos[key] 985 blob, err := repo.GitRepo.BlobObject(key.ID) 986 branches := repos[key].Branches 987 988 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found. 989 if errors.Is(err, plumbing.ErrObjectNotFound) { 990 return skippedLargeDoc(key, branches), nil 991 } 992 993 if err != nil { 994 return index.Document{}, err 995 } 996 997 keyFullPath := key.FullPath() 998 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) { 999 return skippedLargeDoc(key, branches), nil 1000 } 1001 1002 contents, err := blobContents(blob) 1003 if err != nil { 1004 return index.Document{}, err 1005 } 1006 1007 return index.Document{ 1008 SubRepositoryPath: key.SubRepoPath, 1009 Name: keyFullPath, 1010 Content: contents, 1011 Branches: branches, 1012 }, nil 1013} 1014 1015func skippedLargeDoc(key fileKey, branches []string) index.Document { 1016 return index.Document{ 1017 SkipReason: index.SkipReasonTooLarge, 1018 Name: key.FullPath(), 1019 Branches: branches, 1020 SubRepositoryPath: key.SubRepoPath, 1021 } 1022} 1023 1024func blobContents(blob *object.Blob) ([]byte, error) { 1025 r, err := blob.Reader() 1026 if err != nil { 1027 return nil, err 1028 } 1029 defer r.Close() 1030 1031 var buf bytes.Buffer 1032 buf.Grow(int(blob.Size)) 1033 _, err = buf.ReadFrom(r) 1034 if err != nil { 1035 return nil, err 1036 } 1037 return buf.Bytes(), nil 1038} 1039 1040func uniq(ss []string) []string { 1041 result := ss[:0] 1042 var last string 1043 for i, s := range ss { 1044 if i == 0 || s != last { 1045 result = append(result, s) 1046 } 1047 last = s 1048 } 1049 return result 1050}