fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Use single map for collecting files across branches (#839)

When looking at large profiles for `inuse_space` on dot com, I noticed the
filename maps in `prepareNormalBuild` taking a bunch of memory. This PR avoids
allocating a separate map per branch, instead having `RepoWalker` collect all
the entries in a single instance variable.

+110 -109
+8 -6
cmd/zoekt-repo-index/main.go
··· 43 43 "github.com/sourcegraph/zoekt" 44 44 "github.com/sourcegraph/zoekt/build" 45 45 "github.com/sourcegraph/zoekt/gitindex" 46 + "github.com/sourcegraph/zoekt/ignore" 46 47 "go.uber.org/automaxprocs/maxprocs" 47 48 48 49 git "github.com/go-git/go-git/v5" ··· 180 181 } 181 182 } 182 183 183 - perBranch := map[string]map[fileKey]gitindex.BlobRepo{} 184 + perBranch := map[string]map[fileKey]gitindex.BlobLocation{} 184 185 opts.SubRepositories = map[string]*zoekt.Repository{} 185 186 186 187 // branch => repo => version ··· 325 326 func iterateManifest(mf *manifest.Manifest, 326 327 baseURL url.URL, revPrefix string, 327 328 cache *gitindex.RepoCache, 328 - ) (map[fileKey]gitindex.BlobRepo, map[string]plumbing.Hash, error) { 329 - allFiles := map[fileKey]gitindex.BlobRepo{} 329 + ) (map[fileKey]gitindex.BlobLocation, map[string]plumbing.Hash, error) { 330 + allFiles := map[fileKey]gitindex.BlobLocation{} 330 331 allVersions := map[string]plumbing.Hash{} 331 332 for _, p := range mf.Project { 332 333 rev := mf.ProjectRevision(&p) ··· 359 360 return nil, nil, err 360 361 } 361 362 362 - files, versions, err := gitindex.TreeToFiles(topRepo, tree, projURL.String(), cache) 363 + rw := gitindex.NewRepoWalker(topRepo, projURL.String(), cache) 364 + subVersions, err := rw.CollectFiles(tree, rev, &ignore.Matcher{}) 363 365 if err != nil { 364 366 return nil, nil, err 365 367 } 366 368 367 - for key, repo := range files { 369 + for key, repo := range rw.Files { 368 370 allFiles[fileKey{ 369 371 SubRepoPath: filepath.Join(p.GetPath(), key.SubRepoPath), 370 372 Path: key.Path, ··· 372 374 }] = repo 373 375 } 374 376 375 - for path, version := range versions { 377 + for path, version := range subVersions { 376 378 allVersions[filepath.Join(p.GetPath(), path)] = version 377 379 } 378 380 }
+23 -40
gitindex/index.go
··· 426 426 } 427 427 428 428 // branch => (path, sha1) => repo. 429 - var repos map[fileKey]BlobIndexInfo 429 + var repos map[fileKey]BlobLocation 430 430 431 431 // Branch => Repo => SHA1 432 432 var branchVersions map[string]map[string]plumbing.Hash ··· 452 452 } 453 453 } 454 454 455 - reposByPath := map[string]BlobIndexInfo{} 455 + reposByPath := map[string]BlobLocation{} 456 456 for key, info := range repos { 457 457 reposByPath[key.SubRepoPath] = info 458 458 } ··· 461 461 for path, info := range reposByPath { 462 462 tpl := opts.BuildOptions.RepositoryDescription 463 463 if path != "" { 464 - tpl = zoekt.Repository{URL: info.Repo.URL.String()} 465 - if err := SetTemplatesFromOrigin(&tpl, info.Repo.URL); err != nil { 466 - log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.Repo.URL, err) 464 + tpl = zoekt.Repository{URL: info.URL.String()} 465 + if err := SetTemplatesFromOrigin(&tpl, info.URL); err != nil { 466 + log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.URL, err) 467 467 } 468 468 } 469 469 opts.BuildOptions.SubRepositories[path] = &tpl ··· 592 592 593 593 // prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing 594 594 // a build.Builder instance for generating a delta build. 595 - type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) 595 + type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) 596 596 597 597 // prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing 598 598 // a build.Builder instance for generating a normal build. 599 - type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, err error) 599 + type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) 600 600 601 601 type gitIndexConfig struct { 602 602 // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to ··· 612 612 prepareNormalBuild prepareNormalBuildFunc 613 613 } 614 614 615 - func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { 615 + func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { 616 616 if options.Submodules { 617 617 return nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing") 618 618 } ··· 670 670 } 671 671 672 672 // branch => (path, sha1) => repo. 673 - repos = map[fileKey]BlobIndexInfo{} 673 + repos = map[fileKey]BlobLocation{} 674 674 675 675 // branch name -> git worktree at most current commit 676 676 branchToCurrentTree := make(map[string]*object.Tree, len(options.Branches)) ··· 696 696 } 697 697 698 698 // TODO: Support repository submodules for delta builds 699 - // For this prototype, we are ignoring repository submodules, which means that we can use the same 700 - // blob location for all files 701 - hackSharedBlobLocation := BlobRepo{ 702 - GitRepo: repository, 703 - URL: u, 704 - } 699 + 705 700 // loop over all branches, calculate the diff between our 706 701 // last indexed commit and the current commit, and add files mentioned in the diff 707 702 for _, branch := range existingRepository.Branches { ··· 742 737 existing.Branches = append(existing.Branches, branch.Name) 743 738 repos[file] = existing 744 739 } else { 745 - repos[file] = BlobIndexInfo{ 746 - Repo: hackSharedBlobLocation, 740 + repos[file] = BlobLocation{ 741 + GitRepo: repository, 742 + URL: u, 747 743 Branches: []string{branch.Name}, 748 744 } 749 745 } ··· 780 776 existing.Branches = append(existing.Branches, b) 781 777 repos[file] = existing 782 778 } else { 783 - repos[file] = BlobIndexInfo{ 784 - Repo: hackSharedBlobLocation, 779 + repos[file] = BlobLocation{ 780 + GitRepo: repository, 781 + URL: u, 785 782 Branches: []string{b}, 786 783 } 787 784 } ··· 806 803 return repos, nil, changedOrDeletedPaths, nil 807 804 } 808 805 809 - func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, err error) { 806 + func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) { 810 807 var repoCache *RepoCache 811 808 if options.Submodules { 812 809 repoCache = NewRepoCache(options.RepoCacheDir) 813 810 } 814 - 815 - // branch => (path, sha1) => metadata. 816 - repos = map[fileKey]BlobIndexInfo{} 817 811 818 812 // Branch => Repo => SHA1 819 813 branchVersions = map[string]map[string]plumbing.Hash{} ··· 823 817 return nil, nil, fmt.Errorf("expandBranches: %w", err) 824 818 } 825 819 820 + rw := NewRepoWalker(repository, options.BuildOptions.RepositoryDescription.URL, repoCache) 826 821 for _, b := range branches { 827 822 commit, err := getCommit(repository, options.BranchPrefix, b) 828 823 if err != nil { ··· 843 838 return nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err) 844 839 } 845 840 846 - files, subVersions, err := TreeToFiles(repository, tree, options.BuildOptions.RepositoryDescription.URL, repoCache) 841 + subVersions, err := rw.CollectFiles(tree, b, ig) 847 842 if err != nil { 848 - return nil, nil, fmt.Errorf("TreeToFiles: %w", err) 849 - } 850 - for k, v := range files { 851 - if ig.Match(k.Path) { 852 - continue 853 - } 854 - 855 - if existing, ok := repos[k]; ok { 856 - existing.Branches = append(existing.Branches, b) 857 - repos[k] = existing 858 - } else { 859 - repos[k] = BlobIndexInfo{Repo: v, Branches: []string{b}} 860 - } 843 + return nil, nil, fmt.Errorf("CollectFiles: %w", err) 861 844 } 862 845 863 846 branchVersions[b] = subVersions 864 847 } 865 848 866 - return repos, branchVersions, nil 849 + return rw.Files, branchVersions, nil 867 850 } 868 851 869 852 func createDocument(key fileKey, 870 - repos map[fileKey]BlobIndexInfo, 853 + repos map[fileKey]BlobLocation, 871 854 ranks repoPathRanks, 872 855 opts build.Options, 873 856 ) (zoekt.Document, error) { 874 - repo := repos[key].Repo 857 + repo := repos[key] 875 858 blob, err := repo.GitRepo.BlobObject(key.ID) 876 859 branches := repos[key].Branches 877 860
+2 -2
gitindex/index_test.go
··· 608 608 // setup: prepare spy versions of prepare delta / normal build so that we can observe 609 609 // whether they were called appropriately 610 610 deltaBuildCalled := false 611 - prepareDeltaSpy := func(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { 611 + prepareDeltaSpy := func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { 612 612 deltaBuildCalled = true 613 613 return prepareDeltaBuild(options, repository) 614 614 } 615 615 616 616 normalBuildCalled := false 617 - prepareNormalSpy := func(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, err error) { 617 + prepareNormalSpy := func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) { 618 618 normalBuildCalled = true 619 619 return prepareNormalBuild(options, repository) 620 620 }
+71 -57
gitindex/tree.go
··· 26 26 "github.com/go-git/go-git/v5/plumbing" 27 27 "github.com/go-git/go-git/v5/plumbing/filemode" 28 28 "github.com/go-git/go-git/v5/plumbing/object" 29 + "github.com/sourcegraph/zoekt/ignore" 29 30 30 - git "github.com/go-git/go-git/v5" 31 + "github.com/go-git/go-git/v5" 31 32 ) 32 33 33 - // repoWalker walks a tree, recursing into submodules. 34 - type repoWalker struct { 35 - repo *git.Repository 34 + // RepoWalker walks one or more commit trees, collecting the files to index in its Files map. 35 + // 36 + // It also recurses into submodules if Options.Submodules is enabled. 37 + type RepoWalker struct { 38 + Files map[fileKey]BlobLocation 36 39 40 + repo *git.Repository 37 41 repoURL *url.URL 38 - tree map[fileKey]BlobRepo 39 42 40 43 // Path => SubmoduleEntry 41 44 submodules map[string]*SubmoduleEntry 42 - 43 - // Path => commit SHA1 44 - subRepoVersions map[string]plumbing.Hash 45 - repoCache *RepoCache 45 + repoCache *RepoCache 46 46 } 47 47 48 48 // subURL returns the URL for a submodule. 49 - func (w *repoWalker) subURL(relURL string) (*url.URL, error) { 50 - if w.repoURL == nil { 49 + func (rw *RepoWalker) subURL(relURL string) (*url.URL, error) { 50 + if rw.repoURL == nil { 51 51 return nil, fmt.Errorf("no URL for base repo") 52 52 } 53 53 if strings.HasPrefix(relURL, "../") { 54 - u := *w.repoURL 54 + u := *rw.repoURL 55 55 u.Path = path.Join(u.Path, relURL) 56 56 return &u, nil 57 57 } ··· 59 59 return url.Parse(relURL) 60 60 } 61 61 62 - // newRepoWalker creates a new repoWalker. 63 - func newRepoWalker(r *git.Repository, repoURL string, repoCache *RepoCache) *repoWalker { 62 + // NewRepoWalker creates a new RepoWalker. 63 + func NewRepoWalker(r *git.Repository, repoURL string, repoCache *RepoCache) *RepoWalker { 64 64 u, _ := url.Parse(repoURL) 65 - return &repoWalker{ 66 - repo: r, 67 - repoURL: u, 68 - tree: map[fileKey]BlobRepo{}, 69 - repoCache: repoCache, 70 - subRepoVersions: map[string]plumbing.Hash{}, 65 + return &RepoWalker{ 66 + repo: r, 67 + repoURL: u, 68 + Files: map[fileKey]BlobLocation{}, 69 + repoCache: repoCache, 71 70 } 72 71 } 73 72 74 73 // parseModuleMap initializes rw.submodules. 75 - func (rw *repoWalker) parseModuleMap(t *object.Tree) error { 74 + func (rw *RepoWalker) parseModuleMap(t *object.Tree) error { 76 75 if rw.repoCache == nil { 77 76 return nil 78 77 } ··· 94 93 return nil 95 94 } 96 95 97 - // TreeToFiles fetches the blob SHA1s for a tree. If repoCache is 96 + // CollectFiles fetches the blob SHA1s for the tree. If repoCache is 98 97 // non-nil, recurse into submodules. In addition, it returns a mapping 99 98 // that indicates in which repo each SHA1 can be found. 100 - func TreeToFiles(r *git.Repository, t *object.Tree, repoURL string, repoCache *RepoCache) (map[fileKey]BlobRepo, map[string]plumbing.Hash, error) { 101 - rw := newRepoWalker(r, repoURL, repoCache) 102 - 99 + // 100 + // The collected files are available through the RepoWalker.Files map. 101 + func (rw *RepoWalker) CollectFiles(t *object.Tree, branch string, ig *ignore.Matcher) (map[string]plumbing.Hash, error) { 103 102 if err := rw.parseModuleMap(t); err != nil { 104 - return nil, nil, fmt.Errorf("parseModuleMap: %w", err) 103 + return nil, fmt.Errorf("parseModuleMap: %w", err) 104 + } 105 + 106 + ig, err := newIgnoreMatcher(t) 107 + if err != nil { 108 + return nil, fmt.Errorf("newIgnoreMatcher: %w", err) 105 109 } 106 110 107 111 tw := object.NewTreeWalker(t, true, make(map[plumbing.Hash]bool)) 108 112 defer tw.Close() 113 + 114 + // Path => commit SHA1 115 + subRepoVersions := make(map[string]plumbing.Hash) 109 116 for { 110 117 name, entry, err := tw.Next() 111 118 if err == io.EOF { 112 119 break 113 120 } 114 - if err := rw.handleEntry(name, &entry); err != nil { 115 - return nil, nil, fmt.Errorf("handleEntry: %w", err) 121 + if err := rw.handleEntry(name, &entry, branch, subRepoVersions, ig); err != nil { 122 + return nil, fmt.Errorf("handleEntry: %w", err) 116 123 } 117 124 } 118 - return rw.tree, rw.subRepoVersions, nil 125 + return subRepoVersions, nil 119 126 } 120 127 121 - func (r *repoWalker) tryHandleSubmodule(p string, id *plumbing.Hash) error { 122 - if err := r.handleSubmodule(p, id); err != nil { 128 + func (rw *RepoWalker) tryHandleSubmodule(p string, id *plumbing.Hash, branch string, subRepoVersions map[string]plumbing.Hash, ig *ignore.Matcher) error { 129 + if err := rw.handleSubmodule(p, id, branch, subRepoVersions, ig); err != nil { 123 130 log.Printf("submodule %s: ignoring error %v", p, err) 124 131 } 125 132 return nil 126 133 } 127 134 128 - func (r *repoWalker) handleSubmodule(p string, id *plumbing.Hash) error { 129 - submod := r.submodules[p] 135 + func (rw *RepoWalker) handleSubmodule(p string, id *plumbing.Hash, branch string, subRepoVersions map[string]plumbing.Hash, ig *ignore.Matcher) error { 136 + submod := rw.submodules[p] 130 137 if submod == nil { 131 - return fmt.Errorf("no entry for submodule path %q", r.repoURL) 138 + return fmt.Errorf("no entry for submodule path %q", rw.repoURL) 132 139 } 133 140 134 - subURL, err := r.subURL(submod.URL) 141 + subURL, err := rw.subURL(submod.URL) 135 142 if err != nil { 136 143 return err 137 144 } 138 145 139 - subRepo, err := r.repoCache.Open(subURL) 146 + subRepo, err := rw.repoCache.Open(subURL) 140 147 if err != nil { 141 148 return err 142 149 } ··· 150 157 return err 151 158 } 152 159 153 - r.subRepoVersions[p] = *id 160 + subRepoVersions[p] = *id 154 161 155 - subTree, subVersions, err := TreeToFiles(subRepo, tree, subURL.String(), r.repoCache) 162 + sw := NewRepoWalker(subRepo, subURL.String(), rw.repoCache) 163 + subVersions, err := sw.CollectFiles(tree, branch, ig) 156 164 if err != nil { 157 165 return err 158 166 } 159 - for k, repo := range subTree { 160 - r.tree[fileKey{ 167 + for k, repo := range sw.Files { 168 + rw.Files[fileKey{ 161 169 SubRepoPath: filepath.Join(p, k.SubRepoPath), 162 170 Path: k.Path, 163 171 ID: k.ID, 164 172 }] = repo 165 173 } 166 174 for k, v := range subVersions { 167 - r.subRepoVersions[filepath.Join(p, k)] = v 175 + subRepoVersions[filepath.Join(p, k)] = v 168 176 } 169 177 return nil 170 178 } 171 179 172 - func (r *repoWalker) handleEntry(p string, e *object.TreeEntry) error { 173 - if e.Mode == filemode.Submodule && r.repoCache != nil { 174 - if err := r.tryHandleSubmodule(p, &e.Hash); err != nil { 180 + func (rw *RepoWalker) handleEntry(p string, e *object.TreeEntry, branch string, subRepoVersions map[string]plumbing.Hash, ig *ignore.Matcher) error { 181 + if e.Mode == filemode.Submodule && rw.repoCache != nil { 182 + if err := rw.tryHandleSubmodule(p, &e.Hash, branch, subRepoVersions, ig); err != nil { 175 183 return fmt.Errorf("submodule %s: %v", p, err) 176 184 } 177 185 } ··· 182 190 return nil 183 191 } 184 192 185 - r.tree[fileKey{Path: p, ID: e.Hash}] = BlobRepo{ 186 - GitRepo: r.repo, 187 - URL: r.repoURL, 193 + // Skip ignored files 194 + if ig.Match(p) { 195 + return nil 196 + } 197 + 198 + key := fileKey{Path: p, ID: e.Hash} 199 + if existing, ok := rw.Files[key]; ok { 200 + existing.Branches = append(existing.Branches, branch) 201 + rw.Files[key] = existing 202 + } else { 203 + rw.Files[key] = BlobLocation{GitRepo: rw.repo, URL: rw.repoURL, Branches: []string{branch}} 188 204 } 205 + 189 206 return nil 190 207 } 191 208 ··· 201 218 return filepath.Join(k.SubRepoPath, k.Path) 202 219 } 203 220 204 - // BlobIndexInfo contains information about the blob that's needed for indexing. 205 - type BlobIndexInfo struct { 206 - Repo BlobRepo 207 - // Branches is the list of branches that contain the blob. 208 - Branches []string 209 - } 210 - 211 - // BlobRepo holds the repo where the blob can be found. 212 - type BlobRepo struct { 221 + // BlobLocation holds the repo where the blob can be found, plus other information 222 + // needed for indexing like its branches. 223 + type BlobLocation struct { 213 224 GitRepo *git.Repository 214 225 URL *url.URL 226 + 227 + // Branches is the list of branches that contain the blob. 228 + Branches []string 215 229 } 216 230 217 - func (l *BlobRepo) Blob(id *plumbing.Hash) ([]byte, error) { 231 + func (l *BlobLocation) Blob(id *plumbing.Hash) ([]byte, error) { 218 232 blob, err := l.GitRepo.BlobObject(*id) 219 233 if err != nil { 220 234 return nil, err
+6 -4
gitindex/tree_test.go
··· 29 29 30 30 "github.com/google/go-cmp/cmp" 31 31 "github.com/grafana/regexp" 32 + "github.com/sourcegraph/zoekt/ignore" 32 33 33 34 "github.com/sourcegraph/zoekt" 34 35 "github.com/sourcegraph/zoekt/build" ··· 139 140 } 140 141 } 141 142 142 - func TestTreeToFiles(t *testing.T) { 143 + func TestCollectFiles(t *testing.T) { 143 144 dir := t.TempDir() 144 145 145 146 if err := createSubmoduleRepo(dir); err != nil { ··· 168 169 t.Fatalf("AsTree: %v", err) 169 170 } 170 171 171 - files, versions, err := TreeToFiles(repo, tree, aURL.String(), cache) 172 + rw := NewRepoWalker(repo, aURL.String(), cache) 173 + versions, err := rw.CollectFiles(tree, "main", &ignore.Matcher{}) 172 174 if err != nil { 173 - t.Fatalf("TreeToFiles: %v", err) 175 + t.Fatalf("CollectFiles: %v", err) 174 176 } 175 177 176 178 bnameHash := versions["bname"] ··· 181 183 } 182 184 183 185 var paths []string 184 - for k := range files { 186 + for k := range rw.Files { 185 187 paths = append(paths, k.FullPath()) 186 188 } 187 189 sort.Strings(paths)