fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Avoid reopening packfile on every object access (#852)

By default, the `go-git` library will open the packfile on every call to `Repository.BlobObject`, then close it. During indexing, we collect the list of files to index, then iterate through each one calling `Repository.BlobObject`. So on every object access the packfile reopened, and `go-git` reallocates some in-memory buffers.

This PR bypasses `git.PlainOpen` to allow us to enable the `KeepDescriptors` option. This option keeps packfile files open, and caches wrappers for them. The files then need to be explicitly closed when done with the repo.

Benefits:
* Avoid reallocating the memory buffers on every object access (see benchmark results below)
* (Highly speculative) I suspect this could improve OS decisions around when to cache portions of the packfile. Maybe constantly reopening and seeking within the file makes it harder for the OS to determine the true access pattern, which is roughly random access. This can affect decisions like readahead and whether to consider pages 'active'.

+129 -3
+51 -3
gitindex/index.go
··· 32 32 "strconv" 33 33 "strings" 34 34 35 + "github.com/go-git/go-billy/v5/osfs" 36 + "github.com/go-git/go-git/v5/plumbing/cache" 37 + "github.com/go-git/go-git/v5/storage/filesystem" 35 38 "github.com/sourcegraph/zoekt" 36 39 "github.com/sourcegraph/zoekt/build" 37 40 "github.com/sourcegraph/zoekt/ignore" ··· 404 407 } 405 408 406 409 opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir 407 - repo, err := git.PlainOpen(opts.RepoDir) 408 - if err != nil { 409 - return false, fmt.Errorf("git.PlainOpen: %w", err) 410 + 411 + var repo *git.Repository 412 + 413 + // TODO: remove this feature flag once we test this on a large-scale instance. 414 + optimizeRepoOpen := os.Getenv("ZOEKT_ENABLE_GOGIT_OPTIMIZATION") 415 + if b, err := strconv.ParseBool(optimizeRepoOpen); b && err == nil { 416 + var repoCloser io.Closer 417 + repo, repoCloser, err = openRepo(opts.RepoDir) 418 + if err != nil { 419 + return false, fmt.Errorf("openRepo: %w", err) 420 + } 421 + defer repoCloser.Close() 422 + } else { 423 + repo, err = git.PlainOpen(opts.RepoDir) 424 + if err != nil { 425 + return false, fmt.Errorf("git.PlainOpen: %w", err) 426 + } 410 427 } 411 428 412 429 if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil { ··· 570 587 } 571 588 } 572 589 return true, builder.Finish() 590 + } 591 + 592 + // openRepo opens a git repository in a way that's optimized for indexing. 593 + // 594 + // It copies the relevant logic from git.PlainOpen, and enables the filesystem KeepDescriptors option. This 595 + // caches the packfile handles, preventing the packfile from being opened then closed on every object access. 596 + func openRepo(repoDir string) (*git.Repository, io.Closer, error) { 597 + fs := osfs.New(repoDir) 598 + 599 + // Check if the root directory exists. 600 + if _, err := fs.Stat(""); err != nil { 601 + if os.IsNotExist(err) { 602 + return nil, nil, git.ErrRepositoryNotExists 603 + } 604 + return nil, nil, err 605 + } 606 + 607 + // If there's a .git directory, use that as the new root. 608 + if fi, err := fs.Stat(git.GitDirName); err == nil && fi.IsDir() { 609 + if fs, err = fs.Chroot(git.GitDirName); err != nil { 610 + return nil, nil, fmt.Errorf("fs.Chroot: %w", err) 611 + } 612 + } 613 + 614 + s := filesystem.NewStorageWithOptions(fs, cache.NewObjectLRUDefault(), filesystem.Options{ 615 + KeepDescriptors: true, 616 + }) 617 + 618 + // Because we're keeping descriptors open, we need to close the storage object when we're done. 619 + repo, err := git.Open(s, fs) 620 + return repo, s, err 573 621 } 574 622 575 623 type repoPathRanks struct {
+78
gitindex/index_test.go
··· 17 17 import ( 18 18 "bytes" 19 19 "context" 20 + "errors" 20 21 "fmt" 21 22 "net/url" 22 23 "os" ··· 62 63 if _, err := IndexGitRepo(opts); err != nil { 63 64 t.Fatalf("IndexGitRepo: %v", err) 64 65 } 66 + } 67 + 68 + func TestIndexNonexistentRepo(t *testing.T) { 69 + dir := t.TempDir() 70 + desc := zoekt.Repository{ 71 + Name: "nonexistent", 72 + } 73 + opts := Options{ 74 + RepoDir: "does/not/exist", 75 + Branches: []string{"main"}, 76 + BuildOptions: build.Options{ 77 + RepositoryDescription: desc, 78 + IndexDir: dir, 79 + }, 80 + } 81 + 82 + if _, err := IndexGitRepo(opts); err == nil { 83 + t.Fatal("expected error, got none") 84 + } else if !errors.Is(err, git.ErrRepositoryNotExists) { 85 + t.Fatalf("expected git.ErrRepositoryNotExists, got %v", err) 86 + } 87 + } 88 + 89 + func TestIndexTinyRepo(t *testing.T) { 90 + // Create a repo with one file in it. 91 + dir := t.TempDir() 92 + executeCommand(t, dir, exec.Command("git", "init", "-b", "main", "repo")) 93 + 94 + repoDir := filepath.Join(dir, "repo") 95 + executeCommand(t, repoDir, exec.Command("git", "config", "user.name", "Thomas")) 96 + executeCommand(t, repoDir, exec.Command("git", "config", "user.email", "thomas@google.com")) 97 + 98 + if err := os.WriteFile(filepath.Join(repoDir, "file1.go"), []byte("package main\n\nfunc main() {}\n"), 0644); err != nil { 99 + t.Fatalf("WriteFile: %v", err) 100 + } 101 + executeCommand(t, repoDir, exec.Command("git", "add", ".")) 102 + executeCommand(t, repoDir, exec.Command("git", "commit", "-m", "initial commit")) 103 + 104 + // Test that indexing accepts both the repo directory, and the .git subdirectory. 105 + for _, testDir := range []string{"repo", "repo/.git"} { 106 + opts := Options{ 107 + RepoDir: filepath.Join(dir, testDir), 108 + Branches: []string{"main"}, 109 + BuildOptions: build.Options{ 110 + RepositoryDescription: zoekt.Repository{Name: "repo"}, 111 + IndexDir: dir, 112 + }, 113 + } 114 + 115 + if _, err := IndexGitRepo(opts); err != nil { 116 + t.Fatalf("unexpected error %v", err) 117 + } 118 + 119 + searcher, err := shards.NewDirectorySearcher(dir) 120 + if err != nil { 121 + t.Fatal("NewDirectorySearcher", err) 122 + } 123 + 124 + results, err := searcher.Search(context.Background(), &query.Const{Value: true}, &zoekt.SearchOptions{}) 125 + searcher.Close() 126 + 127 + if err != nil { 128 + t.Fatal("search failed", err) 129 + } 130 + 131 + if len(results.Files) != 1 { 132 + t.Fatalf("got search result %v, want 1 file", results.Files) 133 + } 134 + } 135 + } 136 + 137 + func executeCommand(t *testing.T, dir string, cmd *exec.Cmd) *exec.Cmd { 138 + cmd.Dir = dir 139 + if err := cmd.Run(); err != nil { 140 + t.Fatalf("cmd.Run: %v", err) 141 + } 142 + return cmd 65 143 } 66 144 67 145 func TestIndexDeltaBasic(t *testing.T) {