fork of https://github.com/sourcegraph/zoekt
1package index
2
3import (
4 "bytes"
5 "fmt"
6 "io/fs"
7 "os"
8 "path/filepath"
9 "testing"
10)
11
12// Set ZOEKT_BENCH_REPO to a source tree (e.g. a kubernetes checkout) to enable.
13//
14// git clone --depth=1 https://github.com/kubernetes/kubernetes /tmp/k8s
15// ZOEKT_BENCH_REPO=/tmp/k8s go test ./index/ -bench=BenchmarkPostings -benchmem -count=5 -timeout=600s
16
17func requireBenchRepo(b *testing.B) string {
18 b.Helper()
19 dir := os.Getenv("ZOEKT_BENCH_REPO")
20 if dir == "" {
21 b.Skip("ZOEKT_BENCH_REPO not set")
22 }
23 return dir
24}
25
26// loadRepoFiles walks dir and returns file contents, skipping binary files,
27// empty files, and anything over 1 MB. Returns at most maxFiles entries.
28func loadRepoFiles(b *testing.B, dir string, maxFiles int) [][]byte {
29 b.Helper()
30 var files [][]byte
31 err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
32 if err != nil {
33 return nil
34 }
35 if d.IsDir() {
36 switch d.Name() {
37 case ".git", "vendor", "node_modules":
38 return filepath.SkipDir
39 }
40 return nil
41 }
42 if len(files) >= maxFiles {
43 return filepath.SkipAll
44 }
45 info, err := d.Info()
46 if err != nil || info.Size() == 0 || info.Size() > 1<<20 {
47 return nil
48 }
49 data, err := os.ReadFile(path)
50 if err != nil {
51 return nil
52 }
53 if bytes.IndexByte(data, 0) >= 0 {
54 return nil // binary
55 }
56 files = append(files, data)
57 return nil
58 })
59 if err != nil {
60 b.Fatalf("walking repo: %v", err)
61 }
62 if len(files) == 0 {
63 b.Fatal("no files found in repo")
64 }
65 return files
66}
67
68func totalSize(files [][]byte) int64 {
69 var n int64
70 for _, f := range files {
71 n += int64(len(f))
72 }
73 return n
74}
75
76// BenchmarkPostings_NewSearchableString measures the core hot path: trigram
77// extraction, map lookups, delta encoding, and per-trigram slice growth.
78// Sub-benchmarks vary corpus size to show scaling with map size.
79func BenchmarkPostings_NewSearchableString(b *testing.B) {
80 dir := requireBenchRepo(b)
81 allFiles := loadRepoFiles(b, dir, 50_000)
82 b.Logf("loaded %d files, %.1f MB", len(allFiles), float64(totalSize(allFiles))/(1<<20))
83
84 for _, n := range []int{1_000, 5_000, len(allFiles)} {
85 n = min(n, len(allFiles))
86 files := allFiles[:n]
87 size := totalSize(files)
88
89 b.Run(fmt.Sprintf("files=%d", n), func(b *testing.B) {
90 b.ReportAllocs()
91 for b.Loop() {
92 pb := newPostingsBuilder(defaultShardMax)
93 for _, data := range files {
94 _, _, _ = pb.newSearchableString(data, nil)
95 }
96 }
97 b.ReportMetric(float64(size), "input-bytes/op")
98 })
99 }
100}
101
102// BenchmarkPostings_Reuse measures the warm path: building postings with a
103// reset (pooled) postingsBuilder that retains its map and slice allocations
104// from a previous shard build.
105func BenchmarkPostings_Reuse(b *testing.B) {
106 dir := requireBenchRepo(b)
107 allFiles := loadRepoFiles(b, dir, 50_000)
108 size := totalSize(allFiles)
109 b.Logf("loaded %d files, %.1f MB", len(allFiles), float64(size)/(1<<20))
110
111 // Warm up the builder so it has allocated map entries and slices.
112 pb := newPostingsBuilder(defaultShardMax)
113 for _, data := range allFiles {
114 _, _, _ = pb.newSearchableString(data, nil)
115 }
116
117 b.ResetTimer()
118 b.ReportAllocs()
119 for b.Loop() {
120 pb.reset()
121 for _, data := range allFiles {
122 _, _, _ = pb.newSearchableString(data, nil)
123 }
124 }
125 b.ReportMetric(float64(size), "input-bytes/op")
126}
127
128// BenchmarkPostings_WritePostings measures the marshaling path: sorting ngram
129// keys and writing varint-encoded posting lists.
130func BenchmarkPostings_WritePostings(b *testing.B) {
131 dir := requireBenchRepo(b)
132 allFiles := loadRepoFiles(b, dir, 50_000)
133
134 pb := newPostingsBuilder(defaultShardMax)
135 for _, data := range allFiles {
136 _, _, _ = pb.newSearchableString(data, nil)
137 }
138 b.Logf("built %d unique ngrams from %d files, %.1f MB", len(pb.postings), len(allFiles), float64(totalSize(allFiles))/(1<<20))
139
140 buf := &bytes.Buffer{}
141 b.ResetTimer()
142 b.ReportAllocs()
143 for b.Loop() {
144 buf.Reset()
145 w := &writer{w: buf}
146 var ngramText, charOffsets, endRunes simpleSection
147 var postings compoundSection
148 writePostings(w, pb, &ngramText, &charOffsets, &postings, &endRunes)
149 }
150}