fork of https://github.com/sourcegraph/zoekt
1package e2e
2
3import (
4 "bytes"
5 "context"
6 "flag"
7 "fmt"
8 "io"
9 "net/url"
10 "os"
11 "os/exec"
12 "path/filepath"
13 "strings"
14 "testing"
15 "time"
16
17 "github.com/google/go-cmp/cmp"
18 "github.com/sourcegraph/zoekt"
19 "github.com/sourcegraph/zoekt/index"
20 "github.com/sourcegraph/zoekt/internal/archive"
21 "github.com/sourcegraph/zoekt/internal/shards"
22 "github.com/sourcegraph/zoekt/query"
23)
24
25var update = flag.Bool("update", false, "update golden file")
26
27var useShardCache = flag.Bool("shard_cache", false, "cache computed shards for faster test runs")
28
29// debugScore can be set to include much more output. Do not commit the
30// updated golden files, this is purely used for debugging in a local
31// environment.
32var debugScore = flag.Bool("debug_score", false, "include debug output in golden files.")
33
34func TestRanking(t *testing.T) {
35 if testing.Short() {
36 t.Skip("skipping due to short flag")
37 }
38
39 requireCTags(t)
40
41 archiveURLs := []string{
42 "https://github.com/sourcegraph/sourcegraph-public-snapshot/tree/v5.2.2", // Nov 1 2023
43 "https://github.com/golang/go/tree/go1.21.4", // Nov 7 2023
44 "https://github.com/sourcegraph/cody/tree/vscode-v0.14.5", // Nov 8 2023
45 // The commit before ranking e2e tests were added to avoid matching
46 // content inside our golden files.
47 "https://github.com/sourcegraph/zoekt/commit/ef907c2371176aa3f97713d5bf182983ef090c6a", // Nov 17 2023
48 "https://github.com/sourcegraph/conc/tree/5f936abd7ae87036af1f75c95fb9d0daaf00116b", // Jan 21 2024
49 }
50 q := func(query, target string) rankingQuery {
51 return rankingQuery{Query: query, Target: target}
52 }
53 queries := []rankingQuery{
54 // golang/go
55 q("test server", "github.com/golang/go/src/net/http/httptest/server.go"),
56 q("bytes buffer", "github.com/golang/go/src/bytes/buffer.go"),
57 q("bufio buffer", "github.com/golang/go/src/bufio/scan.go"),
58 q("time compare\\(", "github.com/golang/go/src/time/time.go"),
59
60 // sourcegraph/sourcegraph
61 q("graphql type User", "github.com/sourcegraph/sourcegraph-public-snapshot/cmd/frontend/graphqlbackend/schema.graphql"),
62 q("Get database/user", "github.com/sourcegraph/sourcegraph-public-snapshot/internal/database/users.go"),
63 q("InternalDoer", "github.com/sourcegraph/sourcegraph-public-snapshot/internal/httpcli/client.go"),
64 q("Repository metadata Write rbac", "github.com/sourcegraph/sourcegraph-public-snapshot/internal/rbac/constants.go"), // unsure if this is the best doc?
65
66 // cody
67 q("generate unit test", "github.com/sourcegraph/cody/lib/shared/src/chat/recipes/generate-test.ts"),
68 q("r:cody sourcegraph url", "github.com/sourcegraph/cody/lib/shared/src/sourcegraph-api/graphql/client.ts"),
69
70 // zoekt
71 q("zoekt searcher", "github.com/sourcegraph/zoekt/api.go"),
72
73 // exact phrases
74 q("assets are not configured for this binary", "github.com/sourcegraph/sourcegraph-public-snapshot/ui/assets/assets.go"),
75 q("sourcegraph/server docker image build", "github.com/sourcegraph/sourcegraph-public-snapshot/dev/tools.go"),
76
77 // symbols split up
78 q("bufio flush writer", "github.com/golang/go/src/net/http/transfer.go"), // bufioFlushWriter
79 q("coverage data writer", "github.com/golang/go/src/internal/coverage/encodecounter/encode.go"), // CoverageDataWriter
80
81 // sourcegraph/conc vs golang/go
82 q("WaitGroup", "github.com/sourcegraph/conc/waitgroup.go"),
83 }
84
85 var indexDir string
86 if *useShardCache {
87 t.Logf("reusing index dir to speed up testing. If you have unexpected results remove %s", shardCache)
88 indexDir = shardCache
89 } else {
90 indexDir = t.TempDir()
91 }
92
93 for _, u := range archiveURLs {
94 if err := indexURL(indexDir, u); err != nil {
95 t.Fatal(err)
96 }
97 }
98
99 ss, err := shards.NewDirectorySearcher(indexDir)
100 if err != nil {
101 t.Fatalf("NewDirectorySearcher(%s): %v", indexDir, err)
102 }
103 defer ss.Close()
104
105 var ranks []int
106 for _, rq := range queries {
107 // normalise queryStr for writing to fs
108 name := strings.Map(func(r rune) rune {
109 if strings.ContainsRune(" :", r) {
110 return '_'
111 }
112 if '0' <= r && r <= '9' ||
113 'a' <= r && r <= 'z' ||
114 'A' <= r && r <= 'Z' {
115 return r
116 }
117 return -1
118 }, rq.Query)
119
120 t.Run(name, func(t *testing.T) {
121 q, err := query.Parse(rq.Query)
122 if err != nil {
123 t.Fatal(err)
124 }
125
126 sOpts := zoekt.SearchOptions{
127 // Use the same options sourcegraph has by default
128 ChunkMatches: true,
129 MaxWallTime: 20 * time.Second,
130 ShardMaxMatchCount: 10_000 * 10,
131 TotalMaxMatchCount: 100_000 * 10,
132 MaxDocDisplayCount: 500,
133
134 DebugScore: *debugScore,
135 }
136 result, err := ss.Search(context.Background(), q, &sOpts)
137 if err != nil {
138 t.Fatal(err)
139 }
140
141 ranks = append(ranks, targetRank(rq, result.Files))
142
143 var gotBuf bytes.Buffer
144 marshalMatches(&gotBuf, rq, q, result.Files)
145 assertGolden(t, name, gotBuf.Bytes())
146 })
147 }
148
149 t.Run("rank_stats", func(t *testing.T) {
150 if len(ranks) != len(queries) {
151 t.Skip("not computing rank stats since not all query cases ran")
152 }
153
154 var gotBuf bytes.Buffer
155 printf := func(format string, a ...any) {
156 _, _ = fmt.Fprintf(&gotBuf, format, a...)
157 }
158
159 printf("queries: %d\n", len(ranks))
160
161 for _, recallThreshold := range []int{1, 5} {
162 count := 0
163 for _, rank := range ranks {
164 if rank <= recallThreshold && rank > 0 {
165 count++
166 }
167 }
168 countp := float64(count) * 100 / float64(len(ranks))
169 printf("recall@%d: %d (%.0f%%)\n", recallThreshold, count, countp)
170 }
171
172 // Mean reciprocal rank
173 mrr := float64(0)
174 for _, rank := range ranks {
175 if rank > 0 {
176 mrr += 1 / float64(rank)
177 }
178 }
179 mrr /= float64(len(ranks))
180 printf("mrr: %f\n", mrr)
181
182 assertGolden(t, "rank_stats", gotBuf.Bytes())
183 })
184}
185
186func assertGolden(t *testing.T, name string, got []byte) {
187 t.Helper()
188
189 wantPath := filepath.Join("testdata", name+".txt")
190 if *update {
191 if err := os.WriteFile(wantPath, got, 0o600); err != nil {
192 t.Fatal(err)
193 }
194 }
195 want, err := os.ReadFile(wantPath)
196 if err != nil {
197 t.Fatal(err)
198 }
199
200 if d := cmp.Diff(string(want), string(got)); d != "" {
201 t.Fatalf("unexpected (-want, +got):\n%s", d)
202 }
203}
204
205type rankingQuery struct {
206 Query string
207 Target string
208}
209
210var (
211 tarballCache = "/tmp/zoekt-test-ranking-tarballs-" + os.Getenv("USER")
212 shardCache = "/tmp/zoekt-test-ranking-shards-" + os.Getenv("USER")
213)
214
215func indexURL(indexDir, u string) error {
216 if err := os.MkdirAll(tarballCache, 0o700); err != nil {
217 return err
218 }
219
220 opts := archive.Options{
221 Archive: u,
222 Incremental: true,
223 }
224 opts.SetDefaults() // sets metadata like Name and the codeload URL
225 u = opts.Archive
226
227 // if opts.Commit is set but opts.Branch is not, then we just need to give
228 // the commit a name for testing.
229 if opts.Commit != "" && opts.Branch == "" {
230 opts.Branch = "test"
231 }
232
233 // update Archive location to cached location
234 cacheBase := fmt.Sprintf("%s-%s%s.tar.gz", url.QueryEscape(opts.Name), opts.Branch, opts.Commit) // assume .tar.gz
235 path := filepath.Join(tarballCache, cacheBase)
236 opts.Archive = path
237
238 if _, err := os.Stat(path); os.IsNotExist(err) {
239 if err := download(u, path); err != nil {
240 return err
241 }
242 }
243
244 // TODO scip
245 // languageMap := make(ctags.LanguageMap)
246 // for _, lang := range []string{"kotlin", "rust", "ruby", "go", "python", "javascript", "c_sharp", "scala", "typescript", "zig"} {
247 // languageMap[lang] = ctags.ScipCTags
248 // }
249
250 err := archive.Index(opts, index.Options{
251 IndexDir: indexDir,
252 CTagsMustSucceed: true,
253 RepositoryDescription: zoekt.Repository{
254 // Use the latest commit date to calculate the repo rank when loading the shard.
255 // This is the same setting we use in production.
256 RawConfig: map[string]string{"latestCommitDate": "1"},
257 },
258 })
259 if err != nil {
260 return fmt.Errorf("failed to index %s: %w", opts.Archive, err)
261 }
262
263 return nil
264}
265
266func download(url, dst string) error {
267 tmpPath := dst + ".part"
268
269 rc, err := archive.OpenReader(url)
270 if err != nil {
271 return err
272 }
273 defer rc.Close()
274
275 f, err := os.OpenFile(tmpPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o600)
276 if err != nil {
277 return err
278 }
279 defer f.Close()
280
281 _, err = io.Copy(f, rc)
282 if err != nil {
283 return err
284 }
285
286 err = f.Close()
287 if err != nil {
288 return err
289 }
290
291 return os.Rename(tmpPath, dst)
292}
293
294const (
295 chunkMatchesPerFile = 3
296 fileMatchesPerSearch = 6
297)
298
299func docName(f zoekt.FileMatch) string {
300 return f.Repository + "/" + f.FileName
301}
302
303func marshalMatches(w io.Writer, rq rankingQuery, q query.Q, files []zoekt.FileMatch) {
304 _, _ = fmt.Fprintf(w, "queryString: %s\n", rq.Query)
305 _, _ = fmt.Fprintf(w, "query: %s\n", q)
306 _, _ = fmt.Fprintf(w, "targetRank: %d\n\n", targetRank(rq, files))
307
308 files, hiddenFiles := splitAtIndex(files, fileMatchesPerSearch)
309 for _, f := range files {
310 doc := docName(f)
311 if doc == rq.Target {
312 doc = "**" + doc + "**"
313 }
314 _, _ = fmt.Fprintf(w, "%s%s\n", doc, addTabIfNonEmpty(f.Debug))
315
316 chunks, hidden := splitAtIndex(f.ChunkMatches, chunkMatchesPerFile)
317
318 for _, m := range chunks {
319 _, _ = fmt.Fprintf(w, "%d:%s%s\n", m.ContentStart.LineNumber, strings.TrimRight(string(m.Content), "\n"), addTabIfNonEmpty(m.DebugScore))
320 }
321
322 if len(hidden) > 0 {
323 _, _ = fmt.Fprintf(w, "hidden %d more line matches\n", len(hidden))
324 }
325 _, _ = fmt.Fprintln(w)
326 }
327
328 if len(hiddenFiles) > 0 {
329 fmt.Fprintf(w, "hidden %d more file matches\n", len(hiddenFiles))
330 }
331}
332
333func targetRank(rq rankingQuery, files []zoekt.FileMatch) int {
334 for i, f := range files {
335 if docName(f) == rq.Target {
336 return i + 1
337 }
338 }
339 return -1
340}
341
342func splitAtIndex[E any](s []E, idx int) ([]E, []E) {
343 if idx < len(s) {
344 return s[:idx], s[idx:]
345 }
346 return s, nil
347}
348
349func addTabIfNonEmpty(s string) string {
350 if s != "" {
351 return "\t" + s
352 }
353 return s
354}
355
356func requireCTags(tb testing.TB) {
357 tb.Helper()
358
359 if os.Getenv("CTAGS_COMMAND") != "" {
360 return
361 }
362 if _, err := exec.LookPath("universal-ctags"); err == nil {
363 return
364 }
365
366 // On CI we require ctags to be available. Otherwise we skip
367 if os.Getenv("CI") != "" {
368 tb.Fatal("universal-ctags is missing")
369 } else {
370 tb.Skip("universal-ctags is missing")
371 }
372}
373
374func checkScipCTags() string {
375 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
376 return ctags
377 }
378
379 if ctags, err := exec.LookPath("scip-ctags"); err == nil {
380 return ctags
381 }
382
383 return ""
384}