fork of https://github.com/sourcegraph/zoekt
1package e2e
2
3import (
4 "bytes"
5 "context"
6 "flag"
7 "fmt"
8 "io"
9 "net/url"
10 "os"
11 "os/exec"
12 "path/filepath"
13 "strings"
14 "testing"
15 "time"
16
17 "github.com/google/go-cmp/cmp"
18
19 "github.com/sourcegraph/zoekt"
20 "github.com/sourcegraph/zoekt/index"
21 "github.com/sourcegraph/zoekt/internal/archive"
22 "github.com/sourcegraph/zoekt/query"
23 "github.com/sourcegraph/zoekt/search"
24)
25
26var update = flag.Bool("update", false, "update golden file")
27
28var useShardCache = flag.Bool("shard_cache", false, "cache computed shards for faster test runs")
29
30// debugScore can be set to include much more output. Do not commit the
31// updated golden files, this is purely used for debugging in a local
32// environment.
33var debugScore = flag.Bool("debug_score", false, "include debug output in golden files.")
34
35func TestRanking(t *testing.T) {
36 if testing.Short() {
37 t.Skip("skipping due to short flag")
38 }
39
40 requireCTags(t)
41
42 archiveURLs := []string{
43 "https://github.com/sourcegraph/sourcegraph-public-snapshot/tree/v5.2.2", // Nov 1 2023
44 "https://github.com/golang/go/tree/go1.21.4", // Nov 7 2023
45 "https://github.com/sourcegraph/cody-public-snapshot/tree/vscode-v0.14.5", // Nov 8 2023
46 // The commit before ranking e2e tests were added to avoid matching
47 // content inside our golden files.
48 "https://github.com/sourcegraph/zoekt/commit/ef907c2371176aa3f97713d5bf182983ef090c6a", // Nov 17 2023
49 "https://github.com/sourcegraph/conc/tree/5f936abd7ae87036af1f75c95fb9d0daaf00116b", // Jan 21 2024
50 }
51 q := func(query, target string) rankingQuery {
52 return rankingQuery{Query: query, Target: target}
53 }
54 queries := []rankingQuery{
55 // golang/go
56 q("test server", "github.com/golang/go/src/net/http/httptest/server.go"),
57 q("bytes buffer", "github.com/golang/go/src/bytes/buffer.go"),
58 q("bufio buffer", "github.com/golang/go/src/bufio/scan.go"),
59 q("time compare\\(", "github.com/golang/go/src/time/time.go"),
60
61 // sourcegraph/sourcegraph
62 q("graphql type User", "github.com/sourcegraph/sourcegraph-public-snapshot/cmd/frontend/graphqlbackend/schema.graphql"),
63 q("Get database/user", "github.com/sourcegraph/sourcegraph-public-snapshot/internal/database/users.go"),
64 q("InternalDoer", "github.com/sourcegraph/sourcegraph-public-snapshot/internal/httpcli/client.go"),
65 q("Repository metadata Write rbac", "github.com/sourcegraph/sourcegraph-public-snapshot/internal/rbac/constants.go"), // unsure if this is the best doc?
66
67 // cody
68 q("generate unit test", "github.com/sourcegraph/cody-public-snapshot/lib/shared/src/chat/recipes/generate-test.ts"),
69 q("r:cody sourcegraph url", "github.com/sourcegraph/cody-public-snapshot/lib/shared/src/sourcegraph-api/graphql/client.ts"),
70
71 // zoekt
72 q("zoekt searcher", "github.com/sourcegraph/zoekt/api.go"),
73
74 // exact phrases
75 q("assets are not configured for this binary", "github.com/sourcegraph/sourcegraph-public-snapshot/ui/assets/assets.go"),
76 q("sourcegraph/server docker image build", "github.com/sourcegraph/sourcegraph-public-snapshot/dev/tools.go"),
77
78 // symbols split up
79 q("bufio flush writer", "github.com/golang/go/src/net/http/transfer.go"), // bufioFlushWriter
80 q("coverage data writer", "github.com/golang/go/src/internal/coverage/encodecounter/encode.go"), // CoverageDataWriter
81
82 // sourcegraph/conc vs golang/go
83 q("WaitGroup", "github.com/sourcegraph/conc/waitgroup.go"),
84 }
85
86 var indexDir string
87 if *useShardCache {
88 t.Logf("reusing index dir to speed up testing. If you have unexpected results remove %s", shardCache)
89 indexDir = shardCache
90 } else {
91 indexDir = t.TempDir()
92 }
93
94 for _, u := range archiveURLs {
95 if err := indexURL(indexDir, u); err != nil {
96 t.Fatal(err)
97 }
98 }
99
100 ss, err := search.NewDirectorySearcher(indexDir)
101 if err != nil {
102 t.Fatalf("NewDirectorySearcher(%s): %v", indexDir, err)
103 }
104 defer ss.Close()
105
106 var ranks []int
107 for _, rq := range queries {
108 // normalise queryStr for writing to fs
109 name := strings.Map(func(r rune) rune {
110 if strings.ContainsRune(" :", r) {
111 return '_'
112 }
113 if '0' <= r && r <= '9' ||
114 'a' <= r && r <= 'z' ||
115 'A' <= r && r <= 'Z' {
116 return r
117 }
118 return -1
119 }, rq.Query)
120
121 t.Run(name, func(t *testing.T) {
122 q, err := query.Parse(rq.Query)
123 if err != nil {
124 t.Fatal(err)
125 }
126
127 sOpts := zoekt.SearchOptions{
128 // Use the same options sourcegraph has by default
129 ChunkMatches: true,
130 MaxWallTime: 20 * time.Second,
131 ShardMaxMatchCount: 10_000 * 10,
132 TotalMaxMatchCount: 100_000 * 10,
133 MaxDocDisplayCount: 500,
134
135 DebugScore: *debugScore,
136 }
137 result, err := ss.Search(context.Background(), q, &sOpts)
138 if err != nil {
139 t.Fatal(err)
140 }
141
142 ranks = append(ranks, targetRank(rq, result.Files))
143
144 var gotBuf bytes.Buffer
145 marshalMatches(&gotBuf, rq, q, result.Files)
146 assertGolden(t, name, gotBuf.Bytes())
147 })
148 }
149
150 t.Run("rank_stats", func(t *testing.T) {
151 if len(ranks) != len(queries) {
152 t.Skip("not computing rank stats since not all query cases ran")
153 }
154
155 var gotBuf bytes.Buffer
156 printf := func(format string, a ...any) {
157 _, _ = fmt.Fprintf(&gotBuf, format, a...)
158 }
159
160 printf("queries: %d\n", len(ranks))
161
162 for _, recallThreshold := range []int{1, 5} {
163 count := 0
164 for _, rank := range ranks {
165 if rank <= recallThreshold && rank > 0 {
166 count++
167 }
168 }
169 countp := float64(count) * 100 / float64(len(ranks))
170 printf("recall@%d: %d (%.0f%%)\n", recallThreshold, count, countp)
171 }
172
173 // Mean reciprocal rank
174 mrr := float64(0)
175 for _, rank := range ranks {
176 if rank > 0 {
177 mrr += 1 / float64(rank)
178 }
179 }
180 mrr /= float64(len(ranks))
181 printf("mrr: %f\n", mrr)
182
183 assertGolden(t, "rank_stats", gotBuf.Bytes())
184 })
185}
186
187func assertGolden(t *testing.T, name string, got []byte) {
188 t.Helper()
189
190 wantPath := filepath.Join("testdata", name+".txt")
191 if *update {
192 if err := os.WriteFile(wantPath, got, 0o600); err != nil {
193 t.Fatal(err)
194 }
195 }
196 want, err := os.ReadFile(wantPath)
197 if err != nil {
198 t.Fatal(err)
199 }
200
201 if d := cmp.Diff(string(want), string(got)); d != "" {
202 t.Fatalf("unexpected (-want, +got):\n%s", d)
203 }
204}
205
206type rankingQuery struct {
207 Query string
208 Target string
209}
210
211var (
212 tarballCache = "/tmp/zoekt-test-ranking-tarballs-" + os.Getenv("USER")
213 shardCache = "/tmp/zoekt-test-ranking-shards-" + os.Getenv("USER")
214)
215
216func indexURL(indexDir, u string) error {
217 if err := os.MkdirAll(tarballCache, 0o700); err != nil {
218 return err
219 }
220
221 opts := archive.Options{
222 Archive: u,
223 Incremental: true,
224 }
225 opts.SetDefaults() // sets metadata like Name and the codeload URL
226 u = opts.Archive
227
228 // if opts.Commit is set but opts.Branch is not, then we just need to give
229 // the commit a name for testing.
230 if opts.Commit != "" && opts.Branch == "" {
231 opts.Branch = "test"
232 }
233
234 // update Archive location to cached location
235 cacheBase := fmt.Sprintf("%s-%s%s.tar.gz", url.QueryEscape(opts.Name), opts.Branch, opts.Commit) // assume .tar.gz
236 path := filepath.Join(tarballCache, cacheBase)
237 opts.Archive = path
238
239 if _, err := os.Stat(path); os.IsNotExist(err) {
240 if err := download(u, path); err != nil {
241 return err
242 }
243 }
244
245 // TODO scip
246 // languageMap := make(ctags.LanguageMap)
247 // for _, lang := range []string{"kotlin", "rust", "ruby", "go", "python", "javascript", "c_sharp", "scala", "typescript", "zig"} {
248 // languageMap[lang] = ctags.ScipCTags
249 // }
250
251 err := archive.Index(opts, index.Options{
252 IndexDir: indexDir,
253 CTagsMustSucceed: true,
254 RepositoryDescription: zoekt.Repository{
255 // Use the latest commit date to calculate the repo rank when loading the shard.
256 // This is the same setting we use in production.
257 RawConfig: map[string]string{"latestCommitDate": "1"},
258 },
259 })
260 if err != nil {
261 return fmt.Errorf("failed to index %s: %w", opts.Archive, err)
262 }
263
264 return nil
265}
266
267func download(url, dst string) error {
268 tmpPath := dst + ".part"
269
270 rc, err := archive.OpenReader(url)
271 if err != nil {
272 return err
273 }
274 defer rc.Close()
275
276 f, err := os.OpenFile(tmpPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o600)
277 if err != nil {
278 return err
279 }
280 defer f.Close()
281
282 _, err = io.Copy(f, rc)
283 if err != nil {
284 return err
285 }
286
287 err = f.Close()
288 if err != nil {
289 return err
290 }
291
292 return os.Rename(tmpPath, dst)
293}
294
295const (
296 chunkMatchesPerFile = 3
297 fileMatchesPerSearch = 6
298)
299
300func docName(f zoekt.FileMatch) string {
301 return f.Repository + "/" + f.FileName
302}
303
304func marshalMatches(w io.Writer, rq rankingQuery, q query.Q, files []zoekt.FileMatch) {
305 _, _ = fmt.Fprintf(w, "queryString: %s\n", rq.Query)
306 _, _ = fmt.Fprintf(w, "query: %s\n", q)
307 _, _ = fmt.Fprintf(w, "targetRank: %d\n\n", targetRank(rq, files))
308
309 files, hiddenFiles := splitAtIndex(files, fileMatchesPerSearch)
310 for _, f := range files {
311 doc := docName(f)
312 if doc == rq.Target {
313 doc = "**" + doc + "**"
314 }
315 _, _ = fmt.Fprintf(w, "%s%s\n", doc, addTabIfNonEmpty(f.Debug))
316
317 chunks, hidden := splitAtIndex(f.ChunkMatches, chunkMatchesPerFile)
318
319 for _, m := range chunks {
320 _, _ = fmt.Fprintf(w, "%d:%s%s\n", m.ContentStart.LineNumber, strings.TrimRight(string(m.Content), "\n"), addTabIfNonEmpty(m.DebugScore))
321 }
322
323 if len(hidden) > 0 {
324 _, _ = fmt.Fprintf(w, "hidden %d more line matches\n", len(hidden))
325 }
326 _, _ = fmt.Fprintln(w)
327 }
328
329 if len(hiddenFiles) > 0 {
330 fmt.Fprintf(w, "hidden %d more file matches\n", len(hiddenFiles))
331 }
332}
333
334func targetRank(rq rankingQuery, files []zoekt.FileMatch) int {
335 for i, f := range files {
336 if docName(f) == rq.Target {
337 return i + 1
338 }
339 }
340 return -1
341}
342
343func splitAtIndex[E any](s []E, idx int) ([]E, []E) {
344 if idx < len(s) {
345 return s[:idx], s[idx:]
346 }
347 return s, nil
348}
349
350func addTabIfNonEmpty(s string) string {
351 if s != "" {
352 return "\t" + s
353 }
354 return s
355}
356
357func requireCTags(tb testing.TB) {
358 tb.Helper()
359
360 if os.Getenv("CTAGS_COMMAND") != "" {
361 return
362 }
363 if _, err := exec.LookPath("universal-ctags"); err == nil {
364 return
365 }
366
367 // On CI we require ctags to be available. Otherwise we skip
368 if os.Getenv("CI") != "" {
369 tb.Fatal("universal-ctags is missing")
370 } else {
371 tb.Skip("universal-ctags is missing")
372 }
373}
374
375func checkScipCTags() string {
376 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
377 return ctags
378 }
379
380 if ctags, err := exec.LookPath("scip-ctags"); err == nil {
381 return ctags
382 }
383
384 return ""
385}