fork of https://github.com/sourcegraph/zoekt
1package e2e
2
3import (
4 "bytes"
5 "context"
6 "flag"
7 "fmt"
8 "io"
9 "net/url"
10 "os"
11 "os/exec"
12 "path/filepath"
13 "strings"
14 "testing"
15 "time"
16
17 "github.com/google/go-cmp/cmp"
18 "github.com/sourcegraph/zoekt"
19 "github.com/sourcegraph/zoekt/build"
20 "github.com/sourcegraph/zoekt/internal/archive"
21 "github.com/sourcegraph/zoekt/query"
22 "github.com/sourcegraph/zoekt/shards"
23)
24
25var update = flag.Bool("update", false, "update golden file")
26
27var useShardCache = flag.Bool("shard_cache", false, "cache computed shards for faster test runs")
28
29// debugScore can be set to include much more output. Do not commit the
30// updated golden files, this is purely used for debugging in a local
31// environment.
32var debugScore = flag.Bool("debug_score", false, "include debug output in golden files.")
33
34func TestRanking(t *testing.T) {
35 if testing.Short() {
36 t.Skip("skipping due to short flag")
37 }
38
39 requireCTags(t)
40
41 archiveURLs := []string{
42 "https://github.com/sourcegraph/sourcegraph/tree/v5.2.2",
43 "https://github.com/golang/go/tree/go1.21.4",
44 "https://github.com/sourcegraph/cody/tree/vscode-v0.14.5",
45 // The commit before ranking e2e tests were added to avoid matching
46 // content inside our golden files.
47 "https://github.com/sourcegraph/zoekt/commit/ef907c2371176aa3f97713d5bf182983ef090c6a",
48 }
49 q := func(query, target string) rankingQuery {
50 return rankingQuery{Query: query, Target: target}
51 }
52 queries := []rankingQuery{
53 // golang/go
54 q("test server", "github.com/golang/go/src/net/http/httptest/server.go"),
55 q("bytes buffer", "github.com/golang/go/src/bytes/buffer.go"),
56 q("bufio buffer", "github.com/golang/go/src/bufio/scan.go"),
57
58 // sourcegraph/sourcegraph
59 q("graphql type User", "github.com/sourcegraph/sourcegraph/cmd/frontend/graphqlbackend/schema.graphql"),
60 q("Get database/user", "github.com/sourcegraph/sourcegraph/internal/database/users.go"),
61 q("InternalDoer", "github.com/sourcegraph/sourcegraph/internal/httpcli/client.go"),
62 q("Repository metadata Write rbac", "github.com/sourcegraph/sourcegraph/internal/rbac/constants.go"), // unsure if this is the best doc?
63
64 // cody
65 q("generate unit test", "github.com/sourcegraph/cody/lib/shared/src/chat/recipes/generate-test.ts"),
66 q("r:cody sourcegraph url", "github.com/sourcegraph/cody/lib/shared/src/sourcegraph-api/graphql/client.ts"),
67
68 // zoekt
69 q("zoekt searcher", "github.com/sourcegraph/zoekt/api.go"),
70
71 // exact phrases
72 q("assets are not configured for this binary", "github.com/sourcegraph/sourcegraph/ui/assets/assets.go"),
73 q("sourcegraph/server docker image build", "github.com/sourcegraph/sourcegraph/dev/tools.go"),
74
75 // symbols split up
76 q("bufio flush writer", "github.com/golang/go/src/net/http/transfer.go"), // bufioFlushWriter
77 q("coverage data writer", "github.com/golang/go/src/internal/coverage/encodecounter/encode.go"), // CoverageDataWriter
78 }
79
80 var indexDir string
81 if *useShardCache {
82 t.Logf("reusing index dir to speed up testing. If you have unexpected results remove %s", shardCache)
83 indexDir = shardCache
84 } else {
85 indexDir = t.TempDir()
86 }
87
88 for _, u := range archiveURLs {
89 if err := indexURL(indexDir, u); err != nil {
90 t.Fatal(err)
91 }
92 }
93
94 ss, err := shards.NewDirectorySearcher(indexDir)
95 if err != nil {
96 t.Fatalf("NewDirectorySearcher(%s): %v", indexDir, err)
97 }
98 defer ss.Close()
99
100 var ranks []int
101 for _, rq := range queries {
102 // normalise queryStr for writing to fs
103 name := strings.Map(func(r rune) rune {
104 if strings.ContainsRune(" :", r) {
105 return '_'
106 }
107 if '0' <= r && r <= '9' ||
108 'a' <= r && r <= 'z' ||
109 'A' <= r && r <= 'Z' {
110 return r
111 }
112 return -1
113 }, rq.Query)
114
115 t.Run(name, func(t *testing.T) {
116 q, err := query.Parse(rq.Query)
117 if err != nil {
118 t.Fatal(err)
119 }
120
121 // q is marshalled as part of the test, so avoid our rewrites for
122 // ranking.
123 qSearch := query.ExpirementalPhraseBoost(q, rq.Query, query.ExperimentalPhraseBoostOptions{})
124
125 sOpts := zoekt.SearchOptions{
126 // Use the same options sourcegraph has by default
127 ChunkMatches: true,
128 MaxWallTime: 20 * time.Second,
129 ShardMaxMatchCount: 10_000 * 10,
130 TotalMaxMatchCount: 100_000 * 10,
131 MaxDocDisplayCount: 500,
132
133 DebugScore: *debugScore,
134 }
135 result, err := ss.Search(context.Background(), qSearch, &sOpts)
136 if err != nil {
137 t.Fatal(err)
138 }
139
140 ranks = append(ranks, targetRank(rq, result.Files))
141
142 var gotBuf bytes.Buffer
143 marshalMatches(&gotBuf, rq, q, result.Files)
144 assertGolden(t, name, gotBuf.Bytes())
145 })
146 }
147
148 t.Run("rank_stats", func(t *testing.T) {
149 if len(ranks) != len(queries) {
150 t.Skip("not computing rank stats since not all query cases ran")
151 }
152
153 var gotBuf bytes.Buffer
154 printf := func(format string, a ...any) {
155 _, _ = fmt.Fprintf(&gotBuf, format, a...)
156 }
157
158 printf("queries: %d\n", len(ranks))
159
160 for _, recallThreshold := range []int{1, 5} {
161 count := 0
162 for _, rank := range ranks {
163 if rank <= recallThreshold && rank > 0 {
164 count++
165 }
166 }
167 countp := float64(count) * 100 / float64(len(ranks))
168 printf("recall@%d: %d (%.0f%%)\n", recallThreshold, count, countp)
169 }
170
171 // Mean reciprocal rank
172 mrr := float64(0)
173 for _, rank := range ranks {
174 if rank > 0 {
175 mrr += 1 / float64(rank)
176 }
177 }
178 mrr /= float64(len(ranks))
179 printf("mrr: %f\n", mrr)
180
181 assertGolden(t, "rank_stats", gotBuf.Bytes())
182 })
183}
184
185func assertGolden(t *testing.T, name string, got []byte) {
186 t.Helper()
187
188 wantPath := filepath.Join("testdata", name+".txt")
189 if *update {
190 if err := os.WriteFile(wantPath, got, 0o600); err != nil {
191 t.Fatal(err)
192 }
193 }
194 want, err := os.ReadFile(wantPath)
195 if err != nil {
196 t.Fatal(err)
197 }
198
199 if d := cmp.Diff(string(want), string(got)); d != "" {
200 t.Fatalf("unexpected (-want, +got):\n%s", d)
201 }
202}
203
204type rankingQuery struct {
205 Query string
206 Target string
207}
208
209var (
210 tarballCache = "/tmp/zoekt-test-ranking-tarballs-" + os.Getenv("USER")
211 shardCache = "/tmp/zoekt-test-ranking-shards-" + os.Getenv("USER")
212)
213
214func indexURL(indexDir, u string) error {
215 if err := os.MkdirAll(tarballCache, 0o700); err != nil {
216 return err
217 }
218
219 opts := archive.Options{
220 Archive: u,
221 Incremental: true,
222 }
223 opts.SetDefaults() // sets metadata like Name and the codeload URL
224 u = opts.Archive
225
226 // if opts.Commit is set but opts.Branch is not, then we just need to give
227 // the commit a name for testing.
228 if opts.Commit != "" && opts.Branch == "" {
229 opts.Branch = "test"
230 }
231
232 // update Archive location to cached location
233 cacheBase := fmt.Sprintf("%s-%s%s.tar.gz", url.QueryEscape(opts.Name), opts.Branch, opts.Commit) // assume .tar.gz
234 path := filepath.Join(tarballCache, cacheBase)
235 opts.Archive = path
236
237 if _, err := os.Stat(path); os.IsNotExist(err) {
238 if err := download(u, path); err != nil {
239 return err
240 }
241 }
242
243 // TODO scip
244 // languageMap := make(ctags.LanguageMap)
245 // for _, lang := range []string{"kotlin", "rust", "ruby", "go", "python", "javascript", "c_sharp", "scala", "typescript", "zig"} {
246 // languageMap[lang] = ctags.ScipCTags
247 // }
248
249 err := archive.Index(opts, build.Options{
250 IndexDir: indexDir,
251 CTagsMustSucceed: true,
252 })
253 if err != nil {
254 return fmt.Errorf("failed to index %s: %w", opts.Archive, err)
255 }
256
257 return nil
258}
259
260func download(url, dst string) error {
261 tmpPath := dst + ".part"
262
263 rc, err := archive.OpenReader(url)
264 if err != nil {
265 return err
266 }
267 defer rc.Close()
268
269 f, err := os.OpenFile(tmpPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o600)
270 if err != nil {
271 return err
272 }
273 defer f.Close()
274
275 _, err = io.Copy(f, rc)
276 if err != nil {
277 return err
278 }
279
280 err = f.Close()
281 if err != nil {
282 return err
283 }
284
285 return os.Rename(tmpPath, dst)
286}
287
288const (
289 chunkMatchesPerFile = 3
290 fileMatchesPerSearch = 6
291)
292
293func docName(f zoekt.FileMatch) string {
294 return f.Repository + "/" + f.FileName
295}
296
297func marshalMatches(w io.Writer, rq rankingQuery, q query.Q, files []zoekt.FileMatch) {
298 _, _ = fmt.Fprintf(w, "queryString: %s\n", rq.Query)
299 _, _ = fmt.Fprintf(w, "query: %s\n", q)
300 _, _ = fmt.Fprintf(w, "targetRank: %d\n\n", targetRank(rq, files))
301
302 files, hiddenFiles := splitAtIndex(files, fileMatchesPerSearch)
303 for _, f := range files {
304 doc := docName(f)
305 if doc == rq.Target {
306 doc = "**" + doc + "**"
307 }
308 _, _ = fmt.Fprintf(w, "%s%s\n", doc, addTabIfNonEmpty(f.Debug))
309
310 chunks, hidden := splitAtIndex(f.ChunkMatches, chunkMatchesPerFile)
311
312 for _, m := range chunks {
313 _, _ = fmt.Fprintf(w, "%d:%s%s\n", m.ContentStart.LineNumber, string(m.Content), addTabIfNonEmpty(m.DebugScore))
314 }
315
316 if len(hidden) > 0 {
317 _, _ = fmt.Fprintf(w, "hidden %d more line matches\n", len(hidden))
318 }
319 _, _ = fmt.Fprintln(w)
320 }
321
322 if len(hiddenFiles) > 0 {
323 fmt.Fprintf(w, "hidden %d more file matches\n", len(hiddenFiles))
324 }
325}
326
327func targetRank(rq rankingQuery, files []zoekt.FileMatch) int {
328 for i, f := range files {
329 if docName(f) == rq.Target {
330 return i + 1
331 }
332 }
333 return -1
334}
335
336func splitAtIndex[E any](s []E, idx int) ([]E, []E) {
337 if idx < len(s) {
338 return s[:idx], s[idx:]
339 }
340 return s, nil
341}
342
343func addTabIfNonEmpty(s string) string {
344 if s != "" {
345 return "\t" + s
346 }
347 return s
348}
349
350func requireCTags(tb testing.TB) {
351 tb.Helper()
352
353 if os.Getenv("CTAGS_COMMAND") != "" {
354 return
355 }
356 if _, err := exec.LookPath("universal-ctags"); err == nil {
357 return
358 }
359
360 // On CI we require ctags to be available. Otherwise we skip
361 if os.Getenv("CI") != "" {
362 tb.Fatal("universal-ctags is missing")
363 } else {
364 tb.Skip("universal-ctags is missing")
365 }
366}