fork of https://github.com/sourcegraph/zoekt
1package zoekt
2
3import (
4 "bytes"
5 "fmt"
6 "testing"
7 "testing/quick"
8 "unicode/utf8"
9
10 "github.com/google/go-cmp/cmp"
11)
12
13func getNewlines(data []byte) newlines {
14 var locs []uint32
15 for i, c := range data {
16 if c == '\n' {
17 locs = append(locs, uint32(i))
18 }
19 }
20 return newlines{
21 locs: locs,
22 fileSize: uint32(len(data)),
23 }
24}
25
26func TestGetLines(t *testing.T) {
27 contents := [][]byte{
28 []byte("one\ntwo\nthree\nfour"),
29 []byte("one\ntwo\nthree\nfour\n"),
30 []byte("one"),
31 []byte(""),
32 }
33
34 for _, content := range contents {
35 t.Run("", func(t *testing.T) {
36 newLines := getNewlines(content)
37 // Trim the last newline before splitting because a trailing newline does not constitute a new line
38 lines := bytes.Split(bytes.TrimSuffix(content, []byte{'\n'}), []byte{'\n'})
39 wantGetLines := func(low, high int) []byte {
40 low--
41 high--
42 if low < 0 {
43 low = 0
44 }
45 if low >= len(lines) {
46 return nil
47 }
48 if high <= 0 {
49 return nil
50 }
51 if high > len(lines) {
52 high = len(lines)
53 }
54 return bytes.Join(lines[low:high], []byte{'\n'})
55 }
56
57 for low := -1; low <= len(lines)+2; low++ {
58 for high := low; high <= len(lines)+2; high++ {
59 want := wantGetLines(low, high)
60 got := newLines.getLines(content, low, high)
61 if d := cmp.Diff(string(want), string(got)); d != "" {
62 t.Fatal(d)
63 }
64 }
65 }
66 })
67 }
68}
69
70func TestAtOffset(t *testing.T) {
71 cases := []struct {
72 data []byte
73 offset uint32
74 lineNumber int
75 lineStart int
76 lineEnd int
77 }{{
78 data: []byte("0.2.4.\n7.9.11.\n"),
79 offset: 0,
80 lineNumber: 1, lineStart: 0, lineEnd: 6,
81 }, {
82 data: []byte("0.2.4.\n7.9.11.\n"),
83 offset: 6,
84 lineNumber: 1, lineStart: 0, lineEnd: 6,
85 }, {
86 data: []byte("0.2.4.\n7.9.11.\n"),
87 offset: 2,
88 lineNumber: 1, lineStart: 0, lineEnd: 6,
89 }, {
90 data: []byte("0.2.4.\n7.9.11.\n"),
91 offset: 2,
92 lineNumber: 1, lineStart: 0, lineEnd: 6,
93 }, {
94 data: []byte("0.2.4.\n7.9.11.\n"),
95 offset: 7,
96 lineNumber: 2, lineStart: 7, lineEnd: 14,
97 }, {
98 data: []byte("0.2.4.\n7.9.11.\n"),
99 offset: 11,
100 lineNumber: 2, lineStart: 7, lineEnd: 14,
101 }, {
102 data: []byte("0.2.4.\n7.9.11.\n"),
103 offset: 15,
104 lineNumber: 3, lineStart: 15, lineEnd: 15,
105 }, {
106 data: []byte("0.2.4.\n7.9.11."),
107 offset: 7,
108 lineNumber: 2, lineStart: 7, lineEnd: 14,
109 }, {
110 data: []byte("\n\n"),
111 offset: 0,
112 lineNumber: 1, lineStart: 0, lineEnd: 0,
113 }, {
114 data: []byte("\n\n"),
115 offset: 1,
116 lineNumber: 2, lineStart: 1, lineEnd: 1,
117 }, {
118 data: []byte("\n\n"),
119 offset: 3,
120 lineNumber: 3, lineStart: 2, lineEnd: 2,
121 }, {
122 data: []byte("line with no newlines"),
123 offset: 3,
124 lineNumber: 1, lineStart: 0, lineEnd: 21,
125 }}
126
127 for _, tt := range cases {
128 t.Run("", func(t *testing.T) {
129 nls := getNewlines(tt.data)
130 gotLineNumber, gotLineStart, gotLineEnd := nls.atOffset(tt.offset)
131 if gotLineNumber != tt.lineNumber {
132 t.Fatalf("expected line number %d, got %d", tt.lineNumber, gotLineNumber)
133 }
134 if gotLineStart != tt.lineStart {
135 t.Fatalf("expected line start %d, got %d", tt.lineStart, gotLineStart)
136 }
137 if gotLineEnd != tt.lineEnd {
138 t.Fatalf("expected line end %d, got %d", tt.lineEnd, gotLineEnd)
139 }
140 })
141 }
142}
143
144func TestLineBounds(t *testing.T) {
145 cases := []struct {
146 data []byte
147 lineNumber int
148 start uint32
149 end uint32
150 }{{
151 data: []byte("0.2.4.\n7.9.11.\n"),
152 lineNumber: 1,
153 start: 0, end: 6,
154 }, {
155 data: []byte("0.2.4.\n7.9.11.\n"),
156 lineNumber: 2,
157 start: 7, end: 14,
158 }, {
159 data: []byte("0.2.4.\n7.9.11.\n"),
160 lineNumber: 0,
161 start: 0, end: 0,
162 }, {
163 data: []byte("0.2.4.\n7.9.11.\n"),
164 lineNumber: -1,
165 start: 0, end: 0,
166 }, {
167 data: []byte("0.2.4.\n7.9.11.\n"),
168 lineNumber: 202002,
169 start: 15, end: 15,
170 }, {
171 data: []byte("\n\n"),
172 lineNumber: 1,
173 start: 0, end: 0,
174 }, {
175 data: []byte("\n\n"),
176 lineNumber: 2,
177 start: 1, end: 1,
178 }, {
179 data: []byte("\n\n"),
180 lineNumber: 3,
181 start: 2, end: 2,
182 }}
183
184 for _, tt := range cases {
185 t.Run("", func(t *testing.T) {
186 nls := getNewlines(tt.data)
187 gotStart, gotEnd := nls.lineBounds(tt.lineNumber)
188 if gotStart != tt.start {
189 t.Fatalf("expected line start %d, got %d", tt.start, gotStart)
190 }
191 if gotEnd != tt.end {
192 t.Fatalf("expected line end %d, got %d", tt.end, gotEnd)
193 }
194 })
195 }
196}
197
198func TestChunkMatches(t *testing.T) {
199 content := []byte(`0.2.4.6.8.10.
20013.16.19.22.
20126.29.32.35.
20239.42.45.48.
20352.55.58.61.
20465.68.71.74.
20578.81.84.87.
206`)
207 match_0_2 := &candidateMatch{byteOffset: 0, byteMatchSz: 2}
208 match_6_10 := &candidateMatch{byteOffset: 6, byteMatchSz: 4}
209 match_10_16 := &candidateMatch{byteOffset: 10, byteMatchSz: 6}
210 match_19_42 := &candidateMatch{byteOffset: 19, byteMatchSz: 23}
211 match_45_48 := &candidateMatch{byteOffset: 45, byteMatchSz: 3}
212 match_71_72 := &candidateMatch{byteOffset: 71, byteMatchSz: 1}
213
214 cases := []struct {
215 candidateMatches []*candidateMatch
216 numContextLines int
217 want []candidateChunk
218 }{{
219 candidateMatches: []*candidateMatch{match_0_2},
220 numContextLines: 0,
221 want: []candidateChunk{{
222 firstLine: 1,
223 minOffset: 0,
224 lastLine: 1,
225 maxOffset: 2,
226 candidates: []*candidateMatch{match_0_2},
227 }},
228 }, {
229 candidateMatches: []*candidateMatch{match_0_2},
230 numContextLines: 5,
231 want: []candidateChunk{{
232 firstLine: 1,
233 minOffset: 0,
234 lastLine: 1,
235 maxOffset: 2,
236 candidates: []*candidateMatch{match_0_2},
237 }},
238 }, {
239 candidateMatches: []*candidateMatch{match_0_2, match_6_10},
240 numContextLines: 0,
241 want: []candidateChunk{{
242 firstLine: 1,
243 minOffset: 0,
244 lastLine: 1,
245 maxOffset: 10,
246 candidates: []*candidateMatch{match_0_2, match_6_10},
247 }},
248 }, {
249 candidateMatches: []*candidateMatch{match_0_2, match_10_16},
250 numContextLines: 0,
251 want: []candidateChunk{{
252 firstLine: 1,
253 minOffset: 0,
254 lastLine: 2,
255 maxOffset: 16,
256 candidates: []*candidateMatch{match_0_2, match_10_16},
257 }},
258 }, {
259 candidateMatches: []*candidateMatch{match_0_2, match_19_42},
260 numContextLines: 0,
261 want: []candidateChunk{{
262 firstLine: 1,
263 minOffset: 0,
264 lastLine: 1,
265 maxOffset: 2,
266 candidates: []*candidateMatch{match_0_2},
267 }, {
268 firstLine: 2,
269 minOffset: 19,
270 lastLine: 4,
271 maxOffset: 42,
272 candidates: []*candidateMatch{match_19_42},
273 }},
274 }, {
275 candidateMatches: []*candidateMatch{match_0_2, match_19_42},
276 numContextLines: 1,
277 want: []candidateChunk{{
278 firstLine: 1,
279 minOffset: 0,
280 lastLine: 4,
281 maxOffset: 42,
282 candidates: []*candidateMatch{match_0_2, match_19_42},
283 }},
284 }, {
285 candidateMatches: []*candidateMatch{
286 match_0_2, match_19_42, match_45_48, match_71_72,
287 },
288 numContextLines: 0,
289 want: []candidateChunk{{
290 firstLine: 1,
291 minOffset: 0,
292 lastLine: 1,
293 maxOffset: 2,
294 candidates: []*candidateMatch{match_0_2},
295 }, {
296 firstLine: 2,
297 minOffset: 19,
298 lastLine: 4,
299 maxOffset: 48,
300 candidates: []*candidateMatch{match_19_42, match_45_48},
301 }, {
302 firstLine: 6,
303 minOffset: 71,
304 lastLine: 6,
305 maxOffset: 72,
306 candidates: []*candidateMatch{match_71_72},
307 }},
308 }, {
309 candidateMatches: []*candidateMatch{
310 match_0_2, match_19_42, match_45_48, match_71_72,
311 },
312 numContextLines: 100,
313 want: []candidateChunk{{
314 firstLine: 1,
315 minOffset: 0,
316 lastLine: 6,
317 maxOffset: 72,
318 candidates: []*candidateMatch{match_0_2, match_19_42, match_45_48, match_71_72},
319 }},
320 }}
321
322 newlines := getNewlines(content)
323 for _, tt := range cases {
324 t.Run("", func(t *testing.T) {
325 got := chunkCandidates(tt.candidateMatches, newlines, tt.numContextLines)
326 if diff := cmp.Diff(fmt.Sprintf("%#v\n", tt.want), fmt.Sprintf("%#v\n", got)); diff != "" {
327 t.Fatal(diff)
328 }
329 })
330 }
331}
332
333func BenchmarkColumnHelper(b *testing.B) {
334 // We simulate looking up columns of evenly spaced matches
335 const matches = 10_000
336 const match = "match"
337 const space = " "
338 const dist = uint32(len(match) + len(space))
339 data := bytes.Repeat([]byte(match+space), matches)
340
341 b.ResetTimer()
342
343 for i := 0; i < b.N; i++ {
344 columnHelper := columnHelper{data: data}
345
346 lineOffset := 0
347 offset := uint32(0)
348 for offset < uint32(len(data)) {
349 col := columnHelper.get(lineOffset, offset)
350 if col != offset+1 {
351 b.Fatal("column is not offset even though data is ASCII")
352 }
353 offset += dist
354 }
355 }
356}
357
358func TestColumnHelper(t *testing.T) {
359 f := func(line0, line1 string) bool {
360 data := []byte(line0 + line1)
361 lineOffset := len(line0)
362
363 columnHelper := columnHelper{data: data}
364
365 // We check every second rune returns the correct answer
366 offset := lineOffset
367 column := 1
368 for offset < len(data) {
369 if column%2 == 0 {
370 got := columnHelper.get(lineOffset, uint32(offset))
371 if got != uint32(column) {
372 return false
373 }
374 }
375 _, size := utf8.DecodeRune(data[offset:])
376 offset += size
377 column++
378 }
379
380 return true
381 }
382
383 if err := quick.Check(f, nil); err != nil {
384 t.Fatal(err)
385 }
386
387 // Corner cases
388
389 // empty data, shouldn't happen but just in case it slips through
390 ch := columnHelper{data: nil}
391 if got := ch.get(0, 0); got != 1 {
392 t.Fatal("empty data didn't return 1", got)
393 }
394
395 // Repeating a call to get should return the same value
396 // empty data, shouldn't happen but just in case it slips through
397 ch = columnHelper{data: []byte("hello\nworld")}
398 if got := ch.get(6, 8); got != 3 {
399 t.Fatal("unexpected value for third column on second line", got)
400 }
401 if got := ch.get(6, 8); got != 3 {
402 t.Fatal("unexpected value for repeated call for third column on second line", got)
403 }
404
405 // Now make sure if we go backwards we do not incorrectly use the cache
406 if got := ch.get(6, 6); got != 1 {
407 t.Fatal("unexpected value for backwards call for first column on second line", got)
408 }
409}
410
411func TestFindMaxOverlappingSection(t *testing.T) {
412 secs := []DocumentSection{
413 {Start: 0, End: 5},
414 {Start: 8, End: 19},
415 {Start: 22, End: 26},
416 }
417 // 012345678901234567890123456
418 // [....[
419 // [..........[
420 // [...[
421
422 testcases := []struct {
423 name string
424 off uint32
425 sz uint32
426 wantSecIdx uint32
427 wantOverlap bool
428 }{
429 {off: 0, sz: 1, wantSecIdx: 0, wantOverlap: true},
430 {off: 0, sz: 5, wantSecIdx: 0, wantOverlap: true},
431 {off: 2, sz: 5, wantSecIdx: 0, wantOverlap: true},
432 {off: 2, sz: 50, wantSecIdx: 1, wantOverlap: true},
433 {off: 4, sz: 10, wantSecIdx: 1, wantOverlap: true},
434 {off: 5, sz: 15, wantSecIdx: 1, wantOverlap: true},
435 {off: 18, sz: 10, wantSecIdx: 2, wantOverlap: true},
436
437 // Prefer full overlap, break ties by preferring the earlier section
438 {off: 10, sz: 20, wantSecIdx: 2, wantOverlap: true},
439 {off: 0, sz: 100, wantSecIdx: 0, wantOverlap: true},
440 {off: 8, sz: 100, wantSecIdx: 1, wantOverlap: true},
441 {off: 0, sz: 10, wantSecIdx: 0, wantOverlap: true},
442 {off: 16, sz: 10, wantSecIdx: 2, wantOverlap: true},
443
444 // No overlap
445 {off: 5, sz: 2, wantOverlap: false},
446 {off: 20, sz: 1, wantOverlap: false},
447 {off: 99, sz: 1, wantOverlap: false},
448 {off: 0, sz: 0, wantOverlap: false},
449 }
450
451 for _, tt := range testcases {
452 t.Run(fmt.Sprintf("off=%d size=%d", tt.off, tt.sz), func(t *testing.T) {
453 got, haveOverlap := findMaxOverlappingSection(secs, tt.off, tt.sz)
454 if haveOverlap != tt.wantOverlap {
455 t.Fatalf("expected overlap %v, got %v", tt.wantOverlap, haveOverlap)
456 }
457 if got != tt.wantSecIdx {
458 t.Fatalf("expected section %d, got %d", tt.wantSecIdx, got)
459 }
460 })
461 }
462}