fork of https://github.com/sourcegraph/zoekt
1package index
2
3import (
4 "bytes"
5 "fmt"
6 "testing"
7 "testing/quick"
8 "unicode/utf8"
9
10 "github.com/google/go-cmp/cmp"
11)
12
13func getNewlines(data []byte) newlines {
14 var locs []uint32
15 for i, c := range data {
16 if c == '\n' {
17 locs = append(locs, uint32(i))
18 }
19 }
20 return newlines{
21 locs: locs,
22 fileSize: uint32(len(data)),
23 }
24}
25
26func TestGetLines(t *testing.T) {
27 contents := [][]byte{
28 []byte("one\ntwo\nthree\nfour"),
29 []byte("one\ntwo\nthree\nfour\n"),
30 []byte("one"),
31 []byte(""),
32 }
33
34 for _, content := range contents {
35 t.Run("", func(t *testing.T) {
36 newLines := getNewlines(content)
37 lines := bytes.SplitAfter(content, []byte{'\n'})
38 if len(lines) > 0 && len(lines[len(lines)-1]) == 0 {
39 // A trailing newline does not delimit an empty line at the end of a file
40 lines = lines[:len(lines)-1]
41 }
42 wantGetLines := func(low, high int) []byte {
43 low--
44 high--
45 if low < 0 {
46 low = 0
47 }
48 if low >= len(lines) {
49 return nil
50 }
51 if high <= 0 {
52 return nil
53 }
54 if high > len(lines) {
55 high = len(lines)
56 }
57 return bytes.Join(lines[low:high], nil)
58 }
59
60 for low := -1; low <= len(lines)+2; low++ {
61 for high := low; high <= len(lines)+2; high++ {
62 want := wantGetLines(low, high)
63 got := newLines.getLines(content, low, high)
64 if d := cmp.Diff(string(want), string(got)); d != "" {
65 t.Fatal(d)
66 }
67 }
68 }
69 })
70 }
71}
72
73func TestAtOffset(t *testing.T) {
74 cases := []struct {
75 data []byte
76 offset uint32
77 lineNumber int
78 lineStart uint32
79 lineEnd uint32
80 }{{
81 data: []byte("0.2.4.\n7.9.11.\n"),
82 offset: 0,
83 lineNumber: 1, lineStart: 0, lineEnd: 7,
84 }, {
85 data: []byte("0.2.4.\n7.9.11.\n"),
86 offset: 6,
87 lineNumber: 1, lineStart: 0, lineEnd: 7,
88 }, {
89 data: []byte("0.2.4.\n7.9.11.\n"),
90 offset: 2,
91 lineNumber: 1, lineStart: 0, lineEnd: 7,
92 }, {
93 data: []byte("0.2.4.\n7.9.11.\n"),
94 offset: 2,
95 lineNumber: 1, lineStart: 0, lineEnd: 7,
96 }, {
97 data: []byte("0.2.4.\n7.9.11.\n"),
98 offset: 7,
99 lineNumber: 2, lineStart: 7, lineEnd: 15,
100 }, {
101 data: []byte("0.2.4.\n7.9.11.\n"),
102 offset: 11,
103 lineNumber: 2, lineStart: 7, lineEnd: 15,
104 }, {
105 data: []byte("0.2.4.\n7.9.11.\n"),
106 offset: 15,
107 lineNumber: 3, lineStart: 15, lineEnd: 15,
108 }, {
109 data: []byte("0.2.4.\n7.9.11."),
110 offset: 7,
111 lineNumber: 2, lineStart: 7, lineEnd: 14,
112 }, {
113 data: []byte("\n\n"),
114 offset: 0,
115 lineNumber: 1, lineStart: 0, lineEnd: 1,
116 }, {
117 data: []byte("\n\n"),
118 offset: 1,
119 lineNumber: 2, lineStart: 1, lineEnd: 2,
120 }, {
121 data: []byte("\n\n"),
122 offset: 3,
123 lineNumber: 3, lineStart: 2, lineEnd: 2,
124 }, {
125 data: []byte("line with no newlines"),
126 offset: 3,
127 lineNumber: 1, lineStart: 0, lineEnd: 21,
128 }}
129
130 for _, tt := range cases {
131 t.Run("", func(t *testing.T) {
132 nls := getNewlines(tt.data)
133 gotLineNumber := nls.atOffset(tt.offset)
134 if gotLineNumber != tt.lineNumber {
135 t.Fatalf("expected line number %d, got %d", tt.lineNumber, gotLineNumber)
136 }
137 if gotLineStart := nls.lineStart(gotLineNumber); gotLineStart != tt.lineStart {
138 t.Fatalf("expected line start %d, got %d", tt.lineStart, gotLineStart)
139 }
140 if gotLineEnd := nls.lineStart(gotLineNumber + 1); gotLineEnd != tt.lineEnd {
141 t.Fatalf("expected line end %d, got %d", tt.lineEnd, gotLineEnd)
142 }
143 })
144 }
145}
146
147func TestLineBounds(t *testing.T) {
148 cases := []struct {
149 data []byte
150 lineNumber int
151 start uint32
152 end uint32
153 }{{
154 data: []byte("0.2.4.\n7.9.11.\n"),
155 lineNumber: 1,
156 start: 0, end: 7,
157 }, {
158 data: []byte("0.2.4.\n7.9.11.\n"),
159 lineNumber: 2,
160 start: 7, end: 15,
161 }, {
162 data: []byte("0.2.4.\n7.9.11.\n"),
163 lineNumber: 0,
164 start: 0, end: 0,
165 }, {
166 data: []byte("0.2.4.\n7.9.11.\n"),
167 lineNumber: -1,
168 start: 0, end: 0,
169 }, {
170 data: []byte("0.2.4.\n7.9.11.\n"),
171 lineNumber: 202002,
172 start: 15, end: 15,
173 }, {
174 data: []byte("\n\n"),
175 lineNumber: 1,
176 start: 0, end: 1,
177 }, {
178 data: []byte("\n\n"),
179 lineNumber: 2,
180 start: 1, end: 2,
181 }, {
182 data: []byte("\n\n"),
183 lineNumber: 3,
184 start: 2, end: 2,
185 }}
186
187 for _, tt := range cases {
188 t.Run("", func(t *testing.T) {
189 nls := getNewlines(tt.data)
190 gotStart := nls.lineStart(tt.lineNumber)
191 if gotStart != tt.start {
192 t.Fatalf("expected line start %d, got %d", tt.start, gotStart)
193 }
194 gotEnd := nls.lineStart(tt.lineNumber + 1)
195 if gotEnd != tt.end {
196 t.Fatalf("expected line end %d, got %d", tt.end, gotEnd)
197 }
198 })
199 }
200}
201
202func TestChunkMatches(t *testing.T) {
203 content := []byte(`0.2.4.6.8.10.
20413.16.19.22.
20526.29.32.35.
20639.42.45.48.
20752.55.58.61.
20865.68.71.74.
20978.81.84.87.
210`)
211 match_0_2 := &candidateMatch{byteOffset: 0, byteMatchSz: 2}
212 match_6_10 := &candidateMatch{byteOffset: 6, byteMatchSz: 4}
213 match_10_16 := &candidateMatch{byteOffset: 10, byteMatchSz: 6}
214 match_19_42 := &candidateMatch{byteOffset: 19, byteMatchSz: 23}
215 match_45_48 := &candidateMatch{byteOffset: 45, byteMatchSz: 3}
216 match_71_72 := &candidateMatch{byteOffset: 71, byteMatchSz: 1}
217
218 cases := []struct {
219 candidateMatches []*candidateMatch
220 numContextLines int
221 want []candidateChunk
222 }{{
223 candidateMatches: []*candidateMatch{match_0_2},
224 numContextLines: 0,
225 want: []candidateChunk{{
226 firstLine: 1,
227 minOffset: 0,
228 lastLine: 1,
229 maxOffset: 2,
230 candidates: []*candidateMatch{match_0_2},
231 }},
232 }, {
233 candidateMatches: []*candidateMatch{match_0_2},
234 numContextLines: 5,
235 want: []candidateChunk{{
236 firstLine: 1,
237 minOffset: 0,
238 lastLine: 1,
239 maxOffset: 2,
240 candidates: []*candidateMatch{match_0_2},
241 }},
242 }, {
243 candidateMatches: []*candidateMatch{match_0_2, match_6_10},
244 numContextLines: 0,
245 want: []candidateChunk{{
246 firstLine: 1,
247 minOffset: 0,
248 lastLine: 1,
249 maxOffset: 10,
250 candidates: []*candidateMatch{match_0_2, match_6_10},
251 }},
252 }, {
253 candidateMatches: []*candidateMatch{match_0_2, match_10_16},
254 numContextLines: 0,
255 want: []candidateChunk{{
256 firstLine: 1,
257 minOffset: 0,
258 lastLine: 2,
259 maxOffset: 16,
260 candidates: []*candidateMatch{match_0_2, match_10_16},
261 }},
262 }, {
263 candidateMatches: []*candidateMatch{match_0_2, match_19_42},
264 numContextLines: 0,
265 want: []candidateChunk{{
266 firstLine: 1,
267 minOffset: 0,
268 lastLine: 1,
269 maxOffset: 2,
270 candidates: []*candidateMatch{match_0_2},
271 }, {
272 firstLine: 2,
273 minOffset: 19,
274 lastLine: 4,
275 maxOffset: 42,
276 candidates: []*candidateMatch{match_19_42},
277 }},
278 }, {
279 candidateMatches: []*candidateMatch{match_0_2, match_19_42},
280 numContextLines: 1,
281 want: []candidateChunk{{
282 firstLine: 1,
283 minOffset: 0,
284 lastLine: 4,
285 maxOffset: 42,
286 candidates: []*candidateMatch{match_0_2, match_19_42},
287 }},
288 }, {
289 candidateMatches: []*candidateMatch{
290 match_0_2, match_19_42, match_45_48, match_71_72,
291 },
292 numContextLines: 0,
293 want: []candidateChunk{{
294 firstLine: 1,
295 minOffset: 0,
296 lastLine: 1,
297 maxOffset: 2,
298 candidates: []*candidateMatch{match_0_2},
299 }, {
300 firstLine: 2,
301 minOffset: 19,
302 lastLine: 4,
303 maxOffset: 48,
304 candidates: []*candidateMatch{match_19_42, match_45_48},
305 }, {
306 firstLine: 6,
307 minOffset: 71,
308 lastLine: 6,
309 maxOffset: 72,
310 candidates: []*candidateMatch{match_71_72},
311 }},
312 }, {
313 candidateMatches: []*candidateMatch{
314 match_0_2, match_19_42, match_45_48, match_71_72,
315 },
316 numContextLines: 100,
317 want: []candidateChunk{{
318 firstLine: 1,
319 minOffset: 0,
320 lastLine: 6,
321 maxOffset: 72,
322 candidates: []*candidateMatch{match_0_2, match_19_42, match_45_48, match_71_72},
323 }},
324 }}
325
326 newlines := getNewlines(content)
327 for _, tt := range cases {
328 t.Run("", func(t *testing.T) {
329 got := chunkCandidates(tt.candidateMatches, newlines, tt.numContextLines)
330 if diff := cmp.Diff(fmt.Sprintf("%#v\n", tt.want), fmt.Sprintf("%#v\n", got)); diff != "" {
331 t.Fatal(diff)
332 }
333 })
334 }
335}
336
337func BenchmarkColumnHelper(b *testing.B) {
338 // We simulate looking up columns of evenly spaced matches
339 const matches = 10_000
340 const match = "match"
341 const space = " "
342 const dist = uint32(len(match) + len(space))
343 data := bytes.Repeat([]byte(match+space), matches)
344
345 b.ResetTimer()
346
347 for i := 0; i < b.N; i++ {
348 columnHelper := columnHelper{data: data}
349
350 lineOffset := 0
351 offset := uint32(0)
352 for offset < uint32(len(data)) {
353 col := columnHelper.get(lineOffset, offset)
354 if col != offset+1 {
355 b.Fatal("column is not offset even though data is ASCII")
356 }
357 offset += dist
358 }
359 }
360}
361
362func TestColumnHelper(t *testing.T) {
363 f := func(line0, line1 string) bool {
364 data := []byte(line0 + line1)
365 lineOffset := len(line0)
366
367 columnHelper := columnHelper{data: data}
368
369 // We check every second rune returns the correct answer
370 offset := lineOffset
371 column := 1
372 for offset < len(data) {
373 if column%2 == 0 {
374 got := columnHelper.get(lineOffset, uint32(offset))
375 if got != uint32(column) {
376 return false
377 }
378 }
379 _, size := utf8.DecodeRune(data[offset:])
380 offset += size
381 column++
382 }
383
384 return true
385 }
386
387 if err := quick.Check(f, nil); err != nil {
388 t.Fatal(err)
389 }
390
391 // Corner cases
392
393 // empty data, shouldn't happen but just in case it slips through
394 ch := columnHelper{data: nil}
395 if got := ch.get(0, 0); got != 1 {
396 t.Fatal("empty data didn't return 1", got)
397 }
398
399 // Repeating a call to get should return the same value
400 // empty data, shouldn't happen but just in case it slips through
401 ch = columnHelper{data: []byte("hello\nworld")}
402 if got := ch.get(6, 8); got != 3 {
403 t.Fatal("unexpected value for third column on second line", got)
404 }
405 if got := ch.get(6, 8); got != 3 {
406 t.Fatal("unexpected value for repeated call for third column on second line", got)
407 }
408
409 // Now make sure if we go backwards we do not incorrectly use the cache
410 if got := ch.get(6, 6); got != 1 {
411 t.Fatal("unexpected value for backwards call for first column on second line", got)
412 }
413}
414
415func TestFindMaxOverlappingSection(t *testing.T) {
416 secs := []DocumentSection{
417 {Start: 0, End: 5},
418 {Start: 8, End: 19},
419 {Start: 22, End: 26},
420 }
421 // 012345678901234567890123456
422 // [....[
423 // [..........[
424 // [...[
425
426 testcases := []struct {
427 name string
428 off uint32
429 sz uint32
430 wantSecIdx uint32
431 wantOverlap bool
432 }{
433 {off: 0, sz: 1, wantSecIdx: 0, wantOverlap: true},
434 {off: 0, sz: 5, wantSecIdx: 0, wantOverlap: true},
435 {off: 2, sz: 5, wantSecIdx: 0, wantOverlap: true},
436 {off: 2, sz: 50, wantSecIdx: 1, wantOverlap: true},
437 {off: 4, sz: 10, wantSecIdx: 1, wantOverlap: true},
438 {off: 5, sz: 15, wantSecIdx: 1, wantOverlap: true},
439 {off: 18, sz: 10, wantSecIdx: 2, wantOverlap: true},
440
441 // Prefer full overlap, break ties by preferring the earlier section
442 {off: 10, sz: 20, wantSecIdx: 2, wantOverlap: true},
443 {off: 0, sz: 100, wantSecIdx: 0, wantOverlap: true},
444 {off: 8, sz: 100, wantSecIdx: 1, wantOverlap: true},
445 {off: 0, sz: 10, wantSecIdx: 0, wantOverlap: true},
446 {off: 16, sz: 10, wantSecIdx: 2, wantOverlap: true},
447
448 // No overlap
449 {off: 5, sz: 2, wantOverlap: false},
450 {off: 20, sz: 1, wantOverlap: false},
451 {off: 99, sz: 1, wantOverlap: false},
452 {off: 0, sz: 0, wantOverlap: false},
453 }
454
455 for _, tt := range testcases {
456 t.Run(fmt.Sprintf("off=%d size=%d", tt.off, tt.sz), func(t *testing.T) {
457 got, haveOverlap := findMaxOverlappingSection(secs, tt.off, tt.sz)
458 if haveOverlap != tt.wantOverlap {
459 t.Fatalf("expected overlap %v, got %v", tt.wantOverlap, haveOverlap)
460 }
461 if got != tt.wantSecIdx {
462 t.Fatalf("expected section %d, got %d", tt.wantSecIdx, got)
463 }
464 })
465 }
466}