index/contentprovider_test.go at main · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / contentprovider_test.go
at main 12 kB View raw
Julie Tibshirani Move root-level index code to index package (#902) 1y ago
  1package index
  2
  3import (
  4	"bytes"
  5	"fmt"
  6	"testing"
  7	"testing/quick"
  8	"unicode/utf8"
  9
 10	"github.com/google/go-cmp/cmp"
 11)
 12
 13func getNewlines(data []byte) newlines {
 14	var locs []uint32
 15	for i, c := range data {
 16		if c == '\n' {
 17			locs = append(locs, uint32(i))
 18		}
 19	}
 20	return newlines{
 21		locs:     locs,
 22		fileSize: uint32(len(data)),
 23	}
 24}
 25
 26func TestGetLines(t *testing.T) {
 27	contents := [][]byte{
 28		[]byte("one\ntwo\nthree\nfour"),
 29		[]byte("one\ntwo\nthree\nfour\n"),
 30		[]byte("one"),
 31		[]byte(""),
 32	}
 33
 34	for _, content := range contents {
 35		t.Run("", func(t *testing.T) {
 36			newLines := getNewlines(content)
 37			lines := bytes.SplitAfter(content, []byte{'\n'})
 38			if len(lines) > 0 && len(lines[len(lines)-1]) == 0 {
 39				// A trailing newline does not delimit an empty line at the end of a file
 40				lines = lines[:len(lines)-1]
 41			}
 42			wantGetLines := func(low, high int) []byte {
 43				low--
 44				high--
 45				if low < 0 {
 46					low = 0
 47				}
 48				if low >= len(lines) {
 49					return nil
 50				}
 51				if high <= 0 {
 52					return nil
 53				}
 54				if high > len(lines) {
 55					high = len(lines)
 56				}
 57				return bytes.Join(lines[low:high], nil)
 58			}
 59
 60			for low := -1; low <= len(lines)+2; low++ {
 61				for high := low; high <= len(lines)+2; high++ {
 62					want := wantGetLines(low, high)
 63					got := newLines.getLines(content, low, high)
 64					if d := cmp.Diff(string(want), string(got)); d != "" {
 65						t.Fatal(d)
 66					}
 67				}
 68			}
 69		})
 70	}
 71}
 72
 73func TestAtOffset(t *testing.T) {
 74	cases := []struct {
 75		data       []byte
 76		offset     uint32
 77		lineNumber int
 78		lineStart  uint32
 79		lineEnd    uint32
 80	}{{
 81		data:       []byte("0.2.4.\n7.9.11.\n"),
 82		offset:     0,
 83		lineNumber: 1, lineStart: 0, lineEnd: 7,
 84	}, {
 85		data:       []byte("0.2.4.\n7.9.11.\n"),
 86		offset:     6,
 87		lineNumber: 1, lineStart: 0, lineEnd: 7,
 88	}, {
 89		data:       []byte("0.2.4.\n7.9.11.\n"),
 90		offset:     2,
 91		lineNumber: 1, lineStart: 0, lineEnd: 7,
 92	}, {
 93		data:       []byte("0.2.4.\n7.9.11.\n"),
 94		offset:     2,
 95		lineNumber: 1, lineStart: 0, lineEnd: 7,
 96	}, {
 97		data:       []byte("0.2.4.\n7.9.11.\n"),
 98		offset:     7,
 99		lineNumber: 2, lineStart: 7, lineEnd: 15,
100	}, {
101		data:       []byte("0.2.4.\n7.9.11.\n"),
102		offset:     11,
103		lineNumber: 2, lineStart: 7, lineEnd: 15,
104	}, {
105		data:       []byte("0.2.4.\n7.9.11.\n"),
106		offset:     15,
107		lineNumber: 3, lineStart: 15, lineEnd: 15,
108	}, {
109		data:       []byte("0.2.4.\n7.9.11."),
110		offset:     7,
111		lineNumber: 2, lineStart: 7, lineEnd: 14,
112	}, {
113		data:       []byte("\n\n"),
114		offset:     0,
115		lineNumber: 1, lineStart: 0, lineEnd: 1,
116	}, {
117		data:       []byte("\n\n"),
118		offset:     1,
119		lineNumber: 2, lineStart: 1, lineEnd: 2,
120	}, {
121		data:       []byte("\n\n"),
122		offset:     3,
123		lineNumber: 3, lineStart: 2, lineEnd: 2,
124	}, {
125		data:       []byte("line with no newlines"),
126		offset:     3,
127		lineNumber: 1, lineStart: 0, lineEnd: 21,
128	}}
129
130	for _, tt := range cases {
131		t.Run("", func(t *testing.T) {
132			nls := getNewlines(tt.data)
133			gotLineNumber := nls.atOffset(tt.offset)
134			if gotLineNumber != tt.lineNumber {
135				t.Fatalf("expected line number %d, got %d", tt.lineNumber, gotLineNumber)
136			}
137			if gotLineStart := nls.lineStart(gotLineNumber); gotLineStart != tt.lineStart {
138				t.Fatalf("expected line start %d, got %d", tt.lineStart, gotLineStart)
139			}
140			if gotLineEnd := nls.lineStart(gotLineNumber + 1); gotLineEnd != tt.lineEnd {
141				t.Fatalf("expected line end %d, got %d", tt.lineEnd, gotLineEnd)
142			}
143		})
144	}
145}
146
147func TestLineBounds(t *testing.T) {
148	cases := []struct {
149		data       []byte
150		lineNumber int
151		start      uint32
152		end        uint32
153	}{{
154		data:       []byte("0.2.4.\n7.9.11.\n"),
155		lineNumber: 1,
156		start:      0, end: 7,
157	}, {
158		data:       []byte("0.2.4.\n7.9.11.\n"),
159		lineNumber: 2,
160		start:      7, end: 15,
161	}, {
162		data:       []byte("0.2.4.\n7.9.11.\n"),
163		lineNumber: 0,
164		start:      0, end: 0,
165	}, {
166		data:       []byte("0.2.4.\n7.9.11.\n"),
167		lineNumber: -1,
168		start:      0, end: 0,
169	}, {
170		data:       []byte("0.2.4.\n7.9.11.\n"),
171		lineNumber: 202002,
172		start:      15, end: 15,
173	}, {
174		data:       []byte("\n\n"),
175		lineNumber: 1,
176		start:      0, end: 1,
177	}, {
178		data:       []byte("\n\n"),
179		lineNumber: 2,
180		start:      1, end: 2,
181	}, {
182		data:       []byte("\n\n"),
183		lineNumber: 3,
184		start:      2, end: 2,
185	}}
186
187	for _, tt := range cases {
188		t.Run("", func(t *testing.T) {
189			nls := getNewlines(tt.data)
190			gotStart := nls.lineStart(tt.lineNumber)
191			if gotStart != tt.start {
192				t.Fatalf("expected line start %d, got %d", tt.start, gotStart)
193			}
194			gotEnd := nls.lineStart(tt.lineNumber + 1)
195			if gotEnd != tt.end {
196				t.Fatalf("expected line end %d, got %d", tt.end, gotEnd)
197			}
198		})
199	}
200}
201
202func TestChunkMatches(t *testing.T) {
203	content := []byte(`0.2.4.6.8.10.
20413.16.19.22.
20526.29.32.35.
20639.42.45.48.
20752.55.58.61.
20865.68.71.74.
20978.81.84.87.
210`)
211	match_0_2 := &candidateMatch{byteOffset: 0, byteMatchSz: 2}
212	match_6_10 := &candidateMatch{byteOffset: 6, byteMatchSz: 4}
213	match_10_16 := &candidateMatch{byteOffset: 10, byteMatchSz: 6}
214	match_19_42 := &candidateMatch{byteOffset: 19, byteMatchSz: 23}
215	match_45_48 := &candidateMatch{byteOffset: 45, byteMatchSz: 3}
216	match_71_72 := &candidateMatch{byteOffset: 71, byteMatchSz: 1}
217
218	cases := []struct {
219		candidateMatches []*candidateMatch
220		numContextLines  int
221		want             []candidateChunk
222	}{{
223		candidateMatches: []*candidateMatch{match_0_2},
224		numContextLines:  0,
225		want: []candidateChunk{{
226			firstLine:  1,
227			minOffset:  0,
228			lastLine:   1,
229			maxOffset:  2,
230			candidates: []*candidateMatch{match_0_2},
231		}},
232	}, {
233		candidateMatches: []*candidateMatch{match_0_2},
234		numContextLines:  5,
235		want: []candidateChunk{{
236			firstLine:  1,
237			minOffset:  0,
238			lastLine:   1,
239			maxOffset:  2,
240			candidates: []*candidateMatch{match_0_2},
241		}},
242	}, {
243		candidateMatches: []*candidateMatch{match_0_2, match_6_10},
244		numContextLines:  0,
245		want: []candidateChunk{{
246			firstLine:  1,
247			minOffset:  0,
248			lastLine:   1,
249			maxOffset:  10,
250			candidates: []*candidateMatch{match_0_2, match_6_10},
251		}},
252	}, {
253		candidateMatches: []*candidateMatch{match_0_2, match_10_16},
254		numContextLines:  0,
255		want: []candidateChunk{{
256			firstLine:  1,
257			minOffset:  0,
258			lastLine:   2,
259			maxOffset:  16,
260			candidates: []*candidateMatch{match_0_2, match_10_16},
261		}},
262	}, {
263		candidateMatches: []*candidateMatch{match_0_2, match_19_42},
264		numContextLines:  0,
265		want: []candidateChunk{{
266			firstLine:  1,
267			minOffset:  0,
268			lastLine:   1,
269			maxOffset:  2,
270			candidates: []*candidateMatch{match_0_2},
271		}, {
272			firstLine:  2,
273			minOffset:  19,
274			lastLine:   4,
275			maxOffset:  42,
276			candidates: []*candidateMatch{match_19_42},
277		}},
278	}, {
279		candidateMatches: []*candidateMatch{match_0_2, match_19_42},
280		numContextLines:  1,
281		want: []candidateChunk{{
282			firstLine:  1,
283			minOffset:  0,
284			lastLine:   4,
285			maxOffset:  42,
286			candidates: []*candidateMatch{match_0_2, match_19_42},
287		}},
288	}, {
289		candidateMatches: []*candidateMatch{
290			match_0_2, match_19_42, match_45_48, match_71_72,
291		},
292		numContextLines: 0,
293		want: []candidateChunk{{
294			firstLine:  1,
295			minOffset:  0,
296			lastLine:   1,
297			maxOffset:  2,
298			candidates: []*candidateMatch{match_0_2},
299		}, {
300			firstLine:  2,
301			minOffset:  19,
302			lastLine:   4,
303			maxOffset:  48,
304			candidates: []*candidateMatch{match_19_42, match_45_48},
305		}, {
306			firstLine:  6,
307			minOffset:  71,
308			lastLine:   6,
309			maxOffset:  72,
310			candidates: []*candidateMatch{match_71_72},
311		}},
312	}, {
313		candidateMatches: []*candidateMatch{
314			match_0_2, match_19_42, match_45_48, match_71_72,
315		},
316		numContextLines: 100,
317		want: []candidateChunk{{
318			firstLine:  1,
319			minOffset:  0,
320			lastLine:   6,
321			maxOffset:  72,
322			candidates: []*candidateMatch{match_0_2, match_19_42, match_45_48, match_71_72},
323		}},
324	}}
325
326	newlines := getNewlines(content)
327	for _, tt := range cases {
328		t.Run("", func(t *testing.T) {
329			got := chunkCandidates(tt.candidateMatches, newlines, tt.numContextLines)
330			if diff := cmp.Diff(fmt.Sprintf("%#v\n", tt.want), fmt.Sprintf("%#v\n", got)); diff != "" {
331				t.Fatal(diff)
332			}
333		})
334	}
335}
336
337func BenchmarkColumnHelper(b *testing.B) {
338	// We simulate looking up columns of evenly spaced matches
339	const matches = 10_000
340	const match = "match"
341	const space = "         "
342	const dist = uint32(len(match) + len(space))
343	data := bytes.Repeat([]byte(match+space), matches)
344
345	b.ResetTimer()
346
347	for i := 0; i < b.N; i++ {
348		columnHelper := columnHelper{data: data}
349
350		lineOffset := 0
351		offset := uint32(0)
352		for offset < uint32(len(data)) {
353			col := columnHelper.get(lineOffset, offset)
354			if col != offset+1 {
355				b.Fatal("column is not offset even though data is ASCII")
356			}
357			offset += dist
358		}
359	}
360}
361
362func TestColumnHelper(t *testing.T) {
363	f := func(line0, line1 string) bool {
364		data := []byte(line0 + line1)
365		lineOffset := len(line0)
366
367		columnHelper := columnHelper{data: data}
368
369		// We check every second rune returns the correct answer
370		offset := lineOffset
371		column := 1
372		for offset < len(data) {
373			if column%2 == 0 {
374				got := columnHelper.get(lineOffset, uint32(offset))
375				if got != uint32(column) {
376					return false
377				}
378			}
379			_, size := utf8.DecodeRune(data[offset:])
380			offset += size
381			column++
382		}
383
384		return true
385	}
386
387	if err := quick.Check(f, nil); err != nil {
388		t.Fatal(err)
389	}
390
391	// Corner cases
392
393	// empty data, shouldn't happen but just in case it slips through
394	ch := columnHelper{data: nil}
395	if got := ch.get(0, 0); got != 1 {
396		t.Fatal("empty data didn't return 1", got)
397	}
398
399	// Repeating a call to get should return the same value
400	// empty data, shouldn't happen but just in case it slips through
401	ch = columnHelper{data: []byte("hello\nworld")}
402	if got := ch.get(6, 8); got != 3 {
403		t.Fatal("unexpected value for third column on second line", got)
404	}
405	if got := ch.get(6, 8); got != 3 {
406		t.Fatal("unexpected value for repeated call for third column on second line", got)
407	}
408
409	// Now make sure if we go backwards we do not incorrectly use the cache
410	if got := ch.get(6, 6); got != 1 {
411		t.Fatal("unexpected value for backwards call for first column on second line", got)
412	}
413}
414
415func TestFindMaxOverlappingSection(t *testing.T) {
416	secs := []DocumentSection{
417		{Start: 0, End: 5},
418		{Start: 8, End: 19},
419		{Start: 22, End: 26},
420	}
421	// 012345678901234567890123456
422	// [....[
423	//         [..........[
424	//                       [...[
425
426	testcases := []struct {
427		name        string
428		off         uint32
429		sz          uint32
430		wantSecIdx  uint32
431		wantOverlap bool
432	}{
433		{off: 0, sz: 1, wantSecIdx: 0, wantOverlap: true},
434		{off: 0, sz: 5, wantSecIdx: 0, wantOverlap: true},
435		{off: 2, sz: 5, wantSecIdx: 0, wantOverlap: true},
436		{off: 2, sz: 50, wantSecIdx: 1, wantOverlap: true},
437		{off: 4, sz: 10, wantSecIdx: 1, wantOverlap: true},
438		{off: 5, sz: 15, wantSecIdx: 1, wantOverlap: true},
439		{off: 18, sz: 10, wantSecIdx: 2, wantOverlap: true},
440
441		// Prefer full overlap, break ties by preferring the earlier section
442		{off: 10, sz: 20, wantSecIdx: 2, wantOverlap: true},
443		{off: 0, sz: 100, wantSecIdx: 0, wantOverlap: true},
444		{off: 8, sz: 100, wantSecIdx: 1, wantOverlap: true},
445		{off: 0, sz: 10, wantSecIdx: 0, wantOverlap: true},
446		{off: 16, sz: 10, wantSecIdx: 2, wantOverlap: true},
447
448		// No overlap
449		{off: 5, sz: 2, wantOverlap: false},
450		{off: 20, sz: 1, wantOverlap: false},
451		{off: 99, sz: 1, wantOverlap: false},
452		{off: 0, sz: 0, wantOverlap: false},
453	}
454
455	for _, tt := range testcases {
456		t.Run(fmt.Sprintf("off=%d size=%d", tt.off, tt.sz), func(t *testing.T) {
457			got, haveOverlap := findMaxOverlappingSection(secs, tt.off, tt.sz)
458			if haveOverlap != tt.wantOverlap {
459				t.Fatalf("expected overlap %v, got %v", tt.wantOverlap, haveOverlap)
460			}
461			if got != tt.wantSecIdx {
462				t.Fatalf("expected section %d, got %d", tt.wantSecIdx, got)
463			}
464		})
465	}
466}
Configure Feed

Configure Feed