contentprovider_test.go at 59ab9491e1337dda8f1123048dce0ecd2877fa8c · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / contentprovider_test.go
at 59ab9491e1337dda8f1123048dce0ecd2877fa8c 12 kB View raw
Stefan Hengl ranking: allow partial overlap with symbol (#742) 2y ago
  1package zoekt
  2
  3import (
  4	"bytes"
  5	"fmt"
  6	"testing"
  7	"testing/quick"
  8	"unicode/utf8"
  9
 10	"github.com/google/go-cmp/cmp"
 11)
 12
 13func getNewlines(data []byte) newlines {
 14	var locs []uint32
 15	for i, c := range data {
 16		if c == '\n' {
 17			locs = append(locs, uint32(i))
 18		}
 19	}
 20	return newlines{
 21		locs:     locs,
 22		fileSize: uint32(len(data)),
 23	}
 24}
 25
 26func TestGetLines(t *testing.T) {
 27	contents := [][]byte{
 28		[]byte("one\ntwo\nthree\nfour"),
 29		[]byte("one\ntwo\nthree\nfour\n"),
 30		[]byte("one"),
 31		[]byte(""),
 32	}
 33
 34	for _, content := range contents {
 35		t.Run("", func(t *testing.T) {
 36			newLines := getNewlines(content)
 37			// Trim the last newline before splitting because a trailing newline does not constitute a new line
 38			lines := bytes.Split(bytes.TrimSuffix(content, []byte{'\n'}), []byte{'\n'})
 39			wantGetLines := func(low, high int) []byte {
 40				low--
 41				high--
 42				if low < 0 {
 43					low = 0
 44				}
 45				if low >= len(lines) {
 46					return nil
 47				}
 48				if high <= 0 {
 49					return nil
 50				}
 51				if high > len(lines) {
 52					high = len(lines)
 53				}
 54				return bytes.Join(lines[low:high], []byte{'\n'})
 55			}
 56
 57			for low := -1; low <= len(lines)+2; low++ {
 58				for high := low; high <= len(lines)+2; high++ {
 59					want := wantGetLines(low, high)
 60					got := newLines.getLines(content, low, high)
 61					if d := cmp.Diff(string(want), string(got)); d != "" {
 62						t.Fatal(d)
 63					}
 64				}
 65			}
 66		})
 67	}
 68}
 69
 70func TestAtOffset(t *testing.T) {
 71	cases := []struct {
 72		data       []byte
 73		offset     uint32
 74		lineNumber int
 75		lineStart  int
 76		lineEnd    int
 77	}{{
 78		data:       []byte("0.2.4.\n7.9.11.\n"),
 79		offset:     0,
 80		lineNumber: 1, lineStart: 0, lineEnd: 6,
 81	}, {
 82		data:       []byte("0.2.4.\n7.9.11.\n"),
 83		offset:     6,
 84		lineNumber: 1, lineStart: 0, lineEnd: 6,
 85	}, {
 86		data:       []byte("0.2.4.\n7.9.11.\n"),
 87		offset:     2,
 88		lineNumber: 1, lineStart: 0, lineEnd: 6,
 89	}, {
 90		data:       []byte("0.2.4.\n7.9.11.\n"),
 91		offset:     2,
 92		lineNumber: 1, lineStart: 0, lineEnd: 6,
 93	}, {
 94		data:       []byte("0.2.4.\n7.9.11.\n"),
 95		offset:     7,
 96		lineNumber: 2, lineStart: 7, lineEnd: 14,
 97	}, {
 98		data:       []byte("0.2.4.\n7.9.11.\n"),
 99		offset:     11,
100		lineNumber: 2, lineStart: 7, lineEnd: 14,
101	}, {
102		data:       []byte("0.2.4.\n7.9.11.\n"),
103		offset:     15,
104		lineNumber: 3, lineStart: 15, lineEnd: 15,
105	}, {
106		data:       []byte("0.2.4.\n7.9.11."),
107		offset:     7,
108		lineNumber: 2, lineStart: 7, lineEnd: 14,
109	}, {
110		data:       []byte("\n\n"),
111		offset:     0,
112		lineNumber: 1, lineStart: 0, lineEnd: 0,
113	}, {
114		data:       []byte("\n\n"),
115		offset:     1,
116		lineNumber: 2, lineStart: 1, lineEnd: 1,
117	}, {
118		data:       []byte("\n\n"),
119		offset:     3,
120		lineNumber: 3, lineStart: 2, lineEnd: 2,
121	}, {
122		data:       []byte("line with no newlines"),
123		offset:     3,
124		lineNumber: 1, lineStart: 0, lineEnd: 21,
125	}}
126
127	for _, tt := range cases {
128		t.Run("", func(t *testing.T) {
129			nls := getNewlines(tt.data)
130			gotLineNumber, gotLineStart, gotLineEnd := nls.atOffset(tt.offset)
131			if gotLineNumber != tt.lineNumber {
132				t.Fatalf("expected line number %d, got %d", tt.lineNumber, gotLineNumber)
133			}
134			if gotLineStart != tt.lineStart {
135				t.Fatalf("expected line start %d, got %d", tt.lineStart, gotLineStart)
136			}
137			if gotLineEnd != tt.lineEnd {
138				t.Fatalf("expected line end %d, got %d", tt.lineEnd, gotLineEnd)
139			}
140		})
141	}
142}
143
144func TestLineBounds(t *testing.T) {
145	cases := []struct {
146		data       []byte
147		lineNumber int
148		start      uint32
149		end        uint32
150	}{{
151		data:       []byte("0.2.4.\n7.9.11.\n"),
152		lineNumber: 1,
153		start:      0, end: 6,
154	}, {
155		data:       []byte("0.2.4.\n7.9.11.\n"),
156		lineNumber: 2,
157		start:      7, end: 14,
158	}, {
159		data:       []byte("0.2.4.\n7.9.11.\n"),
160		lineNumber: 0,
161		start:      0, end: 0,
162	}, {
163		data:       []byte("0.2.4.\n7.9.11.\n"),
164		lineNumber: -1,
165		start:      0, end: 0,
166	}, {
167		data:       []byte("0.2.4.\n7.9.11.\n"),
168		lineNumber: 202002,
169		start:      15, end: 15,
170	}, {
171		data:       []byte("\n\n"),
172		lineNumber: 1,
173		start:      0, end: 0,
174	}, {
175		data:       []byte("\n\n"),
176		lineNumber: 2,
177		start:      1, end: 1,
178	}, {
179		data:       []byte("\n\n"),
180		lineNumber: 3,
181		start:      2, end: 2,
182	}}
183
184	for _, tt := range cases {
185		t.Run("", func(t *testing.T) {
186			nls := getNewlines(tt.data)
187			gotStart, gotEnd := nls.lineBounds(tt.lineNumber)
188			if gotStart != tt.start {
189				t.Fatalf("expected line start %d, got %d", tt.start, gotStart)
190			}
191			if gotEnd != tt.end {
192				t.Fatalf("expected line end %d, got %d", tt.end, gotEnd)
193			}
194		})
195	}
196}
197
198func TestChunkMatches(t *testing.T) {
199	content := []byte(`0.2.4.6.8.10.
20013.16.19.22.
20126.29.32.35.
20239.42.45.48.
20352.55.58.61.
20465.68.71.74.
20578.81.84.87.
206`)
207	match_0_2 := &candidateMatch{byteOffset: 0, byteMatchSz: 2}
208	match_6_10 := &candidateMatch{byteOffset: 6, byteMatchSz: 4}
209	match_10_16 := &candidateMatch{byteOffset: 10, byteMatchSz: 6}
210	match_19_42 := &candidateMatch{byteOffset: 19, byteMatchSz: 23}
211	match_45_48 := &candidateMatch{byteOffset: 45, byteMatchSz: 3}
212	match_71_72 := &candidateMatch{byteOffset: 71, byteMatchSz: 1}
213
214	cases := []struct {
215		candidateMatches []*candidateMatch
216		numContextLines  int
217		want             []candidateChunk
218	}{{
219		candidateMatches: []*candidateMatch{match_0_2},
220		numContextLines:  0,
221		want: []candidateChunk{{
222			firstLine:  1,
223			minOffset:  0,
224			lastLine:   1,
225			maxOffset:  2,
226			candidates: []*candidateMatch{match_0_2},
227		}},
228	}, {
229		candidateMatches: []*candidateMatch{match_0_2},
230		numContextLines:  5,
231		want: []candidateChunk{{
232			firstLine:  1,
233			minOffset:  0,
234			lastLine:   1,
235			maxOffset:  2,
236			candidates: []*candidateMatch{match_0_2},
237		}},
238	}, {
239		candidateMatches: []*candidateMatch{match_0_2, match_6_10},
240		numContextLines:  0,
241		want: []candidateChunk{{
242			firstLine:  1,
243			minOffset:  0,
244			lastLine:   1,
245			maxOffset:  10,
246			candidates: []*candidateMatch{match_0_2, match_6_10},
247		}},
248	}, {
249		candidateMatches: []*candidateMatch{match_0_2, match_10_16},
250		numContextLines:  0,
251		want: []candidateChunk{{
252			firstLine:  1,
253			minOffset:  0,
254			lastLine:   2,
255			maxOffset:  16,
256			candidates: []*candidateMatch{match_0_2, match_10_16},
257		}},
258	}, {
259		candidateMatches: []*candidateMatch{match_0_2, match_19_42},
260		numContextLines:  0,
261		want: []candidateChunk{{
262			firstLine:  1,
263			minOffset:  0,
264			lastLine:   1,
265			maxOffset:  2,
266			candidates: []*candidateMatch{match_0_2},
267		}, {
268			firstLine:  2,
269			minOffset:  19,
270			lastLine:   4,
271			maxOffset:  42,
272			candidates: []*candidateMatch{match_19_42},
273		}},
274	}, {
275		candidateMatches: []*candidateMatch{match_0_2, match_19_42},
276		numContextLines:  1,
277		want: []candidateChunk{{
278			firstLine:  1,
279			minOffset:  0,
280			lastLine:   4,
281			maxOffset:  42,
282			candidates: []*candidateMatch{match_0_2, match_19_42},
283		}},
284	}, {
285		candidateMatches: []*candidateMatch{
286			match_0_2, match_19_42, match_45_48, match_71_72,
287		},
288		numContextLines: 0,
289		want: []candidateChunk{{
290			firstLine:  1,
291			minOffset:  0,
292			lastLine:   1,
293			maxOffset:  2,
294			candidates: []*candidateMatch{match_0_2},
295		}, {
296			firstLine:  2,
297			minOffset:  19,
298			lastLine:   4,
299			maxOffset:  48,
300			candidates: []*candidateMatch{match_19_42, match_45_48},
301		}, {
302			firstLine:  6,
303			minOffset:  71,
304			lastLine:   6,
305			maxOffset:  72,
306			candidates: []*candidateMatch{match_71_72},
307		}},
308	}, {
309		candidateMatches: []*candidateMatch{
310			match_0_2, match_19_42, match_45_48, match_71_72,
311		},
312		numContextLines: 100,
313		want: []candidateChunk{{
314			firstLine:  1,
315			minOffset:  0,
316			lastLine:   6,
317			maxOffset:  72,
318			candidates: []*candidateMatch{match_0_2, match_19_42, match_45_48, match_71_72},
319		}},
320	}}
321
322	newlines := getNewlines(content)
323	for _, tt := range cases {
324		t.Run("", func(t *testing.T) {
325			got := chunkCandidates(tt.candidateMatches, newlines, tt.numContextLines)
326			if diff := cmp.Diff(fmt.Sprintf("%#v\n", tt.want), fmt.Sprintf("%#v\n", got)); diff != "" {
327				t.Fatal(diff)
328			}
329		})
330	}
331}
332
333func BenchmarkColumnHelper(b *testing.B) {
334	// We simulate looking up columns of evenly spaced matches
335	const matches = 10_000
336	const match = "match"
337	const space = "         "
338	const dist = uint32(len(match) + len(space))
339	data := bytes.Repeat([]byte(match+space), matches)
340
341	b.ResetTimer()
342
343	for i := 0; i < b.N; i++ {
344		columnHelper := columnHelper{data: data}
345
346		lineOffset := 0
347		offset := uint32(0)
348		for offset < uint32(len(data)) {
349			col := columnHelper.get(lineOffset, offset)
350			if col != offset+1 {
351				b.Fatal("column is not offset even though data is ASCII")
352			}
353			offset += dist
354		}
355	}
356}
357
358func TestColumnHelper(t *testing.T) {
359	f := func(line0, line1 string) bool {
360		data := []byte(line0 + line1)
361		lineOffset := len(line0)
362
363		columnHelper := columnHelper{data: data}
364
365		// We check every second rune returns the correct answer
366		offset := lineOffset
367		column := 1
368		for offset < len(data) {
369			if column%2 == 0 {
370				got := columnHelper.get(lineOffset, uint32(offset))
371				if got != uint32(column) {
372					return false
373				}
374			}
375			_, size := utf8.DecodeRune(data[offset:])
376			offset += size
377			column++
378		}
379
380		return true
381	}
382
383	if err := quick.Check(f, nil); err != nil {
384		t.Fatal(err)
385	}
386
387	// Corner cases
388
389	// empty data, shouldn't happen but just in case it slips through
390	ch := columnHelper{data: nil}
391	if got := ch.get(0, 0); got != 1 {
392		t.Fatal("empty data didn't return 1", got)
393	}
394
395	// Repeating a call to get should return the same value
396	// empty data, shouldn't happen but just in case it slips through
397	ch = columnHelper{data: []byte("hello\nworld")}
398	if got := ch.get(6, 8); got != 3 {
399		t.Fatal("unexpected value for third column on second line", got)
400	}
401	if got := ch.get(6, 8); got != 3 {
402		t.Fatal("unexpected value for repeated call for third column on second line", got)
403	}
404
405	// Now make sure if we go backwards we do not incorrectly use the cache
406	if got := ch.get(6, 6); got != 1 {
407		t.Fatal("unexpected value for backwards call for first column on second line", got)
408	}
409}
410
411func TestFindMaxOverlappingSection(t *testing.T) {
412	secs := []DocumentSection{
413		{Start: 0, End: 5},
414		{Start: 8, End: 19},
415		{Start: 22, End: 26},
416	}
417	// 012345678901234567890123456
418	// [....[
419	//         [..........[
420	//                       [...[
421
422	testcases := []struct {
423		name        string
424		off         uint32
425		sz          uint32
426		wantSecIdx  uint32
427		wantOverlap bool
428	}{
429		{off: 0, sz: 1, wantSecIdx: 0, wantOverlap: true},
430		{off: 0, sz: 5, wantSecIdx: 0, wantOverlap: true},
431		{off: 2, sz: 5, wantSecIdx: 0, wantOverlap: true},
432		{off: 2, sz: 50, wantSecIdx: 1, wantOverlap: true},
433		{off: 4, sz: 10, wantSecIdx: 1, wantOverlap: true},
434		{off: 5, sz: 15, wantSecIdx: 1, wantOverlap: true},
435		{off: 18, sz: 10, wantSecIdx: 2, wantOverlap: true},
436
437		// Prefer full overlap, break ties by preferring the earlier section
438		{off: 10, sz: 20, wantSecIdx: 2, wantOverlap: true},
439		{off: 0, sz: 100, wantSecIdx: 0, wantOverlap: true},
440		{off: 8, sz: 100, wantSecIdx: 1, wantOverlap: true},
441		{off: 0, sz: 10, wantSecIdx: 0, wantOverlap: true},
442		{off: 16, sz: 10, wantSecIdx: 2, wantOverlap: true},
443
444		// No overlap
445		{off: 5, sz: 2, wantOverlap: false},
446		{off: 20, sz: 1, wantOverlap: false},
447		{off: 99, sz: 1, wantOverlap: false},
448		{off: 0, sz: 0, wantOverlap: false},
449	}
450
451	for _, tt := range testcases {
452		t.Run(fmt.Sprintf("off=%d size=%d", tt.off, tt.sz), func(t *testing.T) {
453			got, haveOverlap := findMaxOverlappingSection(secs, tt.off, tt.sz)
454			if haveOverlap != tt.wantOverlap {
455				t.Fatalf("expected overlap %v, got %v", tt.wantOverlap, haveOverlap)
456			}
457			if got != tt.wantSecIdx {
458				t.Fatalf("expected section %d, got %d", tt.wantSecIdx, got)
459			}
460		})
461	}
462}
Configure Feed

Configure Feed