fork of https://github.com/sourcegraph/zoekt
1package index
2
3import (
4 "math/rand"
5 "reflect"
6 "slices"
7 "testing"
8 "testing/quick"
9)
10
11const exampleQuery = "const data: Event = { ...JSON.parse(message.data), type: message.event }"
12
13func genFrequencies(ngramOffs []runeNgramOff, max int) []uint32 {
14 seen := map[ngram]uint32{}
15 var frequencies []uint32
16 for _, n := range ngramOffs {
17 freq, ok := seen[n.ngram]
18 if !ok {
19 freq = uint32(rand.Intn(max))
20 seen[n.ngram] = freq
21 }
22 frequencies = append(frequencies, freq)
23 }
24 return frequencies
25}
26
27func BenchmarkMinFrequencyNgramOffsets(b *testing.B) {
28 ngramOffs := splitNGrams([]byte(exampleQuery))
29 slices.SortFunc(ngramOffs, runeNgramOff.Compare)
30 frequencies := genFrequencies(ngramOffs, 100)
31 for i := 0; i < b.N; i++ {
32 x0, x1 := minFrequencyNgramOffsets(ngramOffs, frequencies)
33 if x0 == x1 {
34 b.Fatal("should not be the same")
35 }
36 }
37}
38
39func TestMinFrequencyNgramOffsets(t *testing.T) {
40 // Our implementation has ill-defined tie breaks when the 2nd smallest
41 // frequency can be tied with others. Fixing that would make the CPU perf
42 // worse, so what we do instead is just validate that what we get back is
43 // acceptable.
44 if err := quick.Check(func(s string, maxFreq uint16) bool {
45 // Ensure maximum frequency is nonzero so that random sampling will work
46 maxFreq = max(maxFreq, 1)
47
48 ngramOffs := splitNGrams([]byte(s))
49 if len(ngramOffs) == 0 {
50 return true
51 }
52
53 slices.SortFunc(ngramOffs, runeNgramOff.Compare)
54 frequencies := genFrequencies(ngramOffs, int(maxFreq))
55 x0, x1 := minFrequencyNgramOffsets(ngramOffs, frequencies)
56
57 if x0.index > x1.index {
58 t.Log("x0 should be before x1")
59 return false
60 }
61
62 if len(ngramOffs) <= 1 {
63 return true
64 }
65
66 // Now we just assert that we found two items with the smallest
67 // frequencies.
68 idx0 := slices.IndexFunc(ngramOffs, func(a runeNgramOff) bool { return a == x0 })
69 idx1 := slices.IndexFunc(ngramOffs, func(a runeNgramOff) bool { return a == x1 })
70 start := []uint32{frequencies[idx0], frequencies[idx1]}
71 slices.Sort(start)
72 slices.Sort(frequencies)
73 return reflect.DeepEqual(start, frequencies[:2])
74 }, nil); err != nil {
75 t.Fatal(err)
76 }
77}
78
79func TestFindSelectiveNGrams(t *testing.T) {
80 if err := quick.Check(func(s string, maxFreq uint16) bool {
81 // Ensure maximum frequency is nonzero so that random sampling will work
82 maxFreq = max(maxFreq, 1)
83
84 ngramOffs := splitNGrams([]byte(s))
85 if len(ngramOffs) == 0 {
86 return true
87 }
88
89 slices.SortFunc(ngramOffs, runeNgramOff.Compare)
90 indexMap := make([]int, len(ngramOffs))
91 for i, n := range ngramOffs {
92 indexMap[n.index] = i
93 }
94
95 frequencies := genFrequencies(ngramOffs, int(maxFreq))
96 x0, x1 := findSelectiveNgrams(ngramOffs, indexMap, frequencies)
97
98 if len(ngramOffs) <= 1 {
99 return true
100 }
101
102 // Just assert the invariant that x0 is before x1. This test mostly checks
103 // for out-of-bounds errors.
104 return x0.index < x1.index
105 }, nil); err != nil {
106 t.Fatal(err)
107 }
108}