···16161717import (
1818 "encoding/binary"
1919+ "errors"
1920 "fmt"
2021 "hash/crc64"
2122 "log"
···322323323324const maxUInt32 = 0xffffffff
324325325325-func firstMinarg(xs []uint32) uint32 {
326326- m := uint32(maxUInt32)
327327- j := len(xs)
326326+func min2Index(xs []uint32) (idx0, idx1 int) {
327327+ min0, min1 := uint32(maxUInt32), uint32(maxUInt32)
328328 for i, x := range xs {
329329- if x < m {
330330- m = x
331331- j = i
329329+ if x <= min0 {
330330+ idx0, idx1 = i, idx0
331331+ min0, min1 = x, min0
332332+ } else if x <= min1 {
333333+ idx1 = i
334334+ min1 = x
332335 }
333336 }
334334- return uint32(j)
337337+ return
335338}
336339337337-func lastMinarg(xs []uint32) uint32 {
338338- m := uint32(maxUInt32)
339339- j := len(xs)
340340- for i, x := range xs {
341341- if x <= m {
342342- m = x
343343- j = i
340340+// minFrequencyNgramOffsets returns the two lowest frequency ngrams to pass to
341341+// the distance iterator. If they have the same frequency, we maximise the
342342+// distance between them. first will always have a smaller index than last.
343343+func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
344344+ firstI, lastI := min2Index(frequencies)
345345+ // If the frequencies are equal lets maximise distance in the query
346346+ // string. This optimization normally triggers for long repeated trigrams
347347+ // in a string, eg a query like "AAAAA..."
348348+ if frequencies[firstI] == frequencies[lastI] {
349349+ for i, freq := range frequencies {
350350+ if freq != frequencies[firstI] {
351351+ continue
352352+ }
353353+ if ngramOffs[i].index < ngramOffs[firstI].index {
354354+ firstI = i
355355+ }
356356+ if ngramOffs[i].index > ngramOffs[lastI].index {
357357+ lastI = i
358358+ }
344359 }
345360 }
346346- return uint32(j)
361361+ first = ngramOffs[firstI]
362362+ last = ngramOffs[lastI]
363363+ // Ensure first appears before last to make distance logic below clean.
364364+ if first.index > last.index {
365365+ last, first = first, last
366366+ }
367367+ return first, last
347368}
348369349370func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
···382403383404 // Find the 2 least common ngrams from the string.
384405 ngramOffs := splitNGrams([]byte(query.Pattern))
406406+407407+ // protect against accidental searching of empty strings
408408+ if len(ngramOffs) == 0 {
409409+ return nil, errors.New("iterateNgrams needs non empty string")
410410+ }
411411+385412 // PERF: Sort to increase the chances adjacent checks are in the same btree
386413 // bucket (which can cause disk IO).
387414 slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool {
···415442 frequencies = append(frequencies, freq)
416443 }
417444418418- var first, last runeNgramOff
419419- {
420420- firstI := firstMinarg(frequencies)
421421- frequencies[firstI] = maxUInt32
422422- lastI := lastMinarg(frequencies)
423423- first = ngramOffs[firstI]
424424- last = ngramOffs[lastI]
425425- if first.index > last.index {
426426- last, first = first, last
427427- }
428428- }
445445+ // first and last are now the smallest trigram posting lists to iterate
446446+ // through.
447447+ first, last := minFrequencyNgramOffsets(ngramOffs, frequencies)
429448430449 iter := &ngramDocIterator{
431450 leftPad: first.index,
+79
indexdata_test.go
···11+package zoekt
22+33+import (
44+ "math/rand"
55+ "reflect"
66+ "testing"
77+ "testing/quick"
88+99+ "golang.org/x/exp/slices"
1010+)
1111+1212+const exampleQuery = "const data: Event = { ...JSON.parse(message.data), type: message.event }"
1313+1414+func genFrequencies(ngramOffs []runeNgramOff, max int) []uint32 {
1515+ seen := map[ngram]uint32{}
1616+ var frequencies []uint32
1717+ for _, n := range ngramOffs {
1818+ freq, ok := seen[n.ngram]
1919+ if !ok {
2020+ freq = uint32(rand.Intn(max))
2121+ seen[n.ngram] = freq
2222+ }
2323+ frequencies = append(frequencies, freq)
2424+ }
2525+ return frequencies
2626+}
2727+2828+func BenchmarkMinFrequencyNgramOffsets(b *testing.B) {
2929+ ngramOffs := splitNGrams([]byte(exampleQuery))
3030+ slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool {
3131+ return a.ngram < b.ngram
3232+ })
3333+ frequencies := genFrequencies(ngramOffs, 100)
3434+ for i := 0; i < b.N; i++ {
3535+ x0, x1 := minFrequencyNgramOffsets(ngramOffs, frequencies)
3636+ if x0 == x1 {
3737+ b.Fatal("should not be the same")
3838+ }
3939+ }
4040+}
4141+4242+func TestMinFrequencyNgramOffsets(t *testing.T) {
4343+ // Our implementation has ill-defined tie breaks when the 2nd smallest
4444+ // frequency can be tied with others. Fixing that would make the CPU perf
4545+ // worse, so what we do instead is just validate that what we get back is
4646+ // acceptable.
4747+ if err := quick.Check(func(s string, maxFreq uint16) bool {
4848+ ngramOffs := splitNGrams([]byte(s))
4949+ if len(ngramOffs) == 0 {
5050+ return true
5151+ }
5252+5353+ slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool {
5454+ return a.ngram < b.ngram
5555+ })
5656+ frequencies := genFrequencies(ngramOffs, int(maxFreq))
5757+ x0, x1 := minFrequencyNgramOffsets(ngramOffs, frequencies)
5858+5959+ if x0.index > x1.index {
6060+ t.Log("x0 should be before x1")
6161+ return false
6262+ }
6363+6464+ if len(ngramOffs) <= 1 {
6565+ return true
6666+ }
6767+6868+ // Now we just assert that we found two items with the smallest
6969+ // frequencies.
7070+ idx0 := slices.Index[runeNgramOff](ngramOffs, x0)
7171+ idx1 := slices.Index[runeNgramOff](ngramOffs, x1)
7272+ start := []uint32{frequencies[idx0], frequencies[idx1]}
7373+ slices.Sort(start)
7474+ slices.Sort(frequencies)
7575+ return reflect.DeepEqual(start, frequencies[:2])
7676+ }, nil); err != nil {
7777+ t.Fatal(err)
7878+ }
7979+}