fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

maximise distance between ngrams (#618)

Reintroducing this optimization which we lost when we sorted ngrams.

+124 -26
+45 -26
indexdata.go
··· 16 16 17 17 import ( 18 18 "encoding/binary" 19 + "errors" 19 20 "fmt" 20 21 "hash/crc64" 21 22 "log" ··· 322 323 323 324 const maxUInt32 = 0xffffffff 324 325 325 - func firstMinarg(xs []uint32) uint32 { 326 - m := uint32(maxUInt32) 327 - j := len(xs) 326 + func min2Index(xs []uint32) (idx0, idx1 int) { 327 + min0, min1 := uint32(maxUInt32), uint32(maxUInt32) 328 328 for i, x := range xs { 329 - if x < m { 330 - m = x 331 - j = i 329 + if x <= min0 { 330 + idx0, idx1 = i, idx0 331 + min0, min1 = x, min0 332 + } else if x <= min1 { 333 + idx1 = i 334 + min1 = x 332 335 } 333 336 } 334 - return uint32(j) 337 + return 335 338 } 336 339 337 - func lastMinarg(xs []uint32) uint32 { 338 - m := uint32(maxUInt32) 339 - j := len(xs) 340 - for i, x := range xs { 341 - if x <= m { 342 - m = x 343 - j = i 340 + // minFrequencyNgramOffsets returns the two lowest frequency ngrams to pass to 341 + // the distance iterator. If they have the same frequency, we maximise the 342 + // distance between them. first will always have a smaller index than last. 343 + func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) { 344 + firstI, lastI := min2Index(frequencies) 345 + // If the frequencies are equal lets maximise distance in the query 346 + // string. This optimization normally triggers for long repeated trigrams 347 + // in a string, eg a query like "AAAAA..." 348 + if frequencies[firstI] == frequencies[lastI] { 349 + for i, freq := range frequencies { 350 + if freq != frequencies[firstI] { 351 + continue 352 + } 353 + if ngramOffs[i].index < ngramOffs[firstI].index { 354 + firstI = i 355 + } 356 + if ngramOffs[i].index > ngramOffs[lastI].index { 357 + lastI = i 358 + } 344 359 } 345 360 } 346 - return uint32(j) 361 + first = ngramOffs[firstI] 362 + last = ngramOffs[lastI] 363 + // Ensure first appears before last to make distance logic below clean. 364 + if first.index > last.index { 365 + last, first = first, last 366 + } 367 + return first, last 347 368 } 348 369 349 370 func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 { ··· 382 403 383 404 // Find the 2 least common ngrams from the string. 384 405 ngramOffs := splitNGrams([]byte(query.Pattern)) 406 + 407 + // protect against accidental searching of empty strings 408 + if len(ngramOffs) == 0 { 409 + return nil, errors.New("iterateNgrams needs non empty string") 410 + } 411 + 385 412 // PERF: Sort to increase the chances adjacent checks are in the same btree 386 413 // bucket (which can cause disk IO). 387 414 slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool { ··· 415 442 frequencies = append(frequencies, freq) 416 443 } 417 444 418 - var first, last runeNgramOff 419 - { 420 - firstI := firstMinarg(frequencies) 421 - frequencies[firstI] = maxUInt32 422 - lastI := lastMinarg(frequencies) 423 - first = ngramOffs[firstI] 424 - last = ngramOffs[lastI] 425 - if first.index > last.index { 426 - last, first = first, last 427 - } 428 - } 445 + // first and last are now the smallest trigram posting lists to iterate 446 + // through. 447 + first, last := minFrequencyNgramOffsets(ngramOffs, frequencies) 429 448 430 449 iter := &ngramDocIterator{ 431 450 leftPad: first.index,
+79
indexdata_test.go
··· 1 + package zoekt 2 + 3 + import ( 4 + "math/rand" 5 + "reflect" 6 + "testing" 7 + "testing/quick" 8 + 9 + "golang.org/x/exp/slices" 10 + ) 11 + 12 + const exampleQuery = "const data: Event = { ...JSON.parse(message.data), type: message.event }" 13 + 14 + func genFrequencies(ngramOffs []runeNgramOff, max int) []uint32 { 15 + seen := map[ngram]uint32{} 16 + var frequencies []uint32 17 + for _, n := range ngramOffs { 18 + freq, ok := seen[n.ngram] 19 + if !ok { 20 + freq = uint32(rand.Intn(max)) 21 + seen[n.ngram] = freq 22 + } 23 + frequencies = append(frequencies, freq) 24 + } 25 + return frequencies 26 + } 27 + 28 + func BenchmarkMinFrequencyNgramOffsets(b *testing.B) { 29 + ngramOffs := splitNGrams([]byte(exampleQuery)) 30 + slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool { 31 + return a.ngram < b.ngram 32 + }) 33 + frequencies := genFrequencies(ngramOffs, 100) 34 + for i := 0; i < b.N; i++ { 35 + x0, x1 := minFrequencyNgramOffsets(ngramOffs, frequencies) 36 + if x0 == x1 { 37 + b.Fatal("should not be the same") 38 + } 39 + } 40 + } 41 + 42 + func TestMinFrequencyNgramOffsets(t *testing.T) { 43 + // Our implementation has ill-defined tie breaks when the 2nd smallest 44 + // frequency can be tied with others. Fixing that would make the CPU perf 45 + // worse, so what we do instead is just validate that what we get back is 46 + // acceptable. 47 + if err := quick.Check(func(s string, maxFreq uint16) bool { 48 + ngramOffs := splitNGrams([]byte(s)) 49 + if len(ngramOffs) == 0 { 50 + return true 51 + } 52 + 53 + slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool { 54 + return a.ngram < b.ngram 55 + }) 56 + frequencies := genFrequencies(ngramOffs, int(maxFreq)) 57 + x0, x1 := minFrequencyNgramOffsets(ngramOffs, frequencies) 58 + 59 + if x0.index > x1.index { 60 + t.Log("x0 should be before x1") 61 + return false 62 + } 63 + 64 + if len(ngramOffs) <= 1 { 65 + return true 66 + } 67 + 68 + // Now we just assert that we found two items with the smallest 69 + // frequencies. 70 + idx0 := slices.Index[runeNgramOff](ngramOffs, x0) 71 + idx1 := slices.Index[runeNgramOff](ngramOffs, x1) 72 + start := []uint32{frequencies[idx0], frequencies[idx1]} 73 + slices.Sort(start) 74 + slices.Sort(frequencies) 75 + return reflect.DeepEqual(start, frequencies[:2]) 76 + }, nil); err != nil { 77 + t.Fatal(err) 78 + } 79 + }