fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Avoid overlapping trigrams in distanceHitIterator (#779)

We select the two least frequent trigrams to create the candidate match
iterator. It's common for these trigrams to overlap. This change shifts the
first and last trigrams to avoid overlap, which is guaranteed to result in a
smaller intersection. For frequent terms, this can substantially reduce the
number of candidate matches we consider.

+68 -16
+2 -2
bits.go
··· 110 110 type runeNgramOff struct { 111 111 ngram ngram 112 112 // index is the original index inside of the returned array of splitNGrams 113 - index uint32 113 + index int 114 114 } 115 115 116 116 func (a runeNgramOff) Compare(b runeNgramOff) int { ··· 149 149 ng := runesToNGram(runeGram) 150 150 result = append(result, runeNgramOff{ 151 151 ngram: ng, 152 - index: uint32(len(result)), 152 + index: len(result), 153 153 }) 154 154 } 155 155 return result
+2 -2
index_test.go
··· 441 441 Want: Stats{ 442 442 FilesLoaded: 1, 443 443 ContentBytesLoaded: 22, 444 - IndexBytesLoaded: 8, 444 + IndexBytesLoaded: 10, 445 445 NgramMatches: 3, // we look at doc 1, because it's max(0,1) due to AND 446 446 NgramLookups: 104, 447 447 MatchCount: 2, ··· 556 556 }}, 557 557 Want: Stats{ 558 558 ContentBytesLoaded: 33, // we still have to run regex since "app" matches two documents 559 - IndexBytesLoaded: 8, 559 + IndexBytesLoaded: 10, 560 560 FilesConsidered: 2, // important that we don't check 3 to ensure we are using the index 561 561 FilesLoaded: 2, 562 562 MatchCount: 0, // even though there is a match it doesn't align with a symbol
+36 -12
indexdata.go
··· 336 336 return 337 337 } 338 338 339 - // minFrequencyNgramOffsets returns the two lowest frequency ngrams to pass to 340 - // the distance iterator. If they have the same frequency, we maximise the 341 - // distance between them. first will always have a smaller index than last. 339 + // findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to 340 + // produce a small file intersection. It finds the two lowest frequency ngrams, making 341 + // sure to maximize the distance between them in case of ties. It avoids overlapping 342 + // trigrams to keep their intersection as small as possible. 343 + // 344 + // Invariant: first will always have a smaller index than last. 345 + func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) { 346 + first, last = minFrequencyNgramOffsets(ngramOffs, frequencies) 347 + 348 + // If the trigrams are overlapping, then try to shift one to reduce overlap. 349 + // This is guaranteed to produce a smaller intersection. 350 + if last.index-first.index < ngramSize { 351 + newFirstIndex := max(last.index-ngramSize, 0) 352 + if newFirstIndex != first.index { 353 + first = ngramOffs[indexMap[newFirstIndex]] 354 + } 355 + 356 + newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1) 357 + if newLastIndex != last.index { 358 + last = ngramOffs[indexMap[newLastIndex]] 359 + } 360 + } 361 + return 362 + } 363 + 342 364 func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) { 343 365 firstI, lastI := min2Index(frequencies) 344 366 // If the frequencies are equal lets maximise distance in the query ··· 357 379 } 358 380 } 359 381 } 382 + 360 383 first = ngramOffs[firstI] 361 384 last = ngramOffs[lastI] 362 - // Ensure first appears before last to make distance logic below clean. 385 + 386 + // Ensure first appears before last as a helpful invariant. 363 387 if first.index > last.index { 364 388 last, first = first, last 365 389 } 366 - return first, last 390 + return 367 391 } 368 392 369 393 func (data *indexData) ngrams(filename bool) btreeIndex { ··· 412 436 // bucket (which can cause disk IO). 413 437 slices.SortFunc(ngramOffs, runeNgramOff.Compare) 414 438 frequencies := make([]uint32, 0, len(ngramOffs)) 439 + indexMap := make([]int, len(ngramOffs)) 415 440 ngramLookups := 0 416 441 ngrams := d.ngrams(query.FileName) 417 - for _, o := range ngramOffs { 442 + for i, o := range ngramOffs { 418 443 var freq uint32 419 444 if query.CaseSensitive { 420 445 freq = ngrams.Get(o.ngram).sz ··· 438 463 } 439 464 440 465 frequencies = append(frequencies, freq) 466 + indexMap[o.index] = i 441 467 } 442 468 443 - // first and last are now the smallest trigram posting lists to iterate 444 - // through. 445 - first, last := minFrequencyNgramOffsets(ngramOffs, frequencies) 469 + first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies) 446 470 447 471 iter := &ngramDocIterator{ 448 - leftPad: first.index, 449 - rightPad: uint32(utf8.RuneCountInString(str)) - first.index, 472 + leftPad: uint32(first.index), 473 + rightPad: uint32(utf8.RuneCountInString(str) - first.index), 450 474 ngramLookups: ngramLookups, 451 475 } 452 476 if query.FileName { ··· 456 480 } 457 481 458 482 if first != last { 459 - runeDist := last.index - first.index 483 + runeDist := uint32(last.index - first.index) 460 484 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName) 461 485 if err != nil { 462 486 return nil, err
+28
indexdata_test.go
··· 72 72 t.Fatal(err) 73 73 } 74 74 } 75 + 76 + func TestFindSelectiveNGrams(t *testing.T) { 77 + if err := quick.Check(func(s string, maxFreq uint16) bool { 78 + ngramOffs := splitNGrams([]byte(s)) 79 + if len(ngramOffs) == 0 { 80 + return true 81 + } 82 + 83 + slices.SortFunc(ngramOffs, runeNgramOff.Compare) 84 + indexMap := make([]int, len(ngramOffs)) 85 + for i, n := range ngramOffs { 86 + indexMap[n.index] = i 87 + } 88 + 89 + frequencies := genFrequencies(ngramOffs, int(maxFreq)) 90 + x0, x1 := findSelectiveNgrams(ngramOffs, indexMap, frequencies) 91 + 92 + if len(ngramOffs) <= 1 { 93 + return true 94 + } 95 + 96 + // Just assert the invariant that x0 is before x1. This test mostly checks 97 + // for out-of-bounds errors. 98 + return x0.index < x1.index 99 + }, nil); err != nil { 100 + t.Fatal(err) 101 + } 102 + }