Avoid overlapping trigrams in distanceHitIterator (#779) · boltless.me/zoekt@fe8f2a3

+2 -2

bits.go

··· 110 110 type runeNgramOff struct { 111 111 ngram ngram 112 112 // index is the original index inside of the returned array of splitNGrams 113 - index uint32 113 + index int 114 114 } 115 115 116 116 func (a runeNgramOff) Compare(b runeNgramOff) int { ··· 149 149 ng := runesToNGram(runeGram) 150 150 result = append(result, runeNgramOff{ 151 151 ngram: ng, 152 - index: uint32(len(result)), 152 + index: len(result), 153 153 }) 154 154 } 155 155 return result

+2 -2

index_test.go

··· 441 441 Want: Stats{ 442 442 FilesLoaded: 1, 443 443 ContentBytesLoaded: 22, 444 - IndexBytesLoaded: 8, 444 + IndexBytesLoaded: 10, 445 445 NgramMatches: 3, // we look at doc 1, because it's max(0,1) due to AND 446 446 NgramLookups: 104, 447 447 MatchCount: 2, ··· 556 556 }}, 557 557 Want: Stats{ 558 558 ContentBytesLoaded: 33, // we still have to run regex since "app" matches two documents 559 - IndexBytesLoaded: 8, 559 + IndexBytesLoaded: 10, 560 560 FilesConsidered: 2, // important that we don't check 3 to ensure we are using the index 561 561 FilesLoaded: 2, 562 562 MatchCount: 0, // even though there is a match it doesn't align with a symbol

+36 -12

indexdata.go

··· 336 336 return 337 337 } 338 338 339 - // minFrequencyNgramOffsets returns the two lowest frequency ngrams to pass to 340 - // the distance iterator. If they have the same frequency, we maximise the 341 - // distance between them. first will always have a smaller index than last. 339 + // findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to 340 + // produce a small file intersection. It finds the two lowest frequency ngrams, making 341 + // sure to maximize the distance between them in case of ties. It avoids overlapping 342 + // trigrams to keep their intersection as small as possible. 343 + // 344 + // Invariant: first will always have a smaller index than last. 345 + func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) { 346 + first, last = minFrequencyNgramOffsets(ngramOffs, frequencies) 347 + 348 + // If the trigrams are overlapping, then try to shift one to reduce overlap. 349 + // This is guaranteed to produce a smaller intersection. 350 + if last.index-first.index < ngramSize { 351 + newFirstIndex := max(last.index-ngramSize, 0) 352 + if newFirstIndex != first.index { 353 + first = ngramOffs[indexMap[newFirstIndex]] 354 + } 355 + 356 + newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1) 357 + if newLastIndex != last.index { 358 + last = ngramOffs[indexMap[newLastIndex]] 359 + } 360 + } 361 + return 362 + } 363 + 342 364 func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) { 343 365 firstI, lastI := min2Index(frequencies) 344 366 // If the frequencies are equal lets maximise distance in the query ··· 357 379 } 358 380 } 359 381 } 382 + 360 383 first = ngramOffs[firstI] 361 384 last = ngramOffs[lastI] 362 - // Ensure first appears before last to make distance logic below clean. 385 + 386 + // Ensure first appears before last as a helpful invariant. 363 387 if first.index > last.index { 364 388 last, first = first, last 365 389 } 366 - return first, last 390 + return 367 391 } 368 392 369 393 func (data *indexData) ngrams(filename bool) btreeIndex { ··· 412 436 // bucket (which can cause disk IO). 413 437 slices.SortFunc(ngramOffs, runeNgramOff.Compare) 414 438 frequencies := make([]uint32, 0, len(ngramOffs)) 439 + indexMap := make([]int, len(ngramOffs)) 415 440 ngramLookups := 0 416 441 ngrams := d.ngrams(query.FileName) 417 - for _, o := range ngramOffs { 442 + for i, o := range ngramOffs { 418 443 var freq uint32 419 444 if query.CaseSensitive { 420 445 freq = ngrams.Get(o.ngram).sz ··· 438 463 } 439 464 440 465 frequencies = append(frequencies, freq) 466 + indexMap[o.index] = i 441 467 } 442 468 443 - // first and last are now the smallest trigram posting lists to iterate 444 - // through. 445 - first, last := minFrequencyNgramOffsets(ngramOffs, frequencies) 469 + first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies) 446 470 447 471 iter := &ngramDocIterator{ 448 - leftPad: first.index, 449 - rightPad: uint32(utf8.RuneCountInString(str)) - first.index, 472 + leftPad: uint32(first.index), 473 + rightPad: uint32(utf8.RuneCountInString(str) - first.index), 450 474 ngramLookups: ngramLookups, 451 475 } 452 476 if query.FileName { ··· 456 480 } 457 481 458 482 if first != last { 459 - runeDist := last.index - first.index 483 + runeDist := uint32(last.index - first.index) 460 484 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName) 461 485 if err != nil { 462 486 return nil, err

+28

indexdata_test.go

··· 72 72 t.Fatal(err) 73 73 } 74 74 } 75 + 76 + func TestFindSelectiveNGrams(t *testing.T) { 77 + if err := quick.Check(func(s string, maxFreq uint16) bool { 78 + ngramOffs := splitNGrams([]byte(s)) 79 + if len(ngramOffs) == 0 { 80 + return true 81 + } 82 + 83 + slices.SortFunc(ngramOffs, runeNgramOff.Compare) 84 + indexMap := make([]int, len(ngramOffs)) 85 + for i, n := range ngramOffs { 86 + indexMap[n.index] = i 87 + } 88 + 89 + frequencies := genFrequencies(ngramOffs, int(maxFreq)) 90 + x0, x1 := findSelectiveNgrams(ngramOffs, indexMap, frequencies) 91 + 92 + if len(ngramOffs) <= 1 { 93 + return true 94 + } 95 + 96 + // Just assert the invariant that x0 is before x1. This test mostly checks 97 + // for out-of-bounds errors. 98 + return x0.index < x1.index 99 + }, nil); err != nil { 100 + t.Fatal(err) 101 + } 102 + }

Configure Feed

Configure Feed