fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

indexdata: read posting list iff all ng exist (#619)

The purpose of this commit is to reduce disk IO in case we skip a shard
because of missing ngrams.

To achieve this, we first check whether ALL ngrams exist in the shard before
loading the posting lists to determine their frequency. This means we have to
loop twice over the ngrams for the benefit of not loading any posting list in
case the shard would have been skipped anyways.

Test plan: This is a refactor, so relying on CI

+101 -18
+57 -4
btree.go
··· 304 304 return 305 305 } 306 306 307 + // NgramIndexes returns the indexes of the ngrams in the index. We return a 308 + // slice of slices because we have to keep track of ngram variants in case of 309 + // case-insensitive search. 310 + func (b btreeIndex) NgramIndexes(ngrams []ngram) ([]int, int) { 311 + lookups := 0 312 + ngramIndexes := make([]int, 0, len(ngrams)) 313 + 314 + for _, ng := range ngrams { 315 + ix := b.ngramIndex(ng) 316 + lookups++ 317 + if ix == -1 { 318 + return nil, len(ngramIndexes) + 1 319 + } 320 + ngramIndexes = append(ngramIndexes, ix) 321 + } 322 + 323 + return ngramIndexes, len(ngramIndexes) 324 + } 325 + 326 + func (b btreeIndex) ngramIndex(ng ngram) int { 327 + if b.bt == nil { 328 + return -1 329 + } 330 + 331 + // find bucket 332 + bucketIndex, postingIndexOffset := b.bt.find(ng) 333 + 334 + // read bucket into memory 335 + off, sz := b.getBucket(bucketIndex) 336 + bucket, err := b.file.Read(off, sz) 337 + if err != nil { 338 + return -1 339 + } 340 + 341 + // find ngram in bucket 342 + getNGram := func(i int) ngram { 343 + i *= ngramEncoding 344 + return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding])) 345 + } 346 + 347 + bucketSize := len(bucket) / ngramEncoding 348 + x := sort.Search(bucketSize, func(i int) bool { 349 + return ng <= getNGram(i) 350 + }) 351 + 352 + // return index of associated posting list 353 + if x >= bucketSize || getNGram(x) != ng { 354 + return -1 355 + } 356 + 357 + return postingIndexOffset + x 358 + } 359 + 307 360 // Get returns the simple section of the posting list associated with the 308 361 // ngram. The logic is as follows: 309 362 // 1. Search the inner nodes to find the bucket that may contain ng (in MEM) ··· 341 394 return simpleSection{} 342 395 } 343 396 344 - return b.getPostingList(postingIndexOffset + x) 397 + return b.GetPostingList(postingIndexOffset + x) 345 398 } 346 399 347 - // getPostingList returns the simple section pointing to the posting list of 400 + // GetPostingList returns the simple section pointing to the posting list of 348 401 // the ngram at ngramIndex. 349 402 // 350 403 // Assumming we don't hit a page boundary, which should be rare given that we 351 404 // only read 8 bytes, we need 1 disk access to read the posting offset. 352 - func (b btreeIndex) getPostingList(ngramIndex int) simpleSection { 405 + func (b btreeIndex) GetPostingList(ngramIndex int) simpleSection { 353 406 relativeOffsetBytes := uint32(ngramIndex) * 4 354 407 355 408 if relativeOffsetBytes+8 <= b.postingIndex.sz { ··· 422 475 // decode all ngrams in the bucket and fill map 423 476 for i := 0; i < len(bucket)/ngramEncoding; i++ { 424 477 gram := ngram(binary.BigEndian.Uint64(bucket[i*8:])) 425 - m[gram] = b.getPostingList(int(n.postingIndexOffset) + i) 478 + m[gram] = b.GetPostingList(int(n.postingIndexOffset) + i) 426 479 } 427 480 case *innerNode: 428 481 return
+44 -14
indexdata.go
··· 23 23 "math/bits" 24 24 "unicode/utf8" 25 25 26 + "golang.org/x/exp/slices" 27 + 26 28 "github.com/sourcegraph/zoekt/query" 27 - "golang.org/x/exp/slices" 28 29 ) 29 30 30 31 // indexData holds the pattern-independent data that we have to have ··· 413 414 slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool { 414 415 return a.ngram < b.ngram 415 416 }) 417 + 418 + index := d.ngrams(query.FileName) 416 419 frequencies := make([]uint32, 0, len(ngramOffs)) 417 420 ngramLookups := 0 418 - ngrams := d.ngrams(query.FileName) 419 - for _, o := range ngramOffs { 420 - var freq uint32 421 - if query.CaseSensitive { 422 - freq = ngrams.Get(o.ngram).sz 423 - ngramLookups++ 424 - } else { 425 - for _, v := range generateCaseNgrams(o.ngram) { 426 - freq += ngrams.Get(v).sz 427 - ngramLookups++ 428 - } 421 + if query.CaseSensitive { 422 + // Perf: Look up ngram indexes without loading posting lists. This way we can 423 + // stop early if a ngram does not exist. On the flip side we incur an additional 424 + // loop and more memory allocations. 425 + 426 + ngrams := make([]ngram, 0, len(ngramOffs)) 427 + for _, ng := range ngramOffs { 428 + ngrams = append(ngrams, ng.ngram) 429 429 } 430 430 431 - if freq == 0 { 431 + var ngramIndexes []int 432 + ngramIndexes, ngramLookups = index.NgramIndexes(ngrams) 433 + if len(ngramIndexes) == 0 { 432 434 return &ngramIterationResults{ 433 435 matchIterator: &noMatchTree{ 434 436 Why: "freq=0", ··· 439 441 }, nil 440 442 } 441 443 442 - frequencies = append(frequencies, freq) 444 + for _, ngramIndex := range ngramIndexes { 445 + frequencies = append(frequencies, index.GetPostingList(ngramIndex).sz) 446 + } 447 + } else { 448 + for _, o := range ngramOffs { 449 + var freq uint32 450 + if query.CaseSensitive { 451 + freq = index.Get(o.ngram).sz 452 + ngramLookups++ 453 + } else { 454 + for _, v := range generateCaseNgrams(o.ngram) { 455 + freq += index.Get(v).sz 456 + ngramLookups++ 457 + } 458 + } 459 + 460 + if freq == 0 { 461 + return &ngramIterationResults{ 462 + matchIterator: &noMatchTree{ 463 + Why: "freq=0", 464 + Stats: Stats{ 465 + NgramLookups: ngramLookups, 466 + }, 467 + }, 468 + }, nil 469 + } 470 + 471 + frequencies = append(frequencies, freq) 472 + } 443 473 } 444 474 445 475 // first and last are now the smallest trigram posting lists to iterate