indexdata: read posting list iff all ng exist (#619) · boltless.me/zoekt@b7e5070

+101 -18

2 changed files

Expand all

btree.go

indexdata.go

+57 -4

btree.go

··· 304 304 return 305 305 } 306 306 307 + // NgramIndexes returns the indexes of the ngrams in the index. We return a 308 + // slice of slices because we have to keep track of ngram variants in case of 309 + // case-insensitive search. 310 + func (b btreeIndex) NgramIndexes(ngrams []ngram) ([]int, int) { 311 + lookups := 0 312 + ngramIndexes := make([]int, 0, len(ngrams)) 313 + 314 + for _, ng := range ngrams { 315 + ix := b.ngramIndex(ng) 316 + lookups++ 317 + if ix == -1 { 318 + return nil, len(ngramIndexes) + 1 319 + } 320 + ngramIndexes = append(ngramIndexes, ix) 321 + } 322 + 323 + return ngramIndexes, len(ngramIndexes) 324 + } 325 + 326 + func (b btreeIndex) ngramIndex(ng ngram) int { 327 + if b.bt == nil { 328 + return -1 329 + } 330 + 331 + // find bucket 332 + bucketIndex, postingIndexOffset := b.bt.find(ng) 333 + 334 + // read bucket into memory 335 + off, sz := b.getBucket(bucketIndex) 336 + bucket, err := b.file.Read(off, sz) 337 + if err != nil { 338 + return -1 339 + } 340 + 341 + // find ngram in bucket 342 + getNGram := func(i int) ngram { 343 + i *= ngramEncoding 344 + return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding])) 345 + } 346 + 347 + bucketSize := len(bucket) / ngramEncoding 348 + x := sort.Search(bucketSize, func(i int) bool { 349 + return ng <= getNGram(i) 350 + }) 351 + 352 + // return index of associated posting list 353 + if x >= bucketSize || getNGram(x) != ng { 354 + return -1 355 + } 356 + 357 + return postingIndexOffset + x 358 + } 359 + 307 360 // Get returns the simple section of the posting list associated with the 308 361 // ngram. The logic is as follows: 309 362 // 1. Search the inner nodes to find the bucket that may contain ng (in MEM) ··· 341 394 return simpleSection{} 342 395 } 343 396 344 - return b.getPostingList(postingIndexOffset + x) 397 + return b.GetPostingList(postingIndexOffset + x) 345 398 } 346 399 347 - // getPostingList returns the simple section pointing to the posting list of 400 + // GetPostingList returns the simple section pointing to the posting list of 348 401 // the ngram at ngramIndex. 349 402 // 350 403 // Assumming we don't hit a page boundary, which should be rare given that we 351 404 // only read 8 bytes, we need 1 disk access to read the posting offset. 352 - func (b btreeIndex) getPostingList(ngramIndex int) simpleSection { 405 + func (b btreeIndex) GetPostingList(ngramIndex int) simpleSection { 353 406 relativeOffsetBytes := uint32(ngramIndex) * 4 354 407 355 408 if relativeOffsetBytes+8 <= b.postingIndex.sz { ··· 422 475 // decode all ngrams in the bucket and fill map 423 476 for i := 0; i < len(bucket)/ngramEncoding; i++ { 424 477 gram := ngram(binary.BigEndian.Uint64(bucket[i*8:])) 425 - m[gram] = b.getPostingList(int(n.postingIndexOffset) + i) 478 + m[gram] = b.GetPostingList(int(n.postingIndexOffset) + i) 426 479 } 427 480 case *innerNode: 428 481 return

+44 -14

indexdata.go

··· 23 23 "math/bits" 24 24 "unicode/utf8" 25 25 26 + "golang.org/x/exp/slices" 27 + 26 28 "github.com/sourcegraph/zoekt/query" 27 - "golang.org/x/exp/slices" 28 29 ) 29 30 30 31 // indexData holds the pattern-independent data that we have to have ··· 413 414 slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool { 414 415 return a.ngram < b.ngram 415 416 }) 417 + 418 + index := d.ngrams(query.FileName) 416 419 frequencies := make([]uint32, 0, len(ngramOffs)) 417 420 ngramLookups := 0 418 - ngrams := d.ngrams(query.FileName) 419 - for _, o := range ngramOffs { 420 - var freq uint32 421 - if query.CaseSensitive { 422 - freq = ngrams.Get(o.ngram).sz 423 - ngramLookups++ 424 - } else { 425 - for _, v := range generateCaseNgrams(o.ngram) { 426 - freq += ngrams.Get(v).sz 427 - ngramLookups++ 428 - } 421 + if query.CaseSensitive { 422 + // Perf: Look up ngram indexes without loading posting lists. This way we can 423 + // stop early if a ngram does not exist. On the flip side we incur an additional 424 + // loop and more memory allocations. 425 + 426 + ngrams := make([]ngram, 0, len(ngramOffs)) 427 + for _, ng := range ngramOffs { 428 + ngrams = append(ngrams, ng.ngram) 429 429 } 430 430 431 - if freq == 0 { 431 + var ngramIndexes []int 432 + ngramIndexes, ngramLookups = index.NgramIndexes(ngrams) 433 + if len(ngramIndexes) == 0 { 432 434 return &ngramIterationResults{ 433 435 matchIterator: &noMatchTree{ 434 436 Why: "freq=0", ··· 439 441 }, nil 440 442 } 441 443 442 - frequencies = append(frequencies, freq) 444 + for _, ngramIndex := range ngramIndexes { 445 + frequencies = append(frequencies, index.GetPostingList(ngramIndex).sz) 446 + } 447 + } else { 448 + for _, o := range ngramOffs { 449 + var freq uint32 450 + if query.CaseSensitive { 451 + freq = index.Get(o.ngram).sz 452 + ngramLookups++ 453 + } else { 454 + for _, v := range generateCaseNgrams(o.ngram) { 455 + freq += index.Get(v).sz 456 + ngramLookups++ 457 + } 458 + } 459 + 460 + if freq == 0 { 461 + return &ngramIterationResults{ 462 + matchIterator: &noMatchTree{ 463 + Why: "freq=0", 464 + Stats: Stats{ 465 + NgramLookups: ngramLookups, 466 + }, 467 + }, 468 + }, nil 469 + } 470 + 471 + frequencies = append(frequencies, freq) 472 + } 443 473 } 444 474 445 475 // first and last are now the smallest trigram posting lists to iterate

Configure Feed

Configure Feed