Revert "indexdata: read posting list iff all ng exist (#619)" (#626) · boltless.me/zoekt@f9b3ea5

+18 -101

2 changed files

Expand all

btree.go

indexdata.go

+4 -57

btree.go

··· 304 304 return 305 305 } 306 306 307 - // NgramIndexes returns the indexes of the ngrams in the index. We return a 308 - // slice of slices because we have to keep track of ngram variants in case of 309 - // case-insensitive search. 310 - func (b btreeIndex) NgramIndexes(ngrams []ngram) ([]int, int) { 311 - lookups := 0 312 - ngramIndexes := make([]int, 0, len(ngrams)) 313 - 314 - for _, ng := range ngrams { 315 - ix := b.ngramIndex(ng) 316 - lookups++ 317 - if ix == -1 { 318 - return nil, len(ngramIndexes) + 1 319 - } 320 - ngramIndexes = append(ngramIndexes, ix) 321 - } 322 - 323 - return ngramIndexes, len(ngramIndexes) 324 - } 325 - 326 - func (b btreeIndex) ngramIndex(ng ngram) int { 327 - if b.bt == nil { 328 - return -1 329 - } 330 - 331 - // find bucket 332 - bucketIndex, postingIndexOffset := b.bt.find(ng) 333 - 334 - // read bucket into memory 335 - off, sz := b.getBucket(bucketIndex) 336 - bucket, err := b.file.Read(off, sz) 337 - if err != nil { 338 - return -1 339 - } 340 - 341 - // find ngram in bucket 342 - getNGram := func(i int) ngram { 343 - i *= ngramEncoding 344 - return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding])) 345 - } 346 - 347 - bucketSize := len(bucket) / ngramEncoding 348 - x := sort.Search(bucketSize, func(i int) bool { 349 - return ng <= getNGram(i) 350 - }) 351 - 352 - // return index of associated posting list 353 - if x >= bucketSize || getNGram(x) != ng { 354 - return -1 355 - } 356 - 357 - return postingIndexOffset + x 358 - } 359 - 360 307 // Get returns the simple section of the posting list associated with the 361 308 // ngram. The logic is as follows: 362 309 // 1. Search the inner nodes to find the bucket that may contain ng (in MEM) ··· 394 341 return simpleSection{} 395 342 } 396 343 397 - return b.GetPostingList(postingIndexOffset + x) 344 + return b.getPostingList(postingIndexOffset + x) 398 345 } 399 346 400 - // GetPostingList returns the simple section pointing to the posting list of 347 + // getPostingList returns the simple section pointing to the posting list of 401 348 // the ngram at ngramIndex. 402 349 // 403 350 // Assumming we don't hit a page boundary, which should be rare given that we 404 351 // only read 8 bytes, we need 1 disk access to read the posting offset. 405 - func (b btreeIndex) GetPostingList(ngramIndex int) simpleSection { 352 + func (b btreeIndex) getPostingList(ngramIndex int) simpleSection { 406 353 relativeOffsetBytes := uint32(ngramIndex) * 4 407 354 408 355 if relativeOffsetBytes+8 <= b.postingIndex.sz { ··· 475 422 // decode all ngrams in the bucket and fill map 476 423 for i := 0; i < len(bucket)/ngramEncoding; i++ { 477 424 gram := ngram(binary.BigEndian.Uint64(bucket[i*8:])) 478 - m[gram] = b.GetPostingList(int(n.postingIndexOffset) + i) 425 + m[gram] = b.getPostingList(int(n.postingIndexOffset) + i) 479 426 } 480 427 case *innerNode: 481 428 return

+14 -44

indexdata.go

··· 23 23 "math/bits" 24 24 "unicode/utf8" 25 25 26 + "github.com/sourcegraph/zoekt/query" 26 27 "golang.org/x/exp/slices" 27 - 28 - "github.com/sourcegraph/zoekt/query" 29 28 ) 30 29 31 30 // indexData holds the pattern-independent data that we have to have ··· 414 413 slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool { 415 414 return a.ngram < b.ngram 416 415 }) 417 - 418 - index := d.ngrams(query.FileName) 419 416 frequencies := make([]uint32, 0, len(ngramOffs)) 420 417 ngramLookups := 0 421 - if query.CaseSensitive { 422 - // Perf: Look up ngram indexes without loading posting lists. This way we can 423 - // stop early if a ngram does not exist. On the flip side we incur an additional 424 - // loop and more memory allocations. 425 - 426 - ngrams := make([]ngram, 0, len(ngramOffs)) 427 - for _, ng := range ngramOffs { 428 - ngrams = append(ngrams, ng.ngram) 418 + ngrams := d.ngrams(query.FileName) 419 + for _, o := range ngramOffs { 420 + var freq uint32 421 + if query.CaseSensitive { 422 + freq = ngrams.Get(o.ngram).sz 423 + ngramLookups++ 424 + } else { 425 + for _, v := range generateCaseNgrams(o.ngram) { 426 + freq += ngrams.Get(v).sz 427 + ngramLookups++ 428 + } 429 429 } 430 430 431 - var ngramIndexes []int 432 - ngramIndexes, ngramLookups = index.NgramIndexes(ngrams) 433 - if len(ngramIndexes) == 0 { 431 + if freq == 0 { 434 432 return &ngramIterationResults{ 435 433 matchIterator: &noMatchTree{ 436 434 Why: "freq=0", ··· 441 439 }, nil 442 440 } 443 441 444 - for _, ngramIndex := range ngramIndexes { 445 - frequencies = append(frequencies, index.GetPostingList(ngramIndex).sz) 446 - } 447 - } else { 448 - for _, o := range ngramOffs { 449 - var freq uint32 450 - if query.CaseSensitive { 451 - freq = index.Get(o.ngram).sz 452 - ngramLookups++ 453 - } else { 454 - for _, v := range generateCaseNgrams(o.ngram) { 455 - freq += index.Get(v).sz 456 - ngramLookups++ 457 - } 458 - } 459 - 460 - if freq == 0 { 461 - return &ngramIterationResults{ 462 - matchIterator: &noMatchTree{ 463 - Why: "freq=0", 464 - Stats: Stats{ 465 - NgramLookups: ngramLookups, 466 - }, 467 - }, 468 - }, nil 469 - } 470 - 471 - frequencies = append(frequencies, freq) 472 - } 442 + frequencies = append(frequencies, freq) 473 443 } 474 444 475 445 // first and last are now the smallest trigram posting lists to iterate

Configure Feed

Configure Feed