indexData: experimental ngram map via binary search (#365)

This is an experiment where instead of creating an in memory
representation of the ngram map, we instead lazily read and decode the
data in the search request path.

This uses the fact that we marshal on to disk the ngrams in a sorted
order, so we can binary search it. This requires no changes to how we
marshal, so can be done without re-indexing.

I expect this to be slower, but I am not sure how much slower. But it
will give us good evidence that investing in a better on disk map (ie a
btree or a perfect hash map) would be worthwhile.

It is gated behind the environment variable ZOEKT_ENABLE_NGRAM_BS. I
intend to turn it on in a dogfood cluster to see experimental impact. If
that goes well I'll roll it out to a fraction of our production cluster.

Test Plan: ZOEKT_ENABLE_NGRAM_BS=1 go test ./... and some manual testing
with and without it turned on.

author

Keegan Carruthers-Smith committer

GitHub date 4 years ago (Jun 13, 2022, 12:02 PM +0200) commit b61c934e b61c934e5050d8900ecd669a25ff927bd031d2d5 parent aacaf0c9 aacaf0c9e2e435ccacdb6e7d052558e3c54a593c

+117 -4

4 changed files

Expand all

index_test.go

indexdata.go

ngramoffset.go

read.go

index_test.go

··· 1124 1124 }, 1125 1125 } 1126 1126 1127 + if os.Getenv("ZOEKT_ENABLE_NGRAM_BS") != "" { 1128 + want.Stats.IndexBytes = 228 1129 + } 1130 + 1127 1131 if diff := cmp.Diff(want, res); diff != "" { 1128 1132 t.Fatalf("mismatch (-want +got):\n%s", diff) 1129 1133 }

+1 -1

indexdata.go

··· 33 33 34 34 file IndexFile 35 35 36 - ngrams combinedNgramOffset 36 + ngrams ngramMap 37 37 38 38 newlinesStart uint32 39 39 newlinesIndex []uint32

+87

ngramoffset.go

··· 15 15 package zoekt 16 16 17 17 import ( 18 + "encoding/binary" 18 19 "sort" 19 20 ) 20 21 ··· 356 357 func (a *asciiNgramOffset) SizeBytes() int { 357 358 return 4*len(a.entries) + 4*len(a.chunkOffsets) 358 359 } 360 + 361 + // ngramMap is an transient type while we investigate the performance of 362 + // combinedNgramOffset (established) vs binarySearch (new). 363 + // 364 + // It is like an interface, but we do the drudgery so we still get a useful 365 + // zero value (instead of nil panics in tests). 366 + type ngramMap struct { 367 + offsetMap combinedNgramOffset 368 + bsMap binarySearchNgram 369 + } 370 + 371 + func (m ngramMap) Get(gram ngram) simpleSection { 372 + if m.offsetMap.asc != nil { 373 + return m.offsetMap.Get(gram) 374 + } 375 + return m.bsMap.Get(gram) 376 + } 377 + 378 + func (m ngramMap) DumpMap() map[ngram]simpleSection { 379 + if m.offsetMap.asc != nil { 380 + return m.offsetMap.DumpMap() 381 + } 382 + return m.bsMap.DumpMap() 383 + } 384 + 385 + func (m ngramMap) SizeBytes() int { 386 + if m.offsetMap.asc != nil { 387 + return m.offsetMap.SizeBytes() 388 + } 389 + return 0 // binarySearch only uses mmaped data. 390 + } 391 + 392 + type binarySearchNgram struct { 393 + // ngramText is the bytes at indexTOC.ngramText 394 + // 395 + // It is a sorted ngramSlice marshalled as list of bigendian uint64s. 396 + ngramText []byte 397 + // postingIndex is the index section of the compoundSection for the posting 398 + // lists. 399 + // 400 + // It is a list of offsets in the a order corresponding with ngramText. It 401 + // is marshalled as a list of bigendian uint32s. 402 + postingOffsets []uint32 403 + // postingDataSentinelOffset is where postingData ends in the index file. 404 + // This is used to calculate the size of the last posting. 405 + postingDataSentinelOffset uint32 406 + } 407 + 408 + func (b binarySearchNgram) Get(gram ngram) (ss simpleSection) { 409 + getNGram := func(i int) ngram { 410 + i *= ngramEncoding 411 + return ngram(binary.BigEndian.Uint64(b.ngramText[i : i+ngramEncoding])) 412 + } 413 + 414 + size := len(b.ngramText) / ngramEncoding 415 + if size == 0 { 416 + return simpleSection{} 417 + } 418 + x := sort.Search(size, func(i int) bool { return gram <= getNGram(i) }) 419 + if x >= size || getNGram(x) != gram { 420 + return simpleSection{} 421 + } 422 + 423 + if x+1 < size { 424 + return simpleSection{ 425 + off: b.postingOffsets[x], 426 + sz: b.postingOffsets[x+1] - b.postingOffsets[x], 427 + } 428 + } else { 429 + return simpleSection{ 430 + off: b.postingOffsets[x], 431 + sz: b.postingDataSentinelOffset - b.postingOffsets[x], 432 + } 433 + } 434 + } 435 + 436 + func (b binarySearchNgram) DumpMap() map[ngram]simpleSection { 437 + ngramText := b.ngramText 438 + m := make(map[ngram]simpleSection, len(ngramText)/ngramEncoding) 439 + for len(ngramText) > 0 { 440 + gram := ngram(binary.BigEndian.Uint64(ngramText)) 441 + ngramText = ngramText[ngramEncoding:] 442 + m[gram] = b.Get(gram) 443 + } 444 + return m 445 + }

+25 -3

read.go

··· 289 289 return nil, err 290 290 } 291 291 292 - d.ngrams, err = d.readNgrams(toc) 293 - if err != nil { 294 - return nil, err 292 + if os.Getenv("ZOEKT_ENABLE_NGRAM_BS") != "" { 293 + bsMap, err := d.readBinarySearchNgrams(toc) 294 + if err != nil { 295 + return nil, err 296 + } 297 + d.ngrams = ngramMap{bsMap: bsMap} 298 + } else { 299 + offsetMap, err := d.readNgrams(toc) 300 + if err != nil { 301 + return nil, err 302 + } 303 + d.ngrams = ngramMap{offsetMap: offsetMap} 295 304 } 296 305 297 306 if os.Getenv("ZOEKT_ENABLE_BLOOM") != "" { ··· 469 478 } 470 479 471 480 return makeCombinedNgramOffset(ngrams, postingsIndex), nil 481 + } 482 + 483 + func (d *indexData) readBinarySearchNgrams(toc *indexTOC) (binarySearchNgram, error) { 484 + ngramText, err := d.readSectionBlob(toc.ngramText) 485 + if err != nil { 486 + return binarySearchNgram{}, err 487 + } 488 + 489 + return binarySearchNgram{ 490 + ngramText: ngramText, 491 + postingOffsets: toc.postings.offsets, 492 + postingDataSentinelOffset: toc.postings.data.off + toc.postings.data.sz, 493 + }, nil 472 494 } 473 495 474 496 func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]byte, error) {

Configure Feed

Configure Feed