fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

indexData: experimental ngram map via binary search (#365)

This is an experiment where instead of creating an in memory
representation of the ngram map, we instead lazily read and decode the
data in the search request path.

This uses the fact that we marshal on to disk the ngrams in a sorted
order, so we can binary search it. This requires no changes to how we
marshal, so can be done without re-indexing.

I expect this to be slower, but I am not sure how much slower. But it
will give us good evidence that investing in a better on disk map (ie a
btree or a perfect hash map) would be worthwhile.

It is gated behind the environment variable ZOEKT_ENABLE_NGRAM_BS. I
intend to turn it on in a dogfood cluster to see experimental impact. If
that goes well I'll roll it out to a fraction of our production cluster.

Test Plan: ZOEKT_ENABLE_NGRAM_BS=1 go test ./... and some manual testing
with and without it turned on.

+117 -4
+4
index_test.go
··· 1124 1124 }, 1125 1125 } 1126 1126 1127 + if os.Getenv("ZOEKT_ENABLE_NGRAM_BS") != "" { 1128 + want.Stats.IndexBytes = 228 1129 + } 1130 + 1127 1131 if diff := cmp.Diff(want, res); diff != "" { 1128 1132 t.Fatalf("mismatch (-want +got):\n%s", diff) 1129 1133 }
+1 -1
indexdata.go
··· 33 33 34 34 file IndexFile 35 35 36 - ngrams combinedNgramOffset 36 + ngrams ngramMap 37 37 38 38 newlinesStart uint32 39 39 newlinesIndex []uint32
+87
ngramoffset.go
··· 15 15 package zoekt 16 16 17 17 import ( 18 + "encoding/binary" 18 19 "sort" 19 20 ) 20 21 ··· 356 357 func (a *asciiNgramOffset) SizeBytes() int { 357 358 return 4*len(a.entries) + 4*len(a.chunkOffsets) 358 359 } 360 + 361 + // ngramMap is an transient type while we investigate the performance of 362 + // combinedNgramOffset (established) vs binarySearch (new). 363 + // 364 + // It is like an interface, but we do the drudgery so we still get a useful 365 + // zero value (instead of nil panics in tests). 366 + type ngramMap struct { 367 + offsetMap combinedNgramOffset 368 + bsMap binarySearchNgram 369 + } 370 + 371 + func (m ngramMap) Get(gram ngram) simpleSection { 372 + if m.offsetMap.asc != nil { 373 + return m.offsetMap.Get(gram) 374 + } 375 + return m.bsMap.Get(gram) 376 + } 377 + 378 + func (m ngramMap) DumpMap() map[ngram]simpleSection { 379 + if m.offsetMap.asc != nil { 380 + return m.offsetMap.DumpMap() 381 + } 382 + return m.bsMap.DumpMap() 383 + } 384 + 385 + func (m ngramMap) SizeBytes() int { 386 + if m.offsetMap.asc != nil { 387 + return m.offsetMap.SizeBytes() 388 + } 389 + return 0 // binarySearch only uses mmaped data. 390 + } 391 + 392 + type binarySearchNgram struct { 393 + // ngramText is the bytes at indexTOC.ngramText 394 + // 395 + // It is a sorted ngramSlice marshalled as list of bigendian uint64s. 396 + ngramText []byte 397 + // postingIndex is the index section of the compoundSection for the posting 398 + // lists. 399 + // 400 + // It is a list of offsets in the a order corresponding with ngramText. It 401 + // is marshalled as a list of bigendian uint32s. 402 + postingOffsets []uint32 403 + // postingDataSentinelOffset is where postingData ends in the index file. 404 + // This is used to calculate the size of the last posting. 405 + postingDataSentinelOffset uint32 406 + } 407 + 408 + func (b binarySearchNgram) Get(gram ngram) (ss simpleSection) { 409 + getNGram := func(i int) ngram { 410 + i *= ngramEncoding 411 + return ngram(binary.BigEndian.Uint64(b.ngramText[i : i+ngramEncoding])) 412 + } 413 + 414 + size := len(b.ngramText) / ngramEncoding 415 + if size == 0 { 416 + return simpleSection{} 417 + } 418 + x := sort.Search(size, func(i int) bool { return gram <= getNGram(i) }) 419 + if x >= size || getNGram(x) != gram { 420 + return simpleSection{} 421 + } 422 + 423 + if x+1 < size { 424 + return simpleSection{ 425 + off: b.postingOffsets[x], 426 + sz: b.postingOffsets[x+1] - b.postingOffsets[x], 427 + } 428 + } else { 429 + return simpleSection{ 430 + off: b.postingOffsets[x], 431 + sz: b.postingDataSentinelOffset - b.postingOffsets[x], 432 + } 433 + } 434 + } 435 + 436 + func (b binarySearchNgram) DumpMap() map[ngram]simpleSection { 437 + ngramText := b.ngramText 438 + m := make(map[ngram]simpleSection, len(ngramText)/ngramEncoding) 439 + for len(ngramText) > 0 { 440 + gram := ngram(binary.BigEndian.Uint64(ngramText)) 441 + ngramText = ngramText[ngramEncoding:] 442 + m[gram] = b.Get(gram) 443 + } 444 + return m 445 + }
+25 -3
read.go
··· 289 289 return nil, err 290 290 } 291 291 292 - d.ngrams, err = d.readNgrams(toc) 293 - if err != nil { 294 - return nil, err 292 + if os.Getenv("ZOEKT_ENABLE_NGRAM_BS") != "" { 293 + bsMap, err := d.readBinarySearchNgrams(toc) 294 + if err != nil { 295 + return nil, err 296 + } 297 + d.ngrams = ngramMap{bsMap: bsMap} 298 + } else { 299 + offsetMap, err := d.readNgrams(toc) 300 + if err != nil { 301 + return nil, err 302 + } 303 + d.ngrams = ngramMap{offsetMap: offsetMap} 295 304 } 296 305 297 306 if os.Getenv("ZOEKT_ENABLE_BLOOM") != "" { ··· 469 478 } 470 479 471 480 return makeCombinedNgramOffset(ngrams, postingsIndex), nil 481 + } 482 + 483 + func (d *indexData) readBinarySearchNgrams(toc *indexTOC) (binarySearchNgram, error) { 484 + ngramText, err := d.readSectionBlob(toc.ngramText) 485 + if err != nil { 486 + return binarySearchNgram{}, err 487 + } 488 + 489 + return binarySearchNgram{ 490 + ngramText: ngramText, 491 + postingOffsets: toc.postings.offsets, 492 + postingDataSentinelOffset: toc.postings.data.off + toc.postings.data.sz, 493 + }, nil 472 494 } 473 495 474 496 func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]byte, error) {