set b-tree as default (#540)

The use of a b-tree has proven to be successful. Webserver's heap for a production instance of Sourcegraph shrank by 45% without a noticable impact on latency. Hence we set the b-tree as default. It can be disabled by setting ZOEKT_DISABLE_BTREE=true for webserver.

At the same time we remove binarySearchNgram, because it uses more disk accesses than the b-tree and keeps a reference to the posting offsets.

Once the b-tree has proven itself over a longer period of time, I will remove the alternative code path (combinedNgramOffset) and clean up the temporary structs and interfaces I created.

author

Stefan Hengl committer

GitHub date 3 years ago (Feb 21, 2023, 11:36 AM +0100) commit 00cd03c5 00cd03c58fcce5119cfeb27a2b4ece43300d0722 parent 58cf4748 58cf4748830ac0eded1517cc8c2454694c531fbd

+9 -88

3 changed files

Expand all

index_test.go

ngramoffset.go

read.go

+1 -1

index_test.go

··· 1775 1775 Stats: RepoStats{ 1776 1776 Shards: 1, 1777 1777 Documents: 4, 1778 - IndexBytes: 300, 1778 + IndexBytes: 412, 1779 1779 ContentBytes: 68, 1780 1780 NewLinesCount: 4, 1781 1781 DefaultBranchNewLinesCount: 2,

-60

ngramoffset.go

··· 15 15 package zoekt 16 16 17 17 import ( 18 - "encoding/binary" 19 18 "sort" 20 19 ) 21 20 ··· 400 399 } 401 400 return n.bt.SizeBytes() 402 401 } 403 - 404 - type binarySearchNgram struct { 405 - // ngramText is the bytes at indexTOC.ngramText 406 - // 407 - // It is a sorted ngramSlice marshalled as list of bigendian uint64s. 408 - ngramText []byte 409 - // postingIndex is the index section of the compoundSection for the posting 410 - // lists. 411 - // 412 - // It is a list of offsets in the a order corresponding with ngramText. It 413 - // is marshalled as a list of bigendian uint32s. 414 - postingOffsets []uint32 415 - // postingDataSentinelOffset is where postingData ends in the index file. 416 - // This is used to calculate the size of the last posting. 417 - postingDataSentinelOffset uint32 418 - } 419 - 420 - func (b binarySearchNgram) Get(gram ngram) (ss simpleSection) { 421 - getNGram := func(i int) ngram { 422 - i *= ngramEncoding 423 - return ngram(binary.BigEndian.Uint64(b.ngramText[i : i+ngramEncoding])) 424 - } 425 - 426 - size := len(b.ngramText) / ngramEncoding 427 - if size == 0 { 428 - return simpleSection{} 429 - } 430 - x := sort.Search(size, func(i int) bool { return gram <= getNGram(i) }) 431 - if x >= size || getNGram(x) != gram { 432 - return simpleSection{} 433 - } 434 - 435 - if x+1 < size { 436 - return simpleSection{ 437 - off: b.postingOffsets[x], 438 - sz: b.postingOffsets[x+1] - b.postingOffsets[x], 439 - } 440 - } else { 441 - return simpleSection{ 442 - off: b.postingOffsets[x], 443 - sz: b.postingDataSentinelOffset - b.postingOffsets[x], 444 - } 445 - } 446 - } 447 - 448 - func (b binarySearchNgram) DumpMap() map[ngram]simpleSection { 449 - ngramText := b.ngramText 450 - m := make(map[ngram]simpleSection, len(ngramText)/ngramEncoding) 451 - for len(ngramText) > 0 { 452 - gram := ngram(binary.BigEndian.Uint64(ngramText)) 453 - ngramText = ngramText[ngramEncoding:] 454 - m[gram] = b.Get(gram) 455 - } 456 - return m 457 - } 458 - 459 - func (b binarySearchNgram) SizeBytes() int { 460 - return 0 // binarySearch only uses mmaped data. 461 - }

+8 -27

read.go

··· 288 288 return nil, err 289 289 } 290 290 291 - if os.Getenv("ZOEKT_ENABLE_BTREE") != "" { 292 - bt, err := d.newBtreeIndex(toc.ngramText, toc.postings) 291 + if os.Getenv("ZOEKT_DISABLE_BTREE") != "" { 292 + offsetMap, err := d.readNgrams(toc) 293 293 if err != nil { 294 294 return nil, err 295 295 } 296 - d.ngrams = bt 297 - } else if os.Getenv("ZOEKT_ENABLE_NGRAM_BS") != "" { 298 - bsMap, err := d.readBinarySearchNgrams(toc) 299 - if err != nil { 300 - return nil, err 301 - } 302 - d.ngrams = bsMap 296 + d.ngrams = offsetMap 303 297 } else { 304 - offsetMap, err := d.readNgrams(toc) 298 + bt, err := d.newBtreeIndex(toc.ngramText, toc.postings) 305 299 if err != nil { 306 300 return nil, err 307 301 } 308 - d.ngrams = offsetMap 302 + d.ngrams = bt 309 303 } 310 304 311 305 d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks) ··· 320 314 321 315 d.fileNameIndex = toc.fileNames.relativeIndex() 322 316 323 - if os.Getenv("ZOEKT_ENABLE_BTREE_NAME") != "" { 324 - d.fileNameNgrams.bt, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings) 325 - } else { 317 + if os.Getenv("ZOEKT_DISABLE_BTREE") != "" { 326 318 d.fileNameNgrams.m, err = d.readFileNameNgrams(toc) 319 + } else { 320 + d.fileNameNgrams.bt, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings) 327 321 } 328 322 if err != nil { 329 323 return nil, err ··· 484 478 } 485 479 486 480 return makeCombinedNgramOffset(ngrams, postingsIndex), nil 487 - } 488 - 489 - func (d *indexData) readBinarySearchNgrams(toc *indexTOC) (binarySearchNgram, error) { 490 - ngramText, err := d.readSectionBlob(toc.ngramText) 491 - if err != nil { 492 - return binarySearchNgram{}, err 493 - } 494 - 495 - return binarySearchNgram{ 496 - ngramText: ngramText, 497 - postingOffsets: toc.postings.offsets, 498 - postingDataSentinelOffset: toc.postings.data.off + toc.postings.data.sz, 499 - }, nil 500 481 } 501 482 502 483 func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) {

Configure Feed

Configure Feed