zoekt: calculate bucket offsets lazily (#528) · boltless.me/zoekt@2845327

+62 -10

btree.go

··· 49 49 type btree struct { 50 50 root node 51 51 opts btreeOpts 52 + 53 + lastBucketIndex int 52 54 } 53 55 54 56 type btreeOpts struct { ··· 61 63 } 62 64 63 65 func newBtree(opts btreeOpts) *btree { 64 - return &btree{&leaf{}, opts} 66 + return &btree{ 67 + root: &leaf{}, 68 + opts: opts, 69 + } 65 70 } 66 71 72 + // insert inserts ng into bt. 73 + // 74 + // Note: when all inserts are done, freeze must be called. 67 75 func (bt *btree) insert(ng ngram) { 68 76 if leftNode, rightNode, newKey, ok := bt.root.maybeSplit(bt.opts); ok { 69 77 bt.root = &innerNode{keys: []ngram{newKey}, children: []node{leftNode, rightNode}} ··· 85 93 86 94 func (bt *btree) visit(f func(n node)) { 87 95 bt.root.visit(f) 96 + } 97 + 98 + // freeze must be called once we are done inserting. It backfills "pointers" to 99 + // the buckets and posting lists. 100 + func (bt *btree) freeze() { 101 + // Note: Instead of backfilling we could maintain state during insertion, 102 + // however the visitor pattern seems more natural and shouldn't be a 103 + // performance issue, because, based on the typical number of trigrams 104 + // (500k) per shard, the b-trees we construct here only have around 1000 105 + // leaf nodes. 106 + offset, bucketIndex := 0, 0 107 + bt.visit(func(no node) { 108 + switch n := no.(type) { 109 + case *leaf: 110 + n.bucketIndex = bucketIndex 111 + bucketIndex++ 112 + 113 + n.postingIndexOffset = offset 114 + offset += n.bucketSize 115 + case *innerNode: 116 + return 117 + } 118 + }) 119 + 120 + bt.lastBucketIndex = bucketIndex - 1 88 121 } 89 122 90 123 func (bt *btree) sizeBytes() int { ··· 250 283 // We need the index to read buckets into memory. 251 284 file IndexFile 252 285 253 - bucketOffsets []uint32 286 + // buckets 287 + ngramSec simpleSection 254 288 255 289 postingOffsets []uint32 256 290 postingDataSentinelOffset uint32 257 291 } 258 292 259 - func (b btreeIndex) SizeBytes() int { 260 - return b.bt.sizeBytes() + 2*int(sliceHeaderBytes) + 4*len(b.bucketOffsets) + 4*len(b.postingOffsets) 293 + func (b btreeIndex) SizeBytes() (sz int) { 294 + // btree 295 + sz += int(pointerSize) + b.bt.sizeBytes() 296 + // ngramSec 297 + sz += 8 298 + // postingOffsets 299 + sz += int(sliceHeaderBytes) + 4*len(b.postingOffsets) 300 + // postingDataSentinelOffset 301 + sz += 4 302 + return 261 303 } 262 304 263 305 // Get returns the simple section of the posting list associated with the ··· 271 313 bucketIndex, postingIndexOffset := b.bt.find(ng) 272 314 273 315 // read bucket into memory 274 - off := b.bucketOffsets[bucketIndex] 275 - sz := b.bucketOffsets[bucketIndex+1] - off 276 - 316 + off, sz := b.getBucket(bucketIndex) 277 317 bucket, err := b.file.Read(off, sz) 278 318 if err != nil { 279 319 return simpleSection{} ··· 298 338 return b.getPostingList(postingIndexOffset + x) 299 339 } 300 340 341 + func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) { 342 + // All but the rightmost bucket have exactly bucketSize/2 ngrams 343 + sz = uint32(b.bt.opts.bucketSize / 2 * ngramEncoding) 344 + off = b.ngramSec.off + uint32(bucketIndex)*sz 345 + 346 + // Rightmost bucket has size upto the end of the ngramSec. 347 + if bucketIndex == b.bt.lastBucketIndex { 348 + sz = b.ngramSec.off + b.ngramSec.sz - off 349 + } 350 + 351 + return 352 + } 353 + 301 354 func (b btreeIndex) DumpMap() map[ngram]simpleSection { 302 - m := make(map[ngram]simpleSection, len(b.bucketOffsets)*b.bt.opts.bucketSize) 355 + m := make(map[ngram]simpleSection, b.ngramSec.sz/ngramEncoding) 303 356 304 357 b.bt.visit(func(no node) { 305 358 switch n := no.(type) { 306 359 case *leaf: 307 360 // read bucket into memory 308 - off := b.bucketOffsets[n.bucketIndex] 309 - sz := b.bucketOffsets[n.bucketIndex+1] - off 361 + off, sz := b.getBucket(n.bucketIndex) 310 362 bucket, _ := b.file.Read(off, sz) 311 363 312 364 // decode all ngrams in the bucket and fill map

+61 -51

btree_test.go

··· 3 3 import ( 4 4 "fmt" 5 5 "testing" 6 - 7 - "github.com/google/go-cmp/cmp" 8 6 ) 9 7 10 8 func TestBTree_sorted(t *testing.T) { ··· 66 64 } 67 65 } 68 66 69 - func TestCreateBucketsFromNgramText(t *testing.T) { 70 - var ngramTextOff uint32 = 7 71 - 72 - offset := func(i int) uint32 { 73 - return ngramTextOff + uint32((i * ngramEncoding)) 74 - } 67 + func TestGetBucket(t *testing.T) { 68 + var off uint32 = 13 69 + bucketSize := 4 75 70 76 71 cases := []struct { 77 - opts btreeOpts 78 - ngrams []ngram 79 - wantOffsets []uint32 72 + nNgrams int 73 + bucketIndex int 74 + wantOff uint32 75 + wantSz uint32 80 76 }{ 81 - { 82 - opts: btreeOpts{v: 2, bucketSize: 4}, 83 - ngrams: []ngram{}, 84 - wantOffsets: []uint32{offset(0), offset(0)}, 85 - }, 86 - { 87 - opts: btreeOpts{v: 2, bucketSize: 4}, 88 - ngrams: []ngram{1}, 89 - wantOffsets: []uint32{offset(0), offset(1)}, 90 - }, 77 + // tiny B-tree with just 1 bucket. 91 78 { 92 - opts: btreeOpts{v: 2, bucketSize: 4}, 93 - ngrams: []ngram{1, 2}, 94 - wantOffsets: []uint32{offset(0), offset(2)}, 79 + nNgrams: 1, 80 + bucketIndex: 0, 81 + wantOff: off, 82 + wantSz: 8, 95 83 }, 96 84 { 97 - opts: btreeOpts{v: 2, bucketSize: 4}, 98 - ngrams: []ngram{1, 2, 3}, 99 - wantOffsets: []uint32{offset(0), offset(3)}, 85 + nNgrams: 2, 86 + bucketIndex: 0, 87 + wantOff: off, 88 + wantSz: 16, 100 89 }, 101 90 { 102 - opts: btreeOpts{v: 2, bucketSize: 4}, 103 - ngrams: []ngram{1, 2, 3, 4}, 104 - wantOffsets: []uint32{offset(0), offset(4)}, 91 + nNgrams: 3, 92 + bucketIndex: 0, 93 + wantOff: off, 94 + wantSz: 24, 105 95 }, 96 + // B-tree with 10 ngrams, think 1,2,3,4,5,6,7,8,9,10 106 97 { 107 - opts: btreeOpts{v: 2, bucketSize: 4}, 108 - ngrams: []ngram{1, 2, 3, 4, 5}, 109 - wantOffsets: []uint32{offset(0), offset(2), offset(5)}, 98 + nNgrams: 10, 99 + bucketIndex: 0, 100 + wantOff: off, 101 + wantSz: 16, 110 102 }, 111 103 { 112 - opts: btreeOpts{v: 2, bucketSize: 4}, 113 - ngrams: []ngram{1, 2, 3, 4, 5, 6}, 114 - wantOffsets: []uint32{offset(0), offset(2), offset(6)}, 104 + nNgrams: 10, 105 + bucketIndex: 1, 106 + wantOff: off + 16, 107 + wantSz: 16, 115 108 }, 116 109 { 117 - opts: btreeOpts{v: 2, bucketSize: 4}, 118 - ngrams: []ngram{1, 2, 3, 4, 5, 6, 7}, 119 - wantOffsets: []uint32{offset(0), offset(2), offset(4), offset(7)}, 110 + nNgrams: 10, 111 + bucketIndex: 2, 112 + wantOff: off + 32, 113 + wantSz: 16, 120 114 }, 121 115 { 122 - opts: btreeOpts{v: 2, bucketSize: 4}, 123 - ngrams: []ngram{1, 2, 3, 4, 5, 6, 7, 8}, 124 - wantOffsets: []uint32{offset(0), offset(2), offset(4), offset(8)}, 116 + nNgrams: 10, 117 + bucketIndex: 3, 118 + wantOff: off + 48, 119 + wantSz: 32, 125 120 }, 126 121 { 127 - opts: btreeOpts{v: 2, bucketSize: 4}, 128 - ngrams: []ngram{1, 2, 3, 4, 5, 6, 7, 8, 9}, 129 - wantOffsets: []uint32{offset(0), offset(2), offset(4), offset(6), offset(9)}, 122 + nNgrams: 9, 123 + bucketIndex: 3, 124 + wantOff: off + 48, 125 + wantSz: 24, 130 126 }, 131 127 } 132 128 133 129 for _, tt := range cases { 134 130 t.Run("", func(t *testing.T) { 135 - toc := &indexTOC{} 136 - toc.ngramText.sz = uint32(len(tt.ngrams) * ngramEncoding) 137 - toc.ngramText.off = ngramTextOff 138 - haveOffsets := createBucketOffsets(toc.ngramText, tt.opts.bucketSize) 131 + bi := btreeIndex{ 132 + ngramSec: simpleSection{off: off, sz: uint32(tt.nNgrams * ngramEncoding)}, 133 + } 139 134 140 - if d := cmp.Diff(tt.wantOffsets, haveOffsets); d != "" { 141 - t.Fatalf("-want,+got\n%s", d) 135 + bt := newBtree(btreeOpts{ 136 + bucketSize: bucketSize, 137 + v: 2, 138 + }) 139 + for i := 0; i < tt.nNgrams; i++ { 140 + bt.insert(ngram(i + 1)) 141 + } 142 + bt.freeze() 143 + 144 + bi.bt = bt 145 + 146 + off, sz := bi.getBucket(tt.bucketIndex) 147 + if off != tt.wantOff { 148 + t.Fatalf("off: want %d, got %d", tt.wantOff, off) 149 + } 150 + if sz != tt.wantSz { 151 + t.Fatalf("sz: want %d, got %d", tt.wantSz, sz) 142 152 } 143 153 }) 144 154 }

+2 -37

read.go

··· 512 512 ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding])) 513 513 bt.insert(ng) 514 514 } 515 - 516 - // backfill "pointers" to the buckets and posting lists. Instead of 517 - // backfilling we could maintain state during insertion, however the 518 - // visitor pattern seems more natural and shouldn't be a performance issue, 519 - // because, based on the typical number of trigrams (500k) per shard, the 520 - // b-trees we construct here only have around 1000 leaf nodes. 521 - offset, bucketIndex := 0, 0 522 - bt.visit(func(no node) { 523 - switch n := no.(type) { 524 - case *leaf: 525 - n.bucketIndex = bucketIndex 526 - bucketIndex++ 527 - 528 - n.postingIndexOffset = offset 529 - offset += n.bucketSize 530 - case *innerNode: 531 - return 532 - } 533 - }) 515 + bt.freeze() 534 516 535 517 bi.bt = bt 536 518 537 - bi.bucketOffsets = createBucketOffsets(toc.ngramText, btreeBucketSize) 519 + bi.ngramSec = toc.ngramText 538 520 539 521 bi.postingOffsets = toc.postings.offsets 540 522 bi.postingDataSentinelOffset = toc.postings.data.off + toc.postings.data.sz 541 523 542 524 return bi, nil 543 - } 544 - 545 - // Because we insert ngrams into the btree in order, we can easily reconstruct 546 - // the buckets from the ngramText simpleSection just by knowing the bucketSize. 547 - // The last item of the returned slice is the sentinel value sec.off + sec.sz. 548 - func createBucketOffsets(sec simpleSection, bucketSize int) []uint32 { 549 - step := uint32((bucketSize / 2) * ngramEncoding) 550 - 551 - offsets := make([]uint32, 0, ((sec.sz-1)/step)+1) 552 - offsets = append(offsets, sec.off) 553 - 554 - sentinel := sec.off + sec.sz 555 - for off := sec.off + step; off+step < sentinel; off = off + step { 556 - offsets = append(offsets, off) 557 - } 558 - offsets = append(offsets, sentinel) 559 - return offsets 560 525 } 561 526 562 527 func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]byte, error) {

Configure Feed

Configure Feed