webserver: use B+-tree to map ngrams to posting lists (#522) · boltless.me/zoekt@0c03110

+1

api.go

··· 31 31 const sliceHeaderBytes uint64 = 24 32 32 const stringHeaderBytes uint64 = 16 33 33 const pointerSize uint64 = 8 34 + const interfaceBytes uint64 = 16 34 35 35 36 // FileMatch contains all the matches within a file. 36 37 type FileMatch struct {

+337

btree.go

··· 1 + // B+-tree 2 + // 3 + // The tree we implement here is a B+-tree based on a paper by Ceylan and 4 + // Mihalcea [1]. 5 + // 6 + // B+-trees store all values in the leaves. In our case we store trigrams with 7 + // the goal to quickly retrieve a pointer to the posting list for a given 8 + // trigram. We choose the number of trigrams to store at each leaf based on the 9 + // page size, IE we make sure we are able to load a bucket of ngrams with a 10 + // single disk access. 11 + // 12 + // Here is an example of how our B+-tree looks like for a simple input: 13 + // 14 + // input: "hello world", bucketSize=2, v=2 15 + // 16 + // legend: ()=inner node, []=leaf 17 + // 18 + // (level 1) (hel, lo_) 19 + // 20 + // (level 2) (ell) (llo) (o_w, irl, red) 21 + // 22 + // (level 3) [_wo] [ell] [hel] [llo] [lo_] [o_w] [orl] [rld, wor] 23 + // 24 + // The leafs are stored as part of the index on disk (mmaped) while all inner 25 + // nodes are loaded into memory when we load the shard. 26 + // 27 + // [1] H. Ceylan and R. Mihalcea. 2011. An Efficient Indexer for Large N-Gram 28 + // Corpora, Proceedings of the ACL-HLT 2011 System Demonstrations, pages 29 + // 103-108 30 + 31 + package zoekt 32 + 33 + import ( 34 + "encoding/binary" 35 + "fmt" 36 + "sort" 37 + ) 38 + 39 + // btreeBucketSize should be chosen such that in the final tree the buckets are 40 + // as close to the page size as possible, but not above. We insert ngrams in 41 + // order(!), which means after a split of a leaf, the left leaf is not affected 42 + // by further inserts and its size is fixed to bucketSize/2. The rightmost leaf 43 + // might store up to btreeBucketSize ngrams, but the expected size is 44 + // btreeBucketSize/2, too. 45 + // 46 + // On linux "getconf PAGESIZE" returns the number of bytes in a memory page. 47 + const btreeBucketSize = (4096 * 2) / ngramEncoding 48 + 49 + type btree struct { 50 + root node 51 + opts btreeOpts 52 + } 53 + 54 + type btreeOpts struct { 55 + // How many ngrams can be stored at a leaf node. 56 + bucketSize int 57 + // all inner nodes, except root, have [v, 2v] children. In the literature, 58 + // b-trees are inconsistently categorized either by the number of children 59 + // or by the number of keys. We choose the former. 60 + v int 61 + } 62 + 63 + func newBtree(opts btreeOpts) *btree { 64 + return &btree{&leaf{}, opts} 65 + } 66 + 67 + func (bt *btree) insert(ng ngram) { 68 + if leftNode, rightNode, newKey, ok := bt.root.maybeSplit(bt.opts); ok { 69 + bt.root = &innerNode{keys: []ngram{newKey}, children: []node{leftNode, rightNode}} 70 + } 71 + bt.root.insert(ng, bt.opts) 72 + } 73 + 74 + // find returns the tuple (bucketIndex, postingIndexOffset), both of which are 75 + // stored at the leaf level. They are effectively pointers to the bucket and 76 + // the posting lists for ngrams stored in the bucket. Since ngrams and their 77 + // posting lists are stored in order, knowing the index of the posting list of 78 + // the first item in the bucket is sufficient. 79 + func (bt *btree) find(ng ngram) (int, int) { 80 + if bt.root == nil { 81 + return -1, -1 82 + } 83 + return bt.root.find(ng) 84 + } 85 + 86 + func (bt *btree) visit(f func(n node)) { 87 + bt.root.visit(f) 88 + } 89 + 90 + func (bt *btree) sizeBytes() int { 91 + sz := 2 * 8 // opts 92 + 93 + sz += int(interfaceBytes) 94 + 95 + bt.visit(func(n node) { 96 + sz += n.sizeBytes() 97 + }) 98 + 99 + return sz 100 + } 101 + 102 + type node interface { 103 + insert(ng ngram, opts btreeOpts) 104 + maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) 105 + find(ng ngram) (int, int) 106 + visit(func(n node)) 107 + sizeBytes() int 108 + } 109 + 110 + type innerNode struct { 111 + keys []ngram 112 + children []node 113 + } 114 + 115 + type leaf struct { 116 + bucketIndex int 117 + // postingIndexOffset is the index of the posting list of the first ngram 118 + // in the bucket. This is enough to determine the index of the posting list 119 + // for every other key in the bucket. 120 + postingIndexOffset int 121 + 122 + // Optimization: Because we insert ngrams in order, we don't actually have 123 + // to fill the buckets. We just have to keep track of the size of the 124 + // bucket, so we know when to split, and the key that we have to propagate 125 + // up to the parent node when we split. 126 + // 127 + // If in the future we decide to mutate buckets, we have to replace 128 + // bucketSize and splitKey by []ngram. 129 + bucketSize int 130 + splitKey ngram 131 + } 132 + 133 + func (n *innerNode) sizeBytes() int { 134 + return len(n.keys)*ngramEncoding + len(n.children)*int(interfaceBytes) 135 + } 136 + 137 + func (n *leaf) sizeBytes() int { 138 + return 4 * 8 139 + } 140 + 141 + func (n *leaf) insert(ng ngram, opts btreeOpts) { 142 + n.bucketSize++ 143 + 144 + if n.bucketSize == (opts.bucketSize/2)+1 { 145 + n.splitKey = ng 146 + } 147 + } 148 + 149 + func (n *innerNode) insert(ng ngram, opts btreeOpts) { 150 + insertAt := func(i int) { 151 + // Invariant: Nodes always have a free slot. 152 + // 153 + // We split full nodes on the the way down to the leaf. This has the 154 + // advantage that inserts are handled in a single pass. 155 + if leftNode, rightNode, newKey, ok := n.children[i].maybeSplit(opts); ok { 156 + n.keys = append(n.keys[0:i], append([]ngram{newKey}, n.keys[i:]...)...) 157 + n.children = append(n.children[0:i], append([]node{leftNode, rightNode}, n.children[i+1:]...)...) 158 + 159 + // A split might shift the target index by 1. 160 + if ng >= n.keys[i] { 161 + i++ 162 + } 163 + } 164 + n.children[i].insert(ng, opts) 165 + } 166 + 167 + for i, k := range n.keys { 168 + if ng < k { 169 + insertAt(i) 170 + return 171 + } 172 + } 173 + insertAt(len(n.children) - 1) 174 + } 175 + 176 + // See btree.find 177 + func (n *innerNode) find(ng ngram) (int, int) { 178 + for i, k := range n.keys { 179 + if ng < k { 180 + return n.children[i].find(ng) 181 + } 182 + } 183 + return n.children[len(n.children)-1].find(ng) 184 + } 185 + 186 + // See btree.find 187 + func (n *leaf) find(ng ngram) (int, int) { 188 + return int(n.bucketIndex), int(n.postingIndexOffset) 189 + } 190 + 191 + func (n *leaf) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) { 192 + if n.bucketSize < opts.bucketSize { 193 + return 194 + } 195 + return &leaf{bucketSize: opts.bucketSize / 2}, 196 + &leaf{bucketSize: opts.bucketSize / 2}, 197 + n.splitKey, 198 + true 199 + } 200 + 201 + func (n *innerNode) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) { 202 + if len(n.children) < 2*opts.v { 203 + return 204 + } 205 + return &innerNode{ 206 + keys: append(make([]ngram, 0, opts.v-1), n.keys[0:opts.v-1]...), 207 + children: append(make([]node, 0, opts.v), n.children[:opts.v]...)}, 208 + &innerNode{ 209 + keys: append(make([]ngram, 0, (2*opts.v)-1), n.keys[opts.v:]...), 210 + children: append(make([]node, 0, 2*opts.v), n.children[opts.v:]...)}, 211 + n.keys[opts.v-1], 212 + true 213 + } 214 + 215 + func (n *leaf) visit(f func(n node)) { 216 + f(n) 217 + return 218 + } 219 + 220 + func (n *innerNode) visit(f func(n node)) { 221 + f(n) 222 + for _, child := range n.children { 223 + child.visit(f) 224 + } 225 + } 226 + 227 + func (bt *btree) String() string { 228 + s := "" 229 + s += fmt.Sprintf("%+v", bt.opts) 230 + bt.root.visit(func(n node) { 231 + switch nd := n.(type) { 232 + case *leaf: 233 + return 234 + case *innerNode: 235 + s += fmt.Sprintf("[") 236 + for _, key := range nd.keys { 237 + s += fmt.Sprintf("%d,", key) 238 + } 239 + s = s[:len(s)-1] // remove trailing comma 240 + s += fmt.Sprintf("]") 241 + 242 + } 243 + }) 244 + return s 245 + } 246 + 247 + type btreeIndex struct { 248 + bt *btree 249 + 250 + // We need the index to read buckets into memory. 251 + file IndexFile 252 + 253 + bucketOffsets []uint32 254 + 255 + postingOffsets []uint32 256 + postingDataSentinelOffset uint32 257 + } 258 + 259 + func (b btreeIndex) SizeBytes() int { 260 + return b.bt.sizeBytes() + 2*int(sliceHeaderBytes) + 4*len(b.bucketOffsets) + 4*len(b.postingOffsets) 261 + } 262 + 263 + // Get returns the simple section of the posting list associated with the 264 + // ngram. The logic is as follows: 265 + // 1. Search the inner nodes to find the bucket that may contain ng (in MEM) 266 + // 2. Read the bucket from disk (1 disk access) 267 + // 3. Binary search the bucket (in MEM) 268 + // 4. Return the simple section pointing to the posting list (in MEM) 269 + func (b btreeIndex) Get(ng ngram) (ss simpleSection) { 270 + // find bucket 271 + bucketIndex, postingIndexOffset := b.bt.find(ng) 272 + 273 + // read bucket into memory 274 + off := b.bucketOffsets[bucketIndex] 275 + sz := b.bucketOffsets[bucketIndex+1] - off 276 + 277 + bucket, err := b.file.Read(off, sz) 278 + if err != nil { 279 + return simpleSection{} 280 + } 281 + 282 + // find ngram in bucket 283 + getNGram := func(i int) ngram { 284 + i *= ngramEncoding 285 + return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding])) 286 + } 287 + 288 + bucketSize := len(bucket) / ngramEncoding 289 + x := sort.Search(bucketSize, func(i int) bool { 290 + return ng <= getNGram(i) 291 + }) 292 + 293 + // return associated posting list 294 + if x >= bucketSize || getNGram(x) != ng { 295 + return simpleSection{} 296 + } 297 + 298 + return b.getPostingList(postingIndexOffset + x) 299 + } 300 + 301 + func (b btreeIndex) DumpMap() map[ngram]simpleSection { 302 + m := make(map[ngram]simpleSection, len(b.bucketOffsets)*b.bt.opts.bucketSize) 303 + 304 + b.bt.visit(func(no node) { 305 + switch n := no.(type) { 306 + case *leaf: 307 + // read bucket into memory 308 + off := b.bucketOffsets[n.bucketIndex] 309 + sz := b.bucketOffsets[n.bucketIndex+1] - off 310 + bucket, _ := b.file.Read(off, sz) 311 + 312 + // decode all ngrams in the bucket and fill map 313 + for i := 0; i < len(bucket)/ngramEncoding; i++ { 314 + gram := ngram(binary.BigEndian.Uint64(bucket[i*8:])) 315 + m[gram] = b.getPostingList(int(n.postingIndexOffset) + i) 316 + } 317 + case *innerNode: 318 + return 319 + } 320 + }) 321 + 322 + return m 323 + } 324 + 325 + func (b btreeIndex) getPostingList(postingIndex int) simpleSection { 326 + if postingIndex+1 < len(b.postingOffsets) { 327 + return simpleSection{ 328 + off: b.postingOffsets[postingIndex], 329 + sz: b.postingOffsets[postingIndex+1] - b.postingOffsets[postingIndex], 330 + } 331 + } else { 332 + return simpleSection{ 333 + off: b.postingOffsets[postingIndex], 334 + sz: b.postingDataSentinelOffset - b.postingOffsets[postingIndex], 335 + } 336 + } 337 + }

+152

btree_test.go

··· 1 + package zoekt 2 + 3 + import ( 4 + "fmt" 5 + "testing" 6 + 7 + "github.com/google/go-cmp/cmp" 8 + ) 9 + 10 + func TestBTree_sorted(t *testing.T) { 11 + bt := newBtree(btreeOpts{bucketSize: 2, v: 2}) 12 + insertMany(t, bt, []ngram{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}) 13 + // inner nodes only 14 + // 15 + // [3,5,7] 16 + // / / \ \ 17 + // [2] [4] [6] [8,9] 18 + // 19 + want := "{bucketSize:2 v:2}[3,5,7][2][4][6][8,9]" 20 + if s := bt.String(); s != want { 21 + t.Fatalf("\nwant:%s\ngot: %s", want, s) 22 + } 23 + } 24 + 25 + func TestFindBucket(t *testing.T) { 26 + bt := newBtree(btreeOpts{bucketSize: 4, v: 2}) 27 + insertMany(t, bt, []ngram{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}) 28 + 29 + buckets := 0 30 + offset := 0 31 + bt.visit(func(no node) { 32 + switch n := no.(type) { 33 + case *leaf: 34 + n.bucketIndex = buckets 35 + buckets++ 36 + n.postingIndexOffset = offset 37 + offset += n.bucketSize 38 + case *innerNode: 39 + return 40 + } 41 + }) 42 + 43 + cases := []struct { 44 + ng ngram 45 + wantBucketIndex int 46 + wantPostingIndexOffset int 47 + }{ 48 + { 49 + ng: 7, 50 + wantBucketIndex: 3, 51 + wantPostingIndexOffset: 6, 52 + }, 53 + } 54 + 55 + for _, tt := range cases { 56 + t.Run(fmt.Sprintf("ngram: %d", tt.ng), func(t *testing.T) { 57 + haveBucketIndex, havePostingIndexOffset := bt.find(tt.ng) 58 + if tt.wantBucketIndex != haveBucketIndex { 59 + t.Fatalf("bucketIndex: want %d, got %d", tt.wantBucketIndex, haveBucketIndex) 60 + } 61 + 62 + if tt.wantPostingIndexOffset != havePostingIndexOffset { 63 + t.Fatalf("postingIndexOffset: want %d, got %d", tt.wantPostingIndexOffset, havePostingIndexOffset) 64 + } 65 + }) 66 + } 67 + } 68 + 69 + func TestCreateBucketsFromNgramText(t *testing.T) { 70 + var ngramTextOff uint32 = 7 71 + 72 + offset := func(i int) uint32 { 73 + return ngramTextOff + uint32((i * ngramEncoding)) 74 + } 75 + 76 + cases := []struct { 77 + opts btreeOpts 78 + ngrams []ngram 79 + wantOffsets []uint32 80 + }{ 81 + { 82 + opts: btreeOpts{v: 2, bucketSize: 4}, 83 + ngrams: []ngram{}, 84 + wantOffsets: []uint32{offset(0), offset(0)}, 85 + }, 86 + { 87 + opts: btreeOpts{v: 2, bucketSize: 4}, 88 + ngrams: []ngram{1}, 89 + wantOffsets: []uint32{offset(0), offset(1)}, 90 + }, 91 + { 92 + opts: btreeOpts{v: 2, bucketSize: 4}, 93 + ngrams: []ngram{1, 2}, 94 + wantOffsets: []uint32{offset(0), offset(2)}, 95 + }, 96 + { 97 + opts: btreeOpts{v: 2, bucketSize: 4}, 98 + ngrams: []ngram{1, 2, 3}, 99 + wantOffsets: []uint32{offset(0), offset(3)}, 100 + }, 101 + { 102 + opts: btreeOpts{v: 2, bucketSize: 4}, 103 + ngrams: []ngram{1, 2, 3, 4}, 104 + wantOffsets: []uint32{offset(0), offset(4)}, 105 + }, 106 + { 107 + opts: btreeOpts{v: 2, bucketSize: 4}, 108 + ngrams: []ngram{1, 2, 3, 4, 5}, 109 + wantOffsets: []uint32{offset(0), offset(2), offset(5)}, 110 + }, 111 + { 112 + opts: btreeOpts{v: 2, bucketSize: 4}, 113 + ngrams: []ngram{1, 2, 3, 4, 5, 6}, 114 + wantOffsets: []uint32{offset(0), offset(2), offset(6)}, 115 + }, 116 + { 117 + opts: btreeOpts{v: 2, bucketSize: 4}, 118 + ngrams: []ngram{1, 2, 3, 4, 5, 6, 7}, 119 + wantOffsets: []uint32{offset(0), offset(2), offset(4), offset(7)}, 120 + }, 121 + { 122 + opts: btreeOpts{v: 2, bucketSize: 4}, 123 + ngrams: []ngram{1, 2, 3, 4, 5, 6, 7, 8}, 124 + wantOffsets: []uint32{offset(0), offset(2), offset(4), offset(8)}, 125 + }, 126 + { 127 + opts: btreeOpts{v: 2, bucketSize: 4}, 128 + ngrams: []ngram{1, 2, 3, 4, 5, 6, 7, 8, 9}, 129 + wantOffsets: []uint32{offset(0), offset(2), offset(4), offset(6), offset(9)}, 130 + }, 131 + } 132 + 133 + for _, tt := range cases { 134 + t.Run("", func(t *testing.T) { 135 + toc := &indexTOC{} 136 + toc.ngramText.sz = uint32(len(tt.ngrams) * ngramEncoding) 137 + toc.ngramText.off = ngramTextOff 138 + haveOffsets := createBucketOffsets(toc.ngramText, tt.opts.bucketSize) 139 + 140 + if d := cmp.Diff(tt.wantOffsets, haveOffsets); d != "" { 141 + t.Fatalf("-want,+got\n%s", d) 142 + } 143 + }) 144 + } 145 + } 146 + 147 + func insertMany(t *testing.T, bt *btree, ngrams []ngram) { 148 + t.Helper() 149 + for _, ng := range ngrams { 150 + bt.insert(ng) 151 + } 152 + }

+70 -1

read.go

··· 289 289 return nil, err 290 290 } 291 291 292 - if os.Getenv("ZOEKT_ENABLE_NGRAM_BS") != "" { 292 + if os.Getenv("ZOEKT_ENABLE_BTREE") != "" { 293 + bt, err := d.newBtreeIndex(toc) 294 + if err != nil { 295 + return nil, err 296 + } 297 + d.ngrams = bt 298 + } else if os.Getenv("ZOEKT_ENABLE_NGRAM_BS") != "" { 293 299 bsMap, err := d.readBinarySearchNgrams(toc) 294 300 if err != nil { 295 301 return nil, err ··· 488 494 postingOffsets: toc.postings.offsets, 489 495 postingDataSentinelOffset: toc.postings.data.off + toc.postings.data.sz, 490 496 }, nil 497 + } 498 + 499 + func (d *indexData) newBtreeIndex(toc *indexTOC) (btreeIndex, error) { 500 + bi := btreeIndex{file: d.file} 501 + 502 + textContent, err := d.readSectionBlob(toc.ngramText) 503 + if err != nil { 504 + return btreeIndex{}, err 505 + } 506 + 507 + // For 500k trigams we can expect approx 1000 leaf nodes (500k divided by 508 + // half the bucketSize) and 20 nodes on level 2 (all but the rightmost 509 + // inner nodes will have exactly v=50 children) plus a root node. 510 + bt := newBtree(btreeOpts{bucketSize: btreeBucketSize, v: 50}) 511 + for i := 0; i < len(textContent); i += ngramEncoding { 512 + ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding])) 513 + bt.insert(ng) 514 + } 515 + 516 + // backfill "pointers" to the buckets and posting lists. Instead of 517 + // backfilling we could maintain state during insertion, however the 518 + // visitor pattern seems more natural and shouldn't be a performance issue, 519 + // because, based on the typical number of trigrams (500k) per shard, the 520 + // b-trees we construct here only have around 1000 leaf nodes. 521 + offset, bucketIndex := 0, 0 522 + bt.visit(func(no node) { 523 + switch n := no.(type) { 524 + case *leaf: 525 + n.bucketIndex = bucketIndex 526 + bucketIndex++ 527 + 528 + n.postingIndexOffset = offset 529 + offset += n.bucketSize 530 + case *innerNode: 531 + return 532 + } 533 + }) 534 + 535 + bi.bt = bt 536 + 537 + bi.bucketOffsets = createBucketOffsets(toc.ngramText, btreeBucketSize) 538 + 539 + bi.postingOffsets = toc.postings.offsets 540 + bi.postingDataSentinelOffset = toc.postings.data.off + toc.postings.data.sz 541 + 542 + return bi, nil 543 + } 544 + 545 + // Because we insert ngrams into the btree in order, we can easily reconstruct 546 + // the buckets from the ngramText simpleSection just by knowing the bucketSize. 547 + // The last item of the returned slice is the sentinel value sec.off + sec.sz. 548 + func createBucketOffsets(sec simpleSection, bucketSize int) []uint32 { 549 + step := uint32((bucketSize / 2) * ngramEncoding) 550 + 551 + offsets := make([]uint32, 0, ((sec.sz-1)/step)+1) 552 + offsets = append(offsets, sec.off) 553 + 554 + sentinel := sec.off + sec.sz 555 + for off := sec.off + step; off+step < sentinel; off = off + step { 556 + offsets = append(offsets, off) 557 + } 558 + offsets = append(offsets, sentinel) 559 + return offsets 491 560 } 492 561 493 562 func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]byte, error) {

Configure Feed

Configure Feed