fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 12 kB View raw
1// B+-tree 2// 3// The tree we implement here is a B+-tree based on a paper by Ceylan and 4// Mihalcea [1]. 5// 6// B+-trees store all values in the leaves. In our case we store trigrams with 7// the goal to quickly retrieve a pointer to the posting list for a given 8// trigram. We choose the number of trigrams to store at each leaf based on the 9// page size, IE we make sure we are able to load a bucket of ngrams with a 10// single disk access. 11// 12// Here is an example of how our B+-tree looks like for a simple input: 13// 14// input: "hello world", bucketSize=2, v=2 15// 16// legend: ()=inner node, []=leaf 17// 18// (level 1) (hel, lo_) 19// 20// (level 2) (ell) (llo) (o_w, irl, red) 21// 22// (level 3) [_wo] [ell] [hel] [llo] [lo_] [o_w] [orl] [rld, wor] 23// 24// The leafs are stored as part of the index on disk (mmaped) while all inner 25// nodes are loaded into memory when we load the shard. 26// 27// [1] H. Ceylan and R. Mihalcea. 2011. An Efficient Indexer for Large N-Gram 28// Corpora, Proceedings of the ACL-HLT 2011 System Demonstrations, pages 29// 103-108 30 31package index 32 33import ( 34 "encoding/binary" 35 "fmt" 36 "sort" 37) 38 39// btreeBucketSize should be chosen such that in the final tree the buckets are 40// as close to the page size as possible, but not above. We insert ngrams in 41// order(!), which means after a split of a leaf, the left leaf is not affected 42// by further inserts and its size is fixed to bucketSize/2. The rightmost leaf 43// might store up to btreeBucketSize ngrams, but the expected size is 44// btreeBucketSize/2, too. 45// 46// On linux "getconf PAGESIZE" returns the number of bytes in a memory page. 47const btreeBucketSize = (4096 * 2) / ngramEncoding 48 49const ( 50 interfaceBytes uint64 = 16 51 pointerSize uint64 = 8 52) 53 54type btree struct { 55 root node 56 opts btreeOpts 57 58 lastBucketIndex int 59} 60 61type btreeOpts struct { 62 // How many ngrams can be stored at a leaf node. 63 bucketSize int 64 // all inner nodes, except root, have [v, 2v] children. In the literature, 65 // b-trees are inconsistently categorized either by the number of children 66 // or by the number of keys. We choose the former. 67 v int 68} 69 70func newBtree(opts btreeOpts) *btree { 71 return &btree{ 72 root: &leaf{}, 73 opts: opts, 74 } 75} 76 77// insert inserts ng into bt. 78// 79// Note: when all inserts are done, freeze must be called. 80func (bt *btree) insert(ng ngram) { 81 if leftNode, rightNode, newKey, ok := bt.root.maybeSplit(bt.opts); ok { 82 bt.root = &innerNode{keys: []ngram{newKey}, children: []node{leftNode, rightNode}} 83 } 84 bt.root.insert(ng, bt.opts) 85} 86 87// find returns the tuple (bucketIndex, postingIndexOffset), both of which are 88// stored at the leaf level. They are effectively pointers to the bucket and 89// the posting lists for ngrams stored in the bucket. Since ngrams and their 90// posting lists are stored in order, knowing the index of the posting list of 91// the first item in the bucket is sufficient. 92func (bt *btree) find(ng ngram) (int, int) { 93 if bt.root == nil { 94 return -1, -1 95 } 96 return bt.root.find(ng) 97} 98 99func (bt *btree) visit(f func(n node)) { 100 bt.root.visit(f) 101} 102 103// freeze must be called once we are done inserting. It backfills "pointers" to 104// the buckets and posting lists. 105func (bt *btree) freeze() { 106 // Note: Instead of backfilling we could maintain state during insertion, 107 // however the visitor pattern seems more natural and shouldn't be a 108 // performance issue, because, based on the typical number of trigrams 109 // (500k) per shard, the b-trees we construct here only have around 1000 110 // leaf nodes. 111 offset, bucketIndex := 0, 0 112 bt.visit(func(no node) { 113 switch n := no.(type) { 114 case *leaf: 115 n.bucketIndex = bucketIndex 116 bucketIndex++ 117 118 n.postingIndexOffset = offset 119 offset += n.bucketSize 120 case *innerNode: 121 return 122 } 123 }) 124 125 bt.lastBucketIndex = bucketIndex - 1 126} 127 128func (bt *btree) sizeBytes() int { 129 sz := 2 * 8 // opts 130 131 sz += int(interfaceBytes) 132 133 bt.visit(func(n node) { 134 sz += n.sizeBytes() 135 }) 136 137 return sz 138} 139 140type node interface { 141 insert(ng ngram, opts btreeOpts) 142 maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) 143 find(ng ngram) (int, int) 144 visit(func(n node)) 145 sizeBytes() int 146} 147 148type innerNode struct { 149 keys []ngram 150 children []node 151} 152 153type leaf struct { 154 bucketIndex int 155 // postingIndexOffset is the index of the posting list of the first ngram 156 // in the bucket. This is enough to determine the index of the posting list 157 // for every other key in the bucket. 158 postingIndexOffset int 159 160 // Optimization: Because we insert ngrams in order, we don't actually have 161 // to fill the buckets. We just have to keep track of the size of the 162 // bucket, so we know when to split, and the key that we have to propagate 163 // up to the parent node when we split. 164 // 165 // If in the future we decide to mutate buckets, we have to replace 166 // bucketSize and splitKey by []ngram. 167 bucketSize int 168 splitKey ngram 169} 170 171func (n *innerNode) sizeBytes() int { 172 return len(n.keys)*ngramEncoding + len(n.children)*int(interfaceBytes) 173} 174 175func (n *leaf) sizeBytes() int { 176 return 4 * 8 177} 178 179func (n *leaf) insert(ng ngram, opts btreeOpts) { 180 n.bucketSize++ 181 182 if n.bucketSize == (opts.bucketSize/2)+1 { 183 n.splitKey = ng 184 } 185} 186 187func (n *innerNode) insert(ng ngram, opts btreeOpts) { 188 insertAt := func(i int) { 189 // Invariant: Nodes always have a free slot. 190 // 191 // We split full nodes on the the way down to the leaf. This has the 192 // advantage that inserts are handled in a single pass. 193 if leftNode, rightNode, newKey, ok := n.children[i].maybeSplit(opts); ok { 194 n.keys = append(n.keys[0:i], append([]ngram{newKey}, n.keys[i:]...)...) 195 n.children = append(n.children[0:i], append([]node{leftNode, rightNode}, n.children[i+1:]...)...) 196 197 // A split might shift the target index by 1. 198 if ng >= n.keys[i] { 199 i++ 200 } 201 } 202 n.children[i].insert(ng, opts) 203 } 204 205 for i, k := range n.keys { 206 if ng < k { 207 insertAt(i) 208 return 209 } 210 } 211 insertAt(len(n.children) - 1) 212} 213 214// See btree.find 215func (n *innerNode) find(ng ngram) (int, int) { 216 for i, k := range n.keys { 217 if ng < k { 218 return n.children[i].find(ng) 219 } 220 } 221 return n.children[len(n.children)-1].find(ng) 222} 223 224// See btree.find 225func (n *leaf) find(ng ngram) (int, int) { 226 return int(n.bucketIndex), int(n.postingIndexOffset) 227} 228 229func (n *leaf) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) { 230 if n.bucketSize < opts.bucketSize { 231 return 232 } 233 return &leaf{bucketSize: opts.bucketSize / 2}, 234 &leaf{bucketSize: opts.bucketSize / 2}, 235 n.splitKey, 236 true 237} 238 239func (n *innerNode) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) { 240 if len(n.children) < 2*opts.v { 241 return 242 } 243 return &innerNode{ 244 keys: append(make([]ngram, 0, opts.v-1), n.keys[0:opts.v-1]...), 245 children: append(make([]node, 0, opts.v), n.children[:opts.v]...), 246 }, 247 &innerNode{ 248 keys: append(make([]ngram, 0, (2*opts.v)-1), n.keys[opts.v:]...), 249 children: append(make([]node, 0, 2*opts.v), n.children[opts.v:]...), 250 }, 251 n.keys[opts.v-1], 252 true 253} 254 255func (n *leaf) visit(f func(n node)) { 256 f(n) 257 return 258} 259 260func (n *innerNode) visit(f func(n node)) { 261 f(n) 262 for _, child := range n.children { 263 child.visit(f) 264 } 265} 266 267func (bt *btree) String() string { 268 s := "" 269 s += fmt.Sprintf("%+v", bt.opts) 270 bt.root.visit(func(n node) { 271 switch nd := n.(type) { 272 case *leaf: 273 return 274 case *innerNode: 275 s += fmt.Sprintf("[") 276 for _, key := range nd.keys { 277 s += fmt.Sprintf("%d,", key) 278 } 279 s = s[:len(s)-1] // remove trailing comma 280 s += fmt.Sprintf("]") 281 282 } 283 }) 284 return s 285} 286 287type btreeIndex struct { 288 bt *btree 289 290 // We need the index to read buckets into memory. 291 file IndexFile 292 293 // buckets 294 ngramSec simpleSection 295 296 postingIndex simpleSection 297} 298 299// SizeBytes returns how much memory this structure uses in the heap. 300func (b btreeIndex) SizeBytes() (sz int) { 301 // btree 302 if b.bt != nil { 303 sz += int(pointerSize) + b.bt.sizeBytes() 304 } 305 // ngramSec 306 sz += 8 307 // postingIndex 308 sz += 8 309 // postingDataSentinelOffset 310 sz += 4 311 return 312} 313 314// Get returns the simple section of the posting list associated with the 315// ngram. The logic is as follows: 316// 1. Search the inner nodes to find the bucket that may contain ng (in MEM) 317// 2. Read the bucket from disk (1 disk access) 318// 3. Binary search the bucket (in MEM) 319// 4. Return the simple section pointing to the posting list (in MEM) 320func (b btreeIndex) Get(ng ngram) (ss simpleSection) { 321 if b.bt == nil { 322 return simpleSection{} 323 } 324 325 // find bucket 326 bucketIndex, postingIndexOffset := b.bt.find(ng) 327 328 // read bucket into memory 329 off, sz := b.getBucket(bucketIndex) 330 bucket, err := b.file.Read(off, sz) 331 if err != nil { 332 return simpleSection{} 333 } 334 335 // find ngram in bucket 336 getNGram := func(i int) ngram { 337 i *= ngramEncoding 338 return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding])) 339 } 340 341 bucketSize := len(bucket) / ngramEncoding 342 x := sort.Search(bucketSize, func(i int) bool { 343 return ng <= getNGram(i) 344 }) 345 346 // return associated posting list 347 if x >= bucketSize || getNGram(x) != ng { 348 return simpleSection{} 349 } 350 351 return b.getPostingList(postingIndexOffset + x) 352} 353 354// getPostingList returns the simple section pointing to the posting list of 355// the ngram at ngramIndex. 356// 357// Assumming we don't hit a page boundary, which should be rare given that we 358// only read 8 bytes, we need 1 disk access to read the posting offset. 359func (b btreeIndex) getPostingList(ngramIndex int) simpleSection { 360 relativeOffsetBytes := uint32(ngramIndex) * 4 361 362 if relativeOffsetBytes+8 <= b.postingIndex.sz { 363 // read 2 offsets 364 o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 8) 365 if err != nil { 366 return simpleSection{} 367 } 368 369 start := binary.BigEndian.Uint32(o[0:4]) 370 end := binary.BigEndian.Uint32(o[4:8]) 371 return simpleSection{ 372 off: start, 373 sz: end - start, 374 } 375 } else { 376 // last ngram => read 1 offset and calculate the size of the posting 377 // list from the offset of index section. 378 o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 4) 379 if err != nil { 380 return simpleSection{} 381 } 382 383 start := binary.BigEndian.Uint32(o[0:4]) 384 return simpleSection{ 385 off: start, 386 // The layout of the posting list compound section on disk is 387 // 388 // start b.postingIndex.off 389 // v v 390 // [[posting lists (simple section)][index (simple section)]] 391 // <----------> 392 // last posting list 393 // 394 sz: b.postingIndex.off - start, 395 } 396 } 397} 398 399func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) { 400 // All but the rightmost bucket have exactly bucketSize/2 ngrams 401 sz = uint32(b.bt.opts.bucketSize / 2 * ngramEncoding) 402 off = b.ngramSec.off + uint32(bucketIndex)*sz 403 404 // Rightmost bucket has size upto the end of the ngramSec. 405 if bucketIndex == b.bt.lastBucketIndex { 406 sz = b.ngramSec.off + b.ngramSec.sz - off 407 } 408 409 return 410} 411 412// DumpMap is a debug method which returns the btree as an in-memory 413// representation. This is how zoekt represents the ngram index in 414// google/zoekt. 415func (b btreeIndex) DumpMap() map[ngram]simpleSection { 416 if b.bt == nil { 417 return nil 418 } 419 420 m := make(map[ngram]simpleSection, b.ngramSec.sz/ngramEncoding) 421 422 b.bt.visit(func(no node) { 423 switch n := no.(type) { 424 case *leaf: 425 // read bucket into memory 426 off, sz := b.getBucket(n.bucketIndex) 427 bucket, _ := b.file.Read(off, sz) 428 429 // decode all ngrams in the bucket and fill map 430 for i := range len(bucket) / ngramEncoding { 431 gram := ngram(binary.BigEndian.Uint64(bucket[i*8:])) 432 m[gram] = b.getPostingList(int(n.postingIndexOffset) + i) 433 } 434 case *innerNode: 435 return 436 } 437 }) 438 439 return m 440}