fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// B+-tree 2// 3// The tree we implement here is a B+-tree based on a paper by Ceylan and 4// Mihalcea [1]. 5// 6// B+-trees store all values in the leaves. In our case we store trigrams with 7// the goal to quickly retrieve a pointer to the posting list for a given 8// trigram. We choose the number of trigrams to store at each leaf based on the 9// page size, IE we make sure we are able to load a bucket of ngrams with a 10// single disk access. 11// 12// Here is an example of how our B+-tree looks like for a simple input: 13// 14// input: "hello world", bucketSize=2, v=2 15// 16// legend: ()=inner node, []=leaf 17// 18// (level 1) (hel, lo_) 19// 20// (level 2) (ell) (llo) (o_w, irl, red) 21// 22// (level 3) [_wo] [ell] [hel] [llo] [lo_] [o_w] [orl] [rld, wor] 23// 24// The leafs are stored as part of the index on disk (mmaped) while all inner 25// nodes are loaded into memory when we load the shard. 26// 27// [1] H. Ceylan and R. Mihalcea. 2011. An Efficient Indexer for Large N-Gram 28// Corpora, Proceedings of the ACL-HLT 2011 System Demonstrations, pages 29// 103-108 30 31package zoekt 32 33import ( 34 "encoding/binary" 35 "fmt" 36 "sort" 37) 38 39// btreeBucketSize should be chosen such that in the final tree the buckets are 40// as close to the page size as possible, but not above. We insert ngrams in 41// order(!), which means after a split of a leaf, the left leaf is not affected 42// by further inserts and its size is fixed to bucketSize/2. The rightmost leaf 43// might store up to btreeBucketSize ngrams, but the expected size is 44// btreeBucketSize/2, too. 45// 46// On linux "getconf PAGESIZE" returns the number of bytes in a memory page. 47const btreeBucketSize = (4096 * 2) / ngramEncoding 48 49type btree struct { 50 root node 51 opts btreeOpts 52 53 lastBucketIndex int 54} 55 56type btreeOpts struct { 57 // How many ngrams can be stored at a leaf node. 58 bucketSize int 59 // all inner nodes, except root, have [v, 2v] children. In the literature, 60 // b-trees are inconsistently categorized either by the number of children 61 // or by the number of keys. We choose the former. 62 v int 63} 64 65func newBtree(opts btreeOpts) *btree { 66 return &btree{ 67 root: &leaf{}, 68 opts: opts, 69 } 70} 71 72// insert inserts ng into bt. 73// 74// Note: when all inserts are done, freeze must be called. 75func (bt *btree) insert(ng ngram) { 76 if leftNode, rightNode, newKey, ok := bt.root.maybeSplit(bt.opts); ok { 77 bt.root = &innerNode{keys: []ngram{newKey}, children: []node{leftNode, rightNode}} 78 } 79 bt.root.insert(ng, bt.opts) 80} 81 82// find returns the tuple (bucketIndex, postingIndexOffset), both of which are 83// stored at the leaf level. They are effectively pointers to the bucket and 84// the posting lists for ngrams stored in the bucket. Since ngrams and their 85// posting lists are stored in order, knowing the index of the posting list of 86// the first item in the bucket is sufficient. 87func (bt *btree) find(ng ngram) (int, int) { 88 if bt.root == nil { 89 return -1, -1 90 } 91 return bt.root.find(ng) 92} 93 94func (bt *btree) visit(f func(n node)) { 95 bt.root.visit(f) 96} 97 98// freeze must be called once we are done inserting. It backfills "pointers" to 99// the buckets and posting lists. 100func (bt *btree) freeze() { 101 // Note: Instead of backfilling we could maintain state during insertion, 102 // however the visitor pattern seems more natural and shouldn't be a 103 // performance issue, because, based on the typical number of trigrams 104 // (500k) per shard, the b-trees we construct here only have around 1000 105 // leaf nodes. 106 offset, bucketIndex := 0, 0 107 bt.visit(func(no node) { 108 switch n := no.(type) { 109 case *leaf: 110 n.bucketIndex = bucketIndex 111 bucketIndex++ 112 113 n.postingIndexOffset = offset 114 offset += n.bucketSize 115 case *innerNode: 116 return 117 } 118 }) 119 120 bt.lastBucketIndex = bucketIndex - 1 121} 122 123func (bt *btree) sizeBytes() int { 124 sz := 2 * 8 // opts 125 126 sz += int(interfaceBytes) 127 128 bt.visit(func(n node) { 129 sz += n.sizeBytes() 130 }) 131 132 return sz 133} 134 135type node interface { 136 insert(ng ngram, opts btreeOpts) 137 maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) 138 find(ng ngram) (int, int) 139 visit(func(n node)) 140 sizeBytes() int 141} 142 143type innerNode struct { 144 keys []ngram 145 children []node 146} 147 148type leaf struct { 149 bucketIndex int 150 // postingIndexOffset is the index of the posting list of the first ngram 151 // in the bucket. This is enough to determine the index of the posting list 152 // for every other key in the bucket. 153 postingIndexOffset int 154 155 // Optimization: Because we insert ngrams in order, we don't actually have 156 // to fill the buckets. We just have to keep track of the size of the 157 // bucket, so we know when to split, and the key that we have to propagate 158 // up to the parent node when we split. 159 // 160 // If in the future we decide to mutate buckets, we have to replace 161 // bucketSize and splitKey by []ngram. 162 bucketSize int 163 splitKey ngram 164} 165 166func (n *innerNode) sizeBytes() int { 167 return len(n.keys)*ngramEncoding + len(n.children)*int(interfaceBytes) 168} 169 170func (n *leaf) sizeBytes() int { 171 return 4 * 8 172} 173 174func (n *leaf) insert(ng ngram, opts btreeOpts) { 175 n.bucketSize++ 176 177 if n.bucketSize == (opts.bucketSize/2)+1 { 178 n.splitKey = ng 179 } 180} 181 182func (n *innerNode) insert(ng ngram, opts btreeOpts) { 183 insertAt := func(i int) { 184 // Invariant: Nodes always have a free slot. 185 // 186 // We split full nodes on the the way down to the leaf. This has the 187 // advantage that inserts are handled in a single pass. 188 if leftNode, rightNode, newKey, ok := n.children[i].maybeSplit(opts); ok { 189 n.keys = append(n.keys[0:i], append([]ngram{newKey}, n.keys[i:]...)...) 190 n.children = append(n.children[0:i], append([]node{leftNode, rightNode}, n.children[i+1:]...)...) 191 192 // A split might shift the target index by 1. 193 if ng >= n.keys[i] { 194 i++ 195 } 196 } 197 n.children[i].insert(ng, opts) 198 } 199 200 for i, k := range n.keys { 201 if ng < k { 202 insertAt(i) 203 return 204 } 205 } 206 insertAt(len(n.children) - 1) 207} 208 209// See btree.find 210func (n *innerNode) find(ng ngram) (int, int) { 211 for i, k := range n.keys { 212 if ng < k { 213 return n.children[i].find(ng) 214 } 215 } 216 return n.children[len(n.children)-1].find(ng) 217} 218 219// See btree.find 220func (n *leaf) find(ng ngram) (int, int) { 221 return int(n.bucketIndex), int(n.postingIndexOffset) 222} 223 224func (n *leaf) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) { 225 if n.bucketSize < opts.bucketSize { 226 return 227 } 228 return &leaf{bucketSize: opts.bucketSize / 2}, 229 &leaf{bucketSize: opts.bucketSize / 2}, 230 n.splitKey, 231 true 232} 233 234func (n *innerNode) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) { 235 if len(n.children) < 2*opts.v { 236 return 237 } 238 return &innerNode{ 239 keys: append(make([]ngram, 0, opts.v-1), n.keys[0:opts.v-1]...), 240 children: append(make([]node, 0, opts.v), n.children[:opts.v]...), 241 }, 242 &innerNode{ 243 keys: append(make([]ngram, 0, (2*opts.v)-1), n.keys[opts.v:]...), 244 children: append(make([]node, 0, 2*opts.v), n.children[opts.v:]...), 245 }, 246 n.keys[opts.v-1], 247 true 248} 249 250func (n *leaf) visit(f func(n node)) { 251 f(n) 252 return 253} 254 255func (n *innerNode) visit(f func(n node)) { 256 f(n) 257 for _, child := range n.children { 258 child.visit(f) 259 } 260} 261 262func (bt *btree) String() string { 263 s := "" 264 s += fmt.Sprintf("%+v", bt.opts) 265 bt.root.visit(func(n node) { 266 switch nd := n.(type) { 267 case *leaf: 268 return 269 case *innerNode: 270 s += fmt.Sprintf("[") 271 for _, key := range nd.keys { 272 s += fmt.Sprintf("%d,", key) 273 } 274 s = s[:len(s)-1] // remove trailing comma 275 s += fmt.Sprintf("]") 276 277 } 278 }) 279 return s 280} 281 282type btreeIndex struct { 283 bt *btree 284 285 // We need the index to read buckets into memory. 286 file IndexFile 287 288 // buckets 289 ngramSec simpleSection 290 291 postingIndex simpleSection 292} 293 294// SizeBytes returns how much memory this structure uses in the heap. 295func (b btreeIndex) SizeBytes() (sz int) { 296 // btree 297 if b.bt != nil { 298 sz += int(pointerSize) + b.bt.sizeBytes() 299 } 300 // ngramSec 301 sz += 8 302 // postingIndex 303 sz += 8 304 // postingDataSentinelOffset 305 sz += 4 306 return 307} 308 309// Get returns the simple section of the posting list associated with the 310// ngram. The logic is as follows: 311// 1. Search the inner nodes to find the bucket that may contain ng (in MEM) 312// 2. Read the bucket from disk (1 disk access) 313// 3. Binary search the bucket (in MEM) 314// 4. Return the simple section pointing to the posting list (in MEM) 315func (b btreeIndex) Get(ng ngram) (ss simpleSection) { 316 if b.bt == nil { 317 return simpleSection{} 318 } 319 320 // find bucket 321 bucketIndex, postingIndexOffset := b.bt.find(ng) 322 323 // read bucket into memory 324 off, sz := b.getBucket(bucketIndex) 325 bucket, err := b.file.Read(off, sz) 326 if err != nil { 327 return simpleSection{} 328 } 329 330 // find ngram in bucket 331 getNGram := func(i int) ngram { 332 i *= ngramEncoding 333 return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding])) 334 } 335 336 bucketSize := len(bucket) / ngramEncoding 337 x := sort.Search(bucketSize, func(i int) bool { 338 return ng <= getNGram(i) 339 }) 340 341 // return associated posting list 342 if x >= bucketSize || getNGram(x) != ng { 343 return simpleSection{} 344 } 345 346 return b.getPostingList(postingIndexOffset + x) 347} 348 349// getPostingList returns the simple section pointing to the posting list of 350// the ngram at ngramIndex. 351// 352// Assumming we don't hit a page boundary, which should be rare given that we 353// only read 8 bytes, we need 1 disk access to read the posting offset. 354func (b btreeIndex) getPostingList(ngramIndex int) simpleSection { 355 relativeOffsetBytes := uint32(ngramIndex) * 4 356 357 if relativeOffsetBytes+8 <= b.postingIndex.sz { 358 // read 2 offsets 359 o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 8) 360 if err != nil { 361 return simpleSection{} 362 } 363 364 start := binary.BigEndian.Uint32(o[0:4]) 365 end := binary.BigEndian.Uint32(o[4:8]) 366 return simpleSection{ 367 off: start, 368 sz: end - start, 369 } 370 } else { 371 // last ngram => read 1 offset and calculate the size of the posting 372 // list from the offset of index section. 373 o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 4) 374 if err != nil { 375 return simpleSection{} 376 } 377 378 start := binary.BigEndian.Uint32(o[0:4]) 379 return simpleSection{ 380 off: start, 381 // The layout of the posting list compound section on disk is 382 // 383 // start b.postingIndex.off 384 // v v 385 // [[posting lists (simple section)][index (simple section)]] 386 // <----------> 387 // last posting list 388 // 389 sz: b.postingIndex.off - start, 390 } 391 } 392} 393 394func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) { 395 // All but the rightmost bucket have exactly bucketSize/2 ngrams 396 sz = uint32(b.bt.opts.bucketSize / 2 * ngramEncoding) 397 off = b.ngramSec.off + uint32(bucketIndex)*sz 398 399 // Rightmost bucket has size upto the end of the ngramSec. 400 if bucketIndex == b.bt.lastBucketIndex { 401 sz = b.ngramSec.off + b.ngramSec.sz - off 402 } 403 404 return 405} 406 407// DumpMap is a debug method which returns the btree as an in-memory 408// representation. This is how zoekt represents the ngram index in 409// google/zoekt. 410func (b btreeIndex) DumpMap() map[ngram]simpleSection { 411 if b.bt == nil { 412 return nil 413 } 414 415 m := make(map[ngram]simpleSection, b.ngramSec.sz/ngramEncoding) 416 417 b.bt.visit(func(no node) { 418 switch n := no.(type) { 419 case *leaf: 420 // read bucket into memory 421 off, sz := b.getBucket(n.bucketIndex) 422 bucket, _ := b.file.Read(off, sz) 423 424 // decode all ngrams in the bucket and fill map 425 for i := 0; i < len(bucket)/ngramEncoding; i++ { 426 gram := ngram(binary.BigEndian.Uint64(bucket[i*8:])) 427 m[gram] = b.getPostingList(int(n.postingIndexOffset) + i) 428 } 429 case *innerNode: 430 return 431 } 432 }) 433 434 return m 435}