fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// B+-tree 2// 3// The tree we implement here is a B+-tree based on a paper by Ceylan and 4// Mihalcea [1]. 5// 6// B+-trees store all values in the leaves. In our case we store trigrams with 7// the goal to quickly retrieve a pointer to the posting list for a given 8// trigram. We choose the number of trigrams to store at each leaf based on the 9// page size, IE we make sure we are able to load a bucket of ngrams with a 10// single disk access. 11// 12// Here is an example of how our B+-tree looks like for a simple input: 13// 14// input: "hello world", bucketSize=2, v=2 15// 16// legend: ()=inner node, []=leaf 17// 18// (level 1) (hel, lo_) 19// 20// (level 2) (ell) (llo) (o_w, irl, red) 21// 22// (level 3) [_wo] [ell] [hel] [llo] [lo_] [o_w] [orl] [rld, wor] 23// 24// The leafs are stored as part of the index on disk (mmaped) while all inner 25// nodes are loaded into memory when we load the shard. 26// 27// [1] H. Ceylan and R. Mihalcea. 2011. An Efficient Indexer for Large N-Gram 28// Corpora, Proceedings of the ACL-HLT 2011 System Demonstrations, pages 29// 103-108 30 31package zoekt 32 33import ( 34 "encoding/binary" 35 "fmt" 36 "sort" 37) 38 39// btreeBucketSize should be chosen such that in the final tree the buckets are 40// as close to the page size as possible, but not above. We insert ngrams in 41// order(!), which means after a split of a leaf, the left leaf is not affected 42// by further inserts and its size is fixed to bucketSize/2. The rightmost leaf 43// might store up to btreeBucketSize ngrams, but the expected size is 44// btreeBucketSize/2, too. 45// 46// On linux "getconf PAGESIZE" returns the number of bytes in a memory page. 47const btreeBucketSize = (4096 * 2) / ngramEncoding 48 49type btree struct { 50 root node 51 opts btreeOpts 52 53 lastBucketIndex int 54} 55 56type btreeOpts struct { 57 // How many ngrams can be stored at a leaf node. 58 bucketSize int 59 // all inner nodes, except root, have [v, 2v] children. In the literature, 60 // b-trees are inconsistently categorized either by the number of children 61 // or by the number of keys. We choose the former. 62 v int 63} 64 65func newBtree(opts btreeOpts) *btree { 66 return &btree{ 67 root: &leaf{}, 68 opts: opts, 69 } 70} 71 72// insert inserts ng into bt. 73// 74// Note: when all inserts are done, freeze must be called. 75func (bt *btree) insert(ng ngram) { 76 if leftNode, rightNode, newKey, ok := bt.root.maybeSplit(bt.opts); ok { 77 bt.root = &innerNode{keys: []ngram{newKey}, children: []node{leftNode, rightNode}} 78 } 79 bt.root.insert(ng, bt.opts) 80} 81 82// find returns the tuple (bucketIndex, postingIndexOffset), both of which are 83// stored at the leaf level. They are effectively pointers to the bucket and 84// the posting lists for ngrams stored in the bucket. Since ngrams and their 85// posting lists are stored in order, knowing the index of the posting list of 86// the first item in the bucket is sufficient. 87func (bt *btree) find(ng ngram) (int, int) { 88 if bt.root == nil { 89 return -1, -1 90 } 91 return bt.root.find(ng) 92} 93 94func (bt *btree) visit(f func(n node)) { 95 bt.root.visit(f) 96} 97 98// freeze must be called once we are done inserting. It backfills "pointers" to 99// the buckets and posting lists. 100func (bt *btree) freeze() { 101 // Note: Instead of backfilling we could maintain state during insertion, 102 // however the visitor pattern seems more natural and shouldn't be a 103 // performance issue, because, based on the typical number of trigrams 104 // (500k) per shard, the b-trees we construct here only have around 1000 105 // leaf nodes. 106 offset, bucketIndex := 0, 0 107 bt.visit(func(no node) { 108 switch n := no.(type) { 109 case *leaf: 110 n.bucketIndex = bucketIndex 111 bucketIndex++ 112 113 n.postingIndexOffset = offset 114 offset += n.bucketSize 115 case *innerNode: 116 return 117 } 118 }) 119 120 bt.lastBucketIndex = bucketIndex - 1 121} 122 123func (bt *btree) sizeBytes() int { 124 sz := 2 * 8 // opts 125 126 sz += int(interfaceBytes) 127 128 bt.visit(func(n node) { 129 sz += n.sizeBytes() 130 }) 131 132 return sz 133} 134 135type node interface { 136 insert(ng ngram, opts btreeOpts) 137 maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) 138 find(ng ngram) (int, int) 139 visit(func(n node)) 140 sizeBytes() int 141} 142 143type innerNode struct { 144 keys []ngram 145 children []node 146} 147 148type leaf struct { 149 bucketIndex int 150 // postingIndexOffset is the index of the posting list of the first ngram 151 // in the bucket. This is enough to determine the index of the posting list 152 // for every other key in the bucket. 153 postingIndexOffset int 154 155 // Optimization: Because we insert ngrams in order, we don't actually have 156 // to fill the buckets. We just have to keep track of the size of the 157 // bucket, so we know when to split, and the key that we have to propagate 158 // up to the parent node when we split. 159 // 160 // If in the future we decide to mutate buckets, we have to replace 161 // bucketSize and splitKey by []ngram. 162 bucketSize int 163 splitKey ngram 164} 165 166func (n *innerNode) sizeBytes() int { 167 return len(n.keys)*ngramEncoding + len(n.children)*int(interfaceBytes) 168} 169 170func (n *leaf) sizeBytes() int { 171 return 4 * 8 172} 173 174func (n *leaf) insert(ng ngram, opts btreeOpts) { 175 n.bucketSize++ 176 177 if n.bucketSize == (opts.bucketSize/2)+1 { 178 n.splitKey = ng 179 } 180} 181 182func (n *innerNode) insert(ng ngram, opts btreeOpts) { 183 insertAt := func(i int) { 184 // Invariant: Nodes always have a free slot. 185 // 186 // We split full nodes on the the way down to the leaf. This has the 187 // advantage that inserts are handled in a single pass. 188 if leftNode, rightNode, newKey, ok := n.children[i].maybeSplit(opts); ok { 189 n.keys = append(n.keys[0:i], append([]ngram{newKey}, n.keys[i:]...)...) 190 n.children = append(n.children[0:i], append([]node{leftNode, rightNode}, n.children[i+1:]...)...) 191 192 // A split might shift the target index by 1. 193 if ng >= n.keys[i] { 194 i++ 195 } 196 } 197 n.children[i].insert(ng, opts) 198 } 199 200 for i, k := range n.keys { 201 if ng < k { 202 insertAt(i) 203 return 204 } 205 } 206 insertAt(len(n.children) - 1) 207} 208 209// See btree.find 210func (n *innerNode) find(ng ngram) (int, int) { 211 for i, k := range n.keys { 212 if ng < k { 213 return n.children[i].find(ng) 214 } 215 } 216 return n.children[len(n.children)-1].find(ng) 217} 218 219// See btree.find 220func (n *leaf) find(ng ngram) (int, int) { 221 return int(n.bucketIndex), int(n.postingIndexOffset) 222} 223 224func (n *leaf) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) { 225 if n.bucketSize < opts.bucketSize { 226 return 227 } 228 return &leaf{bucketSize: opts.bucketSize / 2}, 229 &leaf{bucketSize: opts.bucketSize / 2}, 230 n.splitKey, 231 true 232} 233 234func (n *innerNode) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) { 235 if len(n.children) < 2*opts.v { 236 return 237 } 238 return &innerNode{ 239 keys: append(make([]ngram, 0, opts.v-1), n.keys[0:opts.v-1]...), 240 children: append(make([]node, 0, opts.v), n.children[:opts.v]...)}, 241 &innerNode{ 242 keys: append(make([]ngram, 0, (2*opts.v)-1), n.keys[opts.v:]...), 243 children: append(make([]node, 0, 2*opts.v), n.children[opts.v:]...)}, 244 n.keys[opts.v-1], 245 true 246} 247 248func (n *leaf) visit(f func(n node)) { 249 f(n) 250 return 251} 252 253func (n *innerNode) visit(f func(n node)) { 254 f(n) 255 for _, child := range n.children { 256 child.visit(f) 257 } 258} 259 260func (bt *btree) String() string { 261 s := "" 262 s += fmt.Sprintf("%+v", bt.opts) 263 bt.root.visit(func(n node) { 264 switch nd := n.(type) { 265 case *leaf: 266 return 267 case *innerNode: 268 s += fmt.Sprintf("[") 269 for _, key := range nd.keys { 270 s += fmt.Sprintf("%d,", key) 271 } 272 s = s[:len(s)-1] // remove trailing comma 273 s += fmt.Sprintf("]") 274 275 } 276 }) 277 return s 278} 279 280type btreeIndex struct { 281 bt *btree 282 283 // We need the index to read buckets into memory. 284 file IndexFile 285 286 // buckets 287 ngramSec simpleSection 288 289 postingIndex simpleSection 290} 291 292// SizeBytes returns how much memory this structure uses in the heap. 293func (b btreeIndex) SizeBytes() (sz int) { 294 // btree 295 if b.bt != nil { 296 sz += int(pointerSize) + b.bt.sizeBytes() 297 } 298 // ngramSec 299 sz += 8 300 // postingIndex 301 sz += 8 302 // postingDataSentinelOffset 303 sz += 4 304 return 305} 306 307// NgramIndexes returns the indexes of the ngrams in the index. We return a 308// slice of slices because we have to keep track of ngram variants in case of 309// case-insensitive search. 310func (b btreeIndex) NgramIndexes(ngrams []ngram) ([]int, int) { 311 lookups := 0 312 ngramIndexes := make([]int, 0, len(ngrams)) 313 314 for _, ng := range ngrams { 315 ix := b.ngramIndex(ng) 316 lookups++ 317 if ix == -1 { 318 return nil, len(ngramIndexes) + 1 319 } 320 ngramIndexes = append(ngramIndexes, ix) 321 } 322 323 return ngramIndexes, len(ngramIndexes) 324} 325 326func (b btreeIndex) ngramIndex(ng ngram) int { 327 if b.bt == nil { 328 return -1 329 } 330 331 // find bucket 332 bucketIndex, postingIndexOffset := b.bt.find(ng) 333 334 // read bucket into memory 335 off, sz := b.getBucket(bucketIndex) 336 bucket, err := b.file.Read(off, sz) 337 if err != nil { 338 return -1 339 } 340 341 // find ngram in bucket 342 getNGram := func(i int) ngram { 343 i *= ngramEncoding 344 return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding])) 345 } 346 347 bucketSize := len(bucket) / ngramEncoding 348 x := sort.Search(bucketSize, func(i int) bool { 349 return ng <= getNGram(i) 350 }) 351 352 // return index of associated posting list 353 if x >= bucketSize || getNGram(x) != ng { 354 return -1 355 } 356 357 return postingIndexOffset + x 358} 359 360// Get returns the simple section of the posting list associated with the 361// ngram. The logic is as follows: 362// 1. Search the inner nodes to find the bucket that may contain ng (in MEM) 363// 2. Read the bucket from disk (1 disk access) 364// 3. Binary search the bucket (in MEM) 365// 4. Return the simple section pointing to the posting list (in MEM) 366func (b btreeIndex) Get(ng ngram) (ss simpleSection) { 367 if b.bt == nil { 368 return simpleSection{} 369 } 370 371 // find bucket 372 bucketIndex, postingIndexOffset := b.bt.find(ng) 373 374 // read bucket into memory 375 off, sz := b.getBucket(bucketIndex) 376 bucket, err := b.file.Read(off, sz) 377 if err != nil { 378 return simpleSection{} 379 } 380 381 // find ngram in bucket 382 getNGram := func(i int) ngram { 383 i *= ngramEncoding 384 return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding])) 385 } 386 387 bucketSize := len(bucket) / ngramEncoding 388 x := sort.Search(bucketSize, func(i int) bool { 389 return ng <= getNGram(i) 390 }) 391 392 // return associated posting list 393 if x >= bucketSize || getNGram(x) != ng { 394 return simpleSection{} 395 } 396 397 return b.GetPostingList(postingIndexOffset + x) 398} 399 400// GetPostingList returns the simple section pointing to the posting list of 401// the ngram at ngramIndex. 402// 403// Assumming we don't hit a page boundary, which should be rare given that we 404// only read 8 bytes, we need 1 disk access to read the posting offset. 405func (b btreeIndex) GetPostingList(ngramIndex int) simpleSection { 406 relativeOffsetBytes := uint32(ngramIndex) * 4 407 408 if relativeOffsetBytes+8 <= b.postingIndex.sz { 409 // read 2 offsets 410 o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 8) 411 if err != nil { 412 return simpleSection{} 413 } 414 415 start := binary.BigEndian.Uint32(o[0:4]) 416 end := binary.BigEndian.Uint32(o[4:8]) 417 return simpleSection{ 418 off: start, 419 sz: end - start, 420 } 421 } else { 422 // last ngram => read 1 offset and calculate the size of the posting 423 // list from the offset of index section. 424 o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 4) 425 if err != nil { 426 return simpleSection{} 427 } 428 429 start := binary.BigEndian.Uint32(o[0:4]) 430 return simpleSection{ 431 off: start, 432 // The layout of the posting list compound section on disk is 433 // 434 // start b.postingIndex.off 435 // v v 436 // [[posting lists (simple section)][index (simple section)]] 437 // <----------> 438 // last posting list 439 // 440 sz: b.postingIndex.off - start, 441 } 442 } 443} 444 445func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) { 446 // All but the rightmost bucket have exactly bucketSize/2 ngrams 447 sz = uint32(b.bt.opts.bucketSize / 2 * ngramEncoding) 448 off = b.ngramSec.off + uint32(bucketIndex)*sz 449 450 // Rightmost bucket has size upto the end of the ngramSec. 451 if bucketIndex == b.bt.lastBucketIndex { 452 sz = b.ngramSec.off + b.ngramSec.sz - off 453 } 454 455 return 456} 457 458// DumpMap is a debug method which returns the btree as an in-memory 459// representation. This is how zoekt represents the ngram index in 460// google/zoekt. 461func (b btreeIndex) DumpMap() map[ngram]simpleSection { 462 if b.bt == nil { 463 return nil 464 } 465 466 m := make(map[ngram]simpleSection, b.ngramSec.sz/ngramEncoding) 467 468 b.bt.visit(func(no node) { 469 switch n := no.(type) { 470 case *leaf: 471 // read bucket into memory 472 off, sz := b.getBucket(n.bucketIndex) 473 bucket, _ := b.file.Read(off, sz) 474 475 // decode all ngrams in the bucket and fill map 476 for i := 0; i < len(bucket)/ngramEncoding; i++ { 477 gram := ngram(binary.BigEndian.Uint64(bucket[i*8:])) 478 m[gram] = b.GetPostingList(int(n.postingIndexOffset) + i) 479 } 480 case *innerNode: 481 return 482 } 483 }) 484 485 return m 486}