fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// B+-tree 2// 3// The tree we implement here is a B+-tree based on a paper by Ceylan and 4// Mihalcea [1]. 5// 6// B+-trees store all values in the leaves. In our case we store trigrams with 7// the goal to quickly retrieve a pointer to the posting list for a given 8// trigram. We choose the number of trigrams to store at each leaf based on the 9// page size, IE we make sure we are able to load a bucket of ngrams with a 10// single disk access. 11// 12// Here is an example of how our B+-tree looks like for a simple input: 13// 14// input: "hello world", bucketSize=2, v=2 15// 16// legend: ()=inner node, []=leaf 17// 18// (level 1) (hel, lo_) 19// 20// (level 2) (ell) (llo) (o_w, irl, red) 21// 22// (level 3) [_wo] [ell] [hel] [llo] [lo_] [o_w] [orl] [rld, wor] 23// 24// The leafs are stored as part of the index on disk (mmaped) while all inner 25// nodes are loaded into memory when we load the shard. 26// 27// [1] H. Ceylan and R. Mihalcea. 2011. An Efficient Indexer for Large N-Gram 28// Corpora, Proceedings of the ACL-HLT 2011 System Demonstrations, pages 29// 103-108 30 31package zoekt 32 33import ( 34 "encoding/binary" 35 "fmt" 36 "sort" 37) 38 39// btreeBucketSize should be chosen such that in the final tree the buckets are 40// as close to the page size as possible, but not above. We insert ngrams in 41// order(!), which means after a split of a leaf, the left leaf is not affected 42// by further inserts and its size is fixed to bucketSize/2. The rightmost leaf 43// might store up to btreeBucketSize ngrams, but the expected size is 44// btreeBucketSize/2, too. 45// 46// On linux "getconf PAGESIZE" returns the number of bytes in a memory page. 47const btreeBucketSize = (4096 * 2) / ngramEncoding 48 49type btree struct { 50 root node 51 opts btreeOpts 52 53 lastBucketIndex int 54} 55 56type btreeOpts struct { 57 // How many ngrams can be stored at a leaf node. 58 bucketSize int 59 // all inner nodes, except root, have [v, 2v] children. In the literature, 60 // b-trees are inconsistently categorized either by the number of children 61 // or by the number of keys. We choose the former. 62 v int 63} 64 65func newBtree(opts btreeOpts) *btree { 66 return &btree{ 67 root: &leaf{}, 68 opts: opts, 69 } 70} 71 72// insert inserts ng into bt. 73// 74// Note: when all inserts are done, freeze must be called. 75func (bt *btree) insert(ng ngram) { 76 if leftNode, rightNode, newKey, ok := bt.root.maybeSplit(bt.opts); ok { 77 bt.root = &innerNode{keys: []ngram{newKey}, children: []node{leftNode, rightNode}} 78 } 79 bt.root.insert(ng, bt.opts) 80} 81 82// find returns the tuple (bucketIndex, postingIndexOffset), both of which are 83// stored at the leaf level. They are effectively pointers to the bucket and 84// the posting lists for ngrams stored in the bucket. Since ngrams and their 85// posting lists are stored in order, knowing the index of the posting list of 86// the first item in the bucket is sufficient. 87func (bt *btree) find(ng ngram) (int, int) { 88 if bt.root == nil { 89 return -1, -1 90 } 91 return bt.root.find(ng) 92} 93 94func (bt *btree) visit(f func(n node)) { 95 bt.root.visit(f) 96} 97 98// freeze must be called once we are done inserting. It backfills "pointers" to 99// the buckets and posting lists. 100func (bt *btree) freeze() { 101 // Note: Instead of backfilling we could maintain state during insertion, 102 // however the visitor pattern seems more natural and shouldn't be a 103 // performance issue, because, based on the typical number of trigrams 104 // (500k) per shard, the b-trees we construct here only have around 1000 105 // leaf nodes. 106 offset, bucketIndex := 0, 0 107 bt.visit(func(no node) { 108 switch n := no.(type) { 109 case *leaf: 110 n.bucketIndex = bucketIndex 111 bucketIndex++ 112 113 n.postingIndexOffset = offset 114 offset += n.bucketSize 115 case *innerNode: 116 return 117 } 118 }) 119 120 bt.lastBucketIndex = bucketIndex - 1 121} 122 123func (bt *btree) sizeBytes() int { 124 sz := 2 * 8 // opts 125 126 sz += int(interfaceBytes) 127 128 bt.visit(func(n node) { 129 sz += n.sizeBytes() 130 }) 131 132 return sz 133} 134 135type node interface { 136 insert(ng ngram, opts btreeOpts) 137 maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) 138 find(ng ngram) (int, int) 139 visit(func(n node)) 140 sizeBytes() int 141} 142 143type innerNode struct { 144 keys []ngram 145 children []node 146} 147 148type leaf struct { 149 bucketIndex int 150 // postingIndexOffset is the index of the posting list of the first ngram 151 // in the bucket. This is enough to determine the index of the posting list 152 // for every other key in the bucket. 153 postingIndexOffset int 154 155 // Optimization: Because we insert ngrams in order, we don't actually have 156 // to fill the buckets. We just have to keep track of the size of the 157 // bucket, so we know when to split, and the key that we have to propagate 158 // up to the parent node when we split. 159 // 160 // If in the future we decide to mutate buckets, we have to replace 161 // bucketSize and splitKey by []ngram. 162 bucketSize int 163 splitKey ngram 164} 165 166func (n *innerNode) sizeBytes() int { 167 return len(n.keys)*ngramEncoding + len(n.children)*int(interfaceBytes) 168} 169 170func (n *leaf) sizeBytes() int { 171 return 4 * 8 172} 173 174func (n *leaf) insert(ng ngram, opts btreeOpts) { 175 n.bucketSize++ 176 177 if n.bucketSize == (opts.bucketSize/2)+1 { 178 n.splitKey = ng 179 } 180} 181 182func (n *innerNode) insert(ng ngram, opts btreeOpts) { 183 insertAt := func(i int) { 184 // Invariant: Nodes always have a free slot. 185 // 186 // We split full nodes on the the way down to the leaf. This has the 187 // advantage that inserts are handled in a single pass. 188 if leftNode, rightNode, newKey, ok := n.children[i].maybeSplit(opts); ok { 189 n.keys = append(n.keys[0:i], append([]ngram{newKey}, n.keys[i:]...)...) 190 n.children = append(n.children[0:i], append([]node{leftNode, rightNode}, n.children[i+1:]...)...) 191 192 // A split might shift the target index by 1. 193 if ng >= n.keys[i] { 194 i++ 195 } 196 } 197 n.children[i].insert(ng, opts) 198 } 199 200 for i, k := range n.keys { 201 if ng < k { 202 insertAt(i) 203 return 204 } 205 } 206 insertAt(len(n.children) - 1) 207} 208 209// See btree.find 210func (n *innerNode) find(ng ngram) (int, int) { 211 for i, k := range n.keys { 212 if ng < k { 213 return n.children[i].find(ng) 214 } 215 } 216 return n.children[len(n.children)-1].find(ng) 217} 218 219// See btree.find 220func (n *leaf) find(ng ngram) (int, int) { 221 return int(n.bucketIndex), int(n.postingIndexOffset) 222} 223 224func (n *leaf) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) { 225 if n.bucketSize < opts.bucketSize { 226 return 227 } 228 return &leaf{bucketSize: opts.bucketSize / 2}, 229 &leaf{bucketSize: opts.bucketSize / 2}, 230 n.splitKey, 231 true 232} 233 234func (n *innerNode) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) { 235 if len(n.children) < 2*opts.v { 236 return 237 } 238 return &innerNode{ 239 keys: append(make([]ngram, 0, opts.v-1), n.keys[0:opts.v-1]...), 240 children: append(make([]node, 0, opts.v), n.children[:opts.v]...)}, 241 &innerNode{ 242 keys: append(make([]ngram, 0, (2*opts.v)-1), n.keys[opts.v:]...), 243 children: append(make([]node, 0, 2*opts.v), n.children[opts.v:]...)}, 244 n.keys[opts.v-1], 245 true 246} 247 248func (n *leaf) visit(f func(n node)) { 249 f(n) 250 return 251} 252 253func (n *innerNode) visit(f func(n node)) { 254 f(n) 255 for _, child := range n.children { 256 child.visit(f) 257 } 258} 259 260func (bt *btree) String() string { 261 s := "" 262 s += fmt.Sprintf("%+v", bt.opts) 263 bt.root.visit(func(n node) { 264 switch nd := n.(type) { 265 case *leaf: 266 return 267 case *innerNode: 268 s += fmt.Sprintf("[") 269 for _, key := range nd.keys { 270 s += fmt.Sprintf("%d,", key) 271 } 272 s = s[:len(s)-1] // remove trailing comma 273 s += fmt.Sprintf("]") 274 275 } 276 }) 277 return s 278} 279 280type btreeIndex struct { 281 bt *btree 282 283 // We need the index to read buckets into memory. 284 file IndexFile 285 286 // buckets 287 ngramSec simpleSection 288 289 postingIndex simpleSection 290} 291 292// SizeBytes returns how much memory this structure uses in the heap. 293func (b btreeIndex) SizeBytes() (sz int) { 294 // btree 295 if b.bt != nil { 296 sz += int(pointerSize) + b.bt.sizeBytes() 297 } 298 // ngramSec 299 sz += 8 300 // postingIndex 301 sz += 8 302 // postingDataSentinelOffset 303 sz += 4 304 return 305} 306 307// Get returns the simple section of the posting list associated with the 308// ngram. The logic is as follows: 309// 1. Search the inner nodes to find the bucket that may contain ng (in MEM) 310// 2. Read the bucket from disk (1 disk access) 311// 3. Binary search the bucket (in MEM) 312// 4. Return the simple section pointing to the posting list (in MEM) 313func (b btreeIndex) Get(ng ngram) (ss simpleSection) { 314 if b.bt == nil { 315 return simpleSection{} 316 } 317 318 // find bucket 319 bucketIndex, postingIndexOffset := b.bt.find(ng) 320 321 // read bucket into memory 322 off, sz := b.getBucket(bucketIndex) 323 bucket, err := b.file.Read(off, sz) 324 if err != nil { 325 return simpleSection{} 326 } 327 328 // find ngram in bucket 329 getNGram := func(i int) ngram { 330 i *= ngramEncoding 331 return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding])) 332 } 333 334 bucketSize := len(bucket) / ngramEncoding 335 x := sort.Search(bucketSize, func(i int) bool { 336 return ng <= getNGram(i) 337 }) 338 339 // return associated posting list 340 if x >= bucketSize || getNGram(x) != ng { 341 return simpleSection{} 342 } 343 344 return b.getPostingList(postingIndexOffset + x) 345} 346 347// getPostingList returns the simple section pointing to the posting list of 348// the ngram at ngramIndex. 349// 350// Assumming we don't hit a page boundary, which should be rare given that we 351// only read 8 bytes, we need 1 disk access to read the posting offset. 352func (b btreeIndex) getPostingList(ngramIndex int) simpleSection { 353 relativeOffsetBytes := uint32(ngramIndex) * 4 354 355 if relativeOffsetBytes+8 <= b.postingIndex.sz { 356 // read 2 offsets 357 o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 8) 358 if err != nil { 359 return simpleSection{} 360 } 361 362 start := binary.BigEndian.Uint32(o[0:4]) 363 end := binary.BigEndian.Uint32(o[4:8]) 364 return simpleSection{ 365 off: start, 366 sz: end - start, 367 } 368 } else { 369 // last ngram => read 1 offset and calculate the size of the posting 370 // list from the offset of index section. 371 o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 4) 372 if err != nil { 373 return simpleSection{} 374 } 375 376 start := binary.BigEndian.Uint32(o[0:4]) 377 return simpleSection{ 378 off: start, 379 // The layout of the posting list compound section on disk is 380 // 381 // start b.postingIndex.off 382 // v v 383 // [[posting lists (simple section)][index (simple section)]] 384 // <----------> 385 // last posting list 386 // 387 sz: b.postingIndex.off - start, 388 } 389 } 390} 391 392func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) { 393 // All but the rightmost bucket have exactly bucketSize/2 ngrams 394 sz = uint32(b.bt.opts.bucketSize / 2 * ngramEncoding) 395 off = b.ngramSec.off + uint32(bucketIndex)*sz 396 397 // Rightmost bucket has size upto the end of the ngramSec. 398 if bucketIndex == b.bt.lastBucketIndex { 399 sz = b.ngramSec.off + b.ngramSec.sz - off 400 } 401 402 return 403} 404 405// DumpMap is a debug method which returns the btree as an in-memory 406// representation. This is how zoekt represents the ngram index in 407// google/zoekt. 408func (b btreeIndex) DumpMap() map[ngram]simpleSection { 409 if b.bt == nil { 410 return nil 411 } 412 413 m := make(map[ngram]simpleSection, b.ngramSec.sz/ngramEncoding) 414 415 b.bt.visit(func(no node) { 416 switch n := no.(type) { 417 case *leaf: 418 // read bucket into memory 419 off, sz := b.getBucket(n.bucketIndex) 420 bucket, _ := b.file.Read(off, sz) 421 422 // decode all ngrams in the bucket and fill map 423 for i := 0; i < len(bucket)/ngramEncoding; i++ { 424 gram := ngram(binary.BigEndian.Uint64(bucket[i*8:])) 425 m[gram] = b.getPostingList(int(n.postingIndexOffset) + i) 426 } 427 case *innerNode: 428 return 429 } 430 }) 431 432 return m 433}