fork of https://github.com/sourcegraph/zoekt
1// B+-tree
2//
3// The tree we implement here is a B+-tree based on a paper by Ceylan and
4// Mihalcea [1].
5//
6// B+-trees store all values in the leaves. In our case we store trigrams with
7// the goal to quickly retrieve a pointer to the posting list for a given
8// trigram. We choose the number of trigrams to store at each leaf based on the
9// page size, IE we make sure we are able to load a bucket of ngrams with a
10// single disk access.
11//
12// Here is an example of how our B+-tree looks like for a simple input:
13//
14// input: "hello world", bucketSize=2, v=2
15//
16// legend: ()=inner node, []=leaf
17//
18// (level 1) (hel, lo_)
19//
20// (level 2) (ell) (llo) (o_w, irl, red)
21//
22// (level 3) [_wo] [ell] [hel] [llo] [lo_] [o_w] [orl] [rld, wor]
23//
24// The leafs are stored as part of the index on disk (mmaped) while all inner
25// nodes are loaded into memory when we load the shard.
26//
27// [1] H. Ceylan and R. Mihalcea. 2011. An Efficient Indexer for Large N-Gram
28// Corpora, Proceedings of the ACL-HLT 2011 System Demonstrations, pages
29// 103-108
30
31package zoekt
32
33import (
34 "encoding/binary"
35 "fmt"
36 "sort"
37)
38
39// btreeBucketSize should be chosen such that in the final tree the buckets are
40// as close to the page size as possible, but not above. We insert ngrams in
41// order(!), which means after a split of a leaf, the left leaf is not affected
42// by further inserts and its size is fixed to bucketSize/2. The rightmost leaf
43// might store up to btreeBucketSize ngrams, but the expected size is
44// btreeBucketSize/2, too.
45//
46// On linux "getconf PAGESIZE" returns the number of bytes in a memory page.
47const btreeBucketSize = (4096 * 2) / ngramEncoding
48
49type btree struct {
50 root node
51 opts btreeOpts
52
53 lastBucketIndex int
54}
55
56type btreeOpts struct {
57 // How many ngrams can be stored at a leaf node.
58 bucketSize int
59 // all inner nodes, except root, have [v, 2v] children. In the literature,
60 // b-trees are inconsistently categorized either by the number of children
61 // or by the number of keys. We choose the former.
62 v int
63}
64
65func newBtree(opts btreeOpts) *btree {
66 return &btree{
67 root: &leaf{},
68 opts: opts,
69 }
70}
71
72// insert inserts ng into bt.
73//
74// Note: when all inserts are done, freeze must be called.
75func (bt *btree) insert(ng ngram) {
76 if leftNode, rightNode, newKey, ok := bt.root.maybeSplit(bt.opts); ok {
77 bt.root = &innerNode{keys: []ngram{newKey}, children: []node{leftNode, rightNode}}
78 }
79 bt.root.insert(ng, bt.opts)
80}
81
82// find returns the tuple (bucketIndex, postingIndexOffset), both of which are
83// stored at the leaf level. They are effectively pointers to the bucket and
84// the posting lists for ngrams stored in the bucket. Since ngrams and their
85// posting lists are stored in order, knowing the index of the posting list of
86// the first item in the bucket is sufficient.
87func (bt *btree) find(ng ngram) (int, int) {
88 if bt.root == nil {
89 return -1, -1
90 }
91 return bt.root.find(ng)
92}
93
94func (bt *btree) visit(f func(n node)) {
95 bt.root.visit(f)
96}
97
98// freeze must be called once we are done inserting. It backfills "pointers" to
99// the buckets and posting lists.
100func (bt *btree) freeze() {
101 // Note: Instead of backfilling we could maintain state during insertion,
102 // however the visitor pattern seems more natural and shouldn't be a
103 // performance issue, because, based on the typical number of trigrams
104 // (500k) per shard, the b-trees we construct here only have around 1000
105 // leaf nodes.
106 offset, bucketIndex := 0, 0
107 bt.visit(func(no node) {
108 switch n := no.(type) {
109 case *leaf:
110 n.bucketIndex = bucketIndex
111 bucketIndex++
112
113 n.postingIndexOffset = offset
114 offset += n.bucketSize
115 case *innerNode:
116 return
117 }
118 })
119
120 bt.lastBucketIndex = bucketIndex - 1
121}
122
123func (bt *btree) sizeBytes() int {
124 sz := 2 * 8 // opts
125
126 sz += int(interfaceBytes)
127
128 bt.visit(func(n node) {
129 sz += n.sizeBytes()
130 })
131
132 return sz
133}
134
135type node interface {
136 insert(ng ngram, opts btreeOpts)
137 maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool)
138 find(ng ngram) (int, int)
139 visit(func(n node))
140 sizeBytes() int
141}
142
143type innerNode struct {
144 keys []ngram
145 children []node
146}
147
148type leaf struct {
149 bucketIndex int
150 // postingIndexOffset is the index of the posting list of the first ngram
151 // in the bucket. This is enough to determine the index of the posting list
152 // for every other key in the bucket.
153 postingIndexOffset int
154
155 // Optimization: Because we insert ngrams in order, we don't actually have
156 // to fill the buckets. We just have to keep track of the size of the
157 // bucket, so we know when to split, and the key that we have to propagate
158 // up to the parent node when we split.
159 //
160 // If in the future we decide to mutate buckets, we have to replace
161 // bucketSize and splitKey by []ngram.
162 bucketSize int
163 splitKey ngram
164}
165
166func (n *innerNode) sizeBytes() int {
167 return len(n.keys)*ngramEncoding + len(n.children)*int(interfaceBytes)
168}
169
170func (n *leaf) sizeBytes() int {
171 return 4 * 8
172}
173
174func (n *leaf) insert(ng ngram, opts btreeOpts) {
175 n.bucketSize++
176
177 if n.bucketSize == (opts.bucketSize/2)+1 {
178 n.splitKey = ng
179 }
180}
181
182func (n *innerNode) insert(ng ngram, opts btreeOpts) {
183 insertAt := func(i int) {
184 // Invariant: Nodes always have a free slot.
185 //
186 // We split full nodes on the the way down to the leaf. This has the
187 // advantage that inserts are handled in a single pass.
188 if leftNode, rightNode, newKey, ok := n.children[i].maybeSplit(opts); ok {
189 n.keys = append(n.keys[0:i], append([]ngram{newKey}, n.keys[i:]...)...)
190 n.children = append(n.children[0:i], append([]node{leftNode, rightNode}, n.children[i+1:]...)...)
191
192 // A split might shift the target index by 1.
193 if ng >= n.keys[i] {
194 i++
195 }
196 }
197 n.children[i].insert(ng, opts)
198 }
199
200 for i, k := range n.keys {
201 if ng < k {
202 insertAt(i)
203 return
204 }
205 }
206 insertAt(len(n.children) - 1)
207}
208
209// See btree.find
210func (n *innerNode) find(ng ngram) (int, int) {
211 for i, k := range n.keys {
212 if ng < k {
213 return n.children[i].find(ng)
214 }
215 }
216 return n.children[len(n.children)-1].find(ng)
217}
218
219// See btree.find
220func (n *leaf) find(ng ngram) (int, int) {
221 return int(n.bucketIndex), int(n.postingIndexOffset)
222}
223
224func (n *leaf) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) {
225 if n.bucketSize < opts.bucketSize {
226 return
227 }
228 return &leaf{bucketSize: opts.bucketSize / 2},
229 &leaf{bucketSize: opts.bucketSize / 2},
230 n.splitKey,
231 true
232}
233
234func (n *innerNode) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) {
235 if len(n.children) < 2*opts.v {
236 return
237 }
238 return &innerNode{
239 keys: append(make([]ngram, 0, opts.v-1), n.keys[0:opts.v-1]...),
240 children: append(make([]node, 0, opts.v), n.children[:opts.v]...),
241 },
242 &innerNode{
243 keys: append(make([]ngram, 0, (2*opts.v)-1), n.keys[opts.v:]...),
244 children: append(make([]node, 0, 2*opts.v), n.children[opts.v:]...),
245 },
246 n.keys[opts.v-1],
247 true
248}
249
250func (n *leaf) visit(f func(n node)) {
251 f(n)
252 return
253}
254
255func (n *innerNode) visit(f func(n node)) {
256 f(n)
257 for _, child := range n.children {
258 child.visit(f)
259 }
260}
261
262func (bt *btree) String() string {
263 s := ""
264 s += fmt.Sprintf("%+v", bt.opts)
265 bt.root.visit(func(n node) {
266 switch nd := n.(type) {
267 case *leaf:
268 return
269 case *innerNode:
270 s += fmt.Sprintf("[")
271 for _, key := range nd.keys {
272 s += fmt.Sprintf("%d,", key)
273 }
274 s = s[:len(s)-1] // remove trailing comma
275 s += fmt.Sprintf("]")
276
277 }
278 })
279 return s
280}
281
282type btreeIndex struct {
283 bt *btree
284
285 // We need the index to read buckets into memory.
286 file IndexFile
287
288 // buckets
289 ngramSec simpleSection
290
291 postingIndex simpleSection
292}
293
294// SizeBytes returns how much memory this structure uses in the heap.
295func (b btreeIndex) SizeBytes() (sz int) {
296 // btree
297 if b.bt != nil {
298 sz += int(pointerSize) + b.bt.sizeBytes()
299 }
300 // ngramSec
301 sz += 8
302 // postingIndex
303 sz += 8
304 // postingDataSentinelOffset
305 sz += 4
306 return
307}
308
309// Get returns the simple section of the posting list associated with the
310// ngram. The logic is as follows:
311// 1. Search the inner nodes to find the bucket that may contain ng (in MEM)
312// 2. Read the bucket from disk (1 disk access)
313// 3. Binary search the bucket (in MEM)
314// 4. Return the simple section pointing to the posting list (in MEM)
315func (b btreeIndex) Get(ng ngram) (ss simpleSection) {
316 if b.bt == nil {
317 return simpleSection{}
318 }
319
320 // find bucket
321 bucketIndex, postingIndexOffset := b.bt.find(ng)
322
323 // read bucket into memory
324 off, sz := b.getBucket(bucketIndex)
325 bucket, err := b.file.Read(off, sz)
326 if err != nil {
327 return simpleSection{}
328 }
329
330 // find ngram in bucket
331 getNGram := func(i int) ngram {
332 i *= ngramEncoding
333 return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding]))
334 }
335
336 bucketSize := len(bucket) / ngramEncoding
337 x := sort.Search(bucketSize, func(i int) bool {
338 return ng <= getNGram(i)
339 })
340
341 // return associated posting list
342 if x >= bucketSize || getNGram(x) != ng {
343 return simpleSection{}
344 }
345
346 return b.getPostingList(postingIndexOffset + x)
347}
348
349// getPostingList returns the simple section pointing to the posting list of
350// the ngram at ngramIndex.
351//
352// Assumming we don't hit a page boundary, which should be rare given that we
353// only read 8 bytes, we need 1 disk access to read the posting offset.
354func (b btreeIndex) getPostingList(ngramIndex int) simpleSection {
355 relativeOffsetBytes := uint32(ngramIndex) * 4
356
357 if relativeOffsetBytes+8 <= b.postingIndex.sz {
358 // read 2 offsets
359 o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 8)
360 if err != nil {
361 return simpleSection{}
362 }
363
364 start := binary.BigEndian.Uint32(o[0:4])
365 end := binary.BigEndian.Uint32(o[4:8])
366 return simpleSection{
367 off: start,
368 sz: end - start,
369 }
370 } else {
371 // last ngram => read 1 offset and calculate the size of the posting
372 // list from the offset of index section.
373 o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 4)
374 if err != nil {
375 return simpleSection{}
376 }
377
378 start := binary.BigEndian.Uint32(o[0:4])
379 return simpleSection{
380 off: start,
381 // The layout of the posting list compound section on disk is
382 //
383 // start b.postingIndex.off
384 // v v
385 // [[posting lists (simple section)][index (simple section)]]
386 // <---------->
387 // last posting list
388 //
389 sz: b.postingIndex.off - start,
390 }
391 }
392}
393
394func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) {
395 // All but the rightmost bucket have exactly bucketSize/2 ngrams
396 sz = uint32(b.bt.opts.bucketSize / 2 * ngramEncoding)
397 off = b.ngramSec.off + uint32(bucketIndex)*sz
398
399 // Rightmost bucket has size upto the end of the ngramSec.
400 if bucketIndex == b.bt.lastBucketIndex {
401 sz = b.ngramSec.off + b.ngramSec.sz - off
402 }
403
404 return
405}
406
407// DumpMap is a debug method which returns the btree as an in-memory
408// representation. This is how zoekt represents the ngram index in
409// google/zoekt.
410func (b btreeIndex) DumpMap() map[ngram]simpleSection {
411 if b.bt == nil {
412 return nil
413 }
414
415 m := make(map[ngram]simpleSection, b.ngramSec.sz/ngramEncoding)
416
417 b.bt.visit(func(no node) {
418 switch n := no.(type) {
419 case *leaf:
420 // read bucket into memory
421 off, sz := b.getBucket(n.bucketIndex)
422 bucket, _ := b.file.Read(off, sz)
423
424 // decode all ngrams in the bucket and fill map
425 for i := 0; i < len(bucket)/ngramEncoding; i++ {
426 gram := ngram(binary.BigEndian.Uint64(bucket[i*8:]))
427 m[gram] = b.getPostingList(int(n.postingIndexOffset) + i)
428 }
429 case *innerNode:
430 return
431 }
432 })
433
434 return m
435}