fork of https://github.com/sourcegraph/zoekt
1// B+-tree
2//
3// The tree we implement here is a B+-tree based on a paper by Ceylan and
4// Mihalcea [1].
5//
6// B+-trees store all values in the leaves. In our case we store trigrams with
7// the goal to quickly retrieve a pointer to the posting list for a given
8// trigram. We choose the number of trigrams to store at each leaf based on the
9// page size, IE we make sure we are able to load a bucket of ngrams with a
10// single disk access.
11//
12// Here is an example of how our B+-tree looks like for a simple input:
13//
14// input: "hello world", bucketSize=2, v=2
15//
16// legend: ()=inner node, []=leaf
17//
18// (level 1) (hel, lo_)
19//
20// (level 2) (ell) (llo) (o_w, irl, red)
21//
22// (level 3) [_wo] [ell] [hel] [llo] [lo_] [o_w] [orl] [rld, wor]
23//
24// The leafs are stored as part of the index on disk (mmaped) while all inner
25// nodes are loaded into memory when we load the shard.
26//
27// [1] H. Ceylan and R. Mihalcea. 2011. An Efficient Indexer for Large N-Gram
28// Corpora, Proceedings of the ACL-HLT 2011 System Demonstrations, pages
29// 103-108
30
31package index
32
33import (
34 "encoding/binary"
35 "fmt"
36 "sort"
37)
38
39// btreeBucketSize should be chosen such that in the final tree the buckets are
40// as close to the page size as possible, but not above. We insert ngrams in
41// order(!), which means after a split of a leaf, the left leaf is not affected
42// by further inserts and its size is fixed to bucketSize/2. The rightmost leaf
43// might store up to btreeBucketSize ngrams, but the expected size is
44// btreeBucketSize/2, too.
45//
46// On linux "getconf PAGESIZE" returns the number of bytes in a memory page.
47const btreeBucketSize = (4096 * 2) / ngramEncoding
48
49const (
50 interfaceBytes uint64 = 16
51 pointerSize uint64 = 8
52)
53
54type btree struct {
55 root node
56 opts btreeOpts
57
58 lastBucketIndex int
59}
60
61type btreeOpts struct {
62 // How many ngrams can be stored at a leaf node.
63 bucketSize int
64 // all inner nodes, except root, have [v, 2v] children. In the literature,
65 // b-trees are inconsistently categorized either by the number of children
66 // or by the number of keys. We choose the former.
67 v int
68}
69
70func newBtree(opts btreeOpts) *btree {
71 return &btree{
72 root: &leaf{},
73 opts: opts,
74 }
75}
76
77// insert inserts ng into bt.
78//
79// Note: when all inserts are done, freeze must be called.
80func (bt *btree) insert(ng ngram) {
81 if leftNode, rightNode, newKey, ok := bt.root.maybeSplit(bt.opts); ok {
82 bt.root = &innerNode{keys: []ngram{newKey}, children: []node{leftNode, rightNode}}
83 }
84 bt.root.insert(ng, bt.opts)
85}
86
87// find returns the tuple (bucketIndex, postingIndexOffset), both of which are
88// stored at the leaf level. They are effectively pointers to the bucket and
89// the posting lists for ngrams stored in the bucket. Since ngrams and their
90// posting lists are stored in order, knowing the index of the posting list of
91// the first item in the bucket is sufficient.
92func (bt *btree) find(ng ngram) (int, int) {
93 if bt.root == nil {
94 return -1, -1
95 }
96 return bt.root.find(ng)
97}
98
99func (bt *btree) visit(f func(n node)) {
100 bt.root.visit(f)
101}
102
103// freeze must be called once we are done inserting. It backfills "pointers" to
104// the buckets and posting lists.
105func (bt *btree) freeze() {
106 // Note: Instead of backfilling we could maintain state during insertion,
107 // however the visitor pattern seems more natural and shouldn't be a
108 // performance issue, because, based on the typical number of trigrams
109 // (500k) per shard, the b-trees we construct here only have around 1000
110 // leaf nodes.
111 offset, bucketIndex := 0, 0
112 bt.visit(func(no node) {
113 switch n := no.(type) {
114 case *leaf:
115 n.bucketIndex = bucketIndex
116 bucketIndex++
117
118 n.postingIndexOffset = offset
119 offset += n.bucketSize
120 case *innerNode:
121 return
122 }
123 })
124
125 bt.lastBucketIndex = bucketIndex - 1
126}
127
128func (bt *btree) sizeBytes() int {
129 sz := 2 * 8 // opts
130
131 sz += int(interfaceBytes)
132
133 bt.visit(func(n node) {
134 sz += n.sizeBytes()
135 })
136
137 return sz
138}
139
140type node interface {
141 insert(ng ngram, opts btreeOpts)
142 maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool)
143 find(ng ngram) (int, int)
144 visit(func(n node))
145 sizeBytes() int
146}
147
148type innerNode struct {
149 keys []ngram
150 children []node
151}
152
153type leaf struct {
154 bucketIndex int
155 // postingIndexOffset is the index of the posting list of the first ngram
156 // in the bucket. This is enough to determine the index of the posting list
157 // for every other key in the bucket.
158 postingIndexOffset int
159
160 // Optimization: Because we insert ngrams in order, we don't actually have
161 // to fill the buckets. We just have to keep track of the size of the
162 // bucket, so we know when to split, and the key that we have to propagate
163 // up to the parent node when we split.
164 //
165 // If in the future we decide to mutate buckets, we have to replace
166 // bucketSize and splitKey by []ngram.
167 bucketSize int
168 splitKey ngram
169}
170
171func (n *innerNode) sizeBytes() int {
172 return len(n.keys)*ngramEncoding + len(n.children)*int(interfaceBytes)
173}
174
175func (n *leaf) sizeBytes() int {
176 return 4 * 8
177}
178
179func (n *leaf) insert(ng ngram, opts btreeOpts) {
180 n.bucketSize++
181
182 if n.bucketSize == (opts.bucketSize/2)+1 {
183 n.splitKey = ng
184 }
185}
186
187func (n *innerNode) insert(ng ngram, opts btreeOpts) {
188 insertAt := func(i int) {
189 // Invariant: Nodes always have a free slot.
190 //
191 // We split full nodes on the the way down to the leaf. This has the
192 // advantage that inserts are handled in a single pass.
193 if leftNode, rightNode, newKey, ok := n.children[i].maybeSplit(opts); ok {
194 n.keys = append(n.keys[0:i], append([]ngram{newKey}, n.keys[i:]...)...)
195 n.children = append(n.children[0:i], append([]node{leftNode, rightNode}, n.children[i+1:]...)...)
196
197 // A split might shift the target index by 1.
198 if ng >= n.keys[i] {
199 i++
200 }
201 }
202 n.children[i].insert(ng, opts)
203 }
204
205 for i, k := range n.keys {
206 if ng < k {
207 insertAt(i)
208 return
209 }
210 }
211 insertAt(len(n.children) - 1)
212}
213
214// See btree.find
215func (n *innerNode) find(ng ngram) (int, int) {
216 for i, k := range n.keys {
217 if ng < k {
218 return n.children[i].find(ng)
219 }
220 }
221 return n.children[len(n.children)-1].find(ng)
222}
223
224// See btree.find
225func (n *leaf) find(ng ngram) (int, int) {
226 return int(n.bucketIndex), int(n.postingIndexOffset)
227}
228
229func (n *leaf) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) {
230 if n.bucketSize < opts.bucketSize {
231 return
232 }
233 return &leaf{bucketSize: opts.bucketSize / 2},
234 &leaf{bucketSize: opts.bucketSize / 2},
235 n.splitKey,
236 true
237}
238
239func (n *innerNode) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) {
240 if len(n.children) < 2*opts.v {
241 return
242 }
243 return &innerNode{
244 keys: append(make([]ngram, 0, opts.v-1), n.keys[0:opts.v-1]...),
245 children: append(make([]node, 0, opts.v), n.children[:opts.v]...),
246 },
247 &innerNode{
248 keys: append(make([]ngram, 0, (2*opts.v)-1), n.keys[opts.v:]...),
249 children: append(make([]node, 0, 2*opts.v), n.children[opts.v:]...),
250 },
251 n.keys[opts.v-1],
252 true
253}
254
255func (n *leaf) visit(f func(n node)) {
256 f(n)
257 return
258}
259
260func (n *innerNode) visit(f func(n node)) {
261 f(n)
262 for _, child := range n.children {
263 child.visit(f)
264 }
265}
266
267func (bt *btree) String() string {
268 s := ""
269 s += fmt.Sprintf("%+v", bt.opts)
270 bt.root.visit(func(n node) {
271 switch nd := n.(type) {
272 case *leaf:
273 return
274 case *innerNode:
275 s += fmt.Sprintf("[")
276 for _, key := range nd.keys {
277 s += fmt.Sprintf("%d,", key)
278 }
279 s = s[:len(s)-1] // remove trailing comma
280 s += fmt.Sprintf("]")
281
282 }
283 })
284 return s
285}
286
287type btreeIndex struct {
288 bt *btree
289
290 // We need the index to read buckets into memory.
291 file IndexFile
292
293 // buckets
294 ngramSec simpleSection
295
296 postingIndex simpleSection
297}
298
299// SizeBytes returns how much memory this structure uses in the heap.
300func (b btreeIndex) SizeBytes() (sz int) {
301 // btree
302 if b.bt != nil {
303 sz += int(pointerSize) + b.bt.sizeBytes()
304 }
305 // ngramSec
306 sz += 8
307 // postingIndex
308 sz += 8
309 // postingDataSentinelOffset
310 sz += 4
311 return
312}
313
314// Get returns the simple section of the posting list associated with the
315// ngram. The logic is as follows:
316// 1. Search the inner nodes to find the bucket that may contain ng (in MEM)
317// 2. Read the bucket from disk (1 disk access)
318// 3. Binary search the bucket (in MEM)
319// 4. Return the simple section pointing to the posting list (in MEM)
320func (b btreeIndex) Get(ng ngram) (ss simpleSection) {
321 if b.bt == nil {
322 return simpleSection{}
323 }
324
325 // find bucket
326 bucketIndex, postingIndexOffset := b.bt.find(ng)
327
328 // read bucket into memory
329 off, sz := b.getBucket(bucketIndex)
330 bucket, err := b.file.Read(off, sz)
331 if err != nil {
332 return simpleSection{}
333 }
334
335 // find ngram in bucket
336 getNGram := func(i int) ngram {
337 i *= ngramEncoding
338 return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding]))
339 }
340
341 bucketSize := len(bucket) / ngramEncoding
342 x := sort.Search(bucketSize, func(i int) bool {
343 return ng <= getNGram(i)
344 })
345
346 // return associated posting list
347 if x >= bucketSize || getNGram(x) != ng {
348 return simpleSection{}
349 }
350
351 return b.getPostingList(postingIndexOffset + x)
352}
353
354// getPostingList returns the simple section pointing to the posting list of
355// the ngram at ngramIndex.
356//
357// Assumming we don't hit a page boundary, which should be rare given that we
358// only read 8 bytes, we need 1 disk access to read the posting offset.
359func (b btreeIndex) getPostingList(ngramIndex int) simpleSection {
360 relativeOffsetBytes := uint32(ngramIndex) * 4
361
362 if relativeOffsetBytes+8 <= b.postingIndex.sz {
363 // read 2 offsets
364 o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 8)
365 if err != nil {
366 return simpleSection{}
367 }
368
369 start := binary.BigEndian.Uint32(o[0:4])
370 end := binary.BigEndian.Uint32(o[4:8])
371 return simpleSection{
372 off: start,
373 sz: end - start,
374 }
375 } else {
376 // last ngram => read 1 offset and calculate the size of the posting
377 // list from the offset of index section.
378 o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 4)
379 if err != nil {
380 return simpleSection{}
381 }
382
383 start := binary.BigEndian.Uint32(o[0:4])
384 return simpleSection{
385 off: start,
386 // The layout of the posting list compound section on disk is
387 //
388 // start b.postingIndex.off
389 // v v
390 // [[posting lists (simple section)][index (simple section)]]
391 // <---------->
392 // last posting list
393 //
394 sz: b.postingIndex.off - start,
395 }
396 }
397}
398
399func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) {
400 // All but the rightmost bucket have exactly bucketSize/2 ngrams
401 sz = uint32(b.bt.opts.bucketSize / 2 * ngramEncoding)
402 off = b.ngramSec.off + uint32(bucketIndex)*sz
403
404 // Rightmost bucket has size upto the end of the ngramSec.
405 if bucketIndex == b.bt.lastBucketIndex {
406 sz = b.ngramSec.off + b.ngramSec.sz - off
407 }
408
409 return
410}
411
412// DumpMap is a debug method which returns the btree as an in-memory
413// representation. This is how zoekt represents the ngram index in
414// google/zoekt.
415func (b btreeIndex) DumpMap() map[ngram]simpleSection {
416 if b.bt == nil {
417 return nil
418 }
419
420 m := make(map[ngram]simpleSection, b.ngramSec.sz/ngramEncoding)
421
422 b.bt.visit(func(no node) {
423 switch n := no.(type) {
424 case *leaf:
425 // read bucket into memory
426 off, sz := b.getBucket(n.bucketIndex)
427 bucket, _ := b.file.Read(off, sz)
428
429 // decode all ngrams in the bucket and fill map
430 for i := range len(bucket) / ngramEncoding {
431 gram := ngram(binary.BigEndian.Uint64(bucket[i*8:]))
432 m[gram] = b.getPostingList(int(n.postingIndexOffset) + i)
433 }
434 case *innerNode:
435 return
436 }
437 })
438
439 return m
440}