btree.go at da6013d595d0fc739c84e1fd2fd3bdbfe440ea2f · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / btree.go
at da6013d595d0fc739c84e1fd2fd3bdbfe440ea2f 12 kB View raw
Keegan Carruthers-Smith all: gofumpt -l -w . 2y ago
  1// B+-tree
  2//
  3// The tree we implement here is a B+-tree based on a paper by Ceylan and
  4// Mihalcea [1].
  5//
  6// B+-trees store all values in the leaves. In our case we store trigrams with
  7// the goal to quickly retrieve a pointer to the posting list for a given
  8// trigram. We choose the number of trigrams to store at each leaf based on the
  9// page size, IE we make sure we are able to load a bucket of ngrams with a
 10// single disk access.
 11//
 12// Here is an example of how our B+-tree looks like for a simple input:
 13//
 14// input: "hello world", bucketSize=2, v=2
 15//
 16// legend: ()=inner node, []=leaf
 17//
 18// (level 1)                        (hel, lo_)
 19//
 20// (level 2)          (ell)           (llo)           (o_w, irl, red)
 21//
 22// (level 3)      [_wo]  [ell]    [hel]  [llo]    [lo_] [o_w] [orl] [rld, wor]
 23//
 24// The leafs are stored as part of the index on disk (mmaped) while all inner
 25// nodes are loaded into memory when we load the shard.
 26//
 27// [1] H. Ceylan and R. Mihalcea. 2011. An Efficient Indexer for Large N-Gram
 28// Corpora, Proceedings of the ACL-HLT 2011 System Demonstrations, pages
 29// 103-108
 30
 31package zoekt
 32
 33import (
 34	"encoding/binary"
 35	"fmt"
 36	"sort"
 37)
 38
 39// btreeBucketSize should be chosen such that in the final tree the buckets are
 40// as close to the page size as possible, but not above. We insert ngrams in
 41// order(!), which means after a split of a leaf, the left leaf is not affected
 42// by further inserts and its size is fixed to bucketSize/2. The rightmost leaf
 43// might store up to btreeBucketSize ngrams, but the expected size is
 44// btreeBucketSize/2, too.
 45//
 46// On linux "getconf PAGESIZE" returns the number of bytes in a memory page.
 47const btreeBucketSize = (4096 * 2) / ngramEncoding
 48
 49type btree struct {
 50	root node
 51	opts btreeOpts
 52
 53	lastBucketIndex int
 54}
 55
 56type btreeOpts struct {
 57	// How many ngrams can be stored at a leaf node.
 58	bucketSize int
 59	// all inner nodes, except root, have [v, 2v] children. In the literature,
 60	// b-trees are inconsistently categorized either by the number of children
 61	// or by the number of keys. We choose the former.
 62	v int
 63}
 64
 65func newBtree(opts btreeOpts) *btree {
 66	return &btree{
 67		root: &leaf{},
 68		opts: opts,
 69	}
 70}
 71
 72// insert inserts ng into bt.
 73//
 74// Note: when all inserts are done, freeze must be called.
 75func (bt *btree) insert(ng ngram) {
 76	if leftNode, rightNode, newKey, ok := bt.root.maybeSplit(bt.opts); ok {
 77		bt.root = &innerNode{keys: []ngram{newKey}, children: []node{leftNode, rightNode}}
 78	}
 79	bt.root.insert(ng, bt.opts)
 80}
 81
 82// find returns the tuple (bucketIndex, postingIndexOffset), both of which are
 83// stored at the leaf level. They are effectively pointers to the bucket and
 84// the posting lists for ngrams stored in the bucket. Since ngrams and their
 85// posting lists are stored in order, knowing the index of the posting list of
 86// the first item in the bucket is sufficient.
 87func (bt *btree) find(ng ngram) (int, int) {
 88	if bt.root == nil {
 89		return -1, -1
 90	}
 91	return bt.root.find(ng)
 92}
 93
 94func (bt *btree) visit(f func(n node)) {
 95	bt.root.visit(f)
 96}
 97
 98// freeze must be called once we are done inserting. It backfills "pointers" to
 99// the buckets and posting lists.
100func (bt *btree) freeze() {
101	// Note: Instead of backfilling we could maintain state during insertion,
102	// however the visitor pattern seems more natural and shouldn't be a
103	// performance issue, because, based on the typical number of trigrams
104	// (500k) per shard, the b-trees we construct here only have around 1000
105	// leaf nodes.
106	offset, bucketIndex := 0, 0
107	bt.visit(func(no node) {
108		switch n := no.(type) {
109		case *leaf:
110			n.bucketIndex = bucketIndex
111			bucketIndex++
112
113			n.postingIndexOffset = offset
114			offset += n.bucketSize
115		case *innerNode:
116			return
117		}
118	})
119
120	bt.lastBucketIndex = bucketIndex - 1
121}
122
123func (bt *btree) sizeBytes() int {
124	sz := 2 * 8 // opts
125
126	sz += int(interfaceBytes)
127
128	bt.visit(func(n node) {
129		sz += n.sizeBytes()
130	})
131
132	return sz
133}
134
135type node interface {
136	insert(ng ngram, opts btreeOpts)
137	maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool)
138	find(ng ngram) (int, int)
139	visit(func(n node))
140	sizeBytes() int
141}
142
143type innerNode struct {
144	keys     []ngram
145	children []node
146}
147
148type leaf struct {
149	bucketIndex int
150	// postingIndexOffset is the index of the posting list of the first ngram
151	// in the bucket. This is enough to determine the index of the posting list
152	// for every other key in the bucket.
153	postingIndexOffset int
154
155	// Optimization: Because we insert ngrams in order, we don't actually have
156	// to fill the buckets. We just have to keep track of the size of the
157	// bucket, so we know when to split, and the key that we have to propagate
158	// up to the parent node when we split.
159	//
160	// If in the future we decide to mutate buckets, we have to replace
161	// bucketSize and splitKey by []ngram.
162	bucketSize int
163	splitKey   ngram
164}
165
166func (n *innerNode) sizeBytes() int {
167	return len(n.keys)*ngramEncoding + len(n.children)*int(interfaceBytes)
168}
169
170func (n *leaf) sizeBytes() int {
171	return 4 * 8
172}
173
174func (n *leaf) insert(ng ngram, opts btreeOpts) {
175	n.bucketSize++
176
177	if n.bucketSize == (opts.bucketSize/2)+1 {
178		n.splitKey = ng
179	}
180}
181
182func (n *innerNode) insert(ng ngram, opts btreeOpts) {
183	insertAt := func(i int) {
184		// Invariant: Nodes always have a free slot.
185		//
186		// We split full nodes on the the way down to the leaf. This has the
187		// advantage that inserts are handled in a single pass.
188		if leftNode, rightNode, newKey, ok := n.children[i].maybeSplit(opts); ok {
189			n.keys = append(n.keys[0:i], append([]ngram{newKey}, n.keys[i:]...)...)
190			n.children = append(n.children[0:i], append([]node{leftNode, rightNode}, n.children[i+1:]...)...)
191
192			// A split might shift the target index by 1.
193			if ng >= n.keys[i] {
194				i++
195			}
196		}
197		n.children[i].insert(ng, opts)
198	}
199
200	for i, k := range n.keys {
201		if ng < k {
202			insertAt(i)
203			return
204		}
205	}
206	insertAt(len(n.children) - 1)
207}
208
209// See btree.find
210func (n *innerNode) find(ng ngram) (int, int) {
211	for i, k := range n.keys {
212		if ng < k {
213			return n.children[i].find(ng)
214		}
215	}
216	return n.children[len(n.children)-1].find(ng)
217}
218
219// See btree.find
220func (n *leaf) find(ng ngram) (int, int) {
221	return int(n.bucketIndex), int(n.postingIndexOffset)
222}
223
224func (n *leaf) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) {
225	if n.bucketSize < opts.bucketSize {
226		return
227	}
228	return &leaf{bucketSize: opts.bucketSize / 2},
229		&leaf{bucketSize: opts.bucketSize / 2},
230		n.splitKey,
231		true
232}
233
234func (n *innerNode) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) {
235	if len(n.children) < 2*opts.v {
236		return
237	}
238	return &innerNode{
239			keys:     append(make([]ngram, 0, opts.v-1), n.keys[0:opts.v-1]...),
240			children: append(make([]node, 0, opts.v), n.children[:opts.v]...),
241		},
242		&innerNode{
243			keys:     append(make([]ngram, 0, (2*opts.v)-1), n.keys[opts.v:]...),
244			children: append(make([]node, 0, 2*opts.v), n.children[opts.v:]...),
245		},
246		n.keys[opts.v-1],
247		true
248}
249
250func (n *leaf) visit(f func(n node)) {
251	f(n)
252	return
253}
254
255func (n *innerNode) visit(f func(n node)) {
256	f(n)
257	for _, child := range n.children {
258		child.visit(f)
259	}
260}
261
262func (bt *btree) String() string {
263	s := ""
264	s += fmt.Sprintf("%+v", bt.opts)
265	bt.root.visit(func(n node) {
266		switch nd := n.(type) {
267		case *leaf:
268			return
269		case *innerNode:
270			s += fmt.Sprintf("[")
271			for _, key := range nd.keys {
272				s += fmt.Sprintf("%d,", key)
273			}
274			s = s[:len(s)-1] // remove trailing comma
275			s += fmt.Sprintf("]")
276
277		}
278	})
279	return s
280}
281
282type btreeIndex struct {
283	bt *btree
284
285	// We need the index to read buckets into memory.
286	file IndexFile
287
288	// buckets
289	ngramSec simpleSection
290
291	postingIndex simpleSection
292}
293
294// SizeBytes returns how much memory this structure uses in the heap.
295func (b btreeIndex) SizeBytes() (sz int) {
296	// btree
297	if b.bt != nil {
298		sz += int(pointerSize) + b.bt.sizeBytes()
299	}
300	// ngramSec
301	sz += 8
302	// postingIndex
303	sz += 8
304	// postingDataSentinelOffset
305	sz += 4
306	return
307}
308
309// Get returns the simple section of the posting list associated with the
310// ngram. The logic is as follows:
311// 1. Search the inner nodes to find the bucket that may contain ng (in MEM)
312// 2. Read the bucket from disk (1 disk access)
313// 3. Binary search the bucket (in MEM)
314// 4. Return the simple section pointing to the posting list (in MEM)
315func (b btreeIndex) Get(ng ngram) (ss simpleSection) {
316	if b.bt == nil {
317		return simpleSection{}
318	}
319
320	// find bucket
321	bucketIndex, postingIndexOffset := b.bt.find(ng)
322
323	// read bucket into memory
324	off, sz := b.getBucket(bucketIndex)
325	bucket, err := b.file.Read(off, sz)
326	if err != nil {
327		return simpleSection{}
328	}
329
330	// find ngram in bucket
331	getNGram := func(i int) ngram {
332		i *= ngramEncoding
333		return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding]))
334	}
335
336	bucketSize := len(bucket) / ngramEncoding
337	x := sort.Search(bucketSize, func(i int) bool {
338		return ng <= getNGram(i)
339	})
340
341	// return associated posting list
342	if x >= bucketSize || getNGram(x) != ng {
343		return simpleSection{}
344	}
345
346	return b.getPostingList(postingIndexOffset + x)
347}
348
349// getPostingList returns the simple section pointing to the posting list of
350// the ngram at ngramIndex.
351//
352// Assumming we don't hit a page boundary, which should be rare given that we
353// only read 8 bytes, we need 1 disk access to read the posting offset.
354func (b btreeIndex) getPostingList(ngramIndex int) simpleSection {
355	relativeOffsetBytes := uint32(ngramIndex) * 4
356
357	if relativeOffsetBytes+8 <= b.postingIndex.sz {
358		// read 2 offsets
359		o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 8)
360		if err != nil {
361			return simpleSection{}
362		}
363
364		start := binary.BigEndian.Uint32(o[0:4])
365		end := binary.BigEndian.Uint32(o[4:8])
366		return simpleSection{
367			off: start,
368			sz:  end - start,
369		}
370	} else {
371		// last ngram => read 1 offset and calculate the size of the posting
372		// list from the offset of index section.
373		o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 4)
374		if err != nil {
375			return simpleSection{}
376		}
377
378		start := binary.BigEndian.Uint32(o[0:4])
379		return simpleSection{
380			off: start,
381			// The layout of the posting list compound section on disk is
382			//
383			//                      start       b.postingIndex.off
384			//                      v           v
385			// [[posting lists (simple section)][index (simple section)]]
386			//                      <---------->
387			//                    last posting list
388			//
389			sz: b.postingIndex.off - start,
390		}
391	}
392}
393
394func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) {
395	// All but the rightmost bucket have exactly bucketSize/2 ngrams
396	sz = uint32(b.bt.opts.bucketSize / 2 * ngramEncoding)
397	off = b.ngramSec.off + uint32(bucketIndex)*sz
398
399	// Rightmost bucket has size upto the end of the ngramSec.
400	if bucketIndex == b.bt.lastBucketIndex {
401		sz = b.ngramSec.off + b.ngramSec.sz - off
402	}
403
404	return
405}
406
407// DumpMap is a debug method which returns the btree as an in-memory
408// representation. This is how zoekt represents the ngram index in
409// google/zoekt.
410func (b btreeIndex) DumpMap() map[ngram]simpleSection {
411	if b.bt == nil {
412		return nil
413	}
414
415	m := make(map[ngram]simpleSection, b.ngramSec.sz/ngramEncoding)
416
417	b.bt.visit(func(no node) {
418		switch n := no.(type) {
419		case *leaf:
420			// read bucket into memory
421			off, sz := b.getBucket(n.bucketIndex)
422			bucket, _ := b.file.Read(off, sz)
423
424			// decode all ngrams in the bucket and fill map
425			for i := 0; i < len(bucket)/ngramEncoding; i++ {
426				gram := ngram(binary.BigEndian.Uint64(bucket[i*8:]))
427				m[gram] = b.getPostingList(int(n.postingIndexOffset) + i)
428			}
429		case *innerNode:
430			return
431		}
432	})
433
434	return m
435}
Configure Feed

Configure Feed