index/btree.go at main · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / btree.go
at main 12 kB View raw
Keegan Carruthers-Smith all: run modernize across codebase (#919) 1y ago
  1// B+-tree
  2//
  3// The tree we implement here is a B+-tree based on a paper by Ceylan and
  4// Mihalcea [1].
  5//
  6// B+-trees store all values in the leaves. In our case we store trigrams with
  7// the goal to quickly retrieve a pointer to the posting list for a given
  8// trigram. We choose the number of trigrams to store at each leaf based on the
  9// page size, IE we make sure we are able to load a bucket of ngrams with a
 10// single disk access.
 11//
 12// Here is an example of how our B+-tree looks like for a simple input:
 13//
 14// input: "hello world", bucketSize=2, v=2
 15//
 16// legend: ()=inner node, []=leaf
 17//
 18// (level 1)                        (hel, lo_)
 19//
 20// (level 2)          (ell)           (llo)           (o_w, irl, red)
 21//
 22// (level 3)      [_wo]  [ell]    [hel]  [llo]    [lo_] [o_w] [orl] [rld, wor]
 23//
 24// The leafs are stored as part of the index on disk (mmaped) while all inner
 25// nodes are loaded into memory when we load the shard.
 26//
 27// [1] H. Ceylan and R. Mihalcea. 2011. An Efficient Indexer for Large N-Gram
 28// Corpora, Proceedings of the ACL-HLT 2011 System Demonstrations, pages
 29// 103-108
 30
 31package index
 32
 33import (
 34	"encoding/binary"
 35	"fmt"
 36	"sort"
 37)
 38
 39// btreeBucketSize should be chosen such that in the final tree the buckets are
 40// as close to the page size as possible, but not above. We insert ngrams in
 41// order(!), which means after a split of a leaf, the left leaf is not affected
 42// by further inserts and its size is fixed to bucketSize/2. The rightmost leaf
 43// might store up to btreeBucketSize ngrams, but the expected size is
 44// btreeBucketSize/2, too.
 45//
 46// On linux "getconf PAGESIZE" returns the number of bytes in a memory page.
 47const btreeBucketSize = (4096 * 2) / ngramEncoding
 48
 49const (
 50	interfaceBytes uint64 = 16
 51	pointerSize    uint64 = 8
 52)
 53
 54type btree struct {
 55	root node
 56	opts btreeOpts
 57
 58	lastBucketIndex int
 59}
 60
 61type btreeOpts struct {
 62	// How many ngrams can be stored at a leaf node.
 63	bucketSize int
 64	// all inner nodes, except root, have [v, 2v] children. In the literature,
 65	// b-trees are inconsistently categorized either by the number of children
 66	// or by the number of keys. We choose the former.
 67	v int
 68}
 69
 70func newBtree(opts btreeOpts) *btree {
 71	return &btree{
 72		root: &leaf{},
 73		opts: opts,
 74	}
 75}
 76
 77// insert inserts ng into bt.
 78//
 79// Note: when all inserts are done, freeze must be called.
 80func (bt *btree) insert(ng ngram) {
 81	if leftNode, rightNode, newKey, ok := bt.root.maybeSplit(bt.opts); ok {
 82		bt.root = &innerNode{keys: []ngram{newKey}, children: []node{leftNode, rightNode}}
 83	}
 84	bt.root.insert(ng, bt.opts)
 85}
 86
 87// find returns the tuple (bucketIndex, postingIndexOffset), both of which are
 88// stored at the leaf level. They are effectively pointers to the bucket and
 89// the posting lists for ngrams stored in the bucket. Since ngrams and their
 90// posting lists are stored in order, knowing the index of the posting list of
 91// the first item in the bucket is sufficient.
 92func (bt *btree) find(ng ngram) (int, int) {
 93	if bt.root == nil {
 94		return -1, -1
 95	}
 96	return bt.root.find(ng)
 97}
 98
 99func (bt *btree) visit(f func(n node)) {
100	bt.root.visit(f)
101}
102
103// freeze must be called once we are done inserting. It backfills "pointers" to
104// the buckets and posting lists.
105func (bt *btree) freeze() {
106	// Note: Instead of backfilling we could maintain state during insertion,
107	// however the visitor pattern seems more natural and shouldn't be a
108	// performance issue, because, based on the typical number of trigrams
109	// (500k) per shard, the b-trees we construct here only have around 1000
110	// leaf nodes.
111	offset, bucketIndex := 0, 0
112	bt.visit(func(no node) {
113		switch n := no.(type) {
114		case *leaf:
115			n.bucketIndex = bucketIndex
116			bucketIndex++
117
118			n.postingIndexOffset = offset
119			offset += n.bucketSize
120		case *innerNode:
121			return
122		}
123	})
124
125	bt.lastBucketIndex = bucketIndex - 1
126}
127
128func (bt *btree) sizeBytes() int {
129	sz := 2 * 8 // opts
130
131	sz += int(interfaceBytes)
132
133	bt.visit(func(n node) {
134		sz += n.sizeBytes()
135	})
136
137	return sz
138}
139
140type node interface {
141	insert(ng ngram, opts btreeOpts)
142	maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool)
143	find(ng ngram) (int, int)
144	visit(func(n node))
145	sizeBytes() int
146}
147
148type innerNode struct {
149	keys     []ngram
150	children []node
151}
152
153type leaf struct {
154	bucketIndex int
155	// postingIndexOffset is the index of the posting list of the first ngram
156	// in the bucket. This is enough to determine the index of the posting list
157	// for every other key in the bucket.
158	postingIndexOffset int
159
160	// Optimization: Because we insert ngrams in order, we don't actually have
161	// to fill the buckets. We just have to keep track of the size of the
162	// bucket, so we know when to split, and the key that we have to propagate
163	// up to the parent node when we split.
164	//
165	// If in the future we decide to mutate buckets, we have to replace
166	// bucketSize and splitKey by []ngram.
167	bucketSize int
168	splitKey   ngram
169}
170
171func (n *innerNode) sizeBytes() int {
172	return len(n.keys)*ngramEncoding + len(n.children)*int(interfaceBytes)
173}
174
175func (n *leaf) sizeBytes() int {
176	return 4 * 8
177}
178
179func (n *leaf) insert(ng ngram, opts btreeOpts) {
180	n.bucketSize++
181
182	if n.bucketSize == (opts.bucketSize/2)+1 {
183		n.splitKey = ng
184	}
185}
186
187func (n *innerNode) insert(ng ngram, opts btreeOpts) {
188	insertAt := func(i int) {
189		// Invariant: Nodes always have a free slot.
190		//
191		// We split full nodes on the the way down to the leaf. This has the
192		// advantage that inserts are handled in a single pass.
193		if leftNode, rightNode, newKey, ok := n.children[i].maybeSplit(opts); ok {
194			n.keys = append(n.keys[0:i], append([]ngram{newKey}, n.keys[i:]...)...)
195			n.children = append(n.children[0:i], append([]node{leftNode, rightNode}, n.children[i+1:]...)...)
196
197			// A split might shift the target index by 1.
198			if ng >= n.keys[i] {
199				i++
200			}
201		}
202		n.children[i].insert(ng, opts)
203	}
204
205	for i, k := range n.keys {
206		if ng < k {
207			insertAt(i)
208			return
209		}
210	}
211	insertAt(len(n.children) - 1)
212}
213
214// See btree.find
215func (n *innerNode) find(ng ngram) (int, int) {
216	for i, k := range n.keys {
217		if ng < k {
218			return n.children[i].find(ng)
219		}
220	}
221	return n.children[len(n.children)-1].find(ng)
222}
223
224// See btree.find
225func (n *leaf) find(ng ngram) (int, int) {
226	return int(n.bucketIndex), int(n.postingIndexOffset)
227}
228
229func (n *leaf) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) {
230	if n.bucketSize < opts.bucketSize {
231		return
232	}
233	return &leaf{bucketSize: opts.bucketSize / 2},
234		&leaf{bucketSize: opts.bucketSize / 2},
235		n.splitKey,
236		true
237}
238
239func (n *innerNode) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) {
240	if len(n.children) < 2*opts.v {
241		return
242	}
243	return &innerNode{
244			keys:     append(make([]ngram, 0, opts.v-1), n.keys[0:opts.v-1]...),
245			children: append(make([]node, 0, opts.v), n.children[:opts.v]...),
246		},
247		&innerNode{
248			keys:     append(make([]ngram, 0, (2*opts.v)-1), n.keys[opts.v:]...),
249			children: append(make([]node, 0, 2*opts.v), n.children[opts.v:]...),
250		},
251		n.keys[opts.v-1],
252		true
253}
254
255func (n *leaf) visit(f func(n node)) {
256	f(n)
257	return
258}
259
260func (n *innerNode) visit(f func(n node)) {
261	f(n)
262	for _, child := range n.children {
263		child.visit(f)
264	}
265}
266
267func (bt *btree) String() string {
268	s := ""
269	s += fmt.Sprintf("%+v", bt.opts)
270	bt.root.visit(func(n node) {
271		switch nd := n.(type) {
272		case *leaf:
273			return
274		case *innerNode:
275			s += fmt.Sprintf("[")
276			for _, key := range nd.keys {
277				s += fmt.Sprintf("%d,", key)
278			}
279			s = s[:len(s)-1] // remove trailing comma
280			s += fmt.Sprintf("]")
281
282		}
283	})
284	return s
285}
286
287type btreeIndex struct {
288	bt *btree
289
290	// We need the index to read buckets into memory.
291	file IndexFile
292
293	// buckets
294	ngramSec simpleSection
295
296	postingIndex simpleSection
297}
298
299// SizeBytes returns how much memory this structure uses in the heap.
300func (b btreeIndex) SizeBytes() (sz int) {
301	// btree
302	if b.bt != nil {
303		sz += int(pointerSize) + b.bt.sizeBytes()
304	}
305	// ngramSec
306	sz += 8
307	// postingIndex
308	sz += 8
309	// postingDataSentinelOffset
310	sz += 4
311	return
312}
313
314// Get returns the simple section of the posting list associated with the
315// ngram. The logic is as follows:
316// 1. Search the inner nodes to find the bucket that may contain ng (in MEM)
317// 2. Read the bucket from disk (1 disk access)
318// 3. Binary search the bucket (in MEM)
319// 4. Return the simple section pointing to the posting list (in MEM)
320func (b btreeIndex) Get(ng ngram) (ss simpleSection) {
321	if b.bt == nil {
322		return simpleSection{}
323	}
324
325	// find bucket
326	bucketIndex, postingIndexOffset := b.bt.find(ng)
327
328	// read bucket into memory
329	off, sz := b.getBucket(bucketIndex)
330	bucket, err := b.file.Read(off, sz)
331	if err != nil {
332		return simpleSection{}
333	}
334
335	// find ngram in bucket
336	getNGram := func(i int) ngram {
337		i *= ngramEncoding
338		return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding]))
339	}
340
341	bucketSize := len(bucket) / ngramEncoding
342	x := sort.Search(bucketSize, func(i int) bool {
343		return ng <= getNGram(i)
344	})
345
346	// return associated posting list
347	if x >= bucketSize || getNGram(x) != ng {
348		return simpleSection{}
349	}
350
351	return b.getPostingList(postingIndexOffset + x)
352}
353
354// getPostingList returns the simple section pointing to the posting list of
355// the ngram at ngramIndex.
356//
357// Assumming we don't hit a page boundary, which should be rare given that we
358// only read 8 bytes, we need 1 disk access to read the posting offset.
359func (b btreeIndex) getPostingList(ngramIndex int) simpleSection {
360	relativeOffsetBytes := uint32(ngramIndex) * 4
361
362	if relativeOffsetBytes+8 <= b.postingIndex.sz {
363		// read 2 offsets
364		o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 8)
365		if err != nil {
366			return simpleSection{}
367		}
368
369		start := binary.BigEndian.Uint32(o[0:4])
370		end := binary.BigEndian.Uint32(o[4:8])
371		return simpleSection{
372			off: start,
373			sz:  end - start,
374		}
375	} else {
376		// last ngram => read 1 offset and calculate the size of the posting
377		// list from the offset of index section.
378		o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 4)
379		if err != nil {
380			return simpleSection{}
381		}
382
383		start := binary.BigEndian.Uint32(o[0:4])
384		return simpleSection{
385			off: start,
386			// The layout of the posting list compound section on disk is
387			//
388			//                      start       b.postingIndex.off
389			//                      v           v
390			// [[posting lists (simple section)][index (simple section)]]
391			//                      <---------->
392			//                    last posting list
393			//
394			sz: b.postingIndex.off - start,
395		}
396	}
397}
398
399func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) {
400	// All but the rightmost bucket have exactly bucketSize/2 ngrams
401	sz = uint32(b.bt.opts.bucketSize / 2 * ngramEncoding)
402	off = b.ngramSec.off + uint32(bucketIndex)*sz
403
404	// Rightmost bucket has size upto the end of the ngramSec.
405	if bucketIndex == b.bt.lastBucketIndex {
406		sz = b.ngramSec.off + b.ngramSec.sz - off
407	}
408
409	return
410}
411
412// DumpMap is a debug method which returns the btree as an in-memory
413// representation. This is how zoekt represents the ngram index in
414// google/zoekt.
415func (b btreeIndex) DumpMap() map[ngram]simpleSection {
416	if b.bt == nil {
417		return nil
418	}
419
420	m := make(map[ngram]simpleSection, b.ngramSec.sz/ngramEncoding)
421
422	b.bt.visit(func(no node) {
423		switch n := no.(type) {
424		case *leaf:
425			// read bucket into memory
426			off, sz := b.getBucket(n.bucketIndex)
427			bucket, _ := b.file.Read(off, sz)
428
429			// decode all ngrams in the bucket and fill map
430			for i := range len(bucket) / ngramEncoding {
431				gram := ngram(binary.BigEndian.Uint64(bucket[i*8:]))
432				m[gram] = b.getPostingList(int(n.postingIndexOffset) + i)
433			}
434		case *innerNode:
435			return
436		}
437	})
438
439	return m
440}
Configure Feed

Configure Feed