btree.go at e068116194eadd9ef4d619fc53371289bf317d58 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / btree.go
at e068116194eadd9ef4d619fc53371289bf317d58 12 kB View raw
Keegan Carruthers-Smith Revert "indexdata: read posting list iff all ng exist (#619)" (#626) 2y ago
  1// B+-tree
  2//
  3// The tree we implement here is a B+-tree based on a paper by Ceylan and
  4// Mihalcea [1].
  5//
  6// B+-trees store all values in the leaves. In our case we store trigrams with
  7// the goal to quickly retrieve a pointer to the posting list for a given
  8// trigram. We choose the number of trigrams to store at each leaf based on the
  9// page size, IE we make sure we are able to load a bucket of ngrams with a
 10// single disk access.
 11//
 12// Here is an example of how our B+-tree looks like for a simple input:
 13//
 14// input: "hello world", bucketSize=2, v=2
 15//
 16// legend: ()=inner node, []=leaf
 17//
 18// (level 1)                        (hel, lo_)
 19//
 20// (level 2)          (ell)           (llo)           (o_w, irl, red)
 21//
 22// (level 3)      [_wo]  [ell]    [hel]  [llo]    [lo_] [o_w] [orl] [rld, wor]
 23//
 24// The leafs are stored as part of the index on disk (mmaped) while all inner
 25// nodes are loaded into memory when we load the shard.
 26//
 27// [1] H. Ceylan and R. Mihalcea. 2011. An Efficient Indexer for Large N-Gram
 28// Corpora, Proceedings of the ACL-HLT 2011 System Demonstrations, pages
 29// 103-108
 30
 31package zoekt
 32
 33import (
 34	"encoding/binary"
 35	"fmt"
 36	"sort"
 37)
 38
 39// btreeBucketSize should be chosen such that in the final tree the buckets are
 40// as close to the page size as possible, but not above. We insert ngrams in
 41// order(!), which means after a split of a leaf, the left leaf is not affected
 42// by further inserts and its size is fixed to bucketSize/2. The rightmost leaf
 43// might store up to btreeBucketSize ngrams, but the expected size is
 44// btreeBucketSize/2, too.
 45//
 46// On linux "getconf PAGESIZE" returns the number of bytes in a memory page.
 47const btreeBucketSize = (4096 * 2) / ngramEncoding
 48
 49type btree struct {
 50	root node
 51	opts btreeOpts
 52
 53	lastBucketIndex int
 54}
 55
 56type btreeOpts struct {
 57	// How many ngrams can be stored at a leaf node.
 58	bucketSize int
 59	// all inner nodes, except root, have [v, 2v] children. In the literature,
 60	// b-trees are inconsistently categorized either by the number of children
 61	// or by the number of keys. We choose the former.
 62	v int
 63}
 64
 65func newBtree(opts btreeOpts) *btree {
 66	return &btree{
 67		root: &leaf{},
 68		opts: opts,
 69	}
 70}
 71
 72// insert inserts ng into bt.
 73//
 74// Note: when all inserts are done, freeze must be called.
 75func (bt *btree) insert(ng ngram) {
 76	if leftNode, rightNode, newKey, ok := bt.root.maybeSplit(bt.opts); ok {
 77		bt.root = &innerNode{keys: []ngram{newKey}, children: []node{leftNode, rightNode}}
 78	}
 79	bt.root.insert(ng, bt.opts)
 80}
 81
 82// find returns the tuple (bucketIndex, postingIndexOffset), both of which are
 83// stored at the leaf level. They are effectively pointers to the bucket and
 84// the posting lists for ngrams stored in the bucket. Since ngrams and their
 85// posting lists are stored in order, knowing the index of the posting list of
 86// the first item in the bucket is sufficient.
 87func (bt *btree) find(ng ngram) (int, int) {
 88	if bt.root == nil {
 89		return -1, -1
 90	}
 91	return bt.root.find(ng)
 92}
 93
 94func (bt *btree) visit(f func(n node)) {
 95	bt.root.visit(f)
 96}
 97
 98// freeze must be called once we are done inserting. It backfills "pointers" to
 99// the buckets and posting lists.
100func (bt *btree) freeze() {
101	// Note: Instead of backfilling we could maintain state during insertion,
102	// however the visitor pattern seems more natural and shouldn't be a
103	// performance issue, because, based on the typical number of trigrams
104	// (500k) per shard, the b-trees we construct here only have around 1000
105	// leaf nodes.
106	offset, bucketIndex := 0, 0
107	bt.visit(func(no node) {
108		switch n := no.(type) {
109		case *leaf:
110			n.bucketIndex = bucketIndex
111			bucketIndex++
112
113			n.postingIndexOffset = offset
114			offset += n.bucketSize
115		case *innerNode:
116			return
117		}
118	})
119
120	bt.lastBucketIndex = bucketIndex - 1
121}
122
123func (bt *btree) sizeBytes() int {
124	sz := 2 * 8 // opts
125
126	sz += int(interfaceBytes)
127
128	bt.visit(func(n node) {
129		sz += n.sizeBytes()
130	})
131
132	return sz
133}
134
135type node interface {
136	insert(ng ngram, opts btreeOpts)
137	maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool)
138	find(ng ngram) (int, int)
139	visit(func(n node))
140	sizeBytes() int
141}
142
143type innerNode struct {
144	keys     []ngram
145	children []node
146}
147
148type leaf struct {
149	bucketIndex int
150	// postingIndexOffset is the index of the posting list of the first ngram
151	// in the bucket. This is enough to determine the index of the posting list
152	// for every other key in the bucket.
153	postingIndexOffset int
154
155	// Optimization: Because we insert ngrams in order, we don't actually have
156	// to fill the buckets. We just have to keep track of the size of the
157	// bucket, so we know when to split, and the key that we have to propagate
158	// up to the parent node when we split.
159	//
160	// If in the future we decide to mutate buckets, we have to replace
161	// bucketSize and splitKey by []ngram.
162	bucketSize int
163	splitKey   ngram
164}
165
166func (n *innerNode) sizeBytes() int {
167	return len(n.keys)*ngramEncoding + len(n.children)*int(interfaceBytes)
168}
169
170func (n *leaf) sizeBytes() int {
171	return 4 * 8
172}
173
174func (n *leaf) insert(ng ngram, opts btreeOpts) {
175	n.bucketSize++
176
177	if n.bucketSize == (opts.bucketSize/2)+1 {
178		n.splitKey = ng
179	}
180}
181
182func (n *innerNode) insert(ng ngram, opts btreeOpts) {
183	insertAt := func(i int) {
184		// Invariant: Nodes always have a free slot.
185		//
186		// We split full nodes on the the way down to the leaf. This has the
187		// advantage that inserts are handled in a single pass.
188		if leftNode, rightNode, newKey, ok := n.children[i].maybeSplit(opts); ok {
189			n.keys = append(n.keys[0:i], append([]ngram{newKey}, n.keys[i:]...)...)
190			n.children = append(n.children[0:i], append([]node{leftNode, rightNode}, n.children[i+1:]...)...)
191
192			// A split might shift the target index by 1.
193			if ng >= n.keys[i] {
194				i++
195			}
196		}
197		n.children[i].insert(ng, opts)
198	}
199
200	for i, k := range n.keys {
201		if ng < k {
202			insertAt(i)
203			return
204		}
205	}
206	insertAt(len(n.children) - 1)
207}
208
209// See btree.find
210func (n *innerNode) find(ng ngram) (int, int) {
211	for i, k := range n.keys {
212		if ng < k {
213			return n.children[i].find(ng)
214		}
215	}
216	return n.children[len(n.children)-1].find(ng)
217}
218
219// See btree.find
220func (n *leaf) find(ng ngram) (int, int) {
221	return int(n.bucketIndex), int(n.postingIndexOffset)
222}
223
224func (n *leaf) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) {
225	if n.bucketSize < opts.bucketSize {
226		return
227	}
228	return &leaf{bucketSize: opts.bucketSize / 2},
229		&leaf{bucketSize: opts.bucketSize / 2},
230		n.splitKey,
231		true
232}
233
234func (n *innerNode) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) {
235	if len(n.children) < 2*opts.v {
236		return
237	}
238	return &innerNode{
239			keys:     append(make([]ngram, 0, opts.v-1), n.keys[0:opts.v-1]...),
240			children: append(make([]node, 0, opts.v), n.children[:opts.v]...)},
241		&innerNode{
242			keys:     append(make([]ngram, 0, (2*opts.v)-1), n.keys[opts.v:]...),
243			children: append(make([]node, 0, 2*opts.v), n.children[opts.v:]...)},
244		n.keys[opts.v-1],
245		true
246}
247
248func (n *leaf) visit(f func(n node)) {
249	f(n)
250	return
251}
252
253func (n *innerNode) visit(f func(n node)) {
254	f(n)
255	for _, child := range n.children {
256		child.visit(f)
257	}
258}
259
260func (bt *btree) String() string {
261	s := ""
262	s += fmt.Sprintf("%+v", bt.opts)
263	bt.root.visit(func(n node) {
264		switch nd := n.(type) {
265		case *leaf:
266			return
267		case *innerNode:
268			s += fmt.Sprintf("[")
269			for _, key := range nd.keys {
270				s += fmt.Sprintf("%d,", key)
271			}
272			s = s[:len(s)-1] // remove trailing comma
273			s += fmt.Sprintf("]")
274
275		}
276	})
277	return s
278}
279
280type btreeIndex struct {
281	bt *btree
282
283	// We need the index to read buckets into memory.
284	file IndexFile
285
286	// buckets
287	ngramSec simpleSection
288
289	postingIndex simpleSection
290}
291
292// SizeBytes returns how much memory this structure uses in the heap.
293func (b btreeIndex) SizeBytes() (sz int) {
294	// btree
295	if b.bt != nil {
296		sz += int(pointerSize) + b.bt.sizeBytes()
297	}
298	// ngramSec
299	sz += 8
300	// postingIndex
301	sz += 8
302	// postingDataSentinelOffset
303	sz += 4
304	return
305}
306
307// Get returns the simple section of the posting list associated with the
308// ngram. The logic is as follows:
309// 1. Search the inner nodes to find the bucket that may contain ng (in MEM)
310// 2. Read the bucket from disk (1 disk access)
311// 3. Binary search the bucket (in MEM)
312// 4. Return the simple section pointing to the posting list (in MEM)
313func (b btreeIndex) Get(ng ngram) (ss simpleSection) {
314	if b.bt == nil {
315		return simpleSection{}
316	}
317
318	// find bucket
319	bucketIndex, postingIndexOffset := b.bt.find(ng)
320
321	// read bucket into memory
322	off, sz := b.getBucket(bucketIndex)
323	bucket, err := b.file.Read(off, sz)
324	if err != nil {
325		return simpleSection{}
326	}
327
328	// find ngram in bucket
329	getNGram := func(i int) ngram {
330		i *= ngramEncoding
331		return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding]))
332	}
333
334	bucketSize := len(bucket) / ngramEncoding
335	x := sort.Search(bucketSize, func(i int) bool {
336		return ng <= getNGram(i)
337	})
338
339	// return associated posting list
340	if x >= bucketSize || getNGram(x) != ng {
341		return simpleSection{}
342	}
343
344	return b.getPostingList(postingIndexOffset + x)
345}
346
347// getPostingList returns the simple section pointing to the posting list of
348// the ngram at ngramIndex.
349//
350// Assumming we don't hit a page boundary, which should be rare given that we
351// only read 8 bytes, we need 1 disk access to read the posting offset.
352func (b btreeIndex) getPostingList(ngramIndex int) simpleSection {
353	relativeOffsetBytes := uint32(ngramIndex) * 4
354
355	if relativeOffsetBytes+8 <= b.postingIndex.sz {
356		// read 2 offsets
357		o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 8)
358		if err != nil {
359			return simpleSection{}
360		}
361
362		start := binary.BigEndian.Uint32(o[0:4])
363		end := binary.BigEndian.Uint32(o[4:8])
364		return simpleSection{
365			off: start,
366			sz:  end - start,
367		}
368	} else {
369		// last ngram => read 1 offset and calculate the size of the posting
370		// list from the offset of index section.
371		o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 4)
372		if err != nil {
373			return simpleSection{}
374		}
375
376		start := binary.BigEndian.Uint32(o[0:4])
377		return simpleSection{
378			off: start,
379			// The layout of the posting list compound section on disk is
380			//
381			//                      start       b.postingIndex.off
382			//                      v           v
383			// [[posting lists (simple section)][index (simple section)]]
384			//                      <---------->
385			//                    last posting list
386			//
387			sz: b.postingIndex.off - start,
388		}
389	}
390}
391
392func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) {
393	// All but the rightmost bucket have exactly bucketSize/2 ngrams
394	sz = uint32(b.bt.opts.bucketSize / 2 * ngramEncoding)
395	off = b.ngramSec.off + uint32(bucketIndex)*sz
396
397	// Rightmost bucket has size upto the end of the ngramSec.
398	if bucketIndex == b.bt.lastBucketIndex {
399		sz = b.ngramSec.off + b.ngramSec.sz - off
400	}
401
402	return
403}
404
405// DumpMap is a debug method which returns the btree as an in-memory
406// representation. This is how zoekt represents the ngram index in
407// google/zoekt.
408func (b btreeIndex) DumpMap() map[ngram]simpleSection {
409	if b.bt == nil {
410		return nil
411	}
412
413	m := make(map[ngram]simpleSection, b.ngramSec.sz/ngramEncoding)
414
415	b.bt.visit(func(no node) {
416		switch n := no.(type) {
417		case *leaf:
418			// read bucket into memory
419			off, sz := b.getBucket(n.bucketIndex)
420			bucket, _ := b.file.Read(off, sz)
421
422			// decode all ngrams in the bucket and fill map
423			for i := 0; i < len(bucket)/ngramEncoding; i++ {
424				gram := ngram(binary.BigEndian.Uint64(bucket[i*8:]))
425				m[gram] = b.getPostingList(int(n.postingIndexOffset) + i)
426			}
427		case *innerNode:
428			return
429		}
430	})
431
432	return m
433}
Configure Feed

Configure Feed