btree.go at b7e5070bfb563ea836e532dc524a733dd5d684c8 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / btree.go
at b7e5070bfb563ea836e532dc524a733dd5d684c8 13 kB View raw
Stefan Hengl indexdata: read posting list iff all ng exist (#619) 2y ago
  1// B+-tree
  2//
  3// The tree we implement here is a B+-tree based on a paper by Ceylan and
  4// Mihalcea [1].
  5//
  6// B+-trees store all values in the leaves. In our case we store trigrams with
  7// the goal to quickly retrieve a pointer to the posting list for a given
  8// trigram. We choose the number of trigrams to store at each leaf based on the
  9// page size, IE we make sure we are able to load a bucket of ngrams with a
 10// single disk access.
 11//
 12// Here is an example of how our B+-tree looks like for a simple input:
 13//
 14// input: "hello world", bucketSize=2, v=2
 15//
 16// legend: ()=inner node, []=leaf
 17//
 18// (level 1)                        (hel, lo_)
 19//
 20// (level 2)          (ell)           (llo)           (o_w, irl, red)
 21//
 22// (level 3)      [_wo]  [ell]    [hel]  [llo]    [lo_] [o_w] [orl] [rld, wor]
 23//
 24// The leafs are stored as part of the index on disk (mmaped) while all inner
 25// nodes are loaded into memory when we load the shard.
 26//
 27// [1] H. Ceylan and R. Mihalcea. 2011. An Efficient Indexer for Large N-Gram
 28// Corpora, Proceedings of the ACL-HLT 2011 System Demonstrations, pages
 29// 103-108
 30
 31package zoekt
 32
 33import (
 34	"encoding/binary"
 35	"fmt"
 36	"sort"
 37)
 38
 39// btreeBucketSize should be chosen such that in the final tree the buckets are
 40// as close to the page size as possible, but not above. We insert ngrams in
 41// order(!), which means after a split of a leaf, the left leaf is not affected
 42// by further inserts and its size is fixed to bucketSize/2. The rightmost leaf
 43// might store up to btreeBucketSize ngrams, but the expected size is
 44// btreeBucketSize/2, too.
 45//
 46// On linux "getconf PAGESIZE" returns the number of bytes in a memory page.
 47const btreeBucketSize = (4096 * 2) / ngramEncoding
 48
 49type btree struct {
 50	root node
 51	opts btreeOpts
 52
 53	lastBucketIndex int
 54}
 55
 56type btreeOpts struct {
 57	// How many ngrams can be stored at a leaf node.
 58	bucketSize int
 59	// all inner nodes, except root, have [v, 2v] children. In the literature,
 60	// b-trees are inconsistently categorized either by the number of children
 61	// or by the number of keys. We choose the former.
 62	v int
 63}
 64
 65func newBtree(opts btreeOpts) *btree {
 66	return &btree{
 67		root: &leaf{},
 68		opts: opts,
 69	}
 70}
 71
 72// insert inserts ng into bt.
 73//
 74// Note: when all inserts are done, freeze must be called.
 75func (bt *btree) insert(ng ngram) {
 76	if leftNode, rightNode, newKey, ok := bt.root.maybeSplit(bt.opts); ok {
 77		bt.root = &innerNode{keys: []ngram{newKey}, children: []node{leftNode, rightNode}}
 78	}
 79	bt.root.insert(ng, bt.opts)
 80}
 81
 82// find returns the tuple (bucketIndex, postingIndexOffset), both of which are
 83// stored at the leaf level. They are effectively pointers to the bucket and
 84// the posting lists for ngrams stored in the bucket. Since ngrams and their
 85// posting lists are stored in order, knowing the index of the posting list of
 86// the first item in the bucket is sufficient.
 87func (bt *btree) find(ng ngram) (int, int) {
 88	if bt.root == nil {
 89		return -1, -1
 90	}
 91	return bt.root.find(ng)
 92}
 93
 94func (bt *btree) visit(f func(n node)) {
 95	bt.root.visit(f)
 96}
 97
 98// freeze must be called once we are done inserting. It backfills "pointers" to
 99// the buckets and posting lists.
100func (bt *btree) freeze() {
101	// Note: Instead of backfilling we could maintain state during insertion,
102	// however the visitor pattern seems more natural and shouldn't be a
103	// performance issue, because, based on the typical number of trigrams
104	// (500k) per shard, the b-trees we construct here only have around 1000
105	// leaf nodes.
106	offset, bucketIndex := 0, 0
107	bt.visit(func(no node) {
108		switch n := no.(type) {
109		case *leaf:
110			n.bucketIndex = bucketIndex
111			bucketIndex++
112
113			n.postingIndexOffset = offset
114			offset += n.bucketSize
115		case *innerNode:
116			return
117		}
118	})
119
120	bt.lastBucketIndex = bucketIndex - 1
121}
122
123func (bt *btree) sizeBytes() int {
124	sz := 2 * 8 // opts
125
126	sz += int(interfaceBytes)
127
128	bt.visit(func(n node) {
129		sz += n.sizeBytes()
130	})
131
132	return sz
133}
134
135type node interface {
136	insert(ng ngram, opts btreeOpts)
137	maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool)
138	find(ng ngram) (int, int)
139	visit(func(n node))
140	sizeBytes() int
141}
142
143type innerNode struct {
144	keys     []ngram
145	children []node
146}
147
148type leaf struct {
149	bucketIndex int
150	// postingIndexOffset is the index of the posting list of the first ngram
151	// in the bucket. This is enough to determine the index of the posting list
152	// for every other key in the bucket.
153	postingIndexOffset int
154
155	// Optimization: Because we insert ngrams in order, we don't actually have
156	// to fill the buckets. We just have to keep track of the size of the
157	// bucket, so we know when to split, and the key that we have to propagate
158	// up to the parent node when we split.
159	//
160	// If in the future we decide to mutate buckets, we have to replace
161	// bucketSize and splitKey by []ngram.
162	bucketSize int
163	splitKey   ngram
164}
165
166func (n *innerNode) sizeBytes() int {
167	return len(n.keys)*ngramEncoding + len(n.children)*int(interfaceBytes)
168}
169
170func (n *leaf) sizeBytes() int {
171	return 4 * 8
172}
173
174func (n *leaf) insert(ng ngram, opts btreeOpts) {
175	n.bucketSize++
176
177	if n.bucketSize == (opts.bucketSize/2)+1 {
178		n.splitKey = ng
179	}
180}
181
182func (n *innerNode) insert(ng ngram, opts btreeOpts) {
183	insertAt := func(i int) {
184		// Invariant: Nodes always have a free slot.
185		//
186		// We split full nodes on the the way down to the leaf. This has the
187		// advantage that inserts are handled in a single pass.
188		if leftNode, rightNode, newKey, ok := n.children[i].maybeSplit(opts); ok {
189			n.keys = append(n.keys[0:i], append([]ngram{newKey}, n.keys[i:]...)...)
190			n.children = append(n.children[0:i], append([]node{leftNode, rightNode}, n.children[i+1:]...)...)
191
192			// A split might shift the target index by 1.
193			if ng >= n.keys[i] {
194				i++
195			}
196		}
197		n.children[i].insert(ng, opts)
198	}
199
200	for i, k := range n.keys {
201		if ng < k {
202			insertAt(i)
203			return
204		}
205	}
206	insertAt(len(n.children) - 1)
207}
208
209// See btree.find
210func (n *innerNode) find(ng ngram) (int, int) {
211	for i, k := range n.keys {
212		if ng < k {
213			return n.children[i].find(ng)
214		}
215	}
216	return n.children[len(n.children)-1].find(ng)
217}
218
219// See btree.find
220func (n *leaf) find(ng ngram) (int, int) {
221	return int(n.bucketIndex), int(n.postingIndexOffset)
222}
223
224func (n *leaf) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) {
225	if n.bucketSize < opts.bucketSize {
226		return
227	}
228	return &leaf{bucketSize: opts.bucketSize / 2},
229		&leaf{bucketSize: opts.bucketSize / 2},
230		n.splitKey,
231		true
232}
233
234func (n *innerNode) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) {
235	if len(n.children) < 2*opts.v {
236		return
237	}
238	return &innerNode{
239			keys:     append(make([]ngram, 0, opts.v-1), n.keys[0:opts.v-1]...),
240			children: append(make([]node, 0, opts.v), n.children[:opts.v]...)},
241		&innerNode{
242			keys:     append(make([]ngram, 0, (2*opts.v)-1), n.keys[opts.v:]...),
243			children: append(make([]node, 0, 2*opts.v), n.children[opts.v:]...)},
244		n.keys[opts.v-1],
245		true
246}
247
248func (n *leaf) visit(f func(n node)) {
249	f(n)
250	return
251}
252
253func (n *innerNode) visit(f func(n node)) {
254	f(n)
255	for _, child := range n.children {
256		child.visit(f)
257	}
258}
259
260func (bt *btree) String() string {
261	s := ""
262	s += fmt.Sprintf("%+v", bt.opts)
263	bt.root.visit(func(n node) {
264		switch nd := n.(type) {
265		case *leaf:
266			return
267		case *innerNode:
268			s += fmt.Sprintf("[")
269			for _, key := range nd.keys {
270				s += fmt.Sprintf("%d,", key)
271			}
272			s = s[:len(s)-1] // remove trailing comma
273			s += fmt.Sprintf("]")
274
275		}
276	})
277	return s
278}
279
280type btreeIndex struct {
281	bt *btree
282
283	// We need the index to read buckets into memory.
284	file IndexFile
285
286	// buckets
287	ngramSec simpleSection
288
289	postingIndex simpleSection
290}
291
292// SizeBytes returns how much memory this structure uses in the heap.
293func (b btreeIndex) SizeBytes() (sz int) {
294	// btree
295	if b.bt != nil {
296		sz += int(pointerSize) + b.bt.sizeBytes()
297	}
298	// ngramSec
299	sz += 8
300	// postingIndex
301	sz += 8
302	// postingDataSentinelOffset
303	sz += 4
304	return
305}
306
307// NgramIndexes returns the indexes of the ngrams in the index. We return a
308// slice of slices because we have to keep track of ngram variants in case of
309// case-insensitive search.
310func (b btreeIndex) NgramIndexes(ngrams []ngram) ([]int, int) {
311	lookups := 0
312	ngramIndexes := make([]int, 0, len(ngrams))
313
314	for _, ng := range ngrams {
315		ix := b.ngramIndex(ng)
316		lookups++
317		if ix == -1 {
318			return nil, len(ngramIndexes) + 1
319		}
320		ngramIndexes = append(ngramIndexes, ix)
321	}
322
323	return ngramIndexes, len(ngramIndexes)
324}
325
326func (b btreeIndex) ngramIndex(ng ngram) int {
327	if b.bt == nil {
328		return -1
329	}
330
331	// find bucket
332	bucketIndex, postingIndexOffset := b.bt.find(ng)
333
334	// read bucket into memory
335	off, sz := b.getBucket(bucketIndex)
336	bucket, err := b.file.Read(off, sz)
337	if err != nil {
338		return -1
339	}
340
341	// find ngram in bucket
342	getNGram := func(i int) ngram {
343		i *= ngramEncoding
344		return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding]))
345	}
346
347	bucketSize := len(bucket) / ngramEncoding
348	x := sort.Search(bucketSize, func(i int) bool {
349		return ng <= getNGram(i)
350	})
351
352	// return index of associated posting list
353	if x >= bucketSize || getNGram(x) != ng {
354		return -1
355	}
356
357	return postingIndexOffset + x
358}
359
360// Get returns the simple section of the posting list associated with the
361// ngram. The logic is as follows:
362// 1. Search the inner nodes to find the bucket that may contain ng (in MEM)
363// 2. Read the bucket from disk (1 disk access)
364// 3. Binary search the bucket (in MEM)
365// 4. Return the simple section pointing to the posting list (in MEM)
366func (b btreeIndex) Get(ng ngram) (ss simpleSection) {
367	if b.bt == nil {
368		return simpleSection{}
369	}
370
371	// find bucket
372	bucketIndex, postingIndexOffset := b.bt.find(ng)
373
374	// read bucket into memory
375	off, sz := b.getBucket(bucketIndex)
376	bucket, err := b.file.Read(off, sz)
377	if err != nil {
378		return simpleSection{}
379	}
380
381	// find ngram in bucket
382	getNGram := func(i int) ngram {
383		i *= ngramEncoding
384		return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding]))
385	}
386
387	bucketSize := len(bucket) / ngramEncoding
388	x := sort.Search(bucketSize, func(i int) bool {
389		return ng <= getNGram(i)
390	})
391
392	// return associated posting list
393	if x >= bucketSize || getNGram(x) != ng {
394		return simpleSection{}
395	}
396
397	return b.GetPostingList(postingIndexOffset + x)
398}
399
400// GetPostingList returns the simple section pointing to the posting list of
401// the ngram at ngramIndex.
402//
403// Assumming we don't hit a page boundary, which should be rare given that we
404// only read 8 bytes, we need 1 disk access to read the posting offset.
405func (b btreeIndex) GetPostingList(ngramIndex int) simpleSection {
406	relativeOffsetBytes := uint32(ngramIndex) * 4
407
408	if relativeOffsetBytes+8 <= b.postingIndex.sz {
409		// read 2 offsets
410		o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 8)
411		if err != nil {
412			return simpleSection{}
413		}
414
415		start := binary.BigEndian.Uint32(o[0:4])
416		end := binary.BigEndian.Uint32(o[4:8])
417		return simpleSection{
418			off: start,
419			sz:  end - start,
420		}
421	} else {
422		// last ngram => read 1 offset and calculate the size of the posting
423		// list from the offset of index section.
424		o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 4)
425		if err != nil {
426			return simpleSection{}
427		}
428
429		start := binary.BigEndian.Uint32(o[0:4])
430		return simpleSection{
431			off: start,
432			// The layout of the posting list compound section on disk is
433			//
434			//                      start       b.postingIndex.off
435			//                      v           v
436			// [[posting lists (simple section)][index (simple section)]]
437			//                      <---------->
438			//                    last posting list
439			//
440			sz: b.postingIndex.off - start,
441		}
442	}
443}
444
445func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) {
446	// All but the rightmost bucket have exactly bucketSize/2 ngrams
447	sz = uint32(b.bt.opts.bucketSize / 2 * ngramEncoding)
448	off = b.ngramSec.off + uint32(bucketIndex)*sz
449
450	// Rightmost bucket has size upto the end of the ngramSec.
451	if bucketIndex == b.bt.lastBucketIndex {
452		sz = b.ngramSec.off + b.ngramSec.sz - off
453	}
454
455	return
456}
457
458// DumpMap is a debug method which returns the btree as an in-memory
459// representation. This is how zoekt represents the ngram index in
460// google/zoekt.
461func (b btreeIndex) DumpMap() map[ngram]simpleSection {
462	if b.bt == nil {
463		return nil
464	}
465
466	m := make(map[ngram]simpleSection, b.ngramSec.sz/ngramEncoding)
467
468	b.bt.visit(func(no node) {
469		switch n := no.(type) {
470		case *leaf:
471			// read bucket into memory
472			off, sz := b.getBucket(n.bucketIndex)
473			bucket, _ := b.file.Read(off, sz)
474
475			// decode all ngrams in the bucket and fill map
476			for i := 0; i < len(bucket)/ngramEncoding; i++ {
477				gram := ngram(binary.BigEndian.Uint64(bucket[i*8:]))
478				m[gram] = b.GetPostingList(int(n.postingIndexOffset) + i)
479			}
480		case *innerNode:
481			return
482		}
483	})
484
485	return m
486}
Configure Feed

Configure Feed