ngramoffset.go at 0cb60676ea92e8904e4075743174adbb1ea47af6 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / ngramoffset.go
at 0cb60676ea92e8904e4075743174adbb1ea47af6 13 kB View raw
Stefan Hengl set b-tree as default (#540) 3y ago
  1// Copyright 2021 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"sort"
 19)
 20
 21// shrinkUint32Slice copies slices with excess capacity to precisely sized ones
 22// to avoid wasting memory. It should be used on slices with long static durations.
 23func shrinkUint32Slice(a []uint32) []uint32 {
 24	if cap(a)-len(a) < 32 {
 25		return a
 26	}
 27	out := make([]uint32, len(a))
 28	copy(out, a)
 29	return out
 30}
 31
 32type topOffset struct {
 33	top, off uint32
 34}
 35
 36// arrayNgramOffset splits ngrams into two 32-bit parts and uses binary search
 37// to satisfy requests. A three-level trie (over the runes of an ngram) uses 20%
 38// more memory than this simple two-level split.
 39type arrayNgramOffset struct {
 40	// tops specify where the bottom halves of ngrams with the 32-bit top half begin.
 41	// The offset of the next value is used to find where the bottom section ends.
 42	tops []topOffset
 43
 44	// bots are bottom halves of an ngram, referenced by tops
 45	bots []uint32
 46
 47	// offsets is values from simpleSection.off, simpleSection.sz is computed by subtracting
 48	// adjacent offsets.
 49	offsets []uint32
 50}
 51
 52func makeArrayNgramOffset(ngrams []ngram, offsets []uint32) arrayNgramOffset {
 53	arr := arrayNgramOffset{
 54		bots: make([]uint32, 0, len(ngrams)),
 55	}
 56	arr.offsets = shrinkUint32Slice(offsets)
 57
 58	lastTop := uint32(0xffffffff)
 59	lastStart := uint32(0)
 60	for i, v := range ngrams {
 61		curTop := uint32(v >> 32)
 62		if curTop != lastTop {
 63			if lastTop != 0xffffffff {
 64				arr.tops = append(arr.tops, topOffset{lastTop, lastStart})
 65			}
 66			lastTop = curTop
 67			lastStart = uint32(i)
 68		}
 69		arr.bots = append(arr.bots, uint32(v))
 70	}
 71	// add a sentinel value to make it simple to compute sizes
 72	arr.tops = append(arr.tops, topOffset{lastTop, lastStart}, topOffset{0xffffffff, uint32(len(arr.bots))})
 73
 74	// shrink arr.tops to minimal size
 75	tops := make([]topOffset, len(arr.tops))
 76	copy(tops, arr.tops)
 77	arr.tops = tops
 78
 79	return arr
 80}
 81
 82func (a *arrayNgramOffset) Get(gram ngram) simpleSection {
 83	if a.tops == nil {
 84		return simpleSection{}
 85	}
 86
 87	top, bot := uint32(uint64(gram)>>32), uint32(gram)
 88
 89	topIdx := sort.Search(len(a.tops)-1, func(i int) bool { return a.tops[i].top >= top })
 90	if topIdx == len(a.tops)-1 || a.tops[topIdx].top != top {
 91		return simpleSection{}
 92	}
 93
 94	botsSec := a.bots[a.tops[topIdx].off:a.tops[topIdx+1].off]
 95	botIdx := sort.Search(len(botsSec), func(i int) bool { return botsSec[i] >= bot })
 96	if botIdx == len(botsSec) || botsSec[botIdx] != bot {
 97		return simpleSection{}
 98	}
 99	idx := botIdx + int(a.tops[topIdx].off)
100	return simpleSection{
101		off: a.offsets[idx],
102		sz:  a.offsets[idx+1] - a.offsets[idx],
103	}
104}
105
106func (a *arrayNgramOffset) DumpMap() map[ngram]simpleSection {
107	m := make(map[ngram]simpleSection, len(a.offsets)-1)
108	for i := 0; i < len(a.tops)-1; i++ {
109		top, botStart, botEnd := a.tops[i].top, a.tops[i].off, a.tops[i+1].off
110		botSec := a.bots[botStart:botEnd]
111		for j, bot := range botSec {
112			idx := int(botStart) + j
113			m[ngram(uint64(top)<<32|uint64(bot))] = simpleSection{
114				off: a.offsets[idx],
115				sz:  a.offsets[idx+1] - a.offsets[idx],
116			}
117		}
118	}
119	return m
120}
121
122func (a *arrayNgramOffset) SizeBytes() int {
123	return 8*len(a.tops) + 4*len(a.bots) + 4*len(a.offsets)
124}
125
126// combinedNgramOffset combines an ascii ngram mapping with a unicode ngram mapping,
127// falling back on unicode for unicode ngrams or ascii ngrams with excessive lengths.
128type combinedNgramOffset struct {
129	asc *asciiNgramOffset
130	uni *arrayNgramOffset
131}
132
133func makeCombinedNgramOffset(ngrams []ngram, offsets []uint32) combinedNgramOffset {
134	// split ngrams & offsets into ascii ngrams and unicode ngrams,
135	// since ascii ngrams can be represented much more compactly (21b instead of 63b)
136
137	// allocate these arrays based off of rough measurements of what their typical
138	// sizes are-- source code is mostly ASCII with a little bit of Unicode.
139	// Allocating 101% of the total number of ngrams gives a little space for the
140	// duplicate entries used to mark section ends.
141	ngramsAscii := make([]ngramAscii, 0, len(ngrams)*101/100)
142	offsetsAscii := make([]uint32, 0, len(ngrams)*101/100)
143
144	ngramsUnicode := make([]ngram, 0, len(ngrams)*11/100)
145	offsetsUnicode := make([]uint32, 0, len(ngrams)*11/100)
146
147	for i, ng := range ngrams {
148		if ng&ngramAsciiMask == ng { // is ngram ascii-only?
149			ngp := ngramAsciiToPacked(ng)
150			if i == len(ngrams)-1 || ngrams[i+1]&ngramAsciiMask != ngrams[i+1] {
151				// at the end of a section we insert an extra offset with the same ngram,
152				// so the size of the segment can be calculated properly
153				ngramsAscii = append(ngramsAscii, ngp, ngp)
154				offsetsAscii = append(offsetsAscii, offsets[i], offsets[i+1])
155			} else {
156				ngramsAscii = append(ngramsAscii, ngp)
157				offsetsAscii = append(offsetsAscii, offsets[i])
158			}
159			// note: len(offsets) == len(ngrams) + 1
160			if offsets[i+1]-offsets[i] >= ngramAsciiMaxSectionLength {
161				// max-length ascii sections can't be represented properly in the ascii mapping,
162				// and are duplicated in the normal unicode entries.
163				ngramsUnicode = append(ngramsUnicode, ng, ng)
164				offsetsUnicode = append(offsetsUnicode, offsets[i], offsets[i+1])
165			}
166		} else {
167			if i == len(ngrams)-1 || ngrams[i+1]&ngramAsciiMask == ngrams[i+1] {
168				ngramsUnicode = append(ngramsUnicode, ng, ng)
169				offsetsUnicode = append(offsetsUnicode, offsets[i], offsets[i+1])
170			} else {
171				ngramsUnicode = append(ngramsUnicode, ng)
172				offsetsUnicode = append(offsetsUnicode, offsets[i])
173			}
174		}
175	}
176
177	// The last segment always has an extra trailing ngram entry that we don't need, and
178	// is only present for spacing and alignment. Trim it.
179	if len(ngramsAscii) > 0 {
180		ngramsAscii = ngramsAscii[:len(ngramsAscii)-1]
181	}
182	if len(ngramsUnicode) > 0 {
183		ngramsUnicode = ngramsUnicode[:len(ngramsUnicode)-1]
184	}
185
186	asc := makeAsciiNgramOffset(ngramsAscii, offsetsAscii)
187	uni := makeArrayNgramOffset(ngramsUnicode, offsetsUnicode)
188
189	return combinedNgramOffset{asc, &uni}
190}
191
192// Get returns a simpleSection with sz=0 if no entry, otherwise the appropriate
193// offset based on the underlying ASCII or Unicode offset index.
194func (a combinedNgramOffset) Get(gram ngram) simpleSection {
195	if a.asc == nil {
196		return simpleSection{}
197	}
198
199	var sec simpleSection
200	if gram&ngramAsciiMask == gram {
201		sec = a.asc.Get(gram)
202		if sec.sz == ngramAsciiMaxSectionLength {
203			// Fallback: this section's length was too long to store in the
204			// ASCII map, find it in the Unicode map.
205			sec = a.uni.Get(gram)
206		}
207	} else {
208		sec = a.uni.Get(gram)
209	}
210
211	return sec
212}
213
214func (a combinedNgramOffset) DumpMap() map[ngram]simpleSection {
215	m := a.asc.DumpMap()
216	for k, v := range a.uni.DumpMap() {
217		m[k] = v
218	}
219	return m
220}
221
222func (a combinedNgramOffset) SizeBytes() int {
223	return a.asc.SizeBytes() + a.uni.SizeBytes()
224}
225
226const ngramAsciiMask = 127 | 127<<21 | 127<<42
227
228// Ascii mapping packs 3*7b chars and 11 bits of lengths, with this as the set maximum.
229// We could save another ~3% of total RAM / 5% of combinedNgramOffset RAM by switching to
230// a 40b packing with 19-bit lengths, but the code would be significantly uglier so it doesn't
231// seem worth it.
232const ngramAsciiMaxSectionLength = (1 << 11) - 1
233
234type ngramAscii uint32
235
236func ngramAsciiToPacked(ng ngram) ngramAscii {
237	return ngramAscii(uint32(ng&127) | uint32((ng>>(21-7))&(127<<7)) | uint32((ng>>(42-14))&(127<<14)))
238}
239
240func ngramAsciiPackedToNgram(ng ngramAscii) ngram {
241	return ngram(ng&127) | ngram(ng&(127<<7))<<(21-7) | ngram(ng&(127<<14))<<(42-14)
242}
243
244// asciiNgramOffset stores ascii trigrams packed together with short lengths,
245// using offsets for a chunk of entries to limit the number of lengths that must
246// be summed to compute a section's offset.
247type asciiNgramOffset struct {
248	entries      []uint32 // (chara << 25 | charb << 18 | charc << 11 | length)
249	chunkOffsets []uint32 // offset for entries[i*asciiNgramOffsetChunkLength]
250}
251
252// asciiNgramOffsetChunkLength specifies how many entries share one initial offset.
253// It must be a power of 2, and was chosen empirically by measuring RAM usage:
254// 8: 4132MB, 16: 4047MB, 32: 4006MB, 64: 3992MB, 128: 3990MB
255const asciiNgramOffsetChunkLength = 32
256
257func makeAsciiNgramOffset(ngrams []ngramAscii, offsets []uint32) *asciiNgramOffset {
258	ao := &asciiNgramOffset{
259		entries:      make([]uint32, 0, len(ngrams)),
260		chunkOffsets: make([]uint32, 0, len(ngrams)/asciiNgramOffsetChunkLength),
261	}
262
263	for i, ng := range ngrams {
264		if len(ao.entries)%asciiNgramOffsetChunkLength == 0 {
265			ao.chunkOffsets = append(ao.chunkOffsets, offsets[i])
266		}
267		length := offsets[i+1] - offsets[i]
268
269		for {
270			if length < ngramAsciiMaxSectionLength {
271				ao.entries = append(ao.entries, uint32(ng)<<11|length)
272				break
273			} else {
274				// entries with lengths that are too long can't be represented fully in this
275				// map, but we repeatedly insert offsets to make the next entry's offset computable
276				// by summing the offsets in the preceding entries in the chunk, including
277				// this invalid one.
278				ao.entries = append(ao.entries, uint32(ng)<<11|ngramAsciiMaxSectionLength)
279				length -= ngramAsciiMaxSectionLength
280				if len(ao.entries)%asciiNgramOffsetChunkLength == 0 {
281					// We reached the end of the chunk, so there's no need to reach the
282					// offset for the next entry.
283					break
284				}
285			}
286		}
287	}
288
289	ao.entries = shrinkUint32Slice(ao.entries)
290	ao.chunkOffsets = shrinkUint32Slice(ao.chunkOffsets)
291
292	return ao
293}
294
295// Get returns a simpleSection with sz=0 if no entry, or sz=ngramAsciiMaxSectionLength
296// if the length of the ngram is too large for this type and it should cascade to the next entry.
297func (a *asciiNgramOffset) Get(gram ngram) simpleSection {
298	if gram&ngramAsciiMask != gram {
299		return simpleSection{}
300	}
301	g := uint32(ngramAsciiToPacked(gram) << 11)
302
303	idx := sort.Search(len(a.entries), func(i int) bool {
304		return a.entries[i] >= g
305	})
306
307	if idx == len(a.entries) || a.entries[idx]>>11 != g>>11 {
308		return simpleSection{}
309	}
310
311	length := a.entries[idx] & ngramAsciiMaxSectionLength
312	if length == ngramAsciiMaxSectionLength {
313		// this ascii ngram's section length is too large to be represented;
314		// repeate the Get() on the unicode map to get the correct result.
315		return simpleSection{
316			off: 0,
317			sz:  ngramAsciiMaxSectionLength,
318		}
319	}
320
321	chunkNum := idx / asciiNgramOffsetChunkLength
322	chunkBase := chunkNum * asciiNgramOffsetChunkLength
323	offset := a.chunkOffsets[chunkNum]
324	for i := chunkBase; i < idx; i++ {
325		offset += a.entries[i] & ngramAsciiMaxSectionLength
326	}
327
328	return simpleSection{
329		off: offset,
330		sz:  length,
331	}
332}
333
334func (a *asciiNgramOffset) DumpMap() map[ngram]simpleSection {
335	m := make(map[ngram]simpleSection, len(a.entries))
336	off := uint32(0)
337	for i, ent := range a.entries {
338		if i%asciiNgramOffsetChunkLength == 0 {
339			off = a.chunkOffsets[i/asciiNgramOffsetChunkLength]
340		}
341		length := ent & ngramAsciiMaxSectionLength
342		if length == ngramAsciiMaxSectionLength {
343			// This entry is an ascii gram with a section too long
344			// to be represented, so skip the entry.
345			continue
346		}
347		m[ngramAsciiPackedToNgram(ngramAscii(ent>>11))] = simpleSection{
348			off: off,
349			sz:  length,
350		}
351		off += length
352	}
353	return m
354}
355
356func (a *asciiNgramOffset) SizeBytes() int {
357	return 4*len(a.entries) + 4*len(a.chunkOffsets)
358}
359
360type ngramIndex interface {
361	Get(gram ngram) simpleSection
362	DumpMap() map[ngram]simpleSection
363	SizeBytes() int
364}
365
366// This is a temporary type to wrap two very different implementations of the
367// inverted index for the purpose of feature-flagging. We will remove this after
368// we enable the b-tree permanently.
369//
370// Alternatively we could have adapted/extended the interface "ngramIndex".
371// However, adapting the existing implementations and their tests to match the
372// access pattern of map[ngram][]byte seems more cumbersome than this makeshift
373// wrapper. In the end, both ngramIndex and this wrapper will be replaced by a
374// concrete type.
375type fileNameNgrams struct {
376	m  map[ngram][]byte
377	bt btreeIndex
378}
379
380func (n fileNameNgrams) GetBlob(ng ngram) ([]byte, error) {
381	if n.m != nil {
382		return n.m[ng], nil
383	}
384	sec := n.bt.Get(ng)
385	return n.bt.file.Read(sec.off, sec.sz)
386}
387
388func (n fileNameNgrams) Frequency(ng ngram) uint32 {
389	if n.m != nil {
390		return uint32(len(n.m[ng]))
391	}
392	return n.bt.Get(ng).sz
393}
394
395func (n fileNameNgrams) SizeBytes() int {
396	if n.m != nil {
397		// these slices reference mmap-ed memory
398		return 12 * len(n.m)
399	}
400	return n.bt.SizeBytes()
401}
Configure Feed

Configure Feed