fork of https://github.com/sourcegraph/zoekt
1// Copyright 2021 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "sort"
19)
20
21// shrinkUint32Slice copies slices with excess capacity to precisely sized ones
22// to avoid wasting memory. It should be used on slices with long static durations.
23func shrinkUint32Slice(a []uint32) []uint32 {
24 if cap(a)-len(a) < 32 {
25 return a
26 }
27 out := make([]uint32, len(a))
28 copy(out, a)
29 return out
30}
31
32type topOffset struct {
33 top, off uint32
34}
35
36// arrayNgramOffset splits ngrams into two 32-bit parts and uses binary search
37// to satisfy requests. A three-level trie (over the runes of an ngram) uses 20%
38// more memory than this simple two-level split.
39type arrayNgramOffset struct {
40 // tops specify where the bottom halves of ngrams with the 32-bit top half begin.
41 // The offset of the next value is used to find where the bottom section ends.
42 tops []topOffset
43
44 // bots are bottom halves of an ngram, referenced by tops
45 bots []uint32
46
47 // offsets is values from simpleSection.off, simpleSection.sz is computed by subtracting
48 // adjacent offsets.
49 offsets []uint32
50}
51
52func makeArrayNgramOffset(ngrams []ngram, offsets []uint32) arrayNgramOffset {
53 arr := arrayNgramOffset{
54 bots: make([]uint32, 0, len(ngrams)),
55 }
56 arr.offsets = shrinkUint32Slice(offsets)
57
58 lastTop := uint32(0xffffffff)
59 lastStart := uint32(0)
60 for i, v := range ngrams {
61 curTop := uint32(v >> 32)
62 if curTop != lastTop {
63 if lastTop != 0xffffffff {
64 arr.tops = append(arr.tops, topOffset{lastTop, lastStart})
65 }
66 lastTop = curTop
67 lastStart = uint32(i)
68 }
69 arr.bots = append(arr.bots, uint32(v))
70 }
71 // add a sentinel value to make it simple to compute sizes
72 arr.tops = append(arr.tops, topOffset{lastTop, lastStart}, topOffset{0xffffffff, uint32(len(arr.bots))})
73
74 // shrink arr.tops to minimal size
75 tops := make([]topOffset, len(arr.tops))
76 copy(tops, arr.tops)
77 arr.tops = tops
78
79 return arr
80}
81
82func (a *arrayNgramOffset) Get(gram ngram) simpleSection {
83 if a.tops == nil {
84 return simpleSection{}
85 }
86
87 top, bot := uint32(uint64(gram)>>32), uint32(gram)
88
89 topIdx := sort.Search(len(a.tops)-1, func(i int) bool { return a.tops[i].top >= top })
90 if topIdx == len(a.tops)-1 || a.tops[topIdx].top != top {
91 return simpleSection{}
92 }
93
94 botsSec := a.bots[a.tops[topIdx].off:a.tops[topIdx+1].off]
95 botIdx := sort.Search(len(botsSec), func(i int) bool { return botsSec[i] >= bot })
96 if botIdx == len(botsSec) || botsSec[botIdx] != bot {
97 return simpleSection{}
98 }
99 idx := botIdx + int(a.tops[topIdx].off)
100 return simpleSection{
101 off: a.offsets[idx],
102 sz: a.offsets[idx+1] - a.offsets[idx],
103 }
104}
105
106func (a *arrayNgramOffset) DumpMap() map[ngram]simpleSection {
107 m := make(map[ngram]simpleSection, len(a.offsets)-1)
108 for i := 0; i < len(a.tops)-1; i++ {
109 top, botStart, botEnd := a.tops[i].top, a.tops[i].off, a.tops[i+1].off
110 botSec := a.bots[botStart:botEnd]
111 for j, bot := range botSec {
112 idx := int(botStart) + j
113 m[ngram(uint64(top)<<32|uint64(bot))] = simpleSection{
114 off: a.offsets[idx],
115 sz: a.offsets[idx+1] - a.offsets[idx],
116 }
117 }
118 }
119 return m
120}
121
122func (a *arrayNgramOffset) SizeBytes() int {
123 return 8*len(a.tops) + 4*len(a.bots) + 4*len(a.offsets)
124}
125
126// combinedNgramOffset combines an ascii ngram mapping with a unicode ngram mapping,
127// falling back on unicode for unicode ngrams or ascii ngrams with excessive lengths.
128type combinedNgramOffset struct {
129 asc *asciiNgramOffset
130 uni *arrayNgramOffset
131}
132
133func makeCombinedNgramOffset(ngrams []ngram, offsets []uint32) combinedNgramOffset {
134 // split ngrams & offsets into ascii ngrams and unicode ngrams,
135 // since ascii ngrams can be represented much more compactly (21b instead of 63b)
136
137 // allocate these arrays based off of rough measurements of what their typical
138 // sizes are-- source code is mostly ASCII with a little bit of Unicode.
139 // Allocating 101% of the total number of ngrams gives a little space for the
140 // duplicate entries used to mark section ends.
141 ngramsAscii := make([]ngramAscii, 0, len(ngrams)*101/100)
142 offsetsAscii := make([]uint32, 0, len(ngrams)*101/100)
143
144 ngramsUnicode := make([]ngram, 0, len(ngrams)*11/100)
145 offsetsUnicode := make([]uint32, 0, len(ngrams)*11/100)
146
147 for i, ng := range ngrams {
148 if ng&ngramAsciiMask == ng { // is ngram ascii-only?
149 ngp := ngramAsciiToPacked(ng)
150 if i == len(ngrams)-1 || ngrams[i+1]&ngramAsciiMask != ngrams[i+1] {
151 // at the end of a section we insert an extra offset with the same ngram,
152 // so the size of the segment can be calculated properly
153 ngramsAscii = append(ngramsAscii, ngp, ngp)
154 offsetsAscii = append(offsetsAscii, offsets[i], offsets[i+1])
155 } else {
156 ngramsAscii = append(ngramsAscii, ngp)
157 offsetsAscii = append(offsetsAscii, offsets[i])
158 }
159 // note: len(offsets) == len(ngrams) + 1
160 if offsets[i+1]-offsets[i] >= ngramAsciiMaxSectionLength {
161 // max-length ascii sections can't be represented properly in the ascii mapping,
162 // and are duplicated in the normal unicode entries.
163 ngramsUnicode = append(ngramsUnicode, ng, ng)
164 offsetsUnicode = append(offsetsUnicode, offsets[i], offsets[i+1])
165 }
166 } else {
167 if i == len(ngrams)-1 || ngrams[i+1]&ngramAsciiMask == ngrams[i+1] {
168 ngramsUnicode = append(ngramsUnicode, ng, ng)
169 offsetsUnicode = append(offsetsUnicode, offsets[i], offsets[i+1])
170 } else {
171 ngramsUnicode = append(ngramsUnicode, ng)
172 offsetsUnicode = append(offsetsUnicode, offsets[i])
173 }
174 }
175 }
176
177 // The last segment always has an extra trailing ngram entry that we don't need, and
178 // is only present for spacing and alignment. Trim it.
179 if len(ngramsAscii) > 0 {
180 ngramsAscii = ngramsAscii[:len(ngramsAscii)-1]
181 }
182 if len(ngramsUnicode) > 0 {
183 ngramsUnicode = ngramsUnicode[:len(ngramsUnicode)-1]
184 }
185
186 asc := makeAsciiNgramOffset(ngramsAscii, offsetsAscii)
187 uni := makeArrayNgramOffset(ngramsUnicode, offsetsUnicode)
188
189 return combinedNgramOffset{asc, &uni}
190}
191
192// Get returns a simpleSection with sz=0 if no entry, otherwise the appropriate
193// offset based on the underlying ASCII or Unicode offset index.
194func (a combinedNgramOffset) Get(gram ngram) simpleSection {
195 if a.asc == nil {
196 return simpleSection{}
197 }
198
199 var sec simpleSection
200 if gram&ngramAsciiMask == gram {
201 sec = a.asc.Get(gram)
202 if sec.sz == ngramAsciiMaxSectionLength {
203 // Fallback: this section's length was too long to store in the
204 // ASCII map, find it in the Unicode map.
205 sec = a.uni.Get(gram)
206 }
207 } else {
208 sec = a.uni.Get(gram)
209 }
210
211 return sec
212}
213
214func (a combinedNgramOffset) DumpMap() map[ngram]simpleSection {
215 m := a.asc.DumpMap()
216 for k, v := range a.uni.DumpMap() {
217 m[k] = v
218 }
219 return m
220}
221
222func (a combinedNgramOffset) SizeBytes() int {
223 return a.asc.SizeBytes() + a.uni.SizeBytes()
224}
225
226const ngramAsciiMask = 127 | 127<<21 | 127<<42
227
228// Ascii mapping packs 3*7b chars and 11 bits of lengths, with this as the set maximum.
229// We could save another ~3% of total RAM / 5% of combinedNgramOffset RAM by switching to
230// a 40b packing with 19-bit lengths, but the code would be significantly uglier so it doesn't
231// seem worth it.
232const ngramAsciiMaxSectionLength = (1 << 11) - 1
233
234type ngramAscii uint32
235
236func ngramAsciiToPacked(ng ngram) ngramAscii {
237 return ngramAscii(uint32(ng&127) | uint32((ng>>(21-7))&(127<<7)) | uint32((ng>>(42-14))&(127<<14)))
238}
239
240func ngramAsciiPackedToNgram(ng ngramAscii) ngram {
241 return ngram(ng&127) | ngram(ng&(127<<7))<<(21-7) | ngram(ng&(127<<14))<<(42-14)
242}
243
244// asciiNgramOffset stores ascii trigrams packed together with short lengths,
245// using offsets for a chunk of entries to limit the number of lengths that must
246// be summed to compute a section's offset.
247type asciiNgramOffset struct {
248 entries []uint32 // (chara << 25 | charb << 18 | charc << 11 | length)
249 chunkOffsets []uint32 // offset for entries[i*asciiNgramOffsetChunkLength]
250}
251
252// asciiNgramOffsetChunkLength specifies how many entries share one initial offset.
253// It must be a power of 2, and was chosen empirically by measuring RAM usage:
254// 8: 4132MB, 16: 4047MB, 32: 4006MB, 64: 3992MB, 128: 3990MB
255const asciiNgramOffsetChunkLength = 32
256
257func makeAsciiNgramOffset(ngrams []ngramAscii, offsets []uint32) *asciiNgramOffset {
258 ao := &asciiNgramOffset{
259 entries: make([]uint32, 0, len(ngrams)),
260 chunkOffsets: make([]uint32, 0, len(ngrams)/asciiNgramOffsetChunkLength),
261 }
262
263 for i, ng := range ngrams {
264 if len(ao.entries)%asciiNgramOffsetChunkLength == 0 {
265 ao.chunkOffsets = append(ao.chunkOffsets, offsets[i])
266 }
267 length := offsets[i+1] - offsets[i]
268
269 for {
270 if length < ngramAsciiMaxSectionLength {
271 ao.entries = append(ao.entries, uint32(ng)<<11|length)
272 break
273 } else {
274 // entries with lengths that are too long can't be represented fully in this
275 // map, but we repeatedly insert offsets to make the next entry's offset computable
276 // by summing the offsets in the preceding entries in the chunk, including
277 // this invalid one.
278 ao.entries = append(ao.entries, uint32(ng)<<11|ngramAsciiMaxSectionLength)
279 length -= ngramAsciiMaxSectionLength
280 if len(ao.entries)%asciiNgramOffsetChunkLength == 0 {
281 // We reached the end of the chunk, so there's no need to reach the
282 // offset for the next entry.
283 break
284 }
285 }
286 }
287 }
288
289 ao.entries = shrinkUint32Slice(ao.entries)
290 ao.chunkOffsets = shrinkUint32Slice(ao.chunkOffsets)
291
292 return ao
293}
294
295// Get returns a simpleSection with sz=0 if no entry, or sz=ngramAsciiMaxSectionLength
296// if the length of the ngram is too large for this type and it should cascade to the next entry.
297func (a *asciiNgramOffset) Get(gram ngram) simpleSection {
298 if gram&ngramAsciiMask != gram {
299 return simpleSection{}
300 }
301 g := uint32(ngramAsciiToPacked(gram) << 11)
302
303 idx := sort.Search(len(a.entries), func(i int) bool {
304 return a.entries[i] >= g
305 })
306
307 if idx == len(a.entries) || a.entries[idx]>>11 != g>>11 {
308 return simpleSection{}
309 }
310
311 length := a.entries[idx] & ngramAsciiMaxSectionLength
312 if length == ngramAsciiMaxSectionLength {
313 // this ascii ngram's section length is too large to be represented;
314 // repeate the Get() on the unicode map to get the correct result.
315 return simpleSection{
316 off: 0,
317 sz: ngramAsciiMaxSectionLength,
318 }
319 }
320
321 chunkNum := idx / asciiNgramOffsetChunkLength
322 chunkBase := chunkNum * asciiNgramOffsetChunkLength
323 offset := a.chunkOffsets[chunkNum]
324 for i := chunkBase; i < idx; i++ {
325 offset += a.entries[i] & ngramAsciiMaxSectionLength
326 }
327
328 return simpleSection{
329 off: offset,
330 sz: length,
331 }
332}
333
334func (a *asciiNgramOffset) DumpMap() map[ngram]simpleSection {
335 m := make(map[ngram]simpleSection, len(a.entries))
336 off := uint32(0)
337 for i, ent := range a.entries {
338 if i%asciiNgramOffsetChunkLength == 0 {
339 off = a.chunkOffsets[i/asciiNgramOffsetChunkLength]
340 }
341 length := ent & ngramAsciiMaxSectionLength
342 if length == ngramAsciiMaxSectionLength {
343 // This entry is an ascii gram with a section too long
344 // to be represented, so skip the entry.
345 continue
346 }
347 m[ngramAsciiPackedToNgram(ngramAscii(ent>>11))] = simpleSection{
348 off: off,
349 sz: length,
350 }
351 off += length
352 }
353 return m
354}
355
356func (a *asciiNgramOffset) SizeBytes() int {
357 return 4*len(a.entries) + 4*len(a.chunkOffsets)
358}
359
360type ngramIndex interface {
361 Get(gram ngram) simpleSection
362 DumpMap() map[ngram]simpleSection
363 SizeBytes() int
364}
365
366// This is a temporary type to wrap two very different implementations of the
367// inverted index for the purpose of feature-flagging. We will remove this after
368// we enable the b-tree permanently.
369//
370// Alternatively we could have adapted/extended the interface "ngramIndex".
371// However, adapting the existing implementations and their tests to match the
372// access pattern of map[ngram][]byte seems more cumbersome than this makeshift
373// wrapper. In the end, both ngramIndex and this wrapper will be replaced by a
374// concrete type.
375type fileNameNgrams struct {
376 m map[ngram][]byte
377 bt btreeIndex
378}
379
380func (n fileNameNgrams) GetBlob(ng ngram) ([]byte, error) {
381 if n.m != nil {
382 return n.m[ng], nil
383 }
384 sec := n.bt.Get(ng)
385 return n.bt.file.Read(sec.off, sec.sz)
386}
387
388func (n fileNameNgrams) Frequency(ng ngram) uint32 {
389 if n.m != nil {
390 return uint32(len(n.m[ng]))
391 }
392 return n.bt.Get(ng).sz
393}
394
395func (n fileNameNgrams) SizeBytes() int {
396 if n.m != nil {
397 // these slices reference mmap-ed memory
398 return 12 * len(n.m)
399 }
400 return n.bt.SizeBytes()
401}