fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "cmp"
19 "encoding/binary"
20 "math"
21 "math/rand/v2"
22 "slices"
23 "sort"
24 "unicode"
25 "unicode/utf8"
26)
27
28func generateCaseNgrams(g ngram) []ngram {
29 asRunes := ngramToRunes(g)
30
31 variants := make([]ngram, 0, 8)
32 cur := asRunes
33 for {
34 for i := 0; i < 3; i++ {
35 next := unicode.SimpleFold(cur[i])
36 cur[i] = next
37 if next != asRunes[i] {
38 break
39 }
40 }
41
42 variants = append(variants, runesToNGram(cur))
43 if cur == asRunes {
44 break
45 }
46 }
47
48 return variants
49}
50
51func toLower(in []byte) []byte {
52 out := make([]byte, 0, len(in))
53 var buf [4]byte
54 for _, c := range string(in) {
55 i := utf8.EncodeRune(buf[:], unicode.ToLower(c))
56 out = append(out, buf[:i]...)
57 }
58 return out
59}
60
61// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may
62// be larger than 'lower'. Returns whether there was a match, and if
63// yes, the byte size of the match.
64func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) {
65 matchTotal := 0
66 for len(lower) > 0 && len(mixed) > 0 {
67 lr, lsz := utf8.DecodeRune(lower)
68 lower = lower[lsz:]
69
70 mr, msz := utf8.DecodeRune(mixed)
71 mixed = mixed[msz:]
72 matchTotal += msz
73
74 if lr != unicode.ToLower(mr) {
75 return 0, false
76 }
77 }
78
79 return matchTotal, len(lower) == 0
80}
81
82type ngram uint64
83
84func runesToNGram(b [ngramSize]rune) ngram {
85 return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2]))
86}
87
88func bytesToNGram(b []byte) ngram {
89 return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])})
90}
91
92func stringToNGram(s string) ngram {
93 return bytesToNGram([]byte(s))
94}
95
96func ngramToBytes(n ngram) []byte {
97 rs := ngramToRunes(n)
98 return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])}
99}
100
101const runeMask = 1<<21 - 1
102
103func ngramToRunes(n ngram) [ngramSize]rune {
104 return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)}
105}
106
107func (n ngram) String() string {
108 rs := ngramToRunes(n)
109 return string(rs[:])
110}
111
112type runeNgramOff struct {
113 ngram ngram
114 // index is the original index inside of the returned array of splitNGrams
115 index int
116}
117
118func (a runeNgramOff) Compare(b runeNgramOff) int {
119 if a.ngram == b.ngram {
120 return cmp.Compare(a.index, b.index)
121 } else if a.ngram < b.ngram {
122 return -1
123 } else {
124 return 1
125 }
126}
127
128func splitNGrams(str []byte) []runeNgramOff {
129 // len(maxNgrams) >= the number of ngrams in str => no limit
130 return splitNGramsLimit(str, len(str))
131}
132
133func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
134 var runeGram [3]rune
135 var off [3]uint32
136 var runeCount int
137
138 result := make([]runeNgramOff, 0, len(str))
139 var i uint32
140
141 for len(str) > 0 {
142 r, sz := utf8.DecodeRune(str)
143 str = str[sz:]
144 runeGram[0] = runeGram[1]
145 off[0] = off[1]
146 runeGram[1] = runeGram[2]
147 off[1] = off[2]
148 runeGram[2] = r
149 off[2] = uint32(i)
150 i += uint32(sz)
151 runeCount++
152 if runeCount < ngramSize {
153 continue
154 }
155
156 ng := runesToNGram(runeGram)
157 result = append(result, runeNgramOff{
158 ngram: ng,
159 index: len(result),
160 })
161 }
162
163 // We return a random subset of size maxNgrams. This is to prevent the start
164 // of the string biasing ngram selection.
165 if maxNgrams < len(result) {
166 // Deterministic seed for tests. Additionally makes comparing repeated
167 // queries performance easier.
168 r := rand.New(rand.NewPCG(uint64(maxNgrams), 0))
169
170 // Pick random subset via a shuffle
171 r.Shuffle(maxNgrams, func(i, j int) { result[i], result[j] = result[j], result[i] })
172 result = result[:maxNgrams]
173
174 // Caller expects ngrams in order of appearance.
175 slices.SortFunc(result, runeNgramOff.Compare)
176 }
177
178 return result
179}
180
181const (
182 _classLowerChar int = iota
183 _classUpperChar
184 _classDigit
185 _classPunct
186 _classOther
187 _classSpace
188)
189
190func byteClass(c byte) int {
191 if c >= 'a' && c <= 'z' {
192 return _classLowerChar
193 }
194 if c >= 'A' && c <= 'Z' {
195 return _classUpperChar
196 }
197 if c >= '0' && c <= '9' {
198 return _classDigit
199 }
200
201 switch c {
202 case ' ', '\n':
203 return _classSpace
204 case '.', ',', ';', '"', '\'':
205 return _classPunct
206 default:
207 return _classOther
208 }
209}
210
211func characterClass(c byte) bool {
212 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'
213}
214
215func marshalDocSections(secs []DocumentSection) []byte {
216 ints := make([]uint32, 0, len(secs)*2)
217 for _, s := range secs {
218 ints = append(ints, uint32(s.Start), uint32(s.End))
219 }
220
221 return toSizedDeltas(ints)
222}
223
224func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection {
225 sz, m := binary.Uvarint(data)
226 data = data[m:]
227
228 if cap(ds) < int(sz)/2 {
229 ds = make([]DocumentSection, 0, sz/2)
230 } else {
231 ds = ds[:0]
232 }
233
234 // Inlining the delta decoding to avoid unnecessary allocations that would come
235 // from the straightforward implementation, i.e. packing the result of fromSizedDeltas.
236 var last uint32
237 for len(data) > 0 {
238 var d DocumentSection
239
240 delta, m := binary.Uvarint(data)
241 last += uint32(delta)
242 data = data[m:]
243 d.Start = last
244
245 delta, m = binary.Uvarint(data)
246 last += uint32(delta)
247 data = data[m:]
248 d.End = last
249
250 ds = append(ds, d)
251 }
252 return ds
253}
254
255type ngramSlice []ngram
256
257func (p ngramSlice) Len() int { return len(p) }
258
259func (p ngramSlice) Less(i, j int) bool {
260 return p[i] < p[j]
261}
262
263func (p ngramSlice) Swap(i, j int) {
264 p[i], p[j] = p[j], p[i]
265}
266
267func toSizedDeltas(offsets []uint32) []byte {
268 var enc [8]byte
269
270 deltas := make([]byte, 0, len(offsets)*2)
271
272 m := binary.PutUvarint(enc[:], uint64(len(offsets)))
273 deltas = append(deltas, enc[:m]...)
274
275 var last uint32
276 for _, p := range offsets {
277 delta := p - last
278 last = p
279
280 m := binary.PutUvarint(enc[:], uint64(delta))
281 deltas = append(deltas, enc[:m]...)
282 }
283 return deltas
284}
285
286func fromSizedDeltas(data []byte, ps []uint32) []uint32 {
287 sz, m := binary.Uvarint(data)
288 data = data[m:]
289
290 if cap(ps) < int(sz) {
291 ps = make([]uint32, 0, sz)
292 } else {
293 ps = ps[:0]
294 }
295
296 var last uint32
297 for len(data) > 0 {
298 delta, m := binary.Uvarint(data)
299 offset := last + uint32(delta)
300 last = offset
301 data = data[m:]
302 ps = append(ps, offset)
303 }
304 return ps
305}
306
307func toSizedDeltas16(offsets []uint16) []byte {
308 var enc [8]byte
309
310 deltas := make([]byte, 0, len(offsets)*2)
311
312 m := binary.PutUvarint(enc[:], uint64(len(offsets)))
313 deltas = append(deltas, enc[:m]...)
314
315 var last uint16
316 for _, p := range offsets {
317 delta := p - last
318 last = p
319
320 m := binary.PutUvarint(enc[:], uint64(delta))
321 deltas = append(deltas, enc[:m]...)
322 }
323 return deltas
324}
325
326func fromSizedDeltas16(data []byte, ps []uint16) []uint16 {
327 sz, m := binary.Uvarint(data)
328 data = data[m:]
329
330 if cap(ps) < int(sz) {
331 ps = make([]uint16, 0, sz)
332 } else {
333 ps = ps[:0]
334 }
335
336 var last uint16
337 for len(data) > 0 {
338 delta, m := binary.Uvarint(data)
339 offset := last + uint16(delta)
340 last = offset
341 data = data[m:]
342 ps = append(ps, offset)
343 }
344 return ps
345}
346
347func fromDeltas(data []byte, buf []uint32) []uint32 {
348 buf = buf[:0]
349 if cap(buf) < len(data)/2 {
350 buf = make([]uint32, 0, len(data)/2)
351 }
352
353 var last uint32
354 for len(data) > 0 {
355 delta, m := binary.Uvarint(data)
356 offset := last + uint32(delta)
357 last = offset
358 data = data[m:]
359 buf = append(buf, offset)
360 }
361 return buf
362}
363
364type runeOffsetCorrection struct {
365 runeOffset, byteOffset uint32
366}
367
368// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency)
369// to byte offsets, by tracking only the points where a span of runes is non-ASCII,
370// and otherwise interpolating expected byte offsets as one byte per rune.
371//
372// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}].
373//
374// This is very rarely a slight pessimization on repos where there are frequent
375// non-ASCII characters.
376type runeOffsetMap []runeOffsetCorrection
377
378// makeRuneOffsetMap converts the mostly-predictable runeOffset input
379// into a shorter form tracking the unexpected values.
380//
381// The input is a sequence of y values that we expect to increase by 100 each,
382// so we just store (x, y) points where the expectation is violated.
383func makeRuneOffsetMap(off []uint32) runeOffsetMap {
384 expected := uint32(0)
385 tmp := []runeOffsetCorrection{}
386 for runeOffset, byteOffset := range off {
387 if byteOffset != expected {
388 tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset})
389 expected = byteOffset
390 }
391 expected += runeOffsetFrequency
392 }
393 // copy the slice to ensure it doesn't waste unused trailing capacity
394 out := make([]runeOffsetCorrection, len(tmp))
395 copy(out, tmp)
396 return runeOffsetMap(out)
397}
398
399// lookup converts rune index `off` to a byte offset and a number of additional
400// runes to traverse, given the granularity of runeOffsetFrequency.
401//
402// It does this by finding the nearest point to interpolate from in the map.
403func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
404 left := runeOffset % runeOffsetFrequency
405 runeOffset -= left
406 slen := len(m)
407 if slen == 0 {
408 return runeOffset, left
409 }
410 // sort.Search finds the *first* index for which the predicate is true,
411 // but we want to find the *last* index for which the predicate is true.
412 // This involves some work to reverse the index directions.
413 idx := sort.Search(slen, func(i int) bool {
414 return runeOffset >= m[slen-1-i].runeOffset
415 })
416 idx = slen - 1 - idx
417 // idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller
418 // than the first entry in the map, so no correction is necessary.
419 byteOff := runeOffset
420 if idx >= 0 {
421 byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset
422 }
423 return byteOff, left
424}
425
426func (m runeOffsetMap) sizeBytes() int {
427 return 8 * len(m)
428}
429
430func epsilonEqualsOne(scoreWeight float64) bool {
431 return scoreWeight == 1 || math.Abs(scoreWeight-1.0) < 1e-9
432}