fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "cmp"
19 "encoding/binary"
20 "math"
21 "sort"
22 "unicode"
23 "unicode/utf8"
24)
25
26func generateCaseNgrams(g ngram) []ngram {
27 asRunes := ngramToRunes(g)
28
29 variants := make([]ngram, 0, 8)
30 cur := asRunes
31 for {
32 for i := 0; i < 3; i++ {
33 next := unicode.SimpleFold(cur[i])
34 cur[i] = next
35 if next != asRunes[i] {
36 break
37 }
38 }
39
40 variants = append(variants, runesToNGram(cur))
41 if cur == asRunes {
42 break
43 }
44 }
45
46 return variants
47}
48
49func toLower(in []byte) []byte {
50 out := make([]byte, 0, len(in))
51 var buf [4]byte
52 for _, c := range string(in) {
53 i := utf8.EncodeRune(buf[:], unicode.ToLower(c))
54 out = append(out, buf[:i]...)
55 }
56 return out
57}
58
59// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may
60// be larger than 'lower'. Returns whether there was a match, and if
61// yes, the byte size of the match.
62func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) {
63 matchTotal := 0
64 for len(lower) > 0 && len(mixed) > 0 {
65 lr, lsz := utf8.DecodeRune(lower)
66 lower = lower[lsz:]
67
68 mr, msz := utf8.DecodeRune(mixed)
69 mixed = mixed[msz:]
70 matchTotal += msz
71
72 if lr != unicode.ToLower(mr) {
73 return 0, false
74 }
75 }
76
77 return matchTotal, len(lower) == 0
78}
79
80type ngram uint64
81
82func runesToNGram(b [ngramSize]rune) ngram {
83 return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2]))
84}
85
86func bytesToNGram(b []byte) ngram {
87 return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])})
88}
89
90func stringToNGram(s string) ngram {
91 return bytesToNGram([]byte(s))
92}
93
94func ngramToBytes(n ngram) []byte {
95 rs := ngramToRunes(n)
96 return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])}
97}
98
99const runeMask = 1<<21 - 1
100
101func ngramToRunes(n ngram) [ngramSize]rune {
102 return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)}
103}
104
105func (n ngram) String() string {
106 rs := ngramToRunes(n)
107 return string(rs[:])
108}
109
110type runeNgramOff struct {
111 ngram ngram
112 // index is the original index inside of the returned array of splitNGrams
113 index int
114}
115
116func (a runeNgramOff) Compare(b runeNgramOff) int {
117 if a.ngram == b.ngram {
118 return cmp.Compare(a.index, b.index)
119 } else if a.ngram < b.ngram {
120 return -1
121 } else {
122 return 1
123 }
124}
125
126func splitNGrams(str []byte) []runeNgramOff {
127 // len(maxNgrams) >= the number of ngrams in str => no limit
128 return splitNGramsLimit(str, len(str))
129}
130
131func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
132 var runeGram [3]rune
133 var off [3]uint32
134 var runeCount int
135
136 result := make([]runeNgramOff, 0, len(str))
137 var i uint32
138
139 for len(str) > 0 && len(result) < maxNgrams {
140 r, sz := utf8.DecodeRune(str)
141 str = str[sz:]
142 runeGram[0] = runeGram[1]
143 off[0] = off[1]
144 runeGram[1] = runeGram[2]
145 off[1] = off[2]
146 runeGram[2] = r
147 off[2] = uint32(i)
148 i += uint32(sz)
149 runeCount++
150 if runeCount < ngramSize {
151 continue
152 }
153
154 ng := runesToNGram(runeGram)
155 result = append(result, runeNgramOff{
156 ngram: ng,
157 index: len(result),
158 })
159 }
160 return result
161}
162
163const (
164 _classLowerChar int = iota
165 _classUpperChar
166 _classDigit
167 _classPunct
168 _classOther
169 _classSpace
170)
171
172func byteClass(c byte) int {
173 if c >= 'a' && c <= 'z' {
174 return _classLowerChar
175 }
176 if c >= 'A' && c <= 'Z' {
177 return _classUpperChar
178 }
179 if c >= '0' && c <= '9' {
180 return _classDigit
181 }
182
183 switch c {
184 case ' ', '\n':
185 return _classSpace
186 case '.', ',', ';', '"', '\'':
187 return _classPunct
188 default:
189 return _classOther
190 }
191}
192
193func characterClass(c byte) bool {
194 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'
195}
196
197func marshalDocSections(secs []DocumentSection) []byte {
198 ints := make([]uint32, 0, len(secs)*2)
199 for _, s := range secs {
200 ints = append(ints, uint32(s.Start), uint32(s.End))
201 }
202
203 return toSizedDeltas(ints)
204}
205
206func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection {
207 sz, m := binary.Uvarint(data)
208 data = data[m:]
209
210 if cap(ds) < int(sz)/2 {
211 ds = make([]DocumentSection, 0, sz/2)
212 } else {
213 ds = ds[:0]
214 }
215
216 // Inlining the delta decoding to avoid unnecessary allocations that would come
217 // from the straightforward implementation, i.e. packing the result of fromSizedDeltas.
218 var last uint32
219 for len(data) > 0 {
220 var d DocumentSection
221
222 delta, m := binary.Uvarint(data)
223 last += uint32(delta)
224 data = data[m:]
225 d.Start = last
226
227 delta, m = binary.Uvarint(data)
228 last += uint32(delta)
229 data = data[m:]
230 d.End = last
231
232 ds = append(ds, d)
233 }
234 return ds
235}
236
237type ngramSlice []ngram
238
239func (p ngramSlice) Len() int { return len(p) }
240
241func (p ngramSlice) Less(i, j int) bool {
242 return p[i] < p[j]
243}
244
245func (p ngramSlice) Swap(i, j int) {
246 p[i], p[j] = p[j], p[i]
247}
248
249func toSizedDeltas(offsets []uint32) []byte {
250 var enc [8]byte
251
252 deltas := make([]byte, 0, len(offsets)*2)
253
254 m := binary.PutUvarint(enc[:], uint64(len(offsets)))
255 deltas = append(deltas, enc[:m]...)
256
257 var last uint32
258 for _, p := range offsets {
259 delta := p - last
260 last = p
261
262 m := binary.PutUvarint(enc[:], uint64(delta))
263 deltas = append(deltas, enc[:m]...)
264 }
265 return deltas
266}
267
268func fromSizedDeltas(data []byte, ps []uint32) []uint32 {
269 sz, m := binary.Uvarint(data)
270 data = data[m:]
271
272 if cap(ps) < int(sz) {
273 ps = make([]uint32, 0, sz)
274 } else {
275 ps = ps[:0]
276 }
277
278 var last uint32
279 for len(data) > 0 {
280 delta, m := binary.Uvarint(data)
281 offset := last + uint32(delta)
282 last = offset
283 data = data[m:]
284 ps = append(ps, offset)
285 }
286 return ps
287}
288
289func toSizedDeltas16(offsets []uint16) []byte {
290 var enc [8]byte
291
292 deltas := make([]byte, 0, len(offsets)*2)
293
294 m := binary.PutUvarint(enc[:], uint64(len(offsets)))
295 deltas = append(deltas, enc[:m]...)
296
297 var last uint16
298 for _, p := range offsets {
299 delta := p - last
300 last = p
301
302 m := binary.PutUvarint(enc[:], uint64(delta))
303 deltas = append(deltas, enc[:m]...)
304 }
305 return deltas
306}
307
308func fromSizedDeltas16(data []byte, ps []uint16) []uint16 {
309 sz, m := binary.Uvarint(data)
310 data = data[m:]
311
312 if cap(ps) < int(sz) {
313 ps = make([]uint16, 0, sz)
314 } else {
315 ps = ps[:0]
316 }
317
318 var last uint16
319 for len(data) > 0 {
320 delta, m := binary.Uvarint(data)
321 offset := last + uint16(delta)
322 last = offset
323 data = data[m:]
324 ps = append(ps, offset)
325 }
326 return ps
327}
328
329func fromDeltas(data []byte, buf []uint32) []uint32 {
330 buf = buf[:0]
331 if cap(buf) < len(data)/2 {
332 buf = make([]uint32, 0, len(data)/2)
333 }
334
335 var last uint32
336 for len(data) > 0 {
337 delta, m := binary.Uvarint(data)
338 offset := last + uint32(delta)
339 last = offset
340 data = data[m:]
341 buf = append(buf, offset)
342 }
343 return buf
344}
345
346type runeOffsetCorrection struct {
347 runeOffset, byteOffset uint32
348}
349
350// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency)
351// to byte offsets, by tracking only the points where a span of runes is non-ASCII,
352// and otherwise interpolating expected byte offsets as one byte per rune.
353//
354// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}].
355//
356// This is very rarely a slight pessimization on repos where there are frequent
357// non-ASCII characters.
358type runeOffsetMap []runeOffsetCorrection
359
360// makeRuneOffsetMap converts the mostly-predictable runeOffset input
361// into a shorter form tracking the unexpected values.
362//
363// The input is a sequence of y values that we expect to increase by 100 each,
364// so we just store (x, y) points where the expectation is violated.
365func makeRuneOffsetMap(off []uint32) runeOffsetMap {
366 expected := uint32(0)
367 tmp := []runeOffsetCorrection{}
368 for runeOffset, byteOffset := range off {
369 if byteOffset != expected {
370 tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset})
371 expected = byteOffset
372 }
373 expected += runeOffsetFrequency
374 }
375 // copy the slice to ensure it doesn't waste unused trailing capacity
376 out := make([]runeOffsetCorrection, len(tmp))
377 copy(out, tmp)
378 return runeOffsetMap(out)
379}
380
381// lookup converts rune index `off` to a byte offset and a number of additional
382// runes to traverse, given the granularity of runeOffsetFrequency.
383//
384// It does this by finding the nearest point to interpolate from in the map.
385func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
386 left := runeOffset % runeOffsetFrequency
387 runeOffset -= left
388 slen := len(m)
389 if slen == 0 {
390 return runeOffset, left
391 }
392 // sort.Search finds the *first* index for which the predicate is true,
393 // but we want to find the *last* index for which the predicate is true.
394 // This involves some work to reverse the index directions.
395 idx := sort.Search(slen, func(i int) bool {
396 return runeOffset >= m[slen-1-i].runeOffset
397 })
398 idx = slen - 1 - idx
399 // idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller
400 // than the first entry in the map, so no correction is necessary.
401 byteOff := runeOffset
402 if idx >= 0 {
403 byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset
404 }
405 return byteOff, left
406}
407
408func (m runeOffsetMap) sizeBytes() int {
409 return 8 * len(m)
410}
411
412func epsilonEqualsOne(scoreWeight float64) bool {
413 return scoreWeight == 1 || math.Abs(scoreWeight-1.0) < 1e-9
414}