fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "encoding/binary"
19 "log"
20 "sort"
21 "unicode"
22 "unicode/utf8"
23)
24
25func generateCaseNgrams(g ngram) []ngram {
26 asRunes := ngramToRunes(g)
27
28 variants := make([]ngram, 0, 8)
29 cur := asRunes
30 for {
31 for i := 0; i < 3; i++ {
32 next := unicode.SimpleFold(cur[i])
33 cur[i] = next
34 if next != asRunes[i] {
35 break
36 }
37 }
38
39 variants = append(variants, runesToNGram(cur))
40 if cur == asRunes {
41 break
42 }
43 }
44
45 return variants
46}
47
48func toLower(in []byte) []byte {
49 out := make([]byte, 0, len(in))
50 var buf [4]byte
51 for _, c := range string(in) {
52 i := utf8.EncodeRune(buf[:], unicode.ToLower(c))
53 out = append(out, buf[:i]...)
54 }
55 return out
56}
57
58// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may
59// be larger than 'lower'. Returns whether there was a match, and if
60// yes, the byte size of the match.
61func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) {
62 matchTotal := 0
63 for len(lower) > 0 && len(mixed) > 0 {
64 lr, lsz := utf8.DecodeRune(lower)
65 lower = lower[lsz:]
66
67 mr, msz := utf8.DecodeRune(mixed)
68 mixed = mixed[msz:]
69 matchTotal += msz
70
71 if lr != unicode.ToLower(mr) {
72 return 0, false
73 }
74 }
75
76 return matchTotal, len(lower) == 0
77}
78
79type ngram uint64
80
81func runesToNGram(b [ngramSize]rune) ngram {
82 return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2]))
83}
84
85func bytesToNGram(b []byte) ngram {
86 return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])})
87}
88
89func stringToNGram(s string) ngram {
90 return bytesToNGram([]byte(s))
91}
92
93func ngramToBytes(n ngram) []byte {
94 rs := ngramToRunes(n)
95 return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])}
96}
97
98const runeMask = 1<<21 - 1
99
100func ngramToRunes(n ngram) [ngramSize]rune {
101 return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)}
102}
103
104func (n ngram) String() string {
105 rs := ngramToRunes(n)
106 return string(rs[:])
107}
108
109type runeNgramOff struct {
110 ngram ngram
111 byteSize uint32 // size of ngram
112 byteOff uint32
113 runeOff uint32
114}
115
116func splitNGrams(str []byte) []runeNgramOff {
117 var runeGram [3]rune
118 var off [3]uint32
119 var runeCount int
120
121 result := make([]runeNgramOff, 0, len(str))
122 var i uint32
123
124 chars := -1
125 for len(str) > 0 {
126 chars++
127 r, sz := utf8.DecodeRune(str)
128 str = str[sz:]
129 runeGram[0] = runeGram[1]
130 off[0] = off[1]
131 runeGram[1] = runeGram[2]
132 off[1] = off[2]
133 runeGram[2] = r
134 off[2] = uint32(i)
135 i += uint32(sz)
136 runeCount++
137 if runeCount < ngramSize {
138 continue
139 }
140
141 ng := runesToNGram(runeGram)
142 result = append(result, runeNgramOff{
143 ngram: ng,
144 byteSize: i - off[0],
145 byteOff: off[0],
146 runeOff: uint32(chars),
147 })
148 }
149 return result
150}
151
152const (
153 _classLowerChar int = iota
154 _classUpperChar
155 _classDigit
156 _classPunct
157 _classOther
158 _classSpace
159)
160
161func byteClass(c byte) int {
162 if c >= 'a' && c <= 'z' {
163 return _classLowerChar
164 }
165 if c >= 'A' && c <= 'Z' {
166 return _classUpperChar
167 }
168 if c >= '0' && c <= '9' {
169 return _classDigit
170 }
171
172 switch c {
173 case ' ', '\n':
174 return _classSpace
175 case '.', ',', ';', '"', '\'':
176 return _classPunct
177 default:
178 return _classOther
179 }
180}
181
182func marshalDocSections(secs []DocumentSection) []byte {
183 ints := make([]uint32, 0, len(secs)*2)
184 for _, s := range secs {
185 ints = append(ints, uint32(s.Start), uint32(s.End))
186 }
187
188 return toSizedDeltas(ints)
189}
190
191func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection {
192 // Defensive, this shouldn't happen. While we have the feature flag for lazy
193 // doc section decoding let's be extra defensive.
194 if len(data) == 0 {
195 log.Println("WARN unmarshalDocSections received an empty slice to unmarshal")
196 return nil
197 }
198
199 sz, m := binary.Uvarint(data)
200 data = data[m:]
201
202 if cap(ds) < int(sz)/2 {
203 ds = make([]DocumentSection, 0, sz/2)
204 } else {
205 ds = ds[:0]
206 }
207
208 // Inlining the delta decoding to avoid unnecessary allocations that would come
209 // from the straightforward implementation, i.e. packing the result of fromSizedDeltas.
210 var last uint32
211 for len(data) > 0 {
212 var d DocumentSection
213
214 delta, m := binary.Uvarint(data)
215 last += uint32(delta)
216 data = data[m:]
217 d.Start = last
218
219 delta, m = binary.Uvarint(data)
220 last += uint32(delta)
221 data = data[m:]
222 d.End = last
223
224 ds = append(ds, d)
225 }
226 return ds
227}
228
229type ngramSlice []ngram
230
231func (p ngramSlice) Len() int { return len(p) }
232
233func (p ngramSlice) Less(i, j int) bool {
234 return p[i] < p[j]
235}
236
237func (p ngramSlice) Swap(i, j int) {
238 p[i], p[j] = p[j], p[i]
239}
240
241func toSizedDeltas(offsets []uint32) []byte {
242 var enc [8]byte
243
244 deltas := make([]byte, 0, len(offsets)*2)
245
246 m := binary.PutUvarint(enc[:], uint64(len(offsets)))
247 deltas = append(deltas, enc[:m]...)
248
249 var last uint32
250 for _, p := range offsets {
251 delta := p - last
252 last = p
253
254 m := binary.PutUvarint(enc[:], uint64(delta))
255 deltas = append(deltas, enc[:m]...)
256 }
257 return deltas
258}
259
260func fromSizedDeltas(data []byte, ps []uint32) []uint32 {
261 sz, m := binary.Uvarint(data)
262 data = data[m:]
263
264 if cap(ps) < int(sz) {
265 ps = make([]uint32, 0, sz)
266 } else {
267 ps = ps[:0]
268 }
269
270 var last uint32
271 for len(data) > 0 {
272 delta, m := binary.Uvarint(data)
273 offset := last + uint32(delta)
274 last = offset
275 data = data[m:]
276 ps = append(ps, offset)
277 }
278 return ps
279}
280
281func toSizedDeltas16(offsets []uint16) []byte {
282 var enc [8]byte
283
284 deltas := make([]byte, 0, len(offsets)*2)
285
286 m := binary.PutUvarint(enc[:], uint64(len(offsets)))
287 deltas = append(deltas, enc[:m]...)
288
289 var last uint16
290 for _, p := range offsets {
291 delta := p - last
292 last = p
293
294 m := binary.PutUvarint(enc[:], uint64(delta))
295 deltas = append(deltas, enc[:m]...)
296 }
297 return deltas
298}
299
300func fromSizedDeltas16(data []byte, ps []uint16) []uint16 {
301 sz, m := binary.Uvarint(data)
302 data = data[m:]
303
304 if cap(ps) < int(sz) {
305 ps = make([]uint16, 0, sz)
306 } else {
307 ps = ps[:0]
308 }
309
310 var last uint16
311 for len(data) > 0 {
312 delta, m := binary.Uvarint(data)
313 offset := last + uint16(delta)
314 last = offset
315 data = data[m:]
316 ps = append(ps, offset)
317 }
318 return ps
319}
320
321func fromDeltas(data []byte, buf []uint32) []uint32 {
322 buf = buf[:0]
323 if cap(buf) < len(data)/2 {
324 buf = make([]uint32, 0, len(data)/2)
325 }
326
327 var last uint32
328 for len(data) > 0 {
329 delta, m := binary.Uvarint(data)
330 offset := last + uint32(delta)
331 last = offset
332 data = data[m:]
333 buf = append(buf, offset)
334 }
335 return buf
336}
337
338type runeOffsetCorrection struct {
339 runeOffset, byteOffset uint32
340}
341
342// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency)
343// to byte offsets, by tracking only the points where a span of runes is non-ASCII,
344// and otherwise interpolating expected byte offsets as one byte per rune.
345//
346// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}].
347//
348// This is very rarely a slight pessimization on repos where there are frequent
349// non-ASCII characters.
350type runeOffsetMap []runeOffsetCorrection
351
352// makeRuneOffsetMap converts the mostly-predictable runeOffset input
353// into a shorter form tracking the unexpected values.
354//
355// The input is a sequence of y values that we expect to increase by 100 each,
356// so we just store (x, y) points where the expectation is violated.
357func makeRuneOffsetMap(off []uint32) runeOffsetMap {
358 expected := uint32(0)
359 tmp := []runeOffsetCorrection{}
360 for runeOffset, byteOffset := range off {
361 if byteOffset != expected {
362 tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset})
363 expected = byteOffset
364 }
365 expected += runeOffsetFrequency
366 }
367 // copy the slice to ensure it doesn't waste unused trailing capacity
368 out := make([]runeOffsetCorrection, len(tmp))
369 copy(out, tmp)
370 return runeOffsetMap(out)
371}
372
373// lookup converts rune index `off` to a byte offset and a number of additional
374// runes to traverse, given the granularity of runeOffsetFrequency.
375//
376// It does this by finding the nearest point to interpolate from in the map.
377func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
378 left := runeOffset % runeOffsetFrequency
379 runeOffset -= left
380 slen := len(m)
381 if slen == 0 {
382 return runeOffset, left
383 }
384 // sort.Search finds the *first* index for which the predicate is true,
385 // but we want to find the *last* index for which the predicate is true.
386 // This involves some work to reverse the index directions.
387 idx := sort.Search(slen, func(i int) bool {
388 return runeOffset >= m[slen-1-i].runeOffset
389 })
390 idx = slen - 1 - idx
391 // idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller
392 // than the first entry in the map, so no correction is necessary.
393 byteOff := runeOffset
394 if idx >= 0 {
395 byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset
396 }
397 return byteOff, left
398}
399
400func (m runeOffsetMap) sizeBytes() int {
401 return 8 * len(m)
402}