fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "encoding/binary"
19 "sort"
20 "unicode"
21 "unicode/utf8"
22)
23
24func generateCaseNgrams(g ngram) []ngram {
25 asRunes := ngramToRunes(g)
26
27 variants := make([]ngram, 0, 8)
28 cur := asRunes
29 for {
30 for i := 0; i < 3; i++ {
31 next := unicode.SimpleFold(cur[i])
32 cur[i] = next
33 if next != asRunes[i] {
34 break
35 }
36 }
37
38 variants = append(variants, runesToNGram(cur))
39 if cur == asRunes {
40 break
41 }
42 }
43
44 return variants
45}
46
47func toLower(in []byte) []byte {
48 out := make([]byte, 0, len(in))
49 var buf [4]byte
50 for _, c := range string(in) {
51 i := utf8.EncodeRune(buf[:], unicode.ToLower(c))
52 out = append(out, buf[:i]...)
53 }
54 return out
55}
56
57// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may
58// be larger than 'lower'. Returns whether there was a match, and if
59// yes, the byte size of the match.
60func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) {
61 matchTotal := 0
62 for len(lower) > 0 && len(mixed) > 0 {
63 lr, lsz := utf8.DecodeRune(lower)
64 lower = lower[lsz:]
65
66 mr, msz := utf8.DecodeRune(mixed)
67 mixed = mixed[msz:]
68 matchTotal += msz
69
70 if lr != unicode.ToLower(mr) {
71 return 0, false
72 }
73 }
74
75 return matchTotal, len(lower) == 0
76}
77
78type ngram uint64
79
80func runesToNGram(b [ngramSize]rune) ngram {
81 return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2]))
82}
83
84func bytesToNGram(b []byte) ngram {
85 return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])})
86}
87
88func stringToNGram(s string) ngram {
89 return bytesToNGram([]byte(s))
90}
91
92func ngramToBytes(n ngram) []byte {
93 rs := ngramToRunes(n)
94 return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])}
95}
96
97const runeMask = 1<<21 - 1
98
99func ngramToRunes(n ngram) [ngramSize]rune {
100 return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)}
101}
102
103func (n ngram) String() string {
104 rs := ngramToRunes(n)
105 return string(rs[:])
106}
107
108type runeNgramOff struct {
109 ngram ngram
110 // index is the original index inside of the returned array of splitNGrams
111 index uint32
112}
113
114func splitNGrams(str []byte) []runeNgramOff {
115 var runeGram [3]rune
116 var off [3]uint32
117 var runeCount int
118
119 result := make([]runeNgramOff, 0, len(str))
120 var i uint32
121
122 for len(str) > 0 {
123 r, sz := utf8.DecodeRune(str)
124 str = str[sz:]
125 runeGram[0] = runeGram[1]
126 off[0] = off[1]
127 runeGram[1] = runeGram[2]
128 off[1] = off[2]
129 runeGram[2] = r
130 off[2] = uint32(i)
131 i += uint32(sz)
132 runeCount++
133 if runeCount < ngramSize {
134 continue
135 }
136
137 ng := runesToNGram(runeGram)
138 result = append(result, runeNgramOff{
139 ngram: ng,
140 index: uint32(len(result)),
141 })
142 }
143 return result
144}
145
146const (
147 _classLowerChar int = iota
148 _classUpperChar
149 _classDigit
150 _classPunct
151 _classOther
152 _classSpace
153)
154
155func byteClass(c byte) int {
156 if c >= 'a' && c <= 'z' {
157 return _classLowerChar
158 }
159 if c >= 'A' && c <= 'Z' {
160 return _classUpperChar
161 }
162 if c >= '0' && c <= '9' {
163 return _classDigit
164 }
165
166 switch c {
167 case ' ', '\n':
168 return _classSpace
169 case '.', ',', ';', '"', '\'':
170 return _classPunct
171 default:
172 return _classOther
173 }
174}
175
176func characterClass(c byte) bool {
177 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'
178}
179
180func marshalDocSections(secs []DocumentSection) []byte {
181 ints := make([]uint32, 0, len(secs)*2)
182 for _, s := range secs {
183 ints = append(ints, uint32(s.Start), uint32(s.End))
184 }
185
186 return toSizedDeltas(ints)
187}
188
189func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection {
190 sz, m := binary.Uvarint(data)
191 data = data[m:]
192
193 if cap(ds) < int(sz)/2 {
194 ds = make([]DocumentSection, 0, sz/2)
195 } else {
196 ds = ds[:0]
197 }
198
199 // Inlining the delta decoding to avoid unnecessary allocations that would come
200 // from the straightforward implementation, i.e. packing the result of fromSizedDeltas.
201 var last uint32
202 for len(data) > 0 {
203 var d DocumentSection
204
205 delta, m := binary.Uvarint(data)
206 last += uint32(delta)
207 data = data[m:]
208 d.Start = last
209
210 delta, m = binary.Uvarint(data)
211 last += uint32(delta)
212 data = data[m:]
213 d.End = last
214
215 ds = append(ds, d)
216 }
217 return ds
218}
219
220type ngramSlice []ngram
221
222func (p ngramSlice) Len() int { return len(p) }
223
224func (p ngramSlice) Less(i, j int) bool {
225 return p[i] < p[j]
226}
227
228func (p ngramSlice) Swap(i, j int) {
229 p[i], p[j] = p[j], p[i]
230}
231
232func toSizedDeltas(offsets []uint32) []byte {
233 var enc [8]byte
234
235 deltas := make([]byte, 0, len(offsets)*2)
236
237 m := binary.PutUvarint(enc[:], uint64(len(offsets)))
238 deltas = append(deltas, enc[:m]...)
239
240 var last uint32
241 for _, p := range offsets {
242 delta := p - last
243 last = p
244
245 m := binary.PutUvarint(enc[:], uint64(delta))
246 deltas = append(deltas, enc[:m]...)
247 }
248 return deltas
249}
250
251func fromSizedDeltas(data []byte, ps []uint32) []uint32 {
252 sz, m := binary.Uvarint(data)
253 data = data[m:]
254
255 if cap(ps) < int(sz) {
256 ps = make([]uint32, 0, sz)
257 } else {
258 ps = ps[:0]
259 }
260
261 var last uint32
262 for len(data) > 0 {
263 delta, m := binary.Uvarint(data)
264 offset := last + uint32(delta)
265 last = offset
266 data = data[m:]
267 ps = append(ps, offset)
268 }
269 return ps
270}
271
272func toSizedDeltas16(offsets []uint16) []byte {
273 var enc [8]byte
274
275 deltas := make([]byte, 0, len(offsets)*2)
276
277 m := binary.PutUvarint(enc[:], uint64(len(offsets)))
278 deltas = append(deltas, enc[:m]...)
279
280 var last uint16
281 for _, p := range offsets {
282 delta := p - last
283 last = p
284
285 m := binary.PutUvarint(enc[:], uint64(delta))
286 deltas = append(deltas, enc[:m]...)
287 }
288 return deltas
289}
290
291func fromSizedDeltas16(data []byte, ps []uint16) []uint16 {
292 sz, m := binary.Uvarint(data)
293 data = data[m:]
294
295 if cap(ps) < int(sz) {
296 ps = make([]uint16, 0, sz)
297 } else {
298 ps = ps[:0]
299 }
300
301 var last uint16
302 for len(data) > 0 {
303 delta, m := binary.Uvarint(data)
304 offset := last + uint16(delta)
305 last = offset
306 data = data[m:]
307 ps = append(ps, offset)
308 }
309 return ps
310}
311
312func fromDeltas(data []byte, buf []uint32) []uint32 {
313 buf = buf[:0]
314 if cap(buf) < len(data)/2 {
315 buf = make([]uint32, 0, len(data)/2)
316 }
317
318 var last uint32
319 for len(data) > 0 {
320 delta, m := binary.Uvarint(data)
321 offset := last + uint32(delta)
322 last = offset
323 data = data[m:]
324 buf = append(buf, offset)
325 }
326 return buf
327}
328
329type runeOffsetCorrection struct {
330 runeOffset, byteOffset uint32
331}
332
333// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency)
334// to byte offsets, by tracking only the points where a span of runes is non-ASCII,
335// and otherwise interpolating expected byte offsets as one byte per rune.
336//
337// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}].
338//
339// This is very rarely a slight pessimization on repos where there are frequent
340// non-ASCII characters.
341type runeOffsetMap []runeOffsetCorrection
342
343// makeRuneOffsetMap converts the mostly-predictable runeOffset input
344// into a shorter form tracking the unexpected values.
345//
346// The input is a sequence of y values that we expect to increase by 100 each,
347// so we just store (x, y) points where the expectation is violated.
348func makeRuneOffsetMap(off []uint32) runeOffsetMap {
349 expected := uint32(0)
350 tmp := []runeOffsetCorrection{}
351 for runeOffset, byteOffset := range off {
352 if byteOffset != expected {
353 tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset})
354 expected = byteOffset
355 }
356 expected += runeOffsetFrequency
357 }
358 // copy the slice to ensure it doesn't waste unused trailing capacity
359 out := make([]runeOffsetCorrection, len(tmp))
360 copy(out, tmp)
361 return runeOffsetMap(out)
362}
363
364// lookup converts rune index `off` to a byte offset and a number of additional
365// runes to traverse, given the granularity of runeOffsetFrequency.
366//
367// It does this by finding the nearest point to interpolate from in the map.
368func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
369 left := runeOffset % runeOffsetFrequency
370 runeOffset -= left
371 slen := len(m)
372 if slen == 0 {
373 return runeOffset, left
374 }
375 // sort.Search finds the *first* index for which the predicate is true,
376 // but we want to find the *last* index for which the predicate is true.
377 // This involves some work to reverse the index directions.
378 idx := sort.Search(slen, func(i int) bool {
379 return runeOffset >= m[slen-1-i].runeOffset
380 })
381 idx = slen - 1 - idx
382 // idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller
383 // than the first entry in the map, so no correction is necessary.
384 byteOff := runeOffset
385 if idx >= 0 {
386 byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset
387 }
388 return byteOff, left
389}
390
391func (m runeOffsetMap) sizeBytes() int {
392 return 8 * len(m)
393}