fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "encoding/binary"
19 "sort"
20 "unicode"
21 "unicode/utf8"
22)
23
24func generateCaseNgrams(g ngram) []ngram {
25 asRunes := ngramToRunes(g)
26
27 variants := make([]ngram, 0, 8)
28 cur := asRunes
29 for {
30 for i := 0; i < 3; i++ {
31 next := unicode.SimpleFold(cur[i])
32 cur[i] = next
33 if next != asRunes[i] {
34 break
35 }
36 }
37
38 variants = append(variants, runesToNGram(cur))
39 if cur == asRunes {
40 break
41 }
42 }
43
44 return variants
45}
46
47func toLower(in []byte) []byte {
48 out := make([]byte, 0, len(in))
49 var buf [4]byte
50 for _, c := range string(in) {
51 i := utf8.EncodeRune(buf[:], unicode.ToLower(c))
52 out = append(out, buf[:i]...)
53 }
54 return out
55}
56
57// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may
58// be larger than 'lower'. Returns whether there was a match, and if
59// yes, the byte size of the match.
60func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) {
61 matchTotal := 0
62 for len(lower) > 0 && len(mixed) > 0 {
63 lr, lsz := utf8.DecodeRune(lower)
64 lower = lower[lsz:]
65
66 mr, msz := utf8.DecodeRune(mixed)
67 mixed = mixed[msz:]
68 matchTotal += msz
69
70 if lr != unicode.ToLower(mr) {
71 return 0, false
72 }
73 }
74
75 return matchTotal, len(lower) == 0
76}
77
78type ngram uint64
79
80func runesToNGram(b [ngramSize]rune) ngram {
81 return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2]))
82}
83
84func bytesToNGram(b []byte) ngram {
85 return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])})
86}
87
88func stringToNGram(s string) ngram {
89 return bytesToNGram([]byte(s))
90}
91
92func ngramToBytes(n ngram) []byte {
93 rs := ngramToRunes(n)
94 return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])}
95}
96
97const runeMask = 1<<21 - 1
98
99func ngramToRunes(n ngram) [ngramSize]rune {
100 return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)}
101}
102
103func (n ngram) String() string {
104 rs := ngramToRunes(n)
105 return string(rs[:])
106}
107
108type runeNgramOff struct {
109 ngram ngram
110 byteSize uint32 // size of ngram
111 byteOff uint32
112 runeOff uint32
113}
114
115func splitNGrams(str []byte) []runeNgramOff {
116 var runeGram [3]rune
117 var off [3]uint32
118 var runeCount int
119
120 result := make([]runeNgramOff, 0, len(str))
121 var i uint32
122
123 chars := -1
124 for len(str) > 0 {
125 chars++
126 r, sz := utf8.DecodeRune(str)
127 str = str[sz:]
128 runeGram[0] = runeGram[1]
129 off[0] = off[1]
130 runeGram[1] = runeGram[2]
131 off[1] = off[2]
132 runeGram[2] = r
133 off[2] = uint32(i)
134 i += uint32(sz)
135 runeCount++
136 if runeCount < ngramSize {
137 continue
138 }
139
140 ng := runesToNGram(runeGram)
141 result = append(result, runeNgramOff{
142 ngram: ng,
143 byteSize: i - off[0],
144 byteOff: off[0],
145 runeOff: uint32(chars),
146 })
147 }
148 return result
149}
150
151const (
152 _classLowerChar int = iota
153 _classUpperChar
154 _classDigit
155 _classPunct
156 _classOther
157 _classSpace
158)
159
160func byteClass(c byte) int {
161 if c >= 'a' && c <= 'z' {
162 return _classLowerChar
163 }
164 if c >= 'A' && c <= 'Z' {
165 return _classUpperChar
166 }
167 if c >= '0' && c <= '9' {
168 return _classDigit
169 }
170
171 switch c {
172 case ' ', '\n':
173 return _classSpace
174 case '.', ',', ';', '"', '\'':
175 return _classPunct
176 default:
177 return _classOther
178 }
179}
180
181func marshalDocSections(secs []DocumentSection) []byte {
182 ints := make([]uint32, 0, len(secs)*2)
183 for _, s := range secs {
184 ints = append(ints, uint32(s.Start), uint32(s.End))
185 }
186
187 return toSizedDeltas(ints)
188}
189
190func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection {
191 sz, m := binary.Uvarint(data)
192 data = data[m:]
193
194 if cap(ds) < int(sz)/2 {
195 ds = make([]DocumentSection, 0, sz/2)
196 } else {
197 ds = ds[:0]
198 }
199
200 // Inlining the delta decoding to avoid unnecessary allocations that would come
201 // from the straightforward implementation, i.e. packing the result of fromSizedDeltas.
202 var last uint32
203 for len(data) > 0 {
204 var d DocumentSection
205
206 delta, m := binary.Uvarint(data)
207 last += uint32(delta)
208 data = data[m:]
209 d.Start = last
210
211 delta, m = binary.Uvarint(data)
212 last += uint32(delta)
213 data = data[m:]
214 d.End = last
215
216 ds = append(ds, d)
217 }
218 return ds
219}
220
221type ngramSlice []ngram
222
223func (p ngramSlice) Len() int { return len(p) }
224
225func (p ngramSlice) Less(i, j int) bool {
226 return p[i] < p[j]
227}
228
229func (p ngramSlice) Swap(i, j int) {
230 p[i], p[j] = p[j], p[i]
231}
232
233func toSizedDeltas(offsets []uint32) []byte {
234 var enc [8]byte
235
236 deltas := make([]byte, 0, len(offsets)*2)
237
238 m := binary.PutUvarint(enc[:], uint64(len(offsets)))
239 deltas = append(deltas, enc[:m]...)
240
241 var last uint32
242 for _, p := range offsets {
243 delta := p - last
244 last = p
245
246 m := binary.PutUvarint(enc[:], uint64(delta))
247 deltas = append(deltas, enc[:m]...)
248 }
249 return deltas
250}
251
252func fromSizedDeltas(data []byte, ps []uint32) []uint32 {
253 sz, m := binary.Uvarint(data)
254 data = data[m:]
255
256 if cap(ps) < int(sz) {
257 ps = make([]uint32, 0, sz)
258 } else {
259 ps = ps[:0]
260 }
261
262 var last uint32
263 for len(data) > 0 {
264 delta, m := binary.Uvarint(data)
265 offset := last + uint32(delta)
266 last = offset
267 data = data[m:]
268 ps = append(ps, offset)
269 }
270 return ps
271}
272
273func toSizedDeltas16(offsets []uint16) []byte {
274 var enc [8]byte
275
276 deltas := make([]byte, 0, len(offsets)*2)
277
278 m := binary.PutUvarint(enc[:], uint64(len(offsets)))
279 deltas = append(deltas, enc[:m]...)
280
281 var last uint16
282 for _, p := range offsets {
283 delta := p - last
284 last = p
285
286 m := binary.PutUvarint(enc[:], uint64(delta))
287 deltas = append(deltas, enc[:m]...)
288 }
289 return deltas
290}
291
292func fromSizedDeltas16(data []byte, ps []uint16) []uint16 {
293 sz, m := binary.Uvarint(data)
294 data = data[m:]
295
296 if cap(ps) < int(sz) {
297 ps = make([]uint16, 0, sz)
298 } else {
299 ps = ps[:0]
300 }
301
302 var last uint16
303 for len(data) > 0 {
304 delta, m := binary.Uvarint(data)
305 offset := last + uint16(delta)
306 last = offset
307 data = data[m:]
308 ps = append(ps, offset)
309 }
310 return ps
311}
312
313func fromDeltas(data []byte, buf []uint32) []uint32 {
314 buf = buf[:0]
315 if cap(buf) < len(data)/2 {
316 buf = make([]uint32, 0, len(data)/2)
317 }
318
319 var last uint32
320 for len(data) > 0 {
321 delta, m := binary.Uvarint(data)
322 offset := last + uint32(delta)
323 last = offset
324 data = data[m:]
325 buf = append(buf, offset)
326 }
327 return buf
328}
329
330type runeOffsetCorrection struct {
331 runeOffset, byteOffset uint32
332}
333
334// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency)
335// to byte offsets, by tracking only the points where a span of runes is non-ASCII,
336// and otherwise interpolating expected byte offsets as one byte per rune.
337//
338// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}].
339//
340// This is very rarely a slight pessimization on repos where there are frequent
341// non-ASCII characters.
342type runeOffsetMap []runeOffsetCorrection
343
344// makeRuneOffsetMap converts the mostly-predictable runeOffset input
345// into a shorter form tracking the unexpected values.
346//
347// The input is a sequence of y values that we expect to increase by 100 each,
348// so we just store (x, y) points where the expectation is violated.
349func makeRuneOffsetMap(off []uint32) runeOffsetMap {
350 expected := uint32(0)
351 tmp := []runeOffsetCorrection{}
352 for runeOffset, byteOffset := range off {
353 if byteOffset != expected {
354 tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset})
355 expected = byteOffset
356 }
357 expected += runeOffsetFrequency
358 }
359 // copy the slice to ensure it doesn't waste unused trailing capacity
360 out := make([]runeOffsetCorrection, len(tmp))
361 copy(out, tmp)
362 return runeOffsetMap(out)
363}
364
365// lookup converts rune index `off` to a byte offset and a number of additional
366// runes to traverse, given the granularity of runeOffsetFrequency.
367//
368// It does this by finding the nearest point to interpolate from in the map.
369func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
370 left := runeOffset % runeOffsetFrequency
371 runeOffset -= left
372 slen := len(m)
373 if slen == 0 {
374 return runeOffset, left
375 }
376 // sort.Search finds the *first* index for which the predicate is true,
377 // but we want to find the *last* index for which the predicate is true.
378 // This involves some work to reverse the index directions.
379 idx := sort.Search(slen, func(i int) bool {
380 return runeOffset >= m[slen-1-i].runeOffset
381 })
382 idx = slen - 1 - idx
383 // idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller
384 // than the first entry in the map, so no correction is necessary.
385 byteOff := runeOffset
386 if idx >= 0 {
387 byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset
388 }
389 return byteOff, left
390}
391
392func (m runeOffsetMap) sizeBytes() int {
393 return 8 * len(m)
394}