fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "encoding/binary"
19 "sort"
20 "unicode"
21 "unicode/utf8"
22)
23
24func generateCaseNgrams(g ngram) []ngram {
25 asRunes := ngramToRunes(g)
26
27 variants := make([]ngram, 0, 8)
28 cur := asRunes
29 for {
30 for i := 0; i < 3; i++ {
31 next := unicode.SimpleFold(cur[i])
32 cur[i] = next
33 if next != asRunes[i] {
34 break
35 }
36 }
37
38 variants = append(variants, runesToNGram(cur))
39 if cur == asRunes {
40 break
41 }
42 }
43
44 return variants
45}
46
47func toLower(in []byte) []byte {
48 out := make([]byte, 0, len(in))
49 var buf [4]byte
50 for _, c := range string(in) {
51 i := utf8.EncodeRune(buf[:], unicode.ToLower(c))
52 out = append(out, buf[:i]...)
53 }
54 return out
55}
56
57// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may
58// be larger than 'lower'. Returns whether there was a match, and if
59// yes, the byte size of the match.
60func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) {
61 matchTotal := 0
62 for len(lower) > 0 && len(mixed) > 0 {
63 lr, lsz := utf8.DecodeRune(lower)
64 lower = lower[lsz:]
65
66 mr, msz := utf8.DecodeRune(mixed)
67 mixed = mixed[msz:]
68 matchTotal += msz
69
70 if lr != unicode.ToLower(mr) {
71 return 0, false
72 }
73 }
74
75 return matchTotal, len(lower) == 0
76}
77
78type ngram uint64
79
80func runesToNGram(b [ngramSize]rune) ngram {
81 return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2]))
82}
83
84func bytesToNGram(b []byte) ngram {
85 return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])})
86}
87
88func stringToNGram(s string) ngram {
89 return bytesToNGram([]byte(s))
90}
91
92func ngramToBytes(n ngram) []byte {
93 rs := ngramToRunes(n)
94 return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])}
95}
96
97const runeMask = 1<<21 - 1
98
99func ngramToRunes(n ngram) [ngramSize]rune {
100 return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)}
101}
102
103func (n ngram) String() string {
104 rs := ngramToRunes(n)
105 return string(rs[:])
106}
107
108type runeNgramOff struct {
109 ngram ngram
110 byteSize uint32 // size of ngram
111 byteOff uint32
112 runeOff uint32
113}
114
115func splitNGrams(str []byte) []runeNgramOff {
116 var runeGram [3]rune
117 var off [3]uint32
118 var runeCount int
119
120 result := make([]runeNgramOff, 0, len(str))
121 var i uint32
122
123 chars := -1
124 for len(str) > 0 {
125 chars++
126 r, sz := utf8.DecodeRune(str)
127 str = str[sz:]
128 runeGram[0] = runeGram[1]
129 off[0] = off[1]
130 runeGram[1] = runeGram[2]
131 off[1] = off[2]
132 runeGram[2] = r
133 off[2] = uint32(i)
134 i += uint32(sz)
135 runeCount++
136 if runeCount < ngramSize {
137 continue
138 }
139
140 ng := runesToNGram(runeGram)
141 result = append(result, runeNgramOff{
142 ngram: ng,
143 byteSize: i - off[0],
144 byteOff: off[0],
145 runeOff: uint32(chars),
146 })
147 }
148 return result
149}
150
151const (
152 _classLowerChar int = iota
153 _classUpperChar
154 _classDigit
155 _classPunct
156 _classOther
157 _classSpace
158)
159
160func byteClass(c byte) int {
161 if c >= 'a' && c <= 'z' {
162 return _classLowerChar
163 }
164 if c >= 'A' && c <= 'Z' {
165 return _classUpperChar
166 }
167 if c >= '0' && c <= '9' {
168 return _classDigit
169 }
170
171 switch c {
172 case ' ', '\n':
173 return _classSpace
174 case '.', ',', ';', '"', '\'':
175 return _classPunct
176 default:
177 return _classOther
178 }
179}
180
181func characterClass(c byte) bool {
182 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'
183}
184
185func marshalDocSections(secs []DocumentSection) []byte {
186 ints := make([]uint32, 0, len(secs)*2)
187 for _, s := range secs {
188 ints = append(ints, uint32(s.Start), uint32(s.End))
189 }
190
191 return toSizedDeltas(ints)
192}
193
194func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection {
195 sz, m := binary.Uvarint(data)
196 data = data[m:]
197
198 if cap(ds) < int(sz)/2 {
199 ds = make([]DocumentSection, 0, sz/2)
200 } else {
201 ds = ds[:0]
202 }
203
204 // Inlining the delta decoding to avoid unnecessary allocations that would come
205 // from the straightforward implementation, i.e. packing the result of fromSizedDeltas.
206 var last uint32
207 for len(data) > 0 {
208 var d DocumentSection
209
210 delta, m := binary.Uvarint(data)
211 last += uint32(delta)
212 data = data[m:]
213 d.Start = last
214
215 delta, m = binary.Uvarint(data)
216 last += uint32(delta)
217 data = data[m:]
218 d.End = last
219
220 ds = append(ds, d)
221 }
222 return ds
223}
224
225type ngramSlice []ngram
226
227func (p ngramSlice) Len() int { return len(p) }
228
229func (p ngramSlice) Less(i, j int) bool {
230 return p[i] < p[j]
231}
232
233func (p ngramSlice) Swap(i, j int) {
234 p[i], p[j] = p[j], p[i]
235}
236
237func toSizedDeltas(offsets []uint32) []byte {
238 var enc [8]byte
239
240 deltas := make([]byte, 0, len(offsets)*2)
241
242 m := binary.PutUvarint(enc[:], uint64(len(offsets)))
243 deltas = append(deltas, enc[:m]...)
244
245 var last uint32
246 for _, p := range offsets {
247 delta := p - last
248 last = p
249
250 m := binary.PutUvarint(enc[:], uint64(delta))
251 deltas = append(deltas, enc[:m]...)
252 }
253 return deltas
254}
255
256func fromSizedDeltas(data []byte, ps []uint32) []uint32 {
257 sz, m := binary.Uvarint(data)
258 data = data[m:]
259
260 if cap(ps) < int(sz) {
261 ps = make([]uint32, 0, sz)
262 } else {
263 ps = ps[:0]
264 }
265
266 var last uint32
267 for len(data) > 0 {
268 delta, m := binary.Uvarint(data)
269 offset := last + uint32(delta)
270 last = offset
271 data = data[m:]
272 ps = append(ps, offset)
273 }
274 return ps
275}
276
277func toSizedDeltas16(offsets []uint16) []byte {
278 var enc [8]byte
279
280 deltas := make([]byte, 0, len(offsets)*2)
281
282 m := binary.PutUvarint(enc[:], uint64(len(offsets)))
283 deltas = append(deltas, enc[:m]...)
284
285 var last uint16
286 for _, p := range offsets {
287 delta := p - last
288 last = p
289
290 m := binary.PutUvarint(enc[:], uint64(delta))
291 deltas = append(deltas, enc[:m]...)
292 }
293 return deltas
294}
295
296func fromSizedDeltas16(data []byte, ps []uint16) []uint16 {
297 sz, m := binary.Uvarint(data)
298 data = data[m:]
299
300 if cap(ps) < int(sz) {
301 ps = make([]uint16, 0, sz)
302 } else {
303 ps = ps[:0]
304 }
305
306 var last uint16
307 for len(data) > 0 {
308 delta, m := binary.Uvarint(data)
309 offset := last + uint16(delta)
310 last = offset
311 data = data[m:]
312 ps = append(ps, offset)
313 }
314 return ps
315}
316
317func fromDeltas(data []byte, buf []uint32) []uint32 {
318 buf = buf[:0]
319 if cap(buf) < len(data)/2 {
320 buf = make([]uint32, 0, len(data)/2)
321 }
322
323 var last uint32
324 for len(data) > 0 {
325 delta, m := binary.Uvarint(data)
326 offset := last + uint32(delta)
327 last = offset
328 data = data[m:]
329 buf = append(buf, offset)
330 }
331 return buf
332}
333
334type runeOffsetCorrection struct {
335 runeOffset, byteOffset uint32
336}
337
338// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency)
339// to byte offsets, by tracking only the points where a span of runes is non-ASCII,
340// and otherwise interpolating expected byte offsets as one byte per rune.
341//
342// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}].
343//
344// This is very rarely a slight pessimization on repos where there are frequent
345// non-ASCII characters.
346type runeOffsetMap []runeOffsetCorrection
347
348// makeRuneOffsetMap converts the mostly-predictable runeOffset input
349// into a shorter form tracking the unexpected values.
350//
351// The input is a sequence of y values that we expect to increase by 100 each,
352// so we just store (x, y) points where the expectation is violated.
353func makeRuneOffsetMap(off []uint32) runeOffsetMap {
354 expected := uint32(0)
355 tmp := []runeOffsetCorrection{}
356 for runeOffset, byteOffset := range off {
357 if byteOffset != expected {
358 tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset})
359 expected = byteOffset
360 }
361 expected += runeOffsetFrequency
362 }
363 // copy the slice to ensure it doesn't waste unused trailing capacity
364 out := make([]runeOffsetCorrection, len(tmp))
365 copy(out, tmp)
366 return runeOffsetMap(out)
367}
368
369// lookup converts rune index `off` to a byte offset and a number of additional
370// runes to traverse, given the granularity of runeOffsetFrequency.
371//
372// It does this by finding the nearest point to interpolate from in the map.
373func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
374 left := runeOffset % runeOffsetFrequency
375 runeOffset -= left
376 slen := len(m)
377 if slen == 0 {
378 return runeOffset, left
379 }
380 // sort.Search finds the *first* index for which the predicate is true,
381 // but we want to find the *last* index for which the predicate is true.
382 // This involves some work to reverse the index directions.
383 idx := sort.Search(slen, func(i int) bool {
384 return runeOffset >= m[slen-1-i].runeOffset
385 })
386 idx = slen - 1 - idx
387 // idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller
388 // than the first entry in the map, so no correction is necessary.
389 byteOff := runeOffset
390 if idx >= 0 {
391 byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset
392 }
393 return byteOff, left
394}
395
396func (m runeOffsetMap) sizeBytes() int {
397 return 8 * len(m)
398}