bits.go at e068116194eadd9ef4d619fc53371289bf317d58 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / bits.go
at e068116194eadd9ef4d619fc53371289bf317d58 9.1 kB View raw
Keegan Carruthers-Smith sort ngrams before looking them up (#617) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"encoding/binary"
 19	"sort"
 20	"unicode"
 21	"unicode/utf8"
 22)
 23
 24func generateCaseNgrams(g ngram) []ngram {
 25	asRunes := ngramToRunes(g)
 26
 27	variants := make([]ngram, 0, 8)
 28	cur := asRunes
 29	for {
 30		for i := 0; i < 3; i++ {
 31			next := unicode.SimpleFold(cur[i])
 32			cur[i] = next
 33			if next != asRunes[i] {
 34				break
 35			}
 36		}
 37
 38		variants = append(variants, runesToNGram(cur))
 39		if cur == asRunes {
 40			break
 41		}
 42	}
 43
 44	return variants
 45}
 46
 47func toLower(in []byte) []byte {
 48	out := make([]byte, 0, len(in))
 49	var buf [4]byte
 50	for _, c := range string(in) {
 51		i := utf8.EncodeRune(buf[:], unicode.ToLower(c))
 52		out = append(out, buf[:i]...)
 53	}
 54	return out
 55}
 56
 57// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may
 58// be larger than 'lower'. Returns whether there was a match, and if
 59// yes, the byte size of the match.
 60func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) {
 61	matchTotal := 0
 62	for len(lower) > 0 && len(mixed) > 0 {
 63		lr, lsz := utf8.DecodeRune(lower)
 64		lower = lower[lsz:]
 65
 66		mr, msz := utf8.DecodeRune(mixed)
 67		mixed = mixed[msz:]
 68		matchTotal += msz
 69
 70		if lr != unicode.ToLower(mr) {
 71			return 0, false
 72		}
 73	}
 74
 75	return matchTotal, len(lower) == 0
 76}
 77
 78type ngram uint64
 79
 80func runesToNGram(b [ngramSize]rune) ngram {
 81	return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2]))
 82}
 83
 84func bytesToNGram(b []byte) ngram {
 85	return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])})
 86}
 87
 88func stringToNGram(s string) ngram {
 89	return bytesToNGram([]byte(s))
 90}
 91
 92func ngramToBytes(n ngram) []byte {
 93	rs := ngramToRunes(n)
 94	return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])}
 95}
 96
 97const runeMask = 1<<21 - 1
 98
 99func ngramToRunes(n ngram) [ngramSize]rune {
100	return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)}
101}
102
103func (n ngram) String() string {
104	rs := ngramToRunes(n)
105	return string(rs[:])
106}
107
108type runeNgramOff struct {
109	ngram ngram
110	// index is the original index inside of the returned array of splitNGrams
111	index uint32
112}
113
114func splitNGrams(str []byte) []runeNgramOff {
115	var runeGram [3]rune
116	var off [3]uint32
117	var runeCount int
118
119	result := make([]runeNgramOff, 0, len(str))
120	var i uint32
121
122	for len(str) > 0 {
123		r, sz := utf8.DecodeRune(str)
124		str = str[sz:]
125		runeGram[0] = runeGram[1]
126		off[0] = off[1]
127		runeGram[1] = runeGram[2]
128		off[1] = off[2]
129		runeGram[2] = r
130		off[2] = uint32(i)
131		i += uint32(sz)
132		runeCount++
133		if runeCount < ngramSize {
134			continue
135		}
136
137		ng := runesToNGram(runeGram)
138		result = append(result, runeNgramOff{
139			ngram: ng,
140			index: uint32(len(result)),
141		})
142	}
143	return result
144}
145
146const (
147	_classLowerChar int = iota
148	_classUpperChar
149	_classDigit
150	_classPunct
151	_classOther
152	_classSpace
153)
154
155func byteClass(c byte) int {
156	if c >= 'a' && c <= 'z' {
157		return _classLowerChar
158	}
159	if c >= 'A' && c <= 'Z' {
160		return _classUpperChar
161	}
162	if c >= '0' && c <= '9' {
163		return _classDigit
164	}
165
166	switch c {
167	case ' ', '\n':
168		return _classSpace
169	case '.', ',', ';', '"', '\'':
170		return _classPunct
171	default:
172		return _classOther
173	}
174}
175
176func characterClass(c byte) bool {
177	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'
178}
179
180func marshalDocSections(secs []DocumentSection) []byte {
181	ints := make([]uint32, 0, len(secs)*2)
182	for _, s := range secs {
183		ints = append(ints, uint32(s.Start), uint32(s.End))
184	}
185
186	return toSizedDeltas(ints)
187}
188
189func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection {
190	sz, m := binary.Uvarint(data)
191	data = data[m:]
192
193	if cap(ds) < int(sz)/2 {
194		ds = make([]DocumentSection, 0, sz/2)
195	} else {
196		ds = ds[:0]
197	}
198
199	// Inlining the delta decoding to avoid unnecessary allocations that would come
200	// from the straightforward implementation, i.e. packing the result of fromSizedDeltas.
201	var last uint32
202	for len(data) > 0 {
203		var d DocumentSection
204
205		delta, m := binary.Uvarint(data)
206		last += uint32(delta)
207		data = data[m:]
208		d.Start = last
209
210		delta, m = binary.Uvarint(data)
211		last += uint32(delta)
212		data = data[m:]
213		d.End = last
214
215		ds = append(ds, d)
216	}
217	return ds
218}
219
220type ngramSlice []ngram
221
222func (p ngramSlice) Len() int { return len(p) }
223
224func (p ngramSlice) Less(i, j int) bool {
225	return p[i] < p[j]
226}
227
228func (p ngramSlice) Swap(i, j int) {
229	p[i], p[j] = p[j], p[i]
230}
231
232func toSizedDeltas(offsets []uint32) []byte {
233	var enc [8]byte
234
235	deltas := make([]byte, 0, len(offsets)*2)
236
237	m := binary.PutUvarint(enc[:], uint64(len(offsets)))
238	deltas = append(deltas, enc[:m]...)
239
240	var last uint32
241	for _, p := range offsets {
242		delta := p - last
243		last = p
244
245		m := binary.PutUvarint(enc[:], uint64(delta))
246		deltas = append(deltas, enc[:m]...)
247	}
248	return deltas
249}
250
251func fromSizedDeltas(data []byte, ps []uint32) []uint32 {
252	sz, m := binary.Uvarint(data)
253	data = data[m:]
254
255	if cap(ps) < int(sz) {
256		ps = make([]uint32, 0, sz)
257	} else {
258		ps = ps[:0]
259	}
260
261	var last uint32
262	for len(data) > 0 {
263		delta, m := binary.Uvarint(data)
264		offset := last + uint32(delta)
265		last = offset
266		data = data[m:]
267		ps = append(ps, offset)
268	}
269	return ps
270}
271
272func toSizedDeltas16(offsets []uint16) []byte {
273	var enc [8]byte
274
275	deltas := make([]byte, 0, len(offsets)*2)
276
277	m := binary.PutUvarint(enc[:], uint64(len(offsets)))
278	deltas = append(deltas, enc[:m]...)
279
280	var last uint16
281	for _, p := range offsets {
282		delta := p - last
283		last = p
284
285		m := binary.PutUvarint(enc[:], uint64(delta))
286		deltas = append(deltas, enc[:m]...)
287	}
288	return deltas
289}
290
291func fromSizedDeltas16(data []byte, ps []uint16) []uint16 {
292	sz, m := binary.Uvarint(data)
293	data = data[m:]
294
295	if cap(ps) < int(sz) {
296		ps = make([]uint16, 0, sz)
297	} else {
298		ps = ps[:0]
299	}
300
301	var last uint16
302	for len(data) > 0 {
303		delta, m := binary.Uvarint(data)
304		offset := last + uint16(delta)
305		last = offset
306		data = data[m:]
307		ps = append(ps, offset)
308	}
309	return ps
310}
311
312func fromDeltas(data []byte, buf []uint32) []uint32 {
313	buf = buf[:0]
314	if cap(buf) < len(data)/2 {
315		buf = make([]uint32, 0, len(data)/2)
316	}
317
318	var last uint32
319	for len(data) > 0 {
320		delta, m := binary.Uvarint(data)
321		offset := last + uint32(delta)
322		last = offset
323		data = data[m:]
324		buf = append(buf, offset)
325	}
326	return buf
327}
328
329type runeOffsetCorrection struct {
330	runeOffset, byteOffset uint32
331}
332
333// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency)
334// to byte offsets, by tracking only the points where a span of runes is non-ASCII,
335// and otherwise interpolating expected byte offsets as one byte per rune.
336//
337// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}].
338//
339// This is very rarely a slight pessimization on repos where there are frequent
340// non-ASCII characters.
341type runeOffsetMap []runeOffsetCorrection
342
343// makeRuneOffsetMap converts the mostly-predictable runeOffset input
344// into a shorter form tracking the unexpected values.
345//
346// The input is a sequence of y values that we expect to increase by 100 each,
347// so we just store (x, y) points where the expectation is violated.
348func makeRuneOffsetMap(off []uint32) runeOffsetMap {
349	expected := uint32(0)
350	tmp := []runeOffsetCorrection{}
351	for runeOffset, byteOffset := range off {
352		if byteOffset != expected {
353			tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset})
354			expected = byteOffset
355		}
356		expected += runeOffsetFrequency
357	}
358	// copy the slice to ensure it doesn't waste unused trailing capacity
359	out := make([]runeOffsetCorrection, len(tmp))
360	copy(out, tmp)
361	return runeOffsetMap(out)
362}
363
364// lookup converts rune index `off` to a byte offset and a number of additional
365// runes to traverse, given the granularity of runeOffsetFrequency.
366//
367// It does this by finding the nearest point to interpolate from in the map.
368func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
369	left := runeOffset % runeOffsetFrequency
370	runeOffset -= left
371	slen := len(m)
372	if slen == 0 {
373		return runeOffset, left
374	}
375	// sort.Search finds the *first* index for which the predicate is true,
376	// but we want to find the *last* index for which the predicate is true.
377	// This involves some work to reverse the index directions.
378	idx := sort.Search(slen, func(i int) bool {
379		return runeOffset >= m[slen-1-i].runeOffset
380	})
381	idx = slen - 1 - idx
382	// idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller
383	// than the first entry in the map, so no correction is necessary.
384	byteOff := runeOffset
385	if idx >= 0 {
386		byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset
387	}
388	return byteOff, left
389}
390
391func (m runeOffsetMap) sizeBytes() int {
392	return 8 * len(m)
393}
Configure Feed

Configure Feed