bits.go at ebb3ca24249dfa7d3be2bb48899b86e68bed2f6c · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / bits.go
at ebb3ca24249dfa7d3be2bb48899b86e68bed2f6c 10 kB View raw
Keegan Carruthers-Smith index: use a random sample of ngrams when limiting (#797) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"cmp"
 19	"encoding/binary"
 20	"math"
 21	"math/rand/v2"
 22	"slices"
 23	"sort"
 24	"unicode"
 25	"unicode/utf8"
 26)
 27
 28func generateCaseNgrams(g ngram) []ngram {
 29	asRunes := ngramToRunes(g)
 30
 31	variants := make([]ngram, 0, 8)
 32	cur := asRunes
 33	for {
 34		for i := 0; i < 3; i++ {
 35			next := unicode.SimpleFold(cur[i])
 36			cur[i] = next
 37			if next != asRunes[i] {
 38				break
 39			}
 40		}
 41
 42		variants = append(variants, runesToNGram(cur))
 43		if cur == asRunes {
 44			break
 45		}
 46	}
 47
 48	return variants
 49}
 50
 51func toLower(in []byte) []byte {
 52	out := make([]byte, 0, len(in))
 53	var buf [4]byte
 54	for _, c := range string(in) {
 55		i := utf8.EncodeRune(buf[:], unicode.ToLower(c))
 56		out = append(out, buf[:i]...)
 57	}
 58	return out
 59}
 60
 61// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may
 62// be larger than 'lower'. Returns whether there was a match, and if
 63// yes, the byte size of the match.
 64func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) {
 65	matchTotal := 0
 66	for len(lower) > 0 && len(mixed) > 0 {
 67		lr, lsz := utf8.DecodeRune(lower)
 68		lower = lower[lsz:]
 69
 70		mr, msz := utf8.DecodeRune(mixed)
 71		mixed = mixed[msz:]
 72		matchTotal += msz
 73
 74		if lr != unicode.ToLower(mr) {
 75			return 0, false
 76		}
 77	}
 78
 79	return matchTotal, len(lower) == 0
 80}
 81
 82type ngram uint64
 83
 84func runesToNGram(b [ngramSize]rune) ngram {
 85	return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2]))
 86}
 87
 88func bytesToNGram(b []byte) ngram {
 89	return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])})
 90}
 91
 92func stringToNGram(s string) ngram {
 93	return bytesToNGram([]byte(s))
 94}
 95
 96func ngramToBytes(n ngram) []byte {
 97	rs := ngramToRunes(n)
 98	return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])}
 99}
100
101const runeMask = 1<<21 - 1
102
103func ngramToRunes(n ngram) [ngramSize]rune {
104	return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)}
105}
106
107func (n ngram) String() string {
108	rs := ngramToRunes(n)
109	return string(rs[:])
110}
111
112type runeNgramOff struct {
113	ngram ngram
114	// index is the original index inside of the returned array of splitNGrams
115	index int
116}
117
118func (a runeNgramOff) Compare(b runeNgramOff) int {
119	if a.ngram == b.ngram {
120		return cmp.Compare(a.index, b.index)
121	} else if a.ngram < b.ngram {
122		return -1
123	} else {
124		return 1
125	}
126}
127
128func splitNGrams(str []byte) []runeNgramOff {
129	// len(maxNgrams) >= the number of ngrams in str => no limit
130	return splitNGramsLimit(str, len(str))
131}
132
133func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
134	var runeGram [3]rune
135	var off [3]uint32
136	var runeCount int
137
138	result := make([]runeNgramOff, 0, len(str))
139	var i uint32
140
141	for len(str) > 0 {
142		r, sz := utf8.DecodeRune(str)
143		str = str[sz:]
144		runeGram[0] = runeGram[1]
145		off[0] = off[1]
146		runeGram[1] = runeGram[2]
147		off[1] = off[2]
148		runeGram[2] = r
149		off[2] = uint32(i)
150		i += uint32(sz)
151		runeCount++
152		if runeCount < ngramSize {
153			continue
154		}
155
156		ng := runesToNGram(runeGram)
157		result = append(result, runeNgramOff{
158			ngram: ng,
159			index: len(result),
160		})
161	}
162
163	// We return a random subset of size maxNgrams. This is to prevent the start
164	// of the string biasing ngram selection.
165	if maxNgrams < len(result) {
166		//  Deterministic seed for tests. Additionally makes comparing repeated
167		//  queries performance easier.
168		r := rand.New(rand.NewPCG(uint64(maxNgrams), 0))
169
170		// Pick random subset via a shuffle
171		r.Shuffle(maxNgrams, func(i, j int) { result[i], result[j] = result[j], result[i] })
172		result = result[:maxNgrams]
173
174		// Caller expects ngrams in order of appearance.
175		slices.SortFunc(result, runeNgramOff.Compare)
176	}
177
178	return result
179}
180
181const (
182	_classLowerChar int = iota
183	_classUpperChar
184	_classDigit
185	_classPunct
186	_classOther
187	_classSpace
188)
189
190func byteClass(c byte) int {
191	if c >= 'a' && c <= 'z' {
192		return _classLowerChar
193	}
194	if c >= 'A' && c <= 'Z' {
195		return _classUpperChar
196	}
197	if c >= '0' && c <= '9' {
198		return _classDigit
199	}
200
201	switch c {
202	case ' ', '\n':
203		return _classSpace
204	case '.', ',', ';', '"', '\'':
205		return _classPunct
206	default:
207		return _classOther
208	}
209}
210
211func characterClass(c byte) bool {
212	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'
213}
214
215func marshalDocSections(secs []DocumentSection) []byte {
216	ints := make([]uint32, 0, len(secs)*2)
217	for _, s := range secs {
218		ints = append(ints, uint32(s.Start), uint32(s.End))
219	}
220
221	return toSizedDeltas(ints)
222}
223
224func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection {
225	sz, m := binary.Uvarint(data)
226	data = data[m:]
227
228	if cap(ds) < int(sz)/2 {
229		ds = make([]DocumentSection, 0, sz/2)
230	} else {
231		ds = ds[:0]
232	}
233
234	// Inlining the delta decoding to avoid unnecessary allocations that would come
235	// from the straightforward implementation, i.e. packing the result of fromSizedDeltas.
236	var last uint32
237	for len(data) > 0 {
238		var d DocumentSection
239
240		delta, m := binary.Uvarint(data)
241		last += uint32(delta)
242		data = data[m:]
243		d.Start = last
244
245		delta, m = binary.Uvarint(data)
246		last += uint32(delta)
247		data = data[m:]
248		d.End = last
249
250		ds = append(ds, d)
251	}
252	return ds
253}
254
255type ngramSlice []ngram
256
257func (p ngramSlice) Len() int { return len(p) }
258
259func (p ngramSlice) Less(i, j int) bool {
260	return p[i] < p[j]
261}
262
263func (p ngramSlice) Swap(i, j int) {
264	p[i], p[j] = p[j], p[i]
265}
266
267func toSizedDeltas(offsets []uint32) []byte {
268	var enc [8]byte
269
270	deltas := make([]byte, 0, len(offsets)*2)
271
272	m := binary.PutUvarint(enc[:], uint64(len(offsets)))
273	deltas = append(deltas, enc[:m]...)
274
275	var last uint32
276	for _, p := range offsets {
277		delta := p - last
278		last = p
279
280		m := binary.PutUvarint(enc[:], uint64(delta))
281		deltas = append(deltas, enc[:m]...)
282	}
283	return deltas
284}
285
286func fromSizedDeltas(data []byte, ps []uint32) []uint32 {
287	sz, m := binary.Uvarint(data)
288	data = data[m:]
289
290	if cap(ps) < int(sz) {
291		ps = make([]uint32, 0, sz)
292	} else {
293		ps = ps[:0]
294	}
295
296	var last uint32
297	for len(data) > 0 {
298		delta, m := binary.Uvarint(data)
299		offset := last + uint32(delta)
300		last = offset
301		data = data[m:]
302		ps = append(ps, offset)
303	}
304	return ps
305}
306
307func toSizedDeltas16(offsets []uint16) []byte {
308	var enc [8]byte
309
310	deltas := make([]byte, 0, len(offsets)*2)
311
312	m := binary.PutUvarint(enc[:], uint64(len(offsets)))
313	deltas = append(deltas, enc[:m]...)
314
315	var last uint16
316	for _, p := range offsets {
317		delta := p - last
318		last = p
319
320		m := binary.PutUvarint(enc[:], uint64(delta))
321		deltas = append(deltas, enc[:m]...)
322	}
323	return deltas
324}
325
326func fromSizedDeltas16(data []byte, ps []uint16) []uint16 {
327	sz, m := binary.Uvarint(data)
328	data = data[m:]
329
330	if cap(ps) < int(sz) {
331		ps = make([]uint16, 0, sz)
332	} else {
333		ps = ps[:0]
334	}
335
336	var last uint16
337	for len(data) > 0 {
338		delta, m := binary.Uvarint(data)
339		offset := last + uint16(delta)
340		last = offset
341		data = data[m:]
342		ps = append(ps, offset)
343	}
344	return ps
345}
346
347func fromDeltas(data []byte, buf []uint32) []uint32 {
348	buf = buf[:0]
349	if cap(buf) < len(data)/2 {
350		buf = make([]uint32, 0, len(data)/2)
351	}
352
353	var last uint32
354	for len(data) > 0 {
355		delta, m := binary.Uvarint(data)
356		offset := last + uint32(delta)
357		last = offset
358		data = data[m:]
359		buf = append(buf, offset)
360	}
361	return buf
362}
363
364type runeOffsetCorrection struct {
365	runeOffset, byteOffset uint32
366}
367
368// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency)
369// to byte offsets, by tracking only the points where a span of runes is non-ASCII,
370// and otherwise interpolating expected byte offsets as one byte per rune.
371//
372// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}].
373//
374// This is very rarely a slight pessimization on repos where there are frequent
375// non-ASCII characters.
376type runeOffsetMap []runeOffsetCorrection
377
378// makeRuneOffsetMap converts the mostly-predictable runeOffset input
379// into a shorter form tracking the unexpected values.
380//
381// The input is a sequence of y values that we expect to increase by 100 each,
382// so we just store (x, y) points where the expectation is violated.
383func makeRuneOffsetMap(off []uint32) runeOffsetMap {
384	expected := uint32(0)
385	tmp := []runeOffsetCorrection{}
386	for runeOffset, byteOffset := range off {
387		if byteOffset != expected {
388			tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset})
389			expected = byteOffset
390		}
391		expected += runeOffsetFrequency
392	}
393	// copy the slice to ensure it doesn't waste unused trailing capacity
394	out := make([]runeOffsetCorrection, len(tmp))
395	copy(out, tmp)
396	return runeOffsetMap(out)
397}
398
399// lookup converts rune index `off` to a byte offset and a number of additional
400// runes to traverse, given the granularity of runeOffsetFrequency.
401//
402// It does this by finding the nearest point to interpolate from in the map.
403func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
404	left := runeOffset % runeOffsetFrequency
405	runeOffset -= left
406	slen := len(m)
407	if slen == 0 {
408		return runeOffset, left
409	}
410	// sort.Search finds the *first* index for which the predicate is true,
411	// but we want to find the *last* index for which the predicate is true.
412	// This involves some work to reverse the index directions.
413	idx := sort.Search(slen, func(i int) bool {
414		return runeOffset >= m[slen-1-i].runeOffset
415	})
416	idx = slen - 1 - idx
417	// idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller
418	// than the first entry in the map, so no correction is necessary.
419	byteOff := runeOffset
420	if idx >= 0 {
421		byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset
422	}
423	return byteOff, left
424}
425
426func (m runeOffsetMap) sizeBytes() int {
427	return 8 * len(m)
428}
429
430func epsilonEqualsOne(scoreWeight float64) bool {
431	return scoreWeight == 1 || math.Abs(scoreWeight-1.0) < 1e-9
432}
Configure Feed

Configure Feed