bits.go at 12ce07a298aed45c4ee9fa92f9acfdcb83f0836f · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / bits.go
at 12ce07a298aed45c4ee9fa92f9acfdcb83f0836f 9.6 kB View raw
Keegan Carruthers-Smith index: experiment to limit ngram lookups for large snippets (#795) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"cmp"
 19	"encoding/binary"
 20	"math"
 21	"sort"
 22	"unicode"
 23	"unicode/utf8"
 24)
 25
 26func generateCaseNgrams(g ngram) []ngram {
 27	asRunes := ngramToRunes(g)
 28
 29	variants := make([]ngram, 0, 8)
 30	cur := asRunes
 31	for {
 32		for i := 0; i < 3; i++ {
 33			next := unicode.SimpleFold(cur[i])
 34			cur[i] = next
 35			if next != asRunes[i] {
 36				break
 37			}
 38		}
 39
 40		variants = append(variants, runesToNGram(cur))
 41		if cur == asRunes {
 42			break
 43		}
 44	}
 45
 46	return variants
 47}
 48
 49func toLower(in []byte) []byte {
 50	out := make([]byte, 0, len(in))
 51	var buf [4]byte
 52	for _, c := range string(in) {
 53		i := utf8.EncodeRune(buf[:], unicode.ToLower(c))
 54		out = append(out, buf[:i]...)
 55	}
 56	return out
 57}
 58
 59// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may
 60// be larger than 'lower'. Returns whether there was a match, and if
 61// yes, the byte size of the match.
 62func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) {
 63	matchTotal := 0
 64	for len(lower) > 0 && len(mixed) > 0 {
 65		lr, lsz := utf8.DecodeRune(lower)
 66		lower = lower[lsz:]
 67
 68		mr, msz := utf8.DecodeRune(mixed)
 69		mixed = mixed[msz:]
 70		matchTotal += msz
 71
 72		if lr != unicode.ToLower(mr) {
 73			return 0, false
 74		}
 75	}
 76
 77	return matchTotal, len(lower) == 0
 78}
 79
 80type ngram uint64
 81
 82func runesToNGram(b [ngramSize]rune) ngram {
 83	return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2]))
 84}
 85
 86func bytesToNGram(b []byte) ngram {
 87	return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])})
 88}
 89
 90func stringToNGram(s string) ngram {
 91	return bytesToNGram([]byte(s))
 92}
 93
 94func ngramToBytes(n ngram) []byte {
 95	rs := ngramToRunes(n)
 96	return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])}
 97}
 98
 99const runeMask = 1<<21 - 1
100
101func ngramToRunes(n ngram) [ngramSize]rune {
102	return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)}
103}
104
105func (n ngram) String() string {
106	rs := ngramToRunes(n)
107	return string(rs[:])
108}
109
110type runeNgramOff struct {
111	ngram ngram
112	// index is the original index inside of the returned array of splitNGrams
113	index int
114}
115
116func (a runeNgramOff) Compare(b runeNgramOff) int {
117	if a.ngram == b.ngram {
118		return cmp.Compare(a.index, b.index)
119	} else if a.ngram < b.ngram {
120		return -1
121	} else {
122		return 1
123	}
124}
125
126func splitNGrams(str []byte) []runeNgramOff {
127	// len(maxNgrams) >= the number of ngrams in str => no limit
128	return splitNGramsLimit(str, len(str))
129}
130
131func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
132	var runeGram [3]rune
133	var off [3]uint32
134	var runeCount int
135
136	result := make([]runeNgramOff, 0, len(str))
137	var i uint32
138
139	for len(str) > 0 && len(result) < maxNgrams {
140		r, sz := utf8.DecodeRune(str)
141		str = str[sz:]
142		runeGram[0] = runeGram[1]
143		off[0] = off[1]
144		runeGram[1] = runeGram[2]
145		off[1] = off[2]
146		runeGram[2] = r
147		off[2] = uint32(i)
148		i += uint32(sz)
149		runeCount++
150		if runeCount < ngramSize {
151			continue
152		}
153
154		ng := runesToNGram(runeGram)
155		result = append(result, runeNgramOff{
156			ngram: ng,
157			index: len(result),
158		})
159	}
160	return result
161}
162
163const (
164	_classLowerChar int = iota
165	_classUpperChar
166	_classDigit
167	_classPunct
168	_classOther
169	_classSpace
170)
171
172func byteClass(c byte) int {
173	if c >= 'a' && c <= 'z' {
174		return _classLowerChar
175	}
176	if c >= 'A' && c <= 'Z' {
177		return _classUpperChar
178	}
179	if c >= '0' && c <= '9' {
180		return _classDigit
181	}
182
183	switch c {
184	case ' ', '\n':
185		return _classSpace
186	case '.', ',', ';', '"', '\'':
187		return _classPunct
188	default:
189		return _classOther
190	}
191}
192
193func characterClass(c byte) bool {
194	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'
195}
196
197func marshalDocSections(secs []DocumentSection) []byte {
198	ints := make([]uint32, 0, len(secs)*2)
199	for _, s := range secs {
200		ints = append(ints, uint32(s.Start), uint32(s.End))
201	}
202
203	return toSizedDeltas(ints)
204}
205
206func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection {
207	sz, m := binary.Uvarint(data)
208	data = data[m:]
209
210	if cap(ds) < int(sz)/2 {
211		ds = make([]DocumentSection, 0, sz/2)
212	} else {
213		ds = ds[:0]
214	}
215
216	// Inlining the delta decoding to avoid unnecessary allocations that would come
217	// from the straightforward implementation, i.e. packing the result of fromSizedDeltas.
218	var last uint32
219	for len(data) > 0 {
220		var d DocumentSection
221
222		delta, m := binary.Uvarint(data)
223		last += uint32(delta)
224		data = data[m:]
225		d.Start = last
226
227		delta, m = binary.Uvarint(data)
228		last += uint32(delta)
229		data = data[m:]
230		d.End = last
231
232		ds = append(ds, d)
233	}
234	return ds
235}
236
237type ngramSlice []ngram
238
239func (p ngramSlice) Len() int { return len(p) }
240
241func (p ngramSlice) Less(i, j int) bool {
242	return p[i] < p[j]
243}
244
245func (p ngramSlice) Swap(i, j int) {
246	p[i], p[j] = p[j], p[i]
247}
248
249func toSizedDeltas(offsets []uint32) []byte {
250	var enc [8]byte
251
252	deltas := make([]byte, 0, len(offsets)*2)
253
254	m := binary.PutUvarint(enc[:], uint64(len(offsets)))
255	deltas = append(deltas, enc[:m]...)
256
257	var last uint32
258	for _, p := range offsets {
259		delta := p - last
260		last = p
261
262		m := binary.PutUvarint(enc[:], uint64(delta))
263		deltas = append(deltas, enc[:m]...)
264	}
265	return deltas
266}
267
268func fromSizedDeltas(data []byte, ps []uint32) []uint32 {
269	sz, m := binary.Uvarint(data)
270	data = data[m:]
271
272	if cap(ps) < int(sz) {
273		ps = make([]uint32, 0, sz)
274	} else {
275		ps = ps[:0]
276	}
277
278	var last uint32
279	for len(data) > 0 {
280		delta, m := binary.Uvarint(data)
281		offset := last + uint32(delta)
282		last = offset
283		data = data[m:]
284		ps = append(ps, offset)
285	}
286	return ps
287}
288
289func toSizedDeltas16(offsets []uint16) []byte {
290	var enc [8]byte
291
292	deltas := make([]byte, 0, len(offsets)*2)
293
294	m := binary.PutUvarint(enc[:], uint64(len(offsets)))
295	deltas = append(deltas, enc[:m]...)
296
297	var last uint16
298	for _, p := range offsets {
299		delta := p - last
300		last = p
301
302		m := binary.PutUvarint(enc[:], uint64(delta))
303		deltas = append(deltas, enc[:m]...)
304	}
305	return deltas
306}
307
308func fromSizedDeltas16(data []byte, ps []uint16) []uint16 {
309	sz, m := binary.Uvarint(data)
310	data = data[m:]
311
312	if cap(ps) < int(sz) {
313		ps = make([]uint16, 0, sz)
314	} else {
315		ps = ps[:0]
316	}
317
318	var last uint16
319	for len(data) > 0 {
320		delta, m := binary.Uvarint(data)
321		offset := last + uint16(delta)
322		last = offset
323		data = data[m:]
324		ps = append(ps, offset)
325	}
326	return ps
327}
328
329func fromDeltas(data []byte, buf []uint32) []uint32 {
330	buf = buf[:0]
331	if cap(buf) < len(data)/2 {
332		buf = make([]uint32, 0, len(data)/2)
333	}
334
335	var last uint32
336	for len(data) > 0 {
337		delta, m := binary.Uvarint(data)
338		offset := last + uint32(delta)
339		last = offset
340		data = data[m:]
341		buf = append(buf, offset)
342	}
343	return buf
344}
345
346type runeOffsetCorrection struct {
347	runeOffset, byteOffset uint32
348}
349
350// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency)
351// to byte offsets, by tracking only the points where a span of runes is non-ASCII,
352// and otherwise interpolating expected byte offsets as one byte per rune.
353//
354// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}].
355//
356// This is very rarely a slight pessimization on repos where there are frequent
357// non-ASCII characters.
358type runeOffsetMap []runeOffsetCorrection
359
360// makeRuneOffsetMap converts the mostly-predictable runeOffset input
361// into a shorter form tracking the unexpected values.
362//
363// The input is a sequence of y values that we expect to increase by 100 each,
364// so we just store (x, y) points where the expectation is violated.
365func makeRuneOffsetMap(off []uint32) runeOffsetMap {
366	expected := uint32(0)
367	tmp := []runeOffsetCorrection{}
368	for runeOffset, byteOffset := range off {
369		if byteOffset != expected {
370			tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset})
371			expected = byteOffset
372		}
373		expected += runeOffsetFrequency
374	}
375	// copy the slice to ensure it doesn't waste unused trailing capacity
376	out := make([]runeOffsetCorrection, len(tmp))
377	copy(out, tmp)
378	return runeOffsetMap(out)
379}
380
381// lookup converts rune index `off` to a byte offset and a number of additional
382// runes to traverse, given the granularity of runeOffsetFrequency.
383//
384// It does this by finding the nearest point to interpolate from in the map.
385func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
386	left := runeOffset % runeOffsetFrequency
387	runeOffset -= left
388	slen := len(m)
389	if slen == 0 {
390		return runeOffset, left
391	}
392	// sort.Search finds the *first* index for which the predicate is true,
393	// but we want to find the *last* index for which the predicate is true.
394	// This involves some work to reverse the index directions.
395	idx := sort.Search(slen, func(i int) bool {
396		return runeOffset >= m[slen-1-i].runeOffset
397	})
398	idx = slen - 1 - idx
399	// idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller
400	// than the first entry in the map, so no correction is necessary.
401	byteOff := runeOffset
402	if idx >= 0 {
403		byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset
404	}
405	return byteOff, left
406}
407
408func (m runeOffsetMap) sizeBytes() int {
409	return 8 * len(m)
410}
411
412func epsilonEqualsOne(scoreWeight float64) bool {
413	return scoreWeight == 1 || math.Abs(scoreWeight-1.0) < 1e-9
414}
Configure Feed

Configure Feed