bits.go at edf0c8b4e978fbb8ab0428de2b15f077ed777c0d · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / bits.go
at edf0c8b4e978fbb8ab0428de2b15f077ed777c0d 9.2 kB View raw
Keegan Carruthers-Smith matchtree: special case word search (#526) 3y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"encoding/binary"
 19	"sort"
 20	"unicode"
 21	"unicode/utf8"
 22)
 23
 24func generateCaseNgrams(g ngram) []ngram {
 25	asRunes := ngramToRunes(g)
 26
 27	variants := make([]ngram, 0, 8)
 28	cur := asRunes
 29	for {
 30		for i := 0; i < 3; i++ {
 31			next := unicode.SimpleFold(cur[i])
 32			cur[i] = next
 33			if next != asRunes[i] {
 34				break
 35			}
 36		}
 37
 38		variants = append(variants, runesToNGram(cur))
 39		if cur == asRunes {
 40			break
 41		}
 42	}
 43
 44	return variants
 45}
 46
 47func toLower(in []byte) []byte {
 48	out := make([]byte, 0, len(in))
 49	var buf [4]byte
 50	for _, c := range string(in) {
 51		i := utf8.EncodeRune(buf[:], unicode.ToLower(c))
 52		out = append(out, buf[:i]...)
 53	}
 54	return out
 55}
 56
 57// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may
 58// be larger than 'lower'. Returns whether there was a match, and if
 59// yes, the byte size of the match.
 60func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) {
 61	matchTotal := 0
 62	for len(lower) > 0 && len(mixed) > 0 {
 63		lr, lsz := utf8.DecodeRune(lower)
 64		lower = lower[lsz:]
 65
 66		mr, msz := utf8.DecodeRune(mixed)
 67		mixed = mixed[msz:]
 68		matchTotal += msz
 69
 70		if lr != unicode.ToLower(mr) {
 71			return 0, false
 72		}
 73	}
 74
 75	return matchTotal, len(lower) == 0
 76}
 77
 78type ngram uint64
 79
 80func runesToNGram(b [ngramSize]rune) ngram {
 81	return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2]))
 82}
 83
 84func bytesToNGram(b []byte) ngram {
 85	return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])})
 86}
 87
 88func stringToNGram(s string) ngram {
 89	return bytesToNGram([]byte(s))
 90}
 91
 92func ngramToBytes(n ngram) []byte {
 93	rs := ngramToRunes(n)
 94	return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])}
 95}
 96
 97const runeMask = 1<<21 - 1
 98
 99func ngramToRunes(n ngram) [ngramSize]rune {
100	return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)}
101}
102
103func (n ngram) String() string {
104	rs := ngramToRunes(n)
105	return string(rs[:])
106}
107
108type runeNgramOff struct {
109	ngram    ngram
110	byteSize uint32 // size of ngram
111	byteOff  uint32
112	runeOff  uint32
113}
114
115func splitNGrams(str []byte) []runeNgramOff {
116	var runeGram [3]rune
117	var off [3]uint32
118	var runeCount int
119
120	result := make([]runeNgramOff, 0, len(str))
121	var i uint32
122
123	chars := -1
124	for len(str) > 0 {
125		chars++
126		r, sz := utf8.DecodeRune(str)
127		str = str[sz:]
128		runeGram[0] = runeGram[1]
129		off[0] = off[1]
130		runeGram[1] = runeGram[2]
131		off[1] = off[2]
132		runeGram[2] = r
133		off[2] = uint32(i)
134		i += uint32(sz)
135		runeCount++
136		if runeCount < ngramSize {
137			continue
138		}
139
140		ng := runesToNGram(runeGram)
141		result = append(result, runeNgramOff{
142			ngram:    ng,
143			byteSize: i - off[0],
144			byteOff:  off[0],
145			runeOff:  uint32(chars),
146		})
147	}
148	return result
149}
150
151const (
152	_classLowerChar int = iota
153	_classUpperChar
154	_classDigit
155	_classPunct
156	_classOther
157	_classSpace
158)
159
160func byteClass(c byte) int {
161	if c >= 'a' && c <= 'z' {
162		return _classLowerChar
163	}
164	if c >= 'A' && c <= 'Z' {
165		return _classUpperChar
166	}
167	if c >= '0' && c <= '9' {
168		return _classDigit
169	}
170
171	switch c {
172	case ' ', '\n':
173		return _classSpace
174	case '.', ',', ';', '"', '\'':
175		return _classPunct
176	default:
177		return _classOther
178	}
179}
180
181func characterClass(c byte) bool {
182	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'
183}
184
185func marshalDocSections(secs []DocumentSection) []byte {
186	ints := make([]uint32, 0, len(secs)*2)
187	for _, s := range secs {
188		ints = append(ints, uint32(s.Start), uint32(s.End))
189	}
190
191	return toSizedDeltas(ints)
192}
193
194func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection {
195	sz, m := binary.Uvarint(data)
196	data = data[m:]
197
198	if cap(ds) < int(sz)/2 {
199		ds = make([]DocumentSection, 0, sz/2)
200	} else {
201		ds = ds[:0]
202	}
203
204	// Inlining the delta decoding to avoid unnecessary allocations that would come
205	// from the straightforward implementation, i.e. packing the result of fromSizedDeltas.
206	var last uint32
207	for len(data) > 0 {
208		var d DocumentSection
209
210		delta, m := binary.Uvarint(data)
211		last += uint32(delta)
212		data = data[m:]
213		d.Start = last
214
215		delta, m = binary.Uvarint(data)
216		last += uint32(delta)
217		data = data[m:]
218		d.End = last
219
220		ds = append(ds, d)
221	}
222	return ds
223}
224
225type ngramSlice []ngram
226
227func (p ngramSlice) Len() int { return len(p) }
228
229func (p ngramSlice) Less(i, j int) bool {
230	return p[i] < p[j]
231}
232
233func (p ngramSlice) Swap(i, j int) {
234	p[i], p[j] = p[j], p[i]
235}
236
237func toSizedDeltas(offsets []uint32) []byte {
238	var enc [8]byte
239
240	deltas := make([]byte, 0, len(offsets)*2)
241
242	m := binary.PutUvarint(enc[:], uint64(len(offsets)))
243	deltas = append(deltas, enc[:m]...)
244
245	var last uint32
246	for _, p := range offsets {
247		delta := p - last
248		last = p
249
250		m := binary.PutUvarint(enc[:], uint64(delta))
251		deltas = append(deltas, enc[:m]...)
252	}
253	return deltas
254}
255
256func fromSizedDeltas(data []byte, ps []uint32) []uint32 {
257	sz, m := binary.Uvarint(data)
258	data = data[m:]
259
260	if cap(ps) < int(sz) {
261		ps = make([]uint32, 0, sz)
262	} else {
263		ps = ps[:0]
264	}
265
266	var last uint32
267	for len(data) > 0 {
268		delta, m := binary.Uvarint(data)
269		offset := last + uint32(delta)
270		last = offset
271		data = data[m:]
272		ps = append(ps, offset)
273	}
274	return ps
275}
276
277func toSizedDeltas16(offsets []uint16) []byte {
278	var enc [8]byte
279
280	deltas := make([]byte, 0, len(offsets)*2)
281
282	m := binary.PutUvarint(enc[:], uint64(len(offsets)))
283	deltas = append(deltas, enc[:m]...)
284
285	var last uint16
286	for _, p := range offsets {
287		delta := p - last
288		last = p
289
290		m := binary.PutUvarint(enc[:], uint64(delta))
291		deltas = append(deltas, enc[:m]...)
292	}
293	return deltas
294}
295
296func fromSizedDeltas16(data []byte, ps []uint16) []uint16 {
297	sz, m := binary.Uvarint(data)
298	data = data[m:]
299
300	if cap(ps) < int(sz) {
301		ps = make([]uint16, 0, sz)
302	} else {
303		ps = ps[:0]
304	}
305
306	var last uint16
307	for len(data) > 0 {
308		delta, m := binary.Uvarint(data)
309		offset := last + uint16(delta)
310		last = offset
311		data = data[m:]
312		ps = append(ps, offset)
313	}
314	return ps
315}
316
317func fromDeltas(data []byte, buf []uint32) []uint32 {
318	buf = buf[:0]
319	if cap(buf) < len(data)/2 {
320		buf = make([]uint32, 0, len(data)/2)
321	}
322
323	var last uint32
324	for len(data) > 0 {
325		delta, m := binary.Uvarint(data)
326		offset := last + uint32(delta)
327		last = offset
328		data = data[m:]
329		buf = append(buf, offset)
330	}
331	return buf
332}
333
334type runeOffsetCorrection struct {
335	runeOffset, byteOffset uint32
336}
337
338// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency)
339// to byte offsets, by tracking only the points where a span of runes is non-ASCII,
340// and otherwise interpolating expected byte offsets as one byte per rune.
341//
342// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}].
343//
344// This is very rarely a slight pessimization on repos where there are frequent
345// non-ASCII characters.
346type runeOffsetMap []runeOffsetCorrection
347
348// makeRuneOffsetMap converts the mostly-predictable runeOffset input
349// into a shorter form tracking the unexpected values.
350//
351// The input is a sequence of y values that we expect to increase by 100 each,
352// so we just store (x, y) points where the expectation is violated.
353func makeRuneOffsetMap(off []uint32) runeOffsetMap {
354	expected := uint32(0)
355	tmp := []runeOffsetCorrection{}
356	for runeOffset, byteOffset := range off {
357		if byteOffset != expected {
358			tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset})
359			expected = byteOffset
360		}
361		expected += runeOffsetFrequency
362	}
363	// copy the slice to ensure it doesn't waste unused trailing capacity
364	out := make([]runeOffsetCorrection, len(tmp))
365	copy(out, tmp)
366	return runeOffsetMap(out)
367}
368
369// lookup converts rune index `off` to a byte offset and a number of additional
370// runes to traverse, given the granularity of runeOffsetFrequency.
371//
372// It does this by finding the nearest point to interpolate from in the map.
373func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
374	left := runeOffset % runeOffsetFrequency
375	runeOffset -= left
376	slen := len(m)
377	if slen == 0 {
378		return runeOffset, left
379	}
380	// sort.Search finds the *first* index for which the predicate is true,
381	// but we want to find the *last* index for which the predicate is true.
382	// This involves some work to reverse the index directions.
383	idx := sort.Search(slen, func(i int) bool {
384		return runeOffset >= m[slen-1-i].runeOffset
385	})
386	idx = slen - 1 - idx
387	// idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller
388	// than the first entry in the map, so no correction is necessary.
389	byteOff := runeOffset
390	if idx >= 0 {
391		byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset
392	}
393	return byteOff, left
394}
395
396func (m runeOffsetMap) sizeBytes() int {
397	return 8 * len(m)
398}
Configure Feed

Configure Feed