bits.go at a303c6c51d3b753afa39bf7ccf339f1743f341f2 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / bits.go
at a303c6c51d3b753afa39bf7ccf339f1743f341f2 9.1 kB View raw
Keegan Carruthers-Smith default to eager runeDocSection (#503) 3y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"encoding/binary"
 19	"sort"
 20	"unicode"
 21	"unicode/utf8"
 22)
 23
 24func generateCaseNgrams(g ngram) []ngram {
 25	asRunes := ngramToRunes(g)
 26
 27	variants := make([]ngram, 0, 8)
 28	cur := asRunes
 29	for {
 30		for i := 0; i < 3; i++ {
 31			next := unicode.SimpleFold(cur[i])
 32			cur[i] = next
 33			if next != asRunes[i] {
 34				break
 35			}
 36		}
 37
 38		variants = append(variants, runesToNGram(cur))
 39		if cur == asRunes {
 40			break
 41		}
 42	}
 43
 44	return variants
 45}
 46
 47func toLower(in []byte) []byte {
 48	out := make([]byte, 0, len(in))
 49	var buf [4]byte
 50	for _, c := range string(in) {
 51		i := utf8.EncodeRune(buf[:], unicode.ToLower(c))
 52		out = append(out, buf[:i]...)
 53	}
 54	return out
 55}
 56
 57// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may
 58// be larger than 'lower'. Returns whether there was a match, and if
 59// yes, the byte size of the match.
 60func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) {
 61	matchTotal := 0
 62	for len(lower) > 0 && len(mixed) > 0 {
 63		lr, lsz := utf8.DecodeRune(lower)
 64		lower = lower[lsz:]
 65
 66		mr, msz := utf8.DecodeRune(mixed)
 67		mixed = mixed[msz:]
 68		matchTotal += msz
 69
 70		if lr != unicode.ToLower(mr) {
 71			return 0, false
 72		}
 73	}
 74
 75	return matchTotal, len(lower) == 0
 76}
 77
 78type ngram uint64
 79
 80func runesToNGram(b [ngramSize]rune) ngram {
 81	return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2]))
 82}
 83
 84func bytesToNGram(b []byte) ngram {
 85	return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])})
 86}
 87
 88func stringToNGram(s string) ngram {
 89	return bytesToNGram([]byte(s))
 90}
 91
 92func ngramToBytes(n ngram) []byte {
 93	rs := ngramToRunes(n)
 94	return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])}
 95}
 96
 97const runeMask = 1<<21 - 1
 98
 99func ngramToRunes(n ngram) [ngramSize]rune {
100	return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)}
101}
102
103func (n ngram) String() string {
104	rs := ngramToRunes(n)
105	return string(rs[:])
106}
107
108type runeNgramOff struct {
109	ngram    ngram
110	byteSize uint32 // size of ngram
111	byteOff  uint32
112	runeOff  uint32
113}
114
115func splitNGrams(str []byte) []runeNgramOff {
116	var runeGram [3]rune
117	var off [3]uint32
118	var runeCount int
119
120	result := make([]runeNgramOff, 0, len(str))
121	var i uint32
122
123	chars := -1
124	for len(str) > 0 {
125		chars++
126		r, sz := utf8.DecodeRune(str)
127		str = str[sz:]
128		runeGram[0] = runeGram[1]
129		off[0] = off[1]
130		runeGram[1] = runeGram[2]
131		off[1] = off[2]
132		runeGram[2] = r
133		off[2] = uint32(i)
134		i += uint32(sz)
135		runeCount++
136		if runeCount < ngramSize {
137			continue
138		}
139
140		ng := runesToNGram(runeGram)
141		result = append(result, runeNgramOff{
142			ngram:    ng,
143			byteSize: i - off[0],
144			byteOff:  off[0],
145			runeOff:  uint32(chars),
146		})
147	}
148	return result
149}
150
151const (
152	_classLowerChar int = iota
153	_classUpperChar
154	_classDigit
155	_classPunct
156	_classOther
157	_classSpace
158)
159
160func byteClass(c byte) int {
161	if c >= 'a' && c <= 'z' {
162		return _classLowerChar
163	}
164	if c >= 'A' && c <= 'Z' {
165		return _classUpperChar
166	}
167	if c >= '0' && c <= '9' {
168		return _classDigit
169	}
170
171	switch c {
172	case ' ', '\n':
173		return _classSpace
174	case '.', ',', ';', '"', '\'':
175		return _classPunct
176	default:
177		return _classOther
178	}
179}
180
181func marshalDocSections(secs []DocumentSection) []byte {
182	ints := make([]uint32, 0, len(secs)*2)
183	for _, s := range secs {
184		ints = append(ints, uint32(s.Start), uint32(s.End))
185	}
186
187	return toSizedDeltas(ints)
188}
189
190func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection {
191	sz, m := binary.Uvarint(data)
192	data = data[m:]
193
194	if cap(ds) < int(sz)/2 {
195		ds = make([]DocumentSection, 0, sz/2)
196	} else {
197		ds = ds[:0]
198	}
199
200	// Inlining the delta decoding to avoid unnecessary allocations that would come
201	// from the straightforward implementation, i.e. packing the result of fromSizedDeltas.
202	var last uint32
203	for len(data) > 0 {
204		var d DocumentSection
205
206		delta, m := binary.Uvarint(data)
207		last += uint32(delta)
208		data = data[m:]
209		d.Start = last
210
211		delta, m = binary.Uvarint(data)
212		last += uint32(delta)
213		data = data[m:]
214		d.End = last
215
216		ds = append(ds, d)
217	}
218	return ds
219}
220
221type ngramSlice []ngram
222
223func (p ngramSlice) Len() int { return len(p) }
224
225func (p ngramSlice) Less(i, j int) bool {
226	return p[i] < p[j]
227}
228
229func (p ngramSlice) Swap(i, j int) {
230	p[i], p[j] = p[j], p[i]
231}
232
233func toSizedDeltas(offsets []uint32) []byte {
234	var enc [8]byte
235
236	deltas := make([]byte, 0, len(offsets)*2)
237
238	m := binary.PutUvarint(enc[:], uint64(len(offsets)))
239	deltas = append(deltas, enc[:m]...)
240
241	var last uint32
242	for _, p := range offsets {
243		delta := p - last
244		last = p
245
246		m := binary.PutUvarint(enc[:], uint64(delta))
247		deltas = append(deltas, enc[:m]...)
248	}
249	return deltas
250}
251
252func fromSizedDeltas(data []byte, ps []uint32) []uint32 {
253	sz, m := binary.Uvarint(data)
254	data = data[m:]
255
256	if cap(ps) < int(sz) {
257		ps = make([]uint32, 0, sz)
258	} else {
259		ps = ps[:0]
260	}
261
262	var last uint32
263	for len(data) > 0 {
264		delta, m := binary.Uvarint(data)
265		offset := last + uint32(delta)
266		last = offset
267		data = data[m:]
268		ps = append(ps, offset)
269	}
270	return ps
271}
272
273func toSizedDeltas16(offsets []uint16) []byte {
274	var enc [8]byte
275
276	deltas := make([]byte, 0, len(offsets)*2)
277
278	m := binary.PutUvarint(enc[:], uint64(len(offsets)))
279	deltas = append(deltas, enc[:m]...)
280
281	var last uint16
282	for _, p := range offsets {
283		delta := p - last
284		last = p
285
286		m := binary.PutUvarint(enc[:], uint64(delta))
287		deltas = append(deltas, enc[:m]...)
288	}
289	return deltas
290}
291
292func fromSizedDeltas16(data []byte, ps []uint16) []uint16 {
293	sz, m := binary.Uvarint(data)
294	data = data[m:]
295
296	if cap(ps) < int(sz) {
297		ps = make([]uint16, 0, sz)
298	} else {
299		ps = ps[:0]
300	}
301
302	var last uint16
303	for len(data) > 0 {
304		delta, m := binary.Uvarint(data)
305		offset := last + uint16(delta)
306		last = offset
307		data = data[m:]
308		ps = append(ps, offset)
309	}
310	return ps
311}
312
313func fromDeltas(data []byte, buf []uint32) []uint32 {
314	buf = buf[:0]
315	if cap(buf) < len(data)/2 {
316		buf = make([]uint32, 0, len(data)/2)
317	}
318
319	var last uint32
320	for len(data) > 0 {
321		delta, m := binary.Uvarint(data)
322		offset := last + uint32(delta)
323		last = offset
324		data = data[m:]
325		buf = append(buf, offset)
326	}
327	return buf
328}
329
330type runeOffsetCorrection struct {
331	runeOffset, byteOffset uint32
332}
333
334// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency)
335// to byte offsets, by tracking only the points where a span of runes is non-ASCII,
336// and otherwise interpolating expected byte offsets as one byte per rune.
337//
338// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}].
339//
340// This is very rarely a slight pessimization on repos where there are frequent
341// non-ASCII characters.
342type runeOffsetMap []runeOffsetCorrection
343
344// makeRuneOffsetMap converts the mostly-predictable runeOffset input
345// into a shorter form tracking the unexpected values.
346//
347// The input is a sequence of y values that we expect to increase by 100 each,
348// so we just store (x, y) points where the expectation is violated.
349func makeRuneOffsetMap(off []uint32) runeOffsetMap {
350	expected := uint32(0)
351	tmp := []runeOffsetCorrection{}
352	for runeOffset, byteOffset := range off {
353		if byteOffset != expected {
354			tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset})
355			expected = byteOffset
356		}
357		expected += runeOffsetFrequency
358	}
359	// copy the slice to ensure it doesn't waste unused trailing capacity
360	out := make([]runeOffsetCorrection, len(tmp))
361	copy(out, tmp)
362	return runeOffsetMap(out)
363}
364
365// lookup converts rune index `off` to a byte offset and a number of additional
366// runes to traverse, given the granularity of runeOffsetFrequency.
367//
368// It does this by finding the nearest point to interpolate from in the map.
369func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
370	left := runeOffset % runeOffsetFrequency
371	runeOffset -= left
372	slen := len(m)
373	if slen == 0 {
374		return runeOffset, left
375	}
376	// sort.Search finds the *first* index for which the predicate is true,
377	// but we want to find the *last* index for which the predicate is true.
378	// This involves some work to reverse the index directions.
379	idx := sort.Search(slen, func(i int) bool {
380		return runeOffset >= m[slen-1-i].runeOffset
381	})
382	idx = slen - 1 - idx
383	// idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller
384	// than the first entry in the map, so no correction is necessary.
385	byteOff := runeOffset
386	if idx >= 0 {
387		byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset
388	}
389	return byteOff, left
390}
391
392func (m runeOffsetMap) sizeBytes() int {
393	return 8 * len(m)
394}
Configure Feed

Configure Feed