bits.go at c301e5c82b6e1632dce5c39902691c359559852e · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / bits.go
at c301e5c82b6e1632dce5c39902691c359559852e 9.3 kB View raw
Stefan Hengl ranking: boost camelCase (#472) 3y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"encoding/binary"
 19	"log"
 20	"sort"
 21	"unicode"
 22	"unicode/utf8"
 23)
 24
 25func generateCaseNgrams(g ngram) []ngram {
 26	asRunes := ngramToRunes(g)
 27
 28	variants := make([]ngram, 0, 8)
 29	cur := asRunes
 30	for {
 31		for i := 0; i < 3; i++ {
 32			next := unicode.SimpleFold(cur[i])
 33			cur[i] = next
 34			if next != asRunes[i] {
 35				break
 36			}
 37		}
 38
 39		variants = append(variants, runesToNGram(cur))
 40		if cur == asRunes {
 41			break
 42		}
 43	}
 44
 45	return variants
 46}
 47
 48func toLower(in []byte) []byte {
 49	out := make([]byte, 0, len(in))
 50	var buf [4]byte
 51	for _, c := range string(in) {
 52		i := utf8.EncodeRune(buf[:], unicode.ToLower(c))
 53		out = append(out, buf[:i]...)
 54	}
 55	return out
 56}
 57
 58// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may
 59// be larger than 'lower'. Returns whether there was a match, and if
 60// yes, the byte size of the match.
 61func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) {
 62	matchTotal := 0
 63	for len(lower) > 0 && len(mixed) > 0 {
 64		lr, lsz := utf8.DecodeRune(lower)
 65		lower = lower[lsz:]
 66
 67		mr, msz := utf8.DecodeRune(mixed)
 68		mixed = mixed[msz:]
 69		matchTotal += msz
 70
 71		if lr != unicode.ToLower(mr) {
 72			return 0, false
 73		}
 74	}
 75
 76	return matchTotal, len(lower) == 0
 77}
 78
 79type ngram uint64
 80
 81func runesToNGram(b [ngramSize]rune) ngram {
 82	return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2]))
 83}
 84
 85func bytesToNGram(b []byte) ngram {
 86	return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])})
 87}
 88
 89func stringToNGram(s string) ngram {
 90	return bytesToNGram([]byte(s))
 91}
 92
 93func ngramToBytes(n ngram) []byte {
 94	rs := ngramToRunes(n)
 95	return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])}
 96}
 97
 98const runeMask = 1<<21 - 1
 99
100func ngramToRunes(n ngram) [ngramSize]rune {
101	return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)}
102}
103
104func (n ngram) String() string {
105	rs := ngramToRunes(n)
106	return string(rs[:])
107}
108
109type runeNgramOff struct {
110	ngram    ngram
111	byteSize uint32 // size of ngram
112	byteOff  uint32
113	runeOff  uint32
114}
115
116func splitNGrams(str []byte) []runeNgramOff {
117	var runeGram [3]rune
118	var off [3]uint32
119	var runeCount int
120
121	result := make([]runeNgramOff, 0, len(str))
122	var i uint32
123
124	chars := -1
125	for len(str) > 0 {
126		chars++
127		r, sz := utf8.DecodeRune(str)
128		str = str[sz:]
129		runeGram[0] = runeGram[1]
130		off[0] = off[1]
131		runeGram[1] = runeGram[2]
132		off[1] = off[2]
133		runeGram[2] = r
134		off[2] = uint32(i)
135		i += uint32(sz)
136		runeCount++
137		if runeCount < ngramSize {
138			continue
139		}
140
141		ng := runesToNGram(runeGram)
142		result = append(result, runeNgramOff{
143			ngram:    ng,
144			byteSize: i - off[0],
145			byteOff:  off[0],
146			runeOff:  uint32(chars),
147		})
148	}
149	return result
150}
151
152const (
153	_classLowerChar int = iota
154	_classUpperChar
155	_classDigit
156	_classPunct
157	_classOther
158	_classSpace
159)
160
161func byteClass(c byte) int {
162	if c >= 'a' && c <= 'z' {
163		return _classLowerChar
164	}
165	if c >= 'A' && c <= 'Z' {
166		return _classUpperChar
167	}
168	if c >= '0' && c <= '9' {
169		return _classDigit
170	}
171
172	switch c {
173	case ' ', '\n':
174		return _classSpace
175	case '.', ',', ';', '"', '\'':
176		return _classPunct
177	default:
178		return _classOther
179	}
180}
181
182func marshalDocSections(secs []DocumentSection) []byte {
183	ints := make([]uint32, 0, len(secs)*2)
184	for _, s := range secs {
185		ints = append(ints, uint32(s.Start), uint32(s.End))
186	}
187
188	return toSizedDeltas(ints)
189}
190
191func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection {
192	// Defensive, this shouldn't happen. While we have the feature flag for lazy
193	// doc section decoding let's be extra defensive.
194	if len(data) == 0 {
195		log.Println("WARN unmarshalDocSections received an empty slice to unmarshal")
196		return nil
197	}
198
199	sz, m := binary.Uvarint(data)
200	data = data[m:]
201
202	if cap(ds) < int(sz)/2 {
203		ds = make([]DocumentSection, 0, sz/2)
204	} else {
205		ds = ds[:0]
206	}
207
208	// Inlining the delta decoding to avoid unnecessary allocations that would come
209	// from the straightforward implementation, i.e. packing the result of fromSizedDeltas.
210	var last uint32
211	for len(data) > 0 {
212		var d DocumentSection
213
214		delta, m := binary.Uvarint(data)
215		last += uint32(delta)
216		data = data[m:]
217		d.Start = last
218
219		delta, m = binary.Uvarint(data)
220		last += uint32(delta)
221		data = data[m:]
222		d.End = last
223
224		ds = append(ds, d)
225	}
226	return ds
227}
228
229type ngramSlice []ngram
230
231func (p ngramSlice) Len() int { return len(p) }
232
233func (p ngramSlice) Less(i, j int) bool {
234	return p[i] < p[j]
235}
236
237func (p ngramSlice) Swap(i, j int) {
238	p[i], p[j] = p[j], p[i]
239}
240
241func toSizedDeltas(offsets []uint32) []byte {
242	var enc [8]byte
243
244	deltas := make([]byte, 0, len(offsets)*2)
245
246	m := binary.PutUvarint(enc[:], uint64(len(offsets)))
247	deltas = append(deltas, enc[:m]...)
248
249	var last uint32
250	for _, p := range offsets {
251		delta := p - last
252		last = p
253
254		m := binary.PutUvarint(enc[:], uint64(delta))
255		deltas = append(deltas, enc[:m]...)
256	}
257	return deltas
258}
259
260func fromSizedDeltas(data []byte, ps []uint32) []uint32 {
261	sz, m := binary.Uvarint(data)
262	data = data[m:]
263
264	if cap(ps) < int(sz) {
265		ps = make([]uint32, 0, sz)
266	} else {
267		ps = ps[:0]
268	}
269
270	var last uint32
271	for len(data) > 0 {
272		delta, m := binary.Uvarint(data)
273		offset := last + uint32(delta)
274		last = offset
275		data = data[m:]
276		ps = append(ps, offset)
277	}
278	return ps
279}
280
281func toSizedDeltas16(offsets []uint16) []byte {
282	var enc [8]byte
283
284	deltas := make([]byte, 0, len(offsets)*2)
285
286	m := binary.PutUvarint(enc[:], uint64(len(offsets)))
287	deltas = append(deltas, enc[:m]...)
288
289	var last uint16
290	for _, p := range offsets {
291		delta := p - last
292		last = p
293
294		m := binary.PutUvarint(enc[:], uint64(delta))
295		deltas = append(deltas, enc[:m]...)
296	}
297	return deltas
298}
299
300func fromSizedDeltas16(data []byte, ps []uint16) []uint16 {
301	sz, m := binary.Uvarint(data)
302	data = data[m:]
303
304	if cap(ps) < int(sz) {
305		ps = make([]uint16, 0, sz)
306	} else {
307		ps = ps[:0]
308	}
309
310	var last uint16
311	for len(data) > 0 {
312		delta, m := binary.Uvarint(data)
313		offset := last + uint16(delta)
314		last = offset
315		data = data[m:]
316		ps = append(ps, offset)
317	}
318	return ps
319}
320
321func fromDeltas(data []byte, buf []uint32) []uint32 {
322	buf = buf[:0]
323	if cap(buf) < len(data)/2 {
324		buf = make([]uint32, 0, len(data)/2)
325	}
326
327	var last uint32
328	for len(data) > 0 {
329		delta, m := binary.Uvarint(data)
330		offset := last + uint32(delta)
331		last = offset
332		data = data[m:]
333		buf = append(buf, offset)
334	}
335	return buf
336}
337
338type runeOffsetCorrection struct {
339	runeOffset, byteOffset uint32
340}
341
342// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency)
343// to byte offsets, by tracking only the points where a span of runes is non-ASCII,
344// and otherwise interpolating expected byte offsets as one byte per rune.
345//
346// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}].
347//
348// This is very rarely a slight pessimization on repos where there are frequent
349// non-ASCII characters.
350type runeOffsetMap []runeOffsetCorrection
351
352// makeRuneOffsetMap converts the mostly-predictable runeOffset input
353// into a shorter form tracking the unexpected values.
354//
355// The input is a sequence of y values that we expect to increase by 100 each,
356// so we just store (x, y) points where the expectation is violated.
357func makeRuneOffsetMap(off []uint32) runeOffsetMap {
358	expected := uint32(0)
359	tmp := []runeOffsetCorrection{}
360	for runeOffset, byteOffset := range off {
361		if byteOffset != expected {
362			tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset})
363			expected = byteOffset
364		}
365		expected += runeOffsetFrequency
366	}
367	// copy the slice to ensure it doesn't waste unused trailing capacity
368	out := make([]runeOffsetCorrection, len(tmp))
369	copy(out, tmp)
370	return runeOffsetMap(out)
371}
372
373// lookup converts rune index `off` to a byte offset and a number of additional
374// runes to traverse, given the granularity of runeOffsetFrequency.
375//
376// It does this by finding the nearest point to interpolate from in the map.
377func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
378	left := runeOffset % runeOffsetFrequency
379	runeOffset -= left
380	slen := len(m)
381	if slen == 0 {
382		return runeOffset, left
383	}
384	// sort.Search finds the *first* index for which the predicate is true,
385	// but we want to find the *last* index for which the predicate is true.
386	// This involves some work to reverse the index directions.
387	idx := sort.Search(slen, func(i int) bool {
388		return runeOffset >= m[slen-1-i].runeOffset
389	})
390	idx = slen - 1 - idx
391	// idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller
392	// than the first entry in the map, so no correction is necessary.
393	byteOff := runeOffset
394	if idx >= 0 {
395		byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset
396	}
397	return byteOff, left
398}
399
400func (m runeOffsetMap) sizeBytes() int {
401	return 8 * len(m)
402}
Configure Feed

Configure Feed