fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "cmp" 19 "encoding/binary" 20 "math" 21 "sort" 22 "unicode" 23 "unicode/utf8" 24) 25 26func generateCaseNgrams(g ngram) []ngram { 27 asRunes := ngramToRunes(g) 28 29 variants := make([]ngram, 0, 8) 30 cur := asRunes 31 for { 32 for i := 0; i < 3; i++ { 33 next := unicode.SimpleFold(cur[i]) 34 cur[i] = next 35 if next != asRunes[i] { 36 break 37 } 38 } 39 40 variants = append(variants, runesToNGram(cur)) 41 if cur == asRunes { 42 break 43 } 44 } 45 46 return variants 47} 48 49func toLower(in []byte) []byte { 50 out := make([]byte, 0, len(in)) 51 var buf [4]byte 52 for _, c := range string(in) { 53 i := utf8.EncodeRune(buf[:], unicode.ToLower(c)) 54 out = append(out, buf[:i]...) 55 } 56 return out 57} 58 59// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may 60// be larger than 'lower'. Returns whether there was a match, and if 61// yes, the byte size of the match. 62func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) { 63 matchTotal := 0 64 for len(lower) > 0 && len(mixed) > 0 { 65 lr, lsz := utf8.DecodeRune(lower) 66 lower = lower[lsz:] 67 68 mr, msz := utf8.DecodeRune(mixed) 69 mixed = mixed[msz:] 70 matchTotal += msz 71 72 if lr != unicode.ToLower(mr) { 73 return 0, false 74 } 75 } 76 77 return matchTotal, len(lower) == 0 78} 79 80type ngram uint64 81 82func runesToNGram(b [ngramSize]rune) ngram { 83 return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2])) 84} 85 86func bytesToNGram(b []byte) ngram { 87 return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])}) 88} 89 90func stringToNGram(s string) ngram { 91 return bytesToNGram([]byte(s)) 92} 93 94func ngramToBytes(n ngram) []byte { 95 rs := ngramToRunes(n) 96 return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])} 97} 98 99const runeMask = 1<<21 - 1 100 101func ngramToRunes(n ngram) [ngramSize]rune { 102 return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)} 103} 104 105func (n ngram) String() string { 106 rs := ngramToRunes(n) 107 return string(rs[:]) 108} 109 110type runeNgramOff struct { 111 ngram ngram 112 // index is the original index inside of the returned array of splitNGrams 113 index int 114} 115 116func (a runeNgramOff) Compare(b runeNgramOff) int { 117 if a.ngram == b.ngram { 118 return cmp.Compare(a.index, b.index) 119 } else if a.ngram < b.ngram { 120 return -1 121 } else { 122 return 1 123 } 124} 125 126func splitNGrams(str []byte) []runeNgramOff { 127 // len(maxNgrams) >= the number of ngrams in str => no limit 128 return splitNGramsLimit(str, len(str)) 129} 130 131func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff { 132 var runeGram [3]rune 133 var off [3]uint32 134 var runeCount int 135 136 result := make([]runeNgramOff, 0, len(str)) 137 var i uint32 138 139 for len(str) > 0 && len(result) < maxNgrams { 140 r, sz := utf8.DecodeRune(str) 141 str = str[sz:] 142 runeGram[0] = runeGram[1] 143 off[0] = off[1] 144 runeGram[1] = runeGram[2] 145 off[1] = off[2] 146 runeGram[2] = r 147 off[2] = uint32(i) 148 i += uint32(sz) 149 runeCount++ 150 if runeCount < ngramSize { 151 continue 152 } 153 154 ng := runesToNGram(runeGram) 155 result = append(result, runeNgramOff{ 156 ngram: ng, 157 index: len(result), 158 }) 159 } 160 return result 161} 162 163const ( 164 _classLowerChar int = iota 165 _classUpperChar 166 _classDigit 167 _classPunct 168 _classOther 169 _classSpace 170) 171 172func byteClass(c byte) int { 173 if c >= 'a' && c <= 'z' { 174 return _classLowerChar 175 } 176 if c >= 'A' && c <= 'Z' { 177 return _classUpperChar 178 } 179 if c >= '0' && c <= '9' { 180 return _classDigit 181 } 182 183 switch c { 184 case ' ', '\n': 185 return _classSpace 186 case '.', ',', ';', '"', '\'': 187 return _classPunct 188 default: 189 return _classOther 190 } 191} 192 193func characterClass(c byte) bool { 194 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' 195} 196 197func marshalDocSections(secs []DocumentSection) []byte { 198 ints := make([]uint32, 0, len(secs)*2) 199 for _, s := range secs { 200 ints = append(ints, uint32(s.Start), uint32(s.End)) 201 } 202 203 return toSizedDeltas(ints) 204} 205 206func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection { 207 sz, m := binary.Uvarint(data) 208 data = data[m:] 209 210 if cap(ds) < int(sz)/2 { 211 ds = make([]DocumentSection, 0, sz/2) 212 } else { 213 ds = ds[:0] 214 } 215 216 // Inlining the delta decoding to avoid unnecessary allocations that would come 217 // from the straightforward implementation, i.e. packing the result of fromSizedDeltas. 218 var last uint32 219 for len(data) > 0 { 220 var d DocumentSection 221 222 delta, m := binary.Uvarint(data) 223 last += uint32(delta) 224 data = data[m:] 225 d.Start = last 226 227 delta, m = binary.Uvarint(data) 228 last += uint32(delta) 229 data = data[m:] 230 d.End = last 231 232 ds = append(ds, d) 233 } 234 return ds 235} 236 237type ngramSlice []ngram 238 239func (p ngramSlice) Len() int { return len(p) } 240 241func (p ngramSlice) Less(i, j int) bool { 242 return p[i] < p[j] 243} 244 245func (p ngramSlice) Swap(i, j int) { 246 p[i], p[j] = p[j], p[i] 247} 248 249func toSizedDeltas(offsets []uint32) []byte { 250 var enc [8]byte 251 252 deltas := make([]byte, 0, len(offsets)*2) 253 254 m := binary.PutUvarint(enc[:], uint64(len(offsets))) 255 deltas = append(deltas, enc[:m]...) 256 257 var last uint32 258 for _, p := range offsets { 259 delta := p - last 260 last = p 261 262 m := binary.PutUvarint(enc[:], uint64(delta)) 263 deltas = append(deltas, enc[:m]...) 264 } 265 return deltas 266} 267 268func fromSizedDeltas(data []byte, ps []uint32) []uint32 { 269 sz, m := binary.Uvarint(data) 270 data = data[m:] 271 272 if cap(ps) < int(sz) { 273 ps = make([]uint32, 0, sz) 274 } else { 275 ps = ps[:0] 276 } 277 278 var last uint32 279 for len(data) > 0 { 280 delta, m := binary.Uvarint(data) 281 offset := last + uint32(delta) 282 last = offset 283 data = data[m:] 284 ps = append(ps, offset) 285 } 286 return ps 287} 288 289func toSizedDeltas16(offsets []uint16) []byte { 290 var enc [8]byte 291 292 deltas := make([]byte, 0, len(offsets)*2) 293 294 m := binary.PutUvarint(enc[:], uint64(len(offsets))) 295 deltas = append(deltas, enc[:m]...) 296 297 var last uint16 298 for _, p := range offsets { 299 delta := p - last 300 last = p 301 302 m := binary.PutUvarint(enc[:], uint64(delta)) 303 deltas = append(deltas, enc[:m]...) 304 } 305 return deltas 306} 307 308func fromSizedDeltas16(data []byte, ps []uint16) []uint16 { 309 sz, m := binary.Uvarint(data) 310 data = data[m:] 311 312 if cap(ps) < int(sz) { 313 ps = make([]uint16, 0, sz) 314 } else { 315 ps = ps[:0] 316 } 317 318 var last uint16 319 for len(data) > 0 { 320 delta, m := binary.Uvarint(data) 321 offset := last + uint16(delta) 322 last = offset 323 data = data[m:] 324 ps = append(ps, offset) 325 } 326 return ps 327} 328 329func fromDeltas(data []byte, buf []uint32) []uint32 { 330 buf = buf[:0] 331 if cap(buf) < len(data)/2 { 332 buf = make([]uint32, 0, len(data)/2) 333 } 334 335 var last uint32 336 for len(data) > 0 { 337 delta, m := binary.Uvarint(data) 338 offset := last + uint32(delta) 339 last = offset 340 data = data[m:] 341 buf = append(buf, offset) 342 } 343 return buf 344} 345 346type runeOffsetCorrection struct { 347 runeOffset, byteOffset uint32 348} 349 350// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency) 351// to byte offsets, by tracking only the points where a span of runes is non-ASCII, 352// and otherwise interpolating expected byte offsets as one byte per rune. 353// 354// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}]. 355// 356// This is very rarely a slight pessimization on repos where there are frequent 357// non-ASCII characters. 358type runeOffsetMap []runeOffsetCorrection 359 360// makeRuneOffsetMap converts the mostly-predictable runeOffset input 361// into a shorter form tracking the unexpected values. 362// 363// The input is a sequence of y values that we expect to increase by 100 each, 364// so we just store (x, y) points where the expectation is violated. 365func makeRuneOffsetMap(off []uint32) runeOffsetMap { 366 expected := uint32(0) 367 tmp := []runeOffsetCorrection{} 368 for runeOffset, byteOffset := range off { 369 if byteOffset != expected { 370 tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset}) 371 expected = byteOffset 372 } 373 expected += runeOffsetFrequency 374 } 375 // copy the slice to ensure it doesn't waste unused trailing capacity 376 out := make([]runeOffsetCorrection, len(tmp)) 377 copy(out, tmp) 378 return runeOffsetMap(out) 379} 380 381// lookup converts rune index `off` to a byte offset and a number of additional 382// runes to traverse, given the granularity of runeOffsetFrequency. 383// 384// It does this by finding the nearest point to interpolate from in the map. 385func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) { 386 left := runeOffset % runeOffsetFrequency 387 runeOffset -= left 388 slen := len(m) 389 if slen == 0 { 390 return runeOffset, left 391 } 392 // sort.Search finds the *first* index for which the predicate is true, 393 // but we want to find the *last* index for which the predicate is true. 394 // This involves some work to reverse the index directions. 395 idx := sort.Search(slen, func(i int) bool { 396 return runeOffset >= m[slen-1-i].runeOffset 397 }) 398 idx = slen - 1 - idx 399 // idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller 400 // than the first entry in the map, so no correction is necessary. 401 byteOff := runeOffset 402 if idx >= 0 { 403 byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset 404 } 405 return byteOff, left 406} 407 408func (m runeOffsetMap) sizeBytes() int { 409 return 8 * len(m) 410} 411 412func epsilonEqualsOne(scoreWeight float64) bool { 413 return scoreWeight == 1 || math.Abs(scoreWeight-1.0) < 1e-9 414}