fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "cmp" 19 "encoding/binary" 20 "math" 21 "math/rand/v2" 22 "slices" 23 "sort" 24 "unicode" 25 "unicode/utf8" 26) 27 28func generateCaseNgrams(g ngram) []ngram { 29 asRunes := ngramToRunes(g) 30 31 variants := make([]ngram, 0, 8) 32 cur := asRunes 33 for { 34 for i := 0; i < 3; i++ { 35 next := unicode.SimpleFold(cur[i]) 36 cur[i] = next 37 if next != asRunes[i] { 38 break 39 } 40 } 41 42 variants = append(variants, runesToNGram(cur)) 43 if cur == asRunes { 44 break 45 } 46 } 47 48 return variants 49} 50 51func toLower(in []byte) []byte { 52 out := make([]byte, 0, len(in)) 53 var buf [4]byte 54 for _, c := range string(in) { 55 i := utf8.EncodeRune(buf[:], unicode.ToLower(c)) 56 out = append(out, buf[:i]...) 57 } 58 return out 59} 60 61// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may 62// be larger than 'lower'. Returns whether there was a match, and if 63// yes, the byte size of the match. 64func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) { 65 matchTotal := 0 66 for len(lower) > 0 && len(mixed) > 0 { 67 lr, lsz := utf8.DecodeRune(lower) 68 lower = lower[lsz:] 69 70 mr, msz := utf8.DecodeRune(mixed) 71 mixed = mixed[msz:] 72 matchTotal += msz 73 74 if lr != unicode.ToLower(mr) { 75 return 0, false 76 } 77 } 78 79 return matchTotal, len(lower) == 0 80} 81 82type ngram uint64 83 84func runesToNGram(b [ngramSize]rune) ngram { 85 return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2])) 86} 87 88func bytesToNGram(b []byte) ngram { 89 return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])}) 90} 91 92func stringToNGram(s string) ngram { 93 return bytesToNGram([]byte(s)) 94} 95 96func ngramToBytes(n ngram) []byte { 97 rs := ngramToRunes(n) 98 return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])} 99} 100 101const runeMask = 1<<21 - 1 102 103func ngramToRunes(n ngram) [ngramSize]rune { 104 return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)} 105} 106 107func (n ngram) String() string { 108 rs := ngramToRunes(n) 109 return string(rs[:]) 110} 111 112type runeNgramOff struct { 113 ngram ngram 114 // index is the original index inside of the returned array of splitNGrams 115 index int 116} 117 118func (a runeNgramOff) Compare(b runeNgramOff) int { 119 if a.ngram == b.ngram { 120 return cmp.Compare(a.index, b.index) 121 } else if a.ngram < b.ngram { 122 return -1 123 } else { 124 return 1 125 } 126} 127 128func splitNGrams(str []byte) []runeNgramOff { 129 // len(maxNgrams) >= the number of ngrams in str => no limit 130 return splitNGramsLimit(str, len(str)) 131} 132 133func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff { 134 var runeGram [3]rune 135 var off [3]uint32 136 var runeCount int 137 138 result := make([]runeNgramOff, 0, len(str)) 139 var i uint32 140 141 for len(str) > 0 { 142 r, sz := utf8.DecodeRune(str) 143 str = str[sz:] 144 runeGram[0] = runeGram[1] 145 off[0] = off[1] 146 runeGram[1] = runeGram[2] 147 off[1] = off[2] 148 runeGram[2] = r 149 off[2] = uint32(i) 150 i += uint32(sz) 151 runeCount++ 152 if runeCount < ngramSize { 153 continue 154 } 155 156 ng := runesToNGram(runeGram) 157 result = append(result, runeNgramOff{ 158 ngram: ng, 159 index: len(result), 160 }) 161 } 162 163 // We return a random subset of size maxNgrams. This is to prevent the start 164 // of the string biasing ngram selection. 165 if maxNgrams < len(result) { 166 // Deterministic seed for tests. Additionally makes comparing repeated 167 // queries performance easier. 168 r := rand.New(rand.NewPCG(uint64(maxNgrams), 0)) 169 170 // Pick random subset via a shuffle 171 r.Shuffle(maxNgrams, func(i, j int) { result[i], result[j] = result[j], result[i] }) 172 result = result[:maxNgrams] 173 174 // Caller expects ngrams in order of appearance. 175 slices.SortFunc(result, runeNgramOff.Compare) 176 } 177 178 return result 179} 180 181const ( 182 _classLowerChar int = iota 183 _classUpperChar 184 _classDigit 185 _classPunct 186 _classOther 187 _classSpace 188) 189 190func byteClass(c byte) int { 191 if c >= 'a' && c <= 'z' { 192 return _classLowerChar 193 } 194 if c >= 'A' && c <= 'Z' { 195 return _classUpperChar 196 } 197 if c >= '0' && c <= '9' { 198 return _classDigit 199 } 200 201 switch c { 202 case ' ', '\n': 203 return _classSpace 204 case '.', ',', ';', '"', '\'': 205 return _classPunct 206 default: 207 return _classOther 208 } 209} 210 211func characterClass(c byte) bool { 212 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' 213} 214 215func marshalDocSections(secs []DocumentSection) []byte { 216 ints := make([]uint32, 0, len(secs)*2) 217 for _, s := range secs { 218 ints = append(ints, uint32(s.Start), uint32(s.End)) 219 } 220 221 return toSizedDeltas(ints) 222} 223 224func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection { 225 sz, m := binary.Uvarint(data) 226 data = data[m:] 227 228 if cap(ds) < int(sz)/2 { 229 ds = make([]DocumentSection, 0, sz/2) 230 } else { 231 ds = ds[:0] 232 } 233 234 // Inlining the delta decoding to avoid unnecessary allocations that would come 235 // from the straightforward implementation, i.e. packing the result of fromSizedDeltas. 236 var last uint32 237 for len(data) > 0 { 238 var d DocumentSection 239 240 delta, m := binary.Uvarint(data) 241 last += uint32(delta) 242 data = data[m:] 243 d.Start = last 244 245 delta, m = binary.Uvarint(data) 246 last += uint32(delta) 247 data = data[m:] 248 d.End = last 249 250 ds = append(ds, d) 251 } 252 return ds 253} 254 255type ngramSlice []ngram 256 257func (p ngramSlice) Len() int { return len(p) } 258 259func (p ngramSlice) Less(i, j int) bool { 260 return p[i] < p[j] 261} 262 263func (p ngramSlice) Swap(i, j int) { 264 p[i], p[j] = p[j], p[i] 265} 266 267func toSizedDeltas(offsets []uint32) []byte { 268 var enc [8]byte 269 270 deltas := make([]byte, 0, len(offsets)*2) 271 272 m := binary.PutUvarint(enc[:], uint64(len(offsets))) 273 deltas = append(deltas, enc[:m]...) 274 275 var last uint32 276 for _, p := range offsets { 277 delta := p - last 278 last = p 279 280 m := binary.PutUvarint(enc[:], uint64(delta)) 281 deltas = append(deltas, enc[:m]...) 282 } 283 return deltas 284} 285 286func fromSizedDeltas(data []byte, ps []uint32) []uint32 { 287 sz, m := binary.Uvarint(data) 288 data = data[m:] 289 290 if cap(ps) < int(sz) { 291 ps = make([]uint32, 0, sz) 292 } else { 293 ps = ps[:0] 294 } 295 296 var last uint32 297 for len(data) > 0 { 298 delta, m := binary.Uvarint(data) 299 offset := last + uint32(delta) 300 last = offset 301 data = data[m:] 302 ps = append(ps, offset) 303 } 304 return ps 305} 306 307func toSizedDeltas16(offsets []uint16) []byte { 308 var enc [8]byte 309 310 deltas := make([]byte, 0, len(offsets)*2) 311 312 m := binary.PutUvarint(enc[:], uint64(len(offsets))) 313 deltas = append(deltas, enc[:m]...) 314 315 var last uint16 316 for _, p := range offsets { 317 delta := p - last 318 last = p 319 320 m := binary.PutUvarint(enc[:], uint64(delta)) 321 deltas = append(deltas, enc[:m]...) 322 } 323 return deltas 324} 325 326func fromSizedDeltas16(data []byte, ps []uint16) []uint16 { 327 sz, m := binary.Uvarint(data) 328 data = data[m:] 329 330 if cap(ps) < int(sz) { 331 ps = make([]uint16, 0, sz) 332 } else { 333 ps = ps[:0] 334 } 335 336 var last uint16 337 for len(data) > 0 { 338 delta, m := binary.Uvarint(data) 339 offset := last + uint16(delta) 340 last = offset 341 data = data[m:] 342 ps = append(ps, offset) 343 } 344 return ps 345} 346 347func fromDeltas(data []byte, buf []uint32) []uint32 { 348 buf = buf[:0] 349 if cap(buf) < len(data)/2 { 350 buf = make([]uint32, 0, len(data)/2) 351 } 352 353 var last uint32 354 for len(data) > 0 { 355 delta, m := binary.Uvarint(data) 356 offset := last + uint32(delta) 357 last = offset 358 data = data[m:] 359 buf = append(buf, offset) 360 } 361 return buf 362} 363 364type runeOffsetCorrection struct { 365 runeOffset, byteOffset uint32 366} 367 368// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency) 369// to byte offsets, by tracking only the points where a span of runes is non-ASCII, 370// and otherwise interpolating expected byte offsets as one byte per rune. 371// 372// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}]. 373// 374// This is very rarely a slight pessimization on repos where there are frequent 375// non-ASCII characters. 376type runeOffsetMap []runeOffsetCorrection 377 378// makeRuneOffsetMap converts the mostly-predictable runeOffset input 379// into a shorter form tracking the unexpected values. 380// 381// The input is a sequence of y values that we expect to increase by 100 each, 382// so we just store (x, y) points where the expectation is violated. 383func makeRuneOffsetMap(off []uint32) runeOffsetMap { 384 expected := uint32(0) 385 tmp := []runeOffsetCorrection{} 386 for runeOffset, byteOffset := range off { 387 if byteOffset != expected { 388 tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset}) 389 expected = byteOffset 390 } 391 expected += runeOffsetFrequency 392 } 393 // copy the slice to ensure it doesn't waste unused trailing capacity 394 out := make([]runeOffsetCorrection, len(tmp)) 395 copy(out, tmp) 396 return runeOffsetMap(out) 397} 398 399// lookup converts rune index `off` to a byte offset and a number of additional 400// runes to traverse, given the granularity of runeOffsetFrequency. 401// 402// It does this by finding the nearest point to interpolate from in the map. 403func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) { 404 left := runeOffset % runeOffsetFrequency 405 runeOffset -= left 406 slen := len(m) 407 if slen == 0 { 408 return runeOffset, left 409 } 410 // sort.Search finds the *first* index for which the predicate is true, 411 // but we want to find the *last* index for which the predicate is true. 412 // This involves some work to reverse the index directions. 413 idx := sort.Search(slen, func(i int) bool { 414 return runeOffset >= m[slen-1-i].runeOffset 415 }) 416 idx = slen - 1 - idx 417 // idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller 418 // than the first entry in the map, so no correction is necessary. 419 byteOff := runeOffset 420 if idx >= 0 { 421 byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset 422 } 423 return byteOff, left 424} 425 426func (m runeOffsetMap) sizeBytes() int { 427 return 8 * len(m) 428} 429 430func epsilonEqualsOne(scoreWeight float64) bool { 431 return scoreWeight == 1 || math.Abs(scoreWeight-1.0) < 1e-9 432}