fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 9.4 kB View raw
1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "cmp" 19 "encoding/binary" 20 "math" 21 "sort" 22 "unicode" 23 "unicode/utf8" 24) 25 26func generateCaseNgrams(g ngram) []ngram { 27 asRunes := ngramToRunes(g) 28 29 variants := make([]ngram, 0, 8) 30 cur := asRunes 31 for { 32 for i := range 3 { 33 next := unicode.SimpleFold(cur[i]) 34 cur[i] = next 35 if next != asRunes[i] { 36 break 37 } 38 } 39 40 variants = append(variants, runesToNGram(cur)) 41 if cur == asRunes { 42 break 43 } 44 } 45 46 return variants 47} 48 49func toLower(in []byte) []byte { 50 out := make([]byte, 0, len(in)) 51 var buf [4]byte 52 for _, c := range string(in) { 53 i := utf8.EncodeRune(buf[:], unicode.ToLower(c)) 54 out = append(out, buf[:i]...) 55 } 56 return out 57} 58 59// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may 60// be larger than 'lower'. Returns whether there was a match, and if 61// yes, the byte size of the match. 62func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) { 63 matchTotal := 0 64 for len(lower) > 0 && len(mixed) > 0 { 65 lr, lsz := utf8.DecodeRune(lower) 66 lower = lower[lsz:] 67 68 mr, msz := utf8.DecodeRune(mixed) 69 mixed = mixed[msz:] 70 matchTotal += msz 71 72 if lr != unicode.ToLower(mr) { 73 return 0, false 74 } 75 } 76 77 return matchTotal, len(lower) == 0 78} 79 80type ngram uint64 81 82func runesToNGram(b [ngramSize]rune) ngram { 83 return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2])) 84} 85 86func bytesToNGram(b []byte) ngram { 87 return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])}) 88} 89 90func stringToNGram(s string) ngram { 91 return bytesToNGram([]byte(s)) 92} 93 94func ngramToBytes(n ngram) []byte { 95 rs := ngramToRunes(n) 96 return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])} 97} 98 99const runeMask = 1<<21 - 1 100 101func ngramToRunes(n ngram) [ngramSize]rune { 102 return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)} 103} 104 105func (n ngram) String() string { 106 rs := ngramToRunes(n) 107 return string(rs[:]) 108} 109 110type runeNgramOff struct { 111 ngram ngram 112 // index is the original index inside of the returned array of splitNGrams 113 index int 114} 115 116func (a runeNgramOff) Compare(b runeNgramOff) int { 117 if a.ngram == b.ngram { 118 return cmp.Compare(a.index, b.index) 119 } else if a.ngram < b.ngram { 120 return -1 121 } else { 122 return 1 123 } 124} 125 126func splitNGrams(str []byte) []runeNgramOff { 127 var runeGram [3]rune 128 var off [3]uint32 129 var runeCount int 130 131 result := make([]runeNgramOff, 0, len(str)) 132 var i uint32 133 134 for len(str) > 0 { 135 r, sz := utf8.DecodeRune(str) 136 str = str[sz:] 137 runeGram[0] = runeGram[1] 138 off[0] = off[1] 139 runeGram[1] = runeGram[2] 140 off[1] = off[2] 141 runeGram[2] = r 142 off[2] = uint32(i) 143 i += uint32(sz) 144 runeCount++ 145 if runeCount < ngramSize { 146 continue 147 } 148 149 ng := runesToNGram(runeGram) 150 result = append(result, runeNgramOff{ 151 ngram: ng, 152 index: len(result), 153 }) 154 } 155 156 return result 157} 158 159const ( 160 _classLowerChar int = iota 161 _classUpperChar 162 _classDigit 163 _classPunct 164 _classOther 165 _classSpace 166) 167 168func byteClass(c byte) int { 169 if c >= 'a' && c <= 'z' { 170 return _classLowerChar 171 } 172 if c >= 'A' && c <= 'Z' { 173 return _classUpperChar 174 } 175 if c >= '0' && c <= '9' { 176 return _classDigit 177 } 178 179 switch c { 180 case ' ', '\n': 181 return _classSpace 182 case '.', ',', ';', '"', '\'': 183 return _classPunct 184 default: 185 return _classOther 186 } 187} 188 189func characterClass(c byte) bool { 190 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' 191} 192 193func marshalDocSections(secs []DocumentSection) []byte { 194 ints := make([]uint32, 0, len(secs)*2) 195 for _, s := range secs { 196 ints = append(ints, uint32(s.Start), uint32(s.End)) 197 } 198 199 return toSizedDeltas(ints) 200} 201 202func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection { 203 sz, m := binary.Uvarint(data) 204 data = data[m:] 205 206 if cap(ds) < int(sz)/2 { 207 ds = make([]DocumentSection, 0, sz/2) 208 } else { 209 ds = ds[:0] 210 } 211 212 // Inlining the delta decoding to avoid unnecessary allocations that would come 213 // from the straightforward implementation, i.e. packing the result of fromSizedDeltas. 214 var last uint32 215 for len(data) > 0 { 216 var d DocumentSection 217 218 delta, m := binary.Uvarint(data) 219 last += uint32(delta) 220 data = data[m:] 221 d.Start = last 222 223 delta, m = binary.Uvarint(data) 224 last += uint32(delta) 225 data = data[m:] 226 d.End = last 227 228 ds = append(ds, d) 229 } 230 return ds 231} 232 233type ngramSlice []ngram 234 235func (p ngramSlice) Len() int { return len(p) } 236 237func (p ngramSlice) Less(i, j int) bool { 238 return p[i] < p[j] 239} 240 241func (p ngramSlice) Swap(i, j int) { 242 p[i], p[j] = p[j], p[i] 243} 244 245func toSizedDeltas(offsets []uint32) []byte { 246 var enc [8]byte 247 248 deltas := make([]byte, 0, len(offsets)*2) 249 250 m := binary.PutUvarint(enc[:], uint64(len(offsets))) 251 deltas = append(deltas, enc[:m]...) 252 253 var last uint32 254 for _, p := range offsets { 255 delta := p - last 256 last = p 257 258 m := binary.PutUvarint(enc[:], uint64(delta)) 259 deltas = append(deltas, enc[:m]...) 260 } 261 return deltas 262} 263 264func fromSizedDeltas(data []byte, ps []uint32) []uint32 { 265 sz, m := binary.Uvarint(data) 266 data = data[m:] 267 268 if cap(ps) < int(sz) { 269 ps = make([]uint32, 0, sz) 270 } else { 271 ps = ps[:0] 272 } 273 274 var last uint32 275 for len(data) > 0 { 276 delta, m := binary.Uvarint(data) 277 offset := last + uint32(delta) 278 last = offset 279 data = data[m:] 280 ps = append(ps, offset) 281 } 282 return ps 283} 284 285func toSizedDeltas16(offsets []uint16) []byte { 286 var enc [8]byte 287 288 deltas := make([]byte, 0, len(offsets)*2) 289 290 m := binary.PutUvarint(enc[:], uint64(len(offsets))) 291 deltas = append(deltas, enc[:m]...) 292 293 var last uint16 294 for _, p := range offsets { 295 delta := p - last 296 last = p 297 298 m := binary.PutUvarint(enc[:], uint64(delta)) 299 deltas = append(deltas, enc[:m]...) 300 } 301 return deltas 302} 303 304func fromSizedDeltas16(data []byte, ps []uint16) []uint16 { 305 sz, m := binary.Uvarint(data) 306 data = data[m:] 307 308 if cap(ps) < int(sz) { 309 ps = make([]uint16, 0, sz) 310 } else { 311 ps = ps[:0] 312 } 313 314 var last uint16 315 for len(data) > 0 { 316 delta, m := binary.Uvarint(data) 317 offset := last + uint16(delta) 318 last = offset 319 data = data[m:] 320 ps = append(ps, offset) 321 } 322 return ps 323} 324 325func fromDeltas(data []byte, buf []uint32) []uint32 { 326 buf = buf[:0] 327 if cap(buf) < len(data)/2 { 328 buf = make([]uint32, 0, len(data)/2) 329 } 330 331 var last uint32 332 for len(data) > 0 { 333 delta, m := binary.Uvarint(data) 334 offset := last + uint32(delta) 335 last = offset 336 data = data[m:] 337 buf = append(buf, offset) 338 } 339 return buf 340} 341 342type runeOffsetCorrection struct { 343 runeOffset, byteOffset uint32 344} 345 346// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency) 347// to byte offsets, by tracking only the points where a span of runes is non-ASCII, 348// and otherwise interpolating expected byte offsets as one byte per rune. 349// 350// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}]. 351// 352// This is very rarely a slight pessimization on repos where there are frequent 353// non-ASCII characters. 354type runeOffsetMap []runeOffsetCorrection 355 356// makeRuneOffsetMap converts the mostly-predictable runeOffset input 357// into a shorter form tracking the unexpected values. 358// 359// The input is a sequence of y values that we expect to increase by 100 each, 360// so we just store (x, y) points where the expectation is violated. 361func makeRuneOffsetMap(off []uint32) runeOffsetMap { 362 expected := uint32(0) 363 tmp := []runeOffsetCorrection{} 364 for runeOffset, byteOffset := range off { 365 if byteOffset != expected { 366 tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset}) 367 expected = byteOffset 368 } 369 expected += runeOffsetFrequency 370 } 371 // copy the slice to ensure it doesn't waste unused trailing capacity 372 out := make([]runeOffsetCorrection, len(tmp)) 373 copy(out, tmp) 374 return runeOffsetMap(out) 375} 376 377// lookup converts rune index `off` to a byte offset and a number of additional 378// runes to traverse, given the granularity of runeOffsetFrequency. 379// 380// It does this by finding the nearest point to interpolate from in the map. 381func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) { 382 left := runeOffset % runeOffsetFrequency 383 runeOffset -= left 384 slen := len(m) 385 if slen == 0 { 386 return runeOffset, left 387 } 388 // sort.Search finds the *first* index for which the predicate is true, 389 // but we want to find the *last* index for which the predicate is true. 390 // This involves some work to reverse the index directions. 391 idx := sort.Search(slen, func(i int) bool { 392 return runeOffset >= m[slen-1-i].runeOffset 393 }) 394 idx = slen - 1 - idx 395 // idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller 396 // than the first entry in the map, so no correction is necessary. 397 byteOff := runeOffset 398 if idx >= 0 { 399 byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset 400 } 401 return byteOff, left 402} 403 404func (m runeOffsetMap) sizeBytes() int { 405 return 8 * len(m) 406} 407 408func epsilonEqualsOne(scoreWeight float64) bool { 409 return scoreWeight == 1 || math.Abs(scoreWeight-1.0) < 1e-9 410}