fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "encoding/binary" 19 "log" 20 "sort" 21 "unicode" 22 "unicode/utf8" 23) 24 25func generateCaseNgrams(g ngram) []ngram { 26 asRunes := ngramToRunes(g) 27 28 variants := make([]ngram, 0, 8) 29 cur := asRunes 30 for { 31 for i := 0; i < 3; i++ { 32 next := unicode.SimpleFold(cur[i]) 33 cur[i] = next 34 if next != asRunes[i] { 35 break 36 } 37 } 38 39 variants = append(variants, runesToNGram(cur)) 40 if cur == asRunes { 41 break 42 } 43 } 44 45 return variants 46} 47 48func toLower(in []byte) []byte { 49 out := make([]byte, 0, len(in)) 50 var buf [4]byte 51 for _, c := range string(in) { 52 i := utf8.EncodeRune(buf[:], unicode.ToLower(c)) 53 out = append(out, buf[:i]...) 54 } 55 return out 56} 57 58// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may 59// be larger than 'lower'. Returns whether there was a match, and if 60// yes, the byte size of the match. 61func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) { 62 matchTotal := 0 63 for len(lower) > 0 && len(mixed) > 0 { 64 lr, lsz := utf8.DecodeRune(lower) 65 lower = lower[lsz:] 66 67 mr, msz := utf8.DecodeRune(mixed) 68 mixed = mixed[msz:] 69 matchTotal += msz 70 71 if lr != unicode.ToLower(mr) { 72 return 0, false 73 } 74 } 75 76 return matchTotal, len(lower) == 0 77} 78 79type ngram uint64 80 81func runesToNGram(b [ngramSize]rune) ngram { 82 return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2])) 83} 84 85func bytesToNGram(b []byte) ngram { 86 return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])}) 87} 88 89func stringToNGram(s string) ngram { 90 return bytesToNGram([]byte(s)) 91} 92 93func ngramToBytes(n ngram) []byte { 94 rs := ngramToRunes(n) 95 return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])} 96} 97 98const runeMask = 1<<21 - 1 99 100func ngramToRunes(n ngram) [ngramSize]rune { 101 return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)} 102} 103 104func (n ngram) String() string { 105 rs := ngramToRunes(n) 106 return string(rs[:]) 107} 108 109type runeNgramOff struct { 110 ngram ngram 111 byteSize uint32 // size of ngram 112 byteOff uint32 113 runeOff uint32 114} 115 116func splitNGrams(str []byte) []runeNgramOff { 117 var runeGram [3]rune 118 var off [3]uint32 119 var runeCount int 120 121 result := make([]runeNgramOff, 0, len(str)) 122 var i uint32 123 124 chars := -1 125 for len(str) > 0 { 126 chars++ 127 r, sz := utf8.DecodeRune(str) 128 str = str[sz:] 129 runeGram[0] = runeGram[1] 130 off[0] = off[1] 131 runeGram[1] = runeGram[2] 132 off[1] = off[2] 133 runeGram[2] = r 134 off[2] = uint32(i) 135 i += uint32(sz) 136 runeCount++ 137 if runeCount < ngramSize { 138 continue 139 } 140 141 ng := runesToNGram(runeGram) 142 result = append(result, runeNgramOff{ 143 ngram: ng, 144 byteSize: i - off[0], 145 byteOff: off[0], 146 runeOff: uint32(chars), 147 }) 148 } 149 return result 150} 151 152const ( 153 _classLowerChar int = iota 154 _classUpperChar 155 _classDigit 156 _classPunct 157 _classOther 158 _classSpace 159) 160 161func byteClass(c byte) int { 162 if c >= 'a' && c <= 'z' { 163 return _classLowerChar 164 } 165 if c >= 'A' && c <= 'Z' { 166 return _classUpperChar 167 } 168 if c >= '0' && c <= '9' { 169 return _classDigit 170 } 171 172 switch c { 173 case ' ', '\n': 174 return _classSpace 175 case '.', ',', ';', '"', '\'': 176 return _classPunct 177 default: 178 return _classOther 179 } 180} 181 182func marshalDocSections(secs []DocumentSection) []byte { 183 ints := make([]uint32, 0, len(secs)*2) 184 for _, s := range secs { 185 ints = append(ints, uint32(s.Start), uint32(s.End)) 186 } 187 188 return toSizedDeltas(ints) 189} 190 191func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection { 192 // Defensive, this shouldn't happen. While we have the feature flag for lazy 193 // doc section decoding let's be extra defensive. 194 if len(data) == 0 { 195 log.Println("WARN unmarshalDocSections received an empty slice to unmarshal") 196 return nil 197 } 198 199 sz, m := binary.Uvarint(data) 200 data = data[m:] 201 202 if cap(ds) < int(sz)/2 { 203 ds = make([]DocumentSection, 0, sz/2) 204 } else { 205 ds = ds[:0] 206 } 207 208 // Inlining the delta decoding to avoid unnecessary allocations that would come 209 // from the straightforward implementation, i.e. packing the result of fromSizedDeltas. 210 var last uint32 211 for len(data) > 0 { 212 var d DocumentSection 213 214 delta, m := binary.Uvarint(data) 215 last += uint32(delta) 216 data = data[m:] 217 d.Start = last 218 219 delta, m = binary.Uvarint(data) 220 last += uint32(delta) 221 data = data[m:] 222 d.End = last 223 224 ds = append(ds, d) 225 } 226 return ds 227} 228 229type ngramSlice []ngram 230 231func (p ngramSlice) Len() int { return len(p) } 232 233func (p ngramSlice) Less(i, j int) bool { 234 return p[i] < p[j] 235} 236 237func (p ngramSlice) Swap(i, j int) { 238 p[i], p[j] = p[j], p[i] 239} 240 241func toSizedDeltas(offsets []uint32) []byte { 242 var enc [8]byte 243 244 deltas := make([]byte, 0, len(offsets)*2) 245 246 m := binary.PutUvarint(enc[:], uint64(len(offsets))) 247 deltas = append(deltas, enc[:m]...) 248 249 var last uint32 250 for _, p := range offsets { 251 delta := p - last 252 last = p 253 254 m := binary.PutUvarint(enc[:], uint64(delta)) 255 deltas = append(deltas, enc[:m]...) 256 } 257 return deltas 258} 259 260func fromSizedDeltas(data []byte, ps []uint32) []uint32 { 261 sz, m := binary.Uvarint(data) 262 data = data[m:] 263 264 if cap(ps) < int(sz) { 265 ps = make([]uint32, 0, sz) 266 } else { 267 ps = ps[:0] 268 } 269 270 var last uint32 271 for len(data) > 0 { 272 delta, m := binary.Uvarint(data) 273 offset := last + uint32(delta) 274 last = offset 275 data = data[m:] 276 ps = append(ps, offset) 277 } 278 return ps 279} 280 281func toSizedDeltas16(offsets []uint16) []byte { 282 var enc [8]byte 283 284 deltas := make([]byte, 0, len(offsets)*2) 285 286 m := binary.PutUvarint(enc[:], uint64(len(offsets))) 287 deltas = append(deltas, enc[:m]...) 288 289 var last uint16 290 for _, p := range offsets { 291 delta := p - last 292 last = p 293 294 m := binary.PutUvarint(enc[:], uint64(delta)) 295 deltas = append(deltas, enc[:m]...) 296 } 297 return deltas 298} 299 300func fromSizedDeltas16(data []byte, ps []uint16) []uint16 { 301 sz, m := binary.Uvarint(data) 302 data = data[m:] 303 304 if cap(ps) < int(sz) { 305 ps = make([]uint16, 0, sz) 306 } else { 307 ps = ps[:0] 308 } 309 310 var last uint16 311 for len(data) > 0 { 312 delta, m := binary.Uvarint(data) 313 offset := last + uint16(delta) 314 last = offset 315 data = data[m:] 316 ps = append(ps, offset) 317 } 318 return ps 319} 320 321func fromDeltas(data []byte, buf []uint32) []uint32 { 322 buf = buf[:0] 323 if cap(buf) < len(data)/2 { 324 buf = make([]uint32, 0, len(data)/2) 325 } 326 327 var last uint32 328 for len(data) > 0 { 329 delta, m := binary.Uvarint(data) 330 offset := last + uint32(delta) 331 last = offset 332 data = data[m:] 333 buf = append(buf, offset) 334 } 335 return buf 336} 337 338type runeOffsetCorrection struct { 339 runeOffset, byteOffset uint32 340} 341 342// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency) 343// to byte offsets, by tracking only the points where a span of runes is non-ASCII, 344// and otherwise interpolating expected byte offsets as one byte per rune. 345// 346// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}]. 347// 348// This is very rarely a slight pessimization on repos where there are frequent 349// non-ASCII characters. 350type runeOffsetMap []runeOffsetCorrection 351 352// makeRuneOffsetMap converts the mostly-predictable runeOffset input 353// into a shorter form tracking the unexpected values. 354// 355// The input is a sequence of y values that we expect to increase by 100 each, 356// so we just store (x, y) points where the expectation is violated. 357func makeRuneOffsetMap(off []uint32) runeOffsetMap { 358 expected := uint32(0) 359 tmp := []runeOffsetCorrection{} 360 for runeOffset, byteOffset := range off { 361 if byteOffset != expected { 362 tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset}) 363 expected = byteOffset 364 } 365 expected += runeOffsetFrequency 366 } 367 // copy the slice to ensure it doesn't waste unused trailing capacity 368 out := make([]runeOffsetCorrection, len(tmp)) 369 copy(out, tmp) 370 return runeOffsetMap(out) 371} 372 373// lookup converts rune index `off` to a byte offset and a number of additional 374// runes to traverse, given the granularity of runeOffsetFrequency. 375// 376// It does this by finding the nearest point to interpolate from in the map. 377func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) { 378 left := runeOffset % runeOffsetFrequency 379 runeOffset -= left 380 slen := len(m) 381 if slen == 0 { 382 return runeOffset, left 383 } 384 // sort.Search finds the *first* index for which the predicate is true, 385 // but we want to find the *last* index for which the predicate is true. 386 // This involves some work to reverse the index directions. 387 idx := sort.Search(slen, func(i int) bool { 388 return runeOffset >= m[slen-1-i].runeOffset 389 }) 390 idx = slen - 1 - idx 391 // idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller 392 // than the first entry in the map, so no correction is necessary. 393 byteOff := runeOffset 394 if idx >= 0 { 395 byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset 396 } 397 return byteOff, left 398} 399 400func (m runeOffsetMap) sizeBytes() int { 401 return 8 * len(m) 402}