fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "encoding/binary" 19 "sort" 20 "unicode" 21 "unicode/utf8" 22) 23 24func generateCaseNgrams(g ngram) []ngram { 25 asRunes := ngramToRunes(g) 26 27 variants := make([]ngram, 0, 8) 28 cur := asRunes 29 for { 30 for i := 0; i < 3; i++ { 31 next := unicode.SimpleFold(cur[i]) 32 cur[i] = next 33 if next != asRunes[i] { 34 break 35 } 36 } 37 38 variants = append(variants, runesToNGram(cur)) 39 if cur == asRunes { 40 break 41 } 42 } 43 44 return variants 45} 46 47func toLower(in []byte) []byte { 48 out := make([]byte, 0, len(in)) 49 var buf [4]byte 50 for _, c := range string(in) { 51 i := utf8.EncodeRune(buf[:], unicode.ToLower(c)) 52 out = append(out, buf[:i]...) 53 } 54 return out 55} 56 57// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may 58// be larger than 'lower'. Returns whether there was a match, and if 59// yes, the byte size of the match. 60func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) { 61 matchTotal := 0 62 for len(lower) > 0 && len(mixed) > 0 { 63 lr, lsz := utf8.DecodeRune(lower) 64 lower = lower[lsz:] 65 66 mr, msz := utf8.DecodeRune(mixed) 67 mixed = mixed[msz:] 68 matchTotal += msz 69 70 if lr != unicode.ToLower(mr) { 71 return 0, false 72 } 73 } 74 75 return matchTotal, len(lower) == 0 76} 77 78type ngram uint64 79 80func runesToNGram(b [ngramSize]rune) ngram { 81 return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2])) 82} 83 84func bytesToNGram(b []byte) ngram { 85 return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])}) 86} 87 88func stringToNGram(s string) ngram { 89 return bytesToNGram([]byte(s)) 90} 91 92func ngramToBytes(n ngram) []byte { 93 rs := ngramToRunes(n) 94 return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])} 95} 96 97const runeMask = 1<<21 - 1 98 99func ngramToRunes(n ngram) [ngramSize]rune { 100 return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)} 101} 102 103func (n ngram) String() string { 104 rs := ngramToRunes(n) 105 return string(rs[:]) 106} 107 108type runeNgramOff struct { 109 ngram ngram 110 byteSize uint32 // size of ngram 111 byteOff uint32 112 runeOff uint32 113} 114 115func splitNGrams(str []byte) []runeNgramOff { 116 var runeGram [3]rune 117 var off [3]uint32 118 var runeCount int 119 120 result := make([]runeNgramOff, 0, len(str)) 121 var i uint32 122 123 chars := -1 124 for len(str) > 0 { 125 chars++ 126 r, sz := utf8.DecodeRune(str) 127 str = str[sz:] 128 runeGram[0] = runeGram[1] 129 off[0] = off[1] 130 runeGram[1] = runeGram[2] 131 off[1] = off[2] 132 runeGram[2] = r 133 off[2] = uint32(i) 134 i += uint32(sz) 135 runeCount++ 136 if runeCount < ngramSize { 137 continue 138 } 139 140 ng := runesToNGram(runeGram) 141 result = append(result, runeNgramOff{ 142 ngram: ng, 143 byteSize: i - off[0], 144 byteOff: off[0], 145 runeOff: uint32(chars), 146 }) 147 } 148 return result 149} 150 151const ( 152 _classLowerChar int = iota 153 _classUpperChar 154 _classDigit 155 _classPunct 156 _classOther 157 _classSpace 158) 159 160func byteClass(c byte) int { 161 if c >= 'a' && c <= 'z' { 162 return _classLowerChar 163 } 164 if c >= 'A' && c <= 'Z' { 165 return _classUpperChar 166 } 167 if c >= '0' && c <= '9' { 168 return _classDigit 169 } 170 171 switch c { 172 case ' ', '\n': 173 return _classSpace 174 case '.', ',', ';', '"', '\'': 175 return _classPunct 176 default: 177 return _classOther 178 } 179} 180 181func characterClass(c byte) bool { 182 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' 183} 184 185func marshalDocSections(secs []DocumentSection) []byte { 186 ints := make([]uint32, 0, len(secs)*2) 187 for _, s := range secs { 188 ints = append(ints, uint32(s.Start), uint32(s.End)) 189 } 190 191 return toSizedDeltas(ints) 192} 193 194func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection { 195 sz, m := binary.Uvarint(data) 196 data = data[m:] 197 198 if cap(ds) < int(sz)/2 { 199 ds = make([]DocumentSection, 0, sz/2) 200 } else { 201 ds = ds[:0] 202 } 203 204 // Inlining the delta decoding to avoid unnecessary allocations that would come 205 // from the straightforward implementation, i.e. packing the result of fromSizedDeltas. 206 var last uint32 207 for len(data) > 0 { 208 var d DocumentSection 209 210 delta, m := binary.Uvarint(data) 211 last += uint32(delta) 212 data = data[m:] 213 d.Start = last 214 215 delta, m = binary.Uvarint(data) 216 last += uint32(delta) 217 data = data[m:] 218 d.End = last 219 220 ds = append(ds, d) 221 } 222 return ds 223} 224 225type ngramSlice []ngram 226 227func (p ngramSlice) Len() int { return len(p) } 228 229func (p ngramSlice) Less(i, j int) bool { 230 return p[i] < p[j] 231} 232 233func (p ngramSlice) Swap(i, j int) { 234 p[i], p[j] = p[j], p[i] 235} 236 237func toSizedDeltas(offsets []uint32) []byte { 238 var enc [8]byte 239 240 deltas := make([]byte, 0, len(offsets)*2) 241 242 m := binary.PutUvarint(enc[:], uint64(len(offsets))) 243 deltas = append(deltas, enc[:m]...) 244 245 var last uint32 246 for _, p := range offsets { 247 delta := p - last 248 last = p 249 250 m := binary.PutUvarint(enc[:], uint64(delta)) 251 deltas = append(deltas, enc[:m]...) 252 } 253 return deltas 254} 255 256func fromSizedDeltas(data []byte, ps []uint32) []uint32 { 257 sz, m := binary.Uvarint(data) 258 data = data[m:] 259 260 if cap(ps) < int(sz) { 261 ps = make([]uint32, 0, sz) 262 } else { 263 ps = ps[:0] 264 } 265 266 var last uint32 267 for len(data) > 0 { 268 delta, m := binary.Uvarint(data) 269 offset := last + uint32(delta) 270 last = offset 271 data = data[m:] 272 ps = append(ps, offset) 273 } 274 return ps 275} 276 277func toSizedDeltas16(offsets []uint16) []byte { 278 var enc [8]byte 279 280 deltas := make([]byte, 0, len(offsets)*2) 281 282 m := binary.PutUvarint(enc[:], uint64(len(offsets))) 283 deltas = append(deltas, enc[:m]...) 284 285 var last uint16 286 for _, p := range offsets { 287 delta := p - last 288 last = p 289 290 m := binary.PutUvarint(enc[:], uint64(delta)) 291 deltas = append(deltas, enc[:m]...) 292 } 293 return deltas 294} 295 296func fromSizedDeltas16(data []byte, ps []uint16) []uint16 { 297 sz, m := binary.Uvarint(data) 298 data = data[m:] 299 300 if cap(ps) < int(sz) { 301 ps = make([]uint16, 0, sz) 302 } else { 303 ps = ps[:0] 304 } 305 306 var last uint16 307 for len(data) > 0 { 308 delta, m := binary.Uvarint(data) 309 offset := last + uint16(delta) 310 last = offset 311 data = data[m:] 312 ps = append(ps, offset) 313 } 314 return ps 315} 316 317func fromDeltas(data []byte, buf []uint32) []uint32 { 318 buf = buf[:0] 319 if cap(buf) < len(data)/2 { 320 buf = make([]uint32, 0, len(data)/2) 321 } 322 323 var last uint32 324 for len(data) > 0 { 325 delta, m := binary.Uvarint(data) 326 offset := last + uint32(delta) 327 last = offset 328 data = data[m:] 329 buf = append(buf, offset) 330 } 331 return buf 332} 333 334type runeOffsetCorrection struct { 335 runeOffset, byteOffset uint32 336} 337 338// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency) 339// to byte offsets, by tracking only the points where a span of runes is non-ASCII, 340// and otherwise interpolating expected byte offsets as one byte per rune. 341// 342// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}]. 343// 344// This is very rarely a slight pessimization on repos where there are frequent 345// non-ASCII characters. 346type runeOffsetMap []runeOffsetCorrection 347 348// makeRuneOffsetMap converts the mostly-predictable runeOffset input 349// into a shorter form tracking the unexpected values. 350// 351// The input is a sequence of y values that we expect to increase by 100 each, 352// so we just store (x, y) points where the expectation is violated. 353func makeRuneOffsetMap(off []uint32) runeOffsetMap { 354 expected := uint32(0) 355 tmp := []runeOffsetCorrection{} 356 for runeOffset, byteOffset := range off { 357 if byteOffset != expected { 358 tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset}) 359 expected = byteOffset 360 } 361 expected += runeOffsetFrequency 362 } 363 // copy the slice to ensure it doesn't waste unused trailing capacity 364 out := make([]runeOffsetCorrection, len(tmp)) 365 copy(out, tmp) 366 return runeOffsetMap(out) 367} 368 369// lookup converts rune index `off` to a byte offset and a number of additional 370// runes to traverse, given the granularity of runeOffsetFrequency. 371// 372// It does this by finding the nearest point to interpolate from in the map. 373func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) { 374 left := runeOffset % runeOffsetFrequency 375 runeOffset -= left 376 slen := len(m) 377 if slen == 0 { 378 return runeOffset, left 379 } 380 // sort.Search finds the *first* index for which the predicate is true, 381 // but we want to find the *last* index for which the predicate is true. 382 // This involves some work to reverse the index directions. 383 idx := sort.Search(slen, func(i int) bool { 384 return runeOffset >= m[slen-1-i].runeOffset 385 }) 386 idx = slen - 1 - idx 387 // idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller 388 // than the first entry in the map, so no correction is necessary. 389 byteOff := runeOffset 390 if idx >= 0 { 391 byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset 392 } 393 return byteOff, left 394} 395 396func (m runeOffsetMap) sizeBytes() int { 397 return 8 * len(m) 398}