fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "encoding/binary" 19 "sort" 20 "unicode" 21 "unicode/utf8" 22) 23 24func generateCaseNgrams(g ngram) []ngram { 25 asRunes := ngramToRunes(g) 26 27 variants := make([]ngram, 0, 8) 28 cur := asRunes 29 for { 30 for i := 0; i < 3; i++ { 31 next := unicode.SimpleFold(cur[i]) 32 cur[i] = next 33 if next != asRunes[i] { 34 break 35 } 36 } 37 38 variants = append(variants, runesToNGram(cur)) 39 if cur == asRunes { 40 break 41 } 42 } 43 44 return variants 45} 46 47func toLower(in []byte) []byte { 48 out := make([]byte, 0, len(in)) 49 var buf [4]byte 50 for _, c := range string(in) { 51 i := utf8.EncodeRune(buf[:], unicode.ToLower(c)) 52 out = append(out, buf[:i]...) 53 } 54 return out 55} 56 57// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may 58// be larger than 'lower'. Returns whether there was a match, and if 59// yes, the byte size of the match. 60func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) { 61 matchTotal := 0 62 for len(lower) > 0 && len(mixed) > 0 { 63 lr, lsz := utf8.DecodeRune(lower) 64 lower = lower[lsz:] 65 66 mr, msz := utf8.DecodeRune(mixed) 67 mixed = mixed[msz:] 68 matchTotal += msz 69 70 if lr != unicode.ToLower(mr) { 71 return 0, false 72 } 73 } 74 75 return matchTotal, len(lower) == 0 76} 77 78type ngram uint64 79 80func runesToNGram(b [ngramSize]rune) ngram { 81 return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2])) 82} 83 84func bytesToNGram(b []byte) ngram { 85 return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])}) 86} 87 88func stringToNGram(s string) ngram { 89 return bytesToNGram([]byte(s)) 90} 91 92func ngramToBytes(n ngram) []byte { 93 rs := ngramToRunes(n) 94 return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])} 95} 96 97const runeMask = 1<<21 - 1 98 99func ngramToRunes(n ngram) [ngramSize]rune { 100 return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)} 101} 102 103func (n ngram) String() string { 104 rs := ngramToRunes(n) 105 return string(rs[:]) 106} 107 108type runeNgramOff struct { 109 ngram ngram 110 // index is the original index inside of the returned array of splitNGrams 111 index uint32 112} 113 114func splitNGrams(str []byte) []runeNgramOff { 115 var runeGram [3]rune 116 var off [3]uint32 117 var runeCount int 118 119 result := make([]runeNgramOff, 0, len(str)) 120 var i uint32 121 122 for len(str) > 0 { 123 r, sz := utf8.DecodeRune(str) 124 str = str[sz:] 125 runeGram[0] = runeGram[1] 126 off[0] = off[1] 127 runeGram[1] = runeGram[2] 128 off[1] = off[2] 129 runeGram[2] = r 130 off[2] = uint32(i) 131 i += uint32(sz) 132 runeCount++ 133 if runeCount < ngramSize { 134 continue 135 } 136 137 ng := runesToNGram(runeGram) 138 result = append(result, runeNgramOff{ 139 ngram: ng, 140 index: uint32(len(result)), 141 }) 142 } 143 return result 144} 145 146const ( 147 _classLowerChar int = iota 148 _classUpperChar 149 _classDigit 150 _classPunct 151 _classOther 152 _classSpace 153) 154 155func byteClass(c byte) int { 156 if c >= 'a' && c <= 'z' { 157 return _classLowerChar 158 } 159 if c >= 'A' && c <= 'Z' { 160 return _classUpperChar 161 } 162 if c >= '0' && c <= '9' { 163 return _classDigit 164 } 165 166 switch c { 167 case ' ', '\n': 168 return _classSpace 169 case '.', ',', ';', '"', '\'': 170 return _classPunct 171 default: 172 return _classOther 173 } 174} 175 176func characterClass(c byte) bool { 177 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' 178} 179 180func marshalDocSections(secs []DocumentSection) []byte { 181 ints := make([]uint32, 0, len(secs)*2) 182 for _, s := range secs { 183 ints = append(ints, uint32(s.Start), uint32(s.End)) 184 } 185 186 return toSizedDeltas(ints) 187} 188 189func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection { 190 sz, m := binary.Uvarint(data) 191 data = data[m:] 192 193 if cap(ds) < int(sz)/2 { 194 ds = make([]DocumentSection, 0, sz/2) 195 } else { 196 ds = ds[:0] 197 } 198 199 // Inlining the delta decoding to avoid unnecessary allocations that would come 200 // from the straightforward implementation, i.e. packing the result of fromSizedDeltas. 201 var last uint32 202 for len(data) > 0 { 203 var d DocumentSection 204 205 delta, m := binary.Uvarint(data) 206 last += uint32(delta) 207 data = data[m:] 208 d.Start = last 209 210 delta, m = binary.Uvarint(data) 211 last += uint32(delta) 212 data = data[m:] 213 d.End = last 214 215 ds = append(ds, d) 216 } 217 return ds 218} 219 220type ngramSlice []ngram 221 222func (p ngramSlice) Len() int { return len(p) } 223 224func (p ngramSlice) Less(i, j int) bool { 225 return p[i] < p[j] 226} 227 228func (p ngramSlice) Swap(i, j int) { 229 p[i], p[j] = p[j], p[i] 230} 231 232func toSizedDeltas(offsets []uint32) []byte { 233 var enc [8]byte 234 235 deltas := make([]byte, 0, len(offsets)*2) 236 237 m := binary.PutUvarint(enc[:], uint64(len(offsets))) 238 deltas = append(deltas, enc[:m]...) 239 240 var last uint32 241 for _, p := range offsets { 242 delta := p - last 243 last = p 244 245 m := binary.PutUvarint(enc[:], uint64(delta)) 246 deltas = append(deltas, enc[:m]...) 247 } 248 return deltas 249} 250 251func fromSizedDeltas(data []byte, ps []uint32) []uint32 { 252 sz, m := binary.Uvarint(data) 253 data = data[m:] 254 255 if cap(ps) < int(sz) { 256 ps = make([]uint32, 0, sz) 257 } else { 258 ps = ps[:0] 259 } 260 261 var last uint32 262 for len(data) > 0 { 263 delta, m := binary.Uvarint(data) 264 offset := last + uint32(delta) 265 last = offset 266 data = data[m:] 267 ps = append(ps, offset) 268 } 269 return ps 270} 271 272func toSizedDeltas16(offsets []uint16) []byte { 273 var enc [8]byte 274 275 deltas := make([]byte, 0, len(offsets)*2) 276 277 m := binary.PutUvarint(enc[:], uint64(len(offsets))) 278 deltas = append(deltas, enc[:m]...) 279 280 var last uint16 281 for _, p := range offsets { 282 delta := p - last 283 last = p 284 285 m := binary.PutUvarint(enc[:], uint64(delta)) 286 deltas = append(deltas, enc[:m]...) 287 } 288 return deltas 289} 290 291func fromSizedDeltas16(data []byte, ps []uint16) []uint16 { 292 sz, m := binary.Uvarint(data) 293 data = data[m:] 294 295 if cap(ps) < int(sz) { 296 ps = make([]uint16, 0, sz) 297 } else { 298 ps = ps[:0] 299 } 300 301 var last uint16 302 for len(data) > 0 { 303 delta, m := binary.Uvarint(data) 304 offset := last + uint16(delta) 305 last = offset 306 data = data[m:] 307 ps = append(ps, offset) 308 } 309 return ps 310} 311 312func fromDeltas(data []byte, buf []uint32) []uint32 { 313 buf = buf[:0] 314 if cap(buf) < len(data)/2 { 315 buf = make([]uint32, 0, len(data)/2) 316 } 317 318 var last uint32 319 for len(data) > 0 { 320 delta, m := binary.Uvarint(data) 321 offset := last + uint32(delta) 322 last = offset 323 data = data[m:] 324 buf = append(buf, offset) 325 } 326 return buf 327} 328 329type runeOffsetCorrection struct { 330 runeOffset, byteOffset uint32 331} 332 333// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency) 334// to byte offsets, by tracking only the points where a span of runes is non-ASCII, 335// and otherwise interpolating expected byte offsets as one byte per rune. 336// 337// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}]. 338// 339// This is very rarely a slight pessimization on repos where there are frequent 340// non-ASCII characters. 341type runeOffsetMap []runeOffsetCorrection 342 343// makeRuneOffsetMap converts the mostly-predictable runeOffset input 344// into a shorter form tracking the unexpected values. 345// 346// The input is a sequence of y values that we expect to increase by 100 each, 347// so we just store (x, y) points where the expectation is violated. 348func makeRuneOffsetMap(off []uint32) runeOffsetMap { 349 expected := uint32(0) 350 tmp := []runeOffsetCorrection{} 351 for runeOffset, byteOffset := range off { 352 if byteOffset != expected { 353 tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset}) 354 expected = byteOffset 355 } 356 expected += runeOffsetFrequency 357 } 358 // copy the slice to ensure it doesn't waste unused trailing capacity 359 out := make([]runeOffsetCorrection, len(tmp)) 360 copy(out, tmp) 361 return runeOffsetMap(out) 362} 363 364// lookup converts rune index `off` to a byte offset and a number of additional 365// runes to traverse, given the granularity of runeOffsetFrequency. 366// 367// It does this by finding the nearest point to interpolate from in the map. 368func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) { 369 left := runeOffset % runeOffsetFrequency 370 runeOffset -= left 371 slen := len(m) 372 if slen == 0 { 373 return runeOffset, left 374 } 375 // sort.Search finds the *first* index for which the predicate is true, 376 // but we want to find the *last* index for which the predicate is true. 377 // This involves some work to reverse the index directions. 378 idx := sort.Search(slen, func(i int) bool { 379 return runeOffset >= m[slen-1-i].runeOffset 380 }) 381 idx = slen - 1 - idx 382 // idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller 383 // than the first entry in the map, so no correction is necessary. 384 byteOff := runeOffset 385 if idx >= 0 { 386 byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset 387 } 388 return byteOff, left 389} 390 391func (m runeOffsetMap) sizeBytes() int { 392 return 8 * len(m) 393}