fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "encoding/binary" 19 "sort" 20 "unicode" 21 "unicode/utf8" 22) 23 24func generateCaseNgrams(g ngram) []ngram { 25 asRunes := ngramToRunes(g) 26 27 variants := make([]ngram, 0, 8) 28 cur := asRunes 29 for { 30 for i := 0; i < 3; i++ { 31 next := unicode.SimpleFold(cur[i]) 32 cur[i] = next 33 if next != asRunes[i] { 34 break 35 } 36 } 37 38 variants = append(variants, runesToNGram(cur)) 39 if cur == asRunes { 40 break 41 } 42 } 43 44 return variants 45} 46 47func toLower(in []byte) []byte { 48 out := make([]byte, 0, len(in)) 49 var buf [4]byte 50 for _, c := range string(in) { 51 i := utf8.EncodeRune(buf[:], unicode.ToLower(c)) 52 out = append(out, buf[:i]...) 53 } 54 return out 55} 56 57// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may 58// be larger than 'lower'. Returns whether there was a match, and if 59// yes, the byte size of the match. 60func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) { 61 matchTotal := 0 62 for len(lower) > 0 && len(mixed) > 0 { 63 lr, lsz := utf8.DecodeRune(lower) 64 lower = lower[lsz:] 65 66 mr, msz := utf8.DecodeRune(mixed) 67 mixed = mixed[msz:] 68 matchTotal += msz 69 70 if lr != unicode.ToLower(mr) { 71 return 0, false 72 } 73 } 74 75 return matchTotal, len(lower) == 0 76} 77 78type ngram uint64 79 80func runesToNGram(b [ngramSize]rune) ngram { 81 return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2])) 82} 83 84func bytesToNGram(b []byte) ngram { 85 return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])}) 86} 87 88func stringToNGram(s string) ngram { 89 return bytesToNGram([]byte(s)) 90} 91 92func ngramToBytes(n ngram) []byte { 93 rs := ngramToRunes(n) 94 return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])} 95} 96 97const runeMask = 1<<21 - 1 98 99func ngramToRunes(n ngram) [ngramSize]rune { 100 return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)} 101} 102 103func (n ngram) String() string { 104 rs := ngramToRunes(n) 105 return string(rs[:]) 106} 107 108type runeNgramOff struct { 109 ngram ngram 110 byteSize uint32 // size of ngram 111 byteOff uint32 112 runeOff uint32 113} 114 115func splitNGrams(str []byte) []runeNgramOff { 116 var runeGram [3]rune 117 var off [3]uint32 118 var runeCount int 119 120 result := make([]runeNgramOff, 0, len(str)) 121 var i uint32 122 123 chars := -1 124 for len(str) > 0 { 125 chars++ 126 r, sz := utf8.DecodeRune(str) 127 str = str[sz:] 128 runeGram[0] = runeGram[1] 129 off[0] = off[1] 130 runeGram[1] = runeGram[2] 131 off[1] = off[2] 132 runeGram[2] = r 133 off[2] = uint32(i) 134 i += uint32(sz) 135 runeCount++ 136 if runeCount < ngramSize { 137 continue 138 } 139 140 ng := runesToNGram(runeGram) 141 result = append(result, runeNgramOff{ 142 ngram: ng, 143 byteSize: i - off[0], 144 byteOff: off[0], 145 runeOff: uint32(chars), 146 }) 147 } 148 return result 149} 150 151const ( 152 _classLowerChar int = iota 153 _classUpperChar 154 _classDigit 155 _classPunct 156 _classOther 157 _classSpace 158) 159 160func byteClass(c byte) int { 161 if c >= 'a' && c <= 'z' { 162 return _classLowerChar 163 } 164 if c >= 'A' && c <= 'Z' { 165 return _classUpperChar 166 } 167 if c >= '0' && c <= '9' { 168 return _classDigit 169 } 170 171 switch c { 172 case ' ', '\n': 173 return _classSpace 174 case '.', ',', ';', '"', '\'': 175 return _classPunct 176 default: 177 return _classOther 178 } 179} 180 181func marshalDocSections(secs []DocumentSection) []byte { 182 ints := make([]uint32, 0, len(secs)*2) 183 for _, s := range secs { 184 ints = append(ints, uint32(s.Start), uint32(s.End)) 185 } 186 187 return toSizedDeltas(ints) 188} 189 190func unmarshalDocSections(data []byte, ds []DocumentSection) []DocumentSection { 191 sz, m := binary.Uvarint(data) 192 data = data[m:] 193 194 if cap(ds) < int(sz)/2 { 195 ds = make([]DocumentSection, 0, sz/2) 196 } else { 197 ds = ds[:0] 198 } 199 200 // Inlining the delta decoding to avoid unnecessary allocations that would come 201 // from the straightforward implementation, i.e. packing the result of fromSizedDeltas. 202 var last uint32 203 for len(data) > 0 { 204 var d DocumentSection 205 206 delta, m := binary.Uvarint(data) 207 last += uint32(delta) 208 data = data[m:] 209 d.Start = last 210 211 delta, m = binary.Uvarint(data) 212 last += uint32(delta) 213 data = data[m:] 214 d.End = last 215 216 ds = append(ds, d) 217 } 218 return ds 219} 220 221type ngramSlice []ngram 222 223func (p ngramSlice) Len() int { return len(p) } 224 225func (p ngramSlice) Less(i, j int) bool { 226 return p[i] < p[j] 227} 228 229func (p ngramSlice) Swap(i, j int) { 230 p[i], p[j] = p[j], p[i] 231} 232 233func toSizedDeltas(offsets []uint32) []byte { 234 var enc [8]byte 235 236 deltas := make([]byte, 0, len(offsets)*2) 237 238 m := binary.PutUvarint(enc[:], uint64(len(offsets))) 239 deltas = append(deltas, enc[:m]...) 240 241 var last uint32 242 for _, p := range offsets { 243 delta := p - last 244 last = p 245 246 m := binary.PutUvarint(enc[:], uint64(delta)) 247 deltas = append(deltas, enc[:m]...) 248 } 249 return deltas 250} 251 252func fromSizedDeltas(data []byte, ps []uint32) []uint32 { 253 sz, m := binary.Uvarint(data) 254 data = data[m:] 255 256 if cap(ps) < int(sz) { 257 ps = make([]uint32, 0, sz) 258 } else { 259 ps = ps[:0] 260 } 261 262 var last uint32 263 for len(data) > 0 { 264 delta, m := binary.Uvarint(data) 265 offset := last + uint32(delta) 266 last = offset 267 data = data[m:] 268 ps = append(ps, offset) 269 } 270 return ps 271} 272 273func toSizedDeltas16(offsets []uint16) []byte { 274 var enc [8]byte 275 276 deltas := make([]byte, 0, len(offsets)*2) 277 278 m := binary.PutUvarint(enc[:], uint64(len(offsets))) 279 deltas = append(deltas, enc[:m]...) 280 281 var last uint16 282 for _, p := range offsets { 283 delta := p - last 284 last = p 285 286 m := binary.PutUvarint(enc[:], uint64(delta)) 287 deltas = append(deltas, enc[:m]...) 288 } 289 return deltas 290} 291 292func fromSizedDeltas16(data []byte, ps []uint16) []uint16 { 293 sz, m := binary.Uvarint(data) 294 data = data[m:] 295 296 if cap(ps) < int(sz) { 297 ps = make([]uint16, 0, sz) 298 } else { 299 ps = ps[:0] 300 } 301 302 var last uint16 303 for len(data) > 0 { 304 delta, m := binary.Uvarint(data) 305 offset := last + uint16(delta) 306 last = offset 307 data = data[m:] 308 ps = append(ps, offset) 309 } 310 return ps 311} 312 313func fromDeltas(data []byte, buf []uint32) []uint32 { 314 buf = buf[:0] 315 if cap(buf) < len(data)/2 { 316 buf = make([]uint32, 0, len(data)/2) 317 } 318 319 var last uint32 320 for len(data) > 0 { 321 delta, m := binary.Uvarint(data) 322 offset := last + uint32(delta) 323 last = offset 324 data = data[m:] 325 buf = append(buf, offset) 326 } 327 return buf 328} 329 330type runeOffsetCorrection struct { 331 runeOffset, byteOffset uint32 332} 333 334// runeOffsetMap converts from rune offsets (with granularity runeOffsetFrequency) 335// to byte offsets, by tracking only the points where a span of runes is non-ASCII, 336// and otherwise interpolating expected byte offsets as one byte per rune. 337// 338// Instead of storing [100, 205, 305], it stores [{x: 200, y: 205}]. 339// 340// This is very rarely a slight pessimization on repos where there are frequent 341// non-ASCII characters. 342type runeOffsetMap []runeOffsetCorrection 343 344// makeRuneOffsetMap converts the mostly-predictable runeOffset input 345// into a shorter form tracking the unexpected values. 346// 347// The input is a sequence of y values that we expect to increase by 100 each, 348// so we just store (x, y) points where the expectation is violated. 349func makeRuneOffsetMap(off []uint32) runeOffsetMap { 350 expected := uint32(0) 351 tmp := []runeOffsetCorrection{} 352 for runeOffset, byteOffset := range off { 353 if byteOffset != expected { 354 tmp = append(tmp, runeOffsetCorrection{uint32(runeOffset) * runeOffsetFrequency, byteOffset}) 355 expected = byteOffset 356 } 357 expected += runeOffsetFrequency 358 } 359 // copy the slice to ensure it doesn't waste unused trailing capacity 360 out := make([]runeOffsetCorrection, len(tmp)) 361 copy(out, tmp) 362 return runeOffsetMap(out) 363} 364 365// lookup converts rune index `off` to a byte offset and a number of additional 366// runes to traverse, given the granularity of runeOffsetFrequency. 367// 368// It does this by finding the nearest point to interpolate from in the map. 369func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) { 370 left := runeOffset % runeOffsetFrequency 371 runeOffset -= left 372 slen := len(m) 373 if slen == 0 { 374 return runeOffset, left 375 } 376 // sort.Search finds the *first* index for which the predicate is true, 377 // but we want to find the *last* index for which the predicate is true. 378 // This involves some work to reverse the index directions. 379 idx := sort.Search(slen, func(i int) bool { 380 return runeOffset >= m[slen-1-i].runeOffset 381 }) 382 idx = slen - 1 - idx 383 // idx is now in the range [-1, len(m))-- -1 indicates that the offset is smaller 384 // than the first entry in the map, so no correction is necessary. 385 byteOff := runeOffset 386 if idx >= 0 { 387 byteOff = m[idx].byteOffset + runeOffset - m[idx].runeOffset 388 } 389 return byteOff, left 390} 391 392func (m runeOffsetMap) sizeBytes() int { 393 return 8 * len(m) 394}