fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "encoding/binary" 19 "errors" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "math/bits" 24 "os" 25 "slices" 26 "strconv" 27 "unicode/utf8" 28 29 "github.com/sourcegraph/zoekt/query" 30) 31 32// indexData holds the pattern-independent data that we have to have 33// in memory to search. Most of the memory is taken up by the ngram => 34// offset index. 35type indexData struct { 36 symbols symbolData 37 38 file IndexFile 39 40 contentNgrams btreeIndex 41 42 newlinesStart uint32 43 newlinesIndex []uint32 44 45 docSectionsStart uint32 46 docSectionsIndex []uint32 47 48 runeDocSections []DocumentSection 49 50 // rune offset=>byte offset mapping, relative to the start of the content corpus 51 runeOffsets runeOffsetMap 52 53 // offsets of file contents; includes end of last file 54 boundariesStart uint32 55 boundaries []uint32 56 57 // rune offsets for the file content boundaries 58 fileEndRunes []uint32 59 60 fileNameContent []byte 61 fileNameIndex []uint32 62 fileNameNgrams btreeIndex 63 64 // fileEndSymbol[i] is the index of the first symbol for document i. 65 fileEndSymbol []uint32 66 67 // rune offset=>byte offset mapping, relative to the start of the filename corpus 68 fileNameRuneOffsets runeOffsetMap 69 70 // rune offsets for the file name boundaries 71 fileNameEndRunes []uint32 72 73 fileBranchMasks []uint64 74 75 // mask (power of 2) => name 76 branchNames []map[uint]string 77 78 // name => mask (power of 2) 79 branchIDs []map[string]uint 80 81 metaData IndexMetadata 82 repoMetaData []Repository 83 84 subRepos []uint32 85 subRepoPaths [][]string 86 87 // Checksums for all the files, at 8-byte intervals 88 checksums []byte 89 90 // languages for all the files. 91 languages []byte 92 93 // inverse of LanguageMap in metaData 94 languageMap map[uint16]string 95 96 repoListEntry []RepoListEntry 97 98 // repository indexes for all the files 99 repos []uint16 100 101 // Experimental: docID => rank vec 102 ranks [][]float64 103 104 // rawConfigMasks contains the encoded RawConfig for each repository 105 rawConfigMasks []uint8 106} 107 108type symbolData struct { 109 // symContent stores Symbol.Sym and Symbol.Parent. 110 // TODO we don't need to store Symbol.Sym. 111 symContent []byte 112 symIndex []byte 113 // symKindContent is an enum of sym.Kind and sym.ParentKind 114 symKindContent []byte 115 symKindIndex []uint32 116 // symMetadata is [4]uint32 0 Kind Parent ParentKind 117 symMetaData []byte 118} 119 120func uint32SliceAt(a []byte, n uint32) uint32 { 121 return binary.BigEndian.Uint32(a[n*4:]) 122} 123 124func uint32SliceLen(a []byte) uint32 { 125 return uint32(len(a) / 4) 126} 127 128// parent returns index i of the parent enum 129func (d *symbolData) parent(i uint32) []byte { 130 delta := uint32SliceAt(d.symIndex, 0) 131 start := uint32SliceAt(d.symIndex, i) - delta 132 var end uint32 133 if i+1 == uint32SliceLen(d.symIndex) { 134 end = uint32(len(d.symContent)) 135 } else { 136 end = uint32SliceAt(d.symIndex, i+1) - delta 137 } 138 return d.symContent[start:end] 139} 140 141// kind returns index i of the kind enum 142func (d *symbolData) kind(i uint32) []byte { 143 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]] 144} 145 146// data returns the symbol at index i 147func (d *symbolData) data(i uint32) *Symbol { 148 size := uint32(4 * 4) // 4 uint32s 149 offset := i * size 150 if offset >= uint32(len(d.symMetaData)) { 151 return nil 152 } 153 154 metadata := d.symMetaData[offset : offset+size] 155 sym := &Symbol{} 156 key := uint32SliceAt(metadata, 1) 157 sym.Kind = string(d.kind(key)) 158 key = uint32SliceAt(metadata, 2) 159 sym.Parent = string(d.parent(key)) 160 key = uint32SliceAt(metadata, 3) 161 sym.ParentKind = string(d.kind(key)) 162 return sym 163} 164 165func (d *indexData) getChecksum(idx uint32) []byte { 166 start := crc64.Size * idx 167 return d.checksums[start : start+crc64.Size] 168} 169 170func (d *indexData) getLanguage(idx uint32) uint16 { 171 if d.metaData.IndexFeatureVersion < 12 { 172 // older zoekt files had 8-bit language entries 173 return uint16(d.languages[idx]) 174 } 175 // newer zoekt files have 16-bit language entries 176 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8 177} 178 179// calculates stats for files in the range [start, end). 180func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats { 181 if start >= end { 182 // An empty shard for an empty repository. 183 return RepoStats{ 184 Shards: 1, 185 } 186 } 187 188 bytesContent := d.boundaries[end] - d.boundaries[start] 189 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start] 190 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end) 191 192 // CR keegan for stefan: I think we may want to restructure RepoListEntry so 193 // that we don't change anything, except we have 194 // []Repository. Alternatively, things we can divide up we do (like 195 // here). Right now I don't like that these numbers are not true, especially 196 // after aggregation. For now I will move forward with this until we can 197 // chat more. 198 return RepoStats{ 199 ContentBytes: int64(bytesContent) + int64(bytesFN), 200 Documents: int(end - start), 201 // CR keegan for stefan: our shard count is going to go out of whack, 202 // since we will aggregate these. So we will report more shards than are 203 // present on disk. What should we do? 204 Shards: 1, 205 206 // Sourcegraph specific 207 NewLinesCount: count, 208 DefaultBranchNewLinesCount: defaultCount, 209 OtherBranchesNewLinesCount: otherCount, 210 } 211} 212 213func (d *indexData) calculateStats() error { 214 d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData)) 215 var start, end uint32 216 217 for repoID, md := range d.repoMetaData { 218 // determine the file range for repo i 219 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) { 220 end++ 221 } 222 if start < end && d.repos[start] != uint16(repoID) { 223 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID) 224 } 225 226 d.repoListEntry = append(d.repoListEntry, RepoListEntry{ 227 Repository: md, 228 IndexMetadata: d.metaData, 229 Stats: d.calculateStatsForFileRange(start, end), 230 }) 231 start = end 232 } 233 234 // All repos in a compound shard share memoryUse. So we average out the 235 // memoryUse per shard in our reporting. This has the benefit that when you 236 // aggregate the IndexBytes you get back the actual memoryUse. 237 // 238 // TODO take into account tombstones for aggregation. Even better, adjust 239 // API to be shard centric not repo centric. 240 if len(d.repoListEntry) > 0 { 241 indexBytes := d.memoryUse() 242 indexBytesChunk := indexBytes / len(d.repoListEntry) 243 for i := range d.repoListEntry { 244 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk) 245 indexBytes -= indexBytesChunk 246 } 247 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes) 248 } 249 250 return nil 251} 252 253// calculateNewLinesStats computes some Sourcegraph specific statistics for files 254// in the range [start, end). These are not as efficient to calculate as the 255// normal statistics. We experimentally measured about a 10% slower shard load 256// time. However, we find these values very useful to track and computing them 257// outside of load time introduces a lot of complexity. 258func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) { 259 for i := start; i < end; i++ { 260 // branchMask is a bitmask of the branches for a document. Zoekt by 261 // convention represents the default branch as the lowest bit. 262 branchMask := d.fileBranchMasks[i] 263 isDefault := (branchMask & 1) == 1 264 others := uint64(bits.OnesCount64(branchMask >> 1)) 265 266 // this is readNewlines but only reading the size of each section which 267 // corresponds to the number of newlines. 268 sec := simpleSection{ 269 off: d.newlinesStart + d.newlinesIndex[i], 270 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i], 271 } 272 // We are only reading the first varint which is the size. So we don't 273 // need to read more than MaxVarintLen64 bytes. 274 if sec.sz > binary.MaxVarintLen64 { 275 sec.sz = binary.MaxVarintLen64 276 } 277 blob, err := d.readSectionBlob(sec) 278 if err != nil { 279 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err) 280 continue 281 } 282 sz, _ := binary.Uvarint(blob) 283 284 count += sz 285 if isDefault { 286 defaultCount += sz 287 } 288 otherCount += (others * sz) 289 } 290 291 return 292} 293 294func (d *indexData) String() string { 295 return fmt.Sprintf("shard(%s)", d.file.Name()) 296} 297 298// calculates an approximate size of indexData in memory in bytes. 299func (d *indexData) memoryUse() int { 300 sz := 0 301 for _, a := range [][]uint32{ 302 d.newlinesIndex, d.docSectionsIndex, 303 d.boundaries, d.fileNameIndex, 304 d.fileEndRunes, d.fileNameEndRunes, 305 d.fileEndSymbol, d.symbols.symKindIndex, 306 d.subRepos, 307 } { 308 sz += 4 * len(a) 309 } 310 sz += d.runeOffsets.sizeBytes() 311 sz += d.fileNameRuneOffsets.sizeBytes() 312 sz += len(d.languages) 313 sz += len(d.checksums) 314 sz += 2 * len(d.repos) 315 if len(d.ranks) > 0 { 316 sz += 8 * len(d.ranks) * len(d.ranks[0]) 317 } 318 sz += 8 * len(d.runeDocSections) 319 sz += 8 * len(d.fileBranchMasks) 320 sz += d.contentNgrams.SizeBytes() 321 sz += d.fileNameNgrams.SizeBytes() 322 return sz 323} 324 325// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to 326// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids 327// overlapping trigrams to keep their intersection as small as possible. 328// 329// Invariant: first will always have a smaller index than last. 330func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) { 331 first, last = minFrequencyNgramOffsets(ngramOffs, frequencies) 332 333 // If the trigrams are overlapping, then try to shift one to reduce overlap. 334 // This is guaranteed to produce a smaller intersection. 335 if last.index-first.index < ngramSize { 336 newFirstIndex := max(last.index-ngramSize, 0) 337 if newFirstIndex != first.index { 338 first = ngramOffs[indexMap[newFirstIndex]] 339 } 340 341 newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1) 342 if newLastIndex != last.index { 343 last = ngramOffs[indexMap[newLastIndex]] 344 } 345 } 346 return 347} 348 349const maxUInt32 = 0xffffffff 350 351func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) { 352 // Find the two lowest frequency ngrams. 353 idx0, idx1 := 0, 0 354 min0, min1 := uint32(maxUInt32), uint32(maxUInt32) 355 for i, x := range frequencies { 356 if x <= min0 { 357 idx0, idx1 = i, idx0 358 min0, min1 = x, min0 359 } else if x <= min1 { 360 idx1 = i 361 min1 = x 362 } 363 } 364 365 first = ngramOffs[idx0] 366 last = ngramOffs[idx1] 367 368 // Ensure first appears before last as a helpful invariant. 369 if first.index > last.index { 370 last, first = first, last 371 } 372 return 373} 374 375func (data *indexData) ngrams(filename bool) btreeIndex { 376 if filename { 377 return data.fileNameNgrams 378 } 379 return data.contentNgrams 380} 381 382type ngramIterationResults struct { 383 matchIterator 384 385 caseSensitive bool 386 fileName bool 387 substrBytes []byte 388 substrLowered []byte 389} 390 391func (r *ngramIterationResults) String() string { 392 return fmt.Sprintf("wrapper(%v)", r.matchIterator) 393} 394 395func (r *ngramIterationResults) candidates() []*candidateMatch { 396 cs := r.matchIterator.candidates() 397 for _, c := range cs { 398 c.caseSensitive = r.caseSensitive 399 c.fileName = r.fileName 400 c.substrBytes = r.substrBytes 401 c.substrLowered = r.substrLowered 402 } 403 return cs 404} 405 406// experimentIterateNgramLookupLimit when non-zero will only lookup this many 407// ngrams from a query string. Note: that if case-insensitive, this only 408// limits the input. So we will still lookup the case folding. 409// 410// This experiment is targetting looking up large snippets. If it is 411// successful, we will likely hardcode the value we use in production. 412// 413// Future note: if we find cases where this works badly, we can consider only 414// searching a random subset of the query string to avoid bad strings. 415var experimentIterateNgramLookupLimit = getEnvInt("SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT") 416 417func getEnvInt(k string) int { 418 v, _ := strconv.Atoi(os.Getenv(k)) 419 if v != 0 { 420 log.Printf("%s = %d\n", k, v) 421 } 422 return v 423} 424 425func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) { 426 str := query.Pattern 427 428 // Find the 2 least common ngrams from the string. 429 var ngramOffs []runeNgramOff 430 if ngramLimit := experimentIterateNgramLookupLimit; ngramLimit > 0 { 431 // Note: we can't just do str = str[:ngramLimit] due to utf-8 and str 432 // length is asked later on for other optimizations. 433 ngramOffs = splitNGramsLimit([]byte(str), ngramLimit) 434 } else { 435 ngramOffs = splitNGrams([]byte(str)) 436 } 437 438 // protect against accidental searching of empty strings 439 if len(ngramOffs) == 0 { 440 return nil, errors.New("iterateNgrams needs non empty string") 441 } 442 443 // PERF: Sort to increase the chances adjacent checks are in the same btree 444 // bucket (which can cause disk IO). 445 slices.SortFunc(ngramOffs, runeNgramOff.Compare) 446 frequencies := make([]uint32, 0, len(ngramOffs)) 447 indexMap := make([]int, len(ngramOffs)) 448 ngramLookups := 0 449 ngrams := d.ngrams(query.FileName) 450 for i, o := range ngramOffs { 451 var freq uint32 452 if query.CaseSensitive { 453 freq = ngrams.Get(o.ngram).sz 454 ngramLookups++ 455 } else { 456 for _, v := range generateCaseNgrams(o.ngram) { 457 freq += ngrams.Get(v).sz 458 ngramLookups++ 459 } 460 } 461 462 if freq == 0 { 463 return &ngramIterationResults{ 464 matchIterator: &noMatchTree{ 465 Why: "freq=0", 466 Stats: Stats{ 467 NgramLookups: ngramLookups, 468 }, 469 }, 470 }, nil 471 } 472 473 frequencies = append(frequencies, freq) 474 indexMap[o.index] = i 475 } 476 477 first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies) 478 479 iter := &ngramDocIterator{ 480 leftPad: uint32(first.index), 481 rightPad: uint32(utf8.RuneCountInString(str) - first.index), 482 ngramLookups: ngramLookups, 483 } 484 if query.FileName { 485 iter.ends = d.fileNameEndRunes 486 } else { 487 iter.ends = d.fileEndRunes 488 } 489 490 if first != last { 491 runeDist := uint32(last.index - first.index) 492 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName) 493 if err != nil { 494 return nil, err 495 } 496 497 iter.iter = i 498 } else { 499 hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName) 500 if err != nil { 501 return nil, err 502 } 503 iter.iter = hitIter 504 } 505 506 patBytes := []byte(query.Pattern) 507 lowerPatBytes := toLower(patBytes) 508 509 return &ngramIterationResults{ 510 matchIterator: iter, 511 caseSensitive: query.CaseSensitive, 512 fileName: query.FileName, 513 substrBytes: patBytes, 514 substrLowered: lowerPatBytes, 515 }, nil 516} 517 518func (d *indexData) fileName(i uint32) []byte { 519 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]] 520} 521 522func (d *indexData) numDocs() uint32 { 523 return uint32(len(d.fileBranchMasks)) 524} 525 526func (s *indexData) Close() { 527 s.file.Close() 528} 529 530const ( 531 rawConfigYes = 1 532 rawConfigNo = 2 533) 534 535// encodeRawConfig encodes a rawConfig map into a uint8 mask. 536func encodeRawConfig(rawConfig map[string]string) uint8 { 537 var encoded uint8 538 for i, f := range []string{"public", "fork", "archived"} { 539 var e uint8 540 v, ok := rawConfig[f] 541 if ok && v == "1" { 542 e |= rawConfigYes 543 } else { 544 e |= rawConfigNo 545 } 546 encoded = encoded | e<<(2*i) 547 } 548 return encoded 549}