fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "encoding/binary" 19 "errors" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "math" 24 "math/bits" 25 "slices" 26 "unicode/utf8" 27 28 "github.com/sourcegraph/zoekt" 29 "github.com/sourcegraph/zoekt/query" 30) 31 32// indexData holds the pattern-independent data that we have to have 33// in memory to search. Most of the memory is taken up by the ngram => 34// offset index. 35type indexData struct { 36 symbols symbolData 37 38 file IndexFile 39 40 contentNgrams btreeIndex 41 42 newlinesStart uint32 43 newlinesIndex []uint32 44 45 docSectionsStart uint32 46 docSectionsIndex []uint32 47 48 runeDocSections []DocumentSection 49 50 // rune offset=>byte offset mapping, relative to the start of the content corpus 51 runeOffsets runeOffsetMap 52 53 // offsets of file contents; includes end of last file 54 boundariesStart uint32 55 boundaries []uint32 56 57 // rune offsets for the file content boundaries 58 fileEndRunes []uint32 59 60 fileNameContent []byte 61 fileNameIndex []uint32 62 fileNameNgrams btreeIndex 63 64 // fileEndSymbol[i] is the index of the first symbol for document i. 65 fileEndSymbol []uint32 66 67 // rune offset=>byte offset mapping, relative to the start of the filename corpus 68 fileNameRuneOffsets runeOffsetMap 69 70 // rune offsets for the file name boundaries 71 fileNameEndRunes []uint32 72 73 fileBranchMasks []uint64 74 75 // mask (power of 2) => name 76 branchNames []map[uint]string 77 78 // name => mask (power of 2) 79 branchIDs []map[string]uint 80 81 metaData zoekt.IndexMetadata 82 repoMetaData []zoekt.Repository 83 84 subRepos []uint32 85 subRepoPaths [][]string 86 87 // Checksums for all the files, at 8-byte intervals 88 checksums []byte 89 90 // languages for all the files. 91 languages []byte 92 93 // inverse of LanguageMap in metaData 94 languageMap map[uint16]string 95 96 // file categories for all the files. 97 categories []byte 98 99 repoListEntry []zoekt.RepoListEntry 100 101 // repository indexes for all the files 102 repos []uint16 103 104 // rawConfigMasks contains the encoded RawConfig for each repository 105 rawConfigMasks []uint8 106 107 // Cache for docMatchTree objects 108 docMatchTreeCache *docMatchTreeCache 109} 110 111type symbolData struct { 112 // symContent stores Symbol.Sym and Symbol.Parent. 113 // TODO we don't need to store Symbol.Sym. 114 symContent []byte 115 symIndex []byte 116 // symKindContent is an enum of sym.Kind and sym.ParentKind 117 symKindContent []byte 118 symKindIndex []uint32 119 // symMetadata is [4]uint32 0 Kind Parent ParentKind 120 symMetaData []byte 121} 122 123func uint32SliceAt(a []byte, n uint32) uint32 { 124 return binary.BigEndian.Uint32(a[n*4:]) 125} 126 127func uint32SliceLen(a []byte) uint32 { 128 return uint32(len(a) / 4) 129} 130 131// parent returns index i of the parent enum 132func (d *symbolData) parent(i uint32) []byte { 133 delta := uint32SliceAt(d.symIndex, 0) 134 start := uint32SliceAt(d.symIndex, i) - delta 135 var end uint32 136 if i+1 == uint32SliceLen(d.symIndex) { 137 end = uint32(len(d.symContent)) 138 } else { 139 end = uint32SliceAt(d.symIndex, i+1) - delta 140 } 141 return d.symContent[start:end] 142} 143 144// kind returns index i of the kind enum 145func (d *symbolData) kind(i uint32) []byte { 146 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]] 147} 148 149// data returns the symbol at index i 150func (d *symbolData) data(i uint32) *zoekt.Symbol { 151 size := uint32(4 * 4) // 4 uint32s 152 offset := i * size 153 if offset >= uint32(len(d.symMetaData)) { 154 return nil 155 } 156 157 metadata := d.symMetaData[offset : offset+size] 158 sym := &zoekt.Symbol{} 159 key := uint32SliceAt(metadata, 1) 160 sym.Kind = string(d.kind(key)) 161 key = uint32SliceAt(metadata, 2) 162 sym.Parent = string(d.parent(key)) 163 key = uint32SliceAt(metadata, 3) 164 sym.ParentKind = string(d.kind(key)) 165 return sym 166} 167 168func (d *indexData) getChecksum(idx uint32) []byte { 169 start := crc64.Size * idx 170 return d.checksums[start : start+crc64.Size] 171} 172 173func (d *indexData) getLanguage(idx uint32) uint16 { 174 if d.metaData.IndexFeatureVersion < 12 { 175 // older zoekt files had 8-bit language entries 176 return uint16(d.languages[idx]) 177 } 178 // newer zoekt files have 16-bit language entries 179 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8 180} 181 182func (d *indexData) getCategory(idx uint32) FileCategory { 183 if len(d.categories) == 0 { 184 // This means we're reading an older index, so return 'missing' 185 return FileCategoryMissing 186 } 187 category, err := decodeCategory(d.categories[idx]) 188 if err != nil { 189 return FileCategoryMissing 190 } 191 return category 192} 193 194// calculates stats for files in the range [start, end). 195func (d *indexData) calculateStatsForFileRange(start, end uint32) zoekt.RepoStats { 196 if start >= end { 197 // An empty shard for an empty repository. 198 return zoekt.RepoStats{ 199 Shards: 1, 200 } 201 } 202 203 bytesContent := d.boundaries[end] - d.boundaries[start] 204 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start] 205 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end) 206 207 // CR keegan for stefan: I think we may want to restructure RepoListEntry so 208 // that we don't change anything, except we have 209 // []Repository. Alternatively, things we can divide up we do (like 210 // here). Right now I don't like that these numbers are not true, especially 211 // after aggregation. For now I will move forward with this until we can 212 // chat more. 213 return zoekt.RepoStats{ 214 ContentBytes: int64(bytesContent) + int64(bytesFN), 215 Documents: int(end - start), 216 // CR keegan for stefan: our shard count is going to go out of whack, 217 // since we will aggregate these. So we will report more shards than are 218 // present on disk. What should we do? 219 Shards: 1, 220 221 // Sourcegraph specific 222 NewLinesCount: count, 223 DefaultBranchNewLinesCount: defaultCount, 224 OtherBranchesNewLinesCount: otherCount, 225 } 226} 227 228func (d *indexData) calculateStats() error { 229 d.repoListEntry = make([]zoekt.RepoListEntry, 0, len(d.repoMetaData)) 230 var start, end uint32 231 232 for repoID, md := range d.repoMetaData { 233 // determine the file range for repo i 234 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) { 235 end++ 236 } 237 if start < end && d.repos[start] != uint16(repoID) { 238 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID) 239 } 240 241 d.repoListEntry = append(d.repoListEntry, zoekt.RepoListEntry{ 242 Repository: md, 243 IndexMetadata: d.metaData, 244 Stats: d.calculateStatsForFileRange(start, end), 245 }) 246 start = end 247 } 248 249 // All repos in a compound shard share memoryUse. So we average out the 250 // memoryUse per shard in our reporting. This has the benefit that when you 251 // aggregate the IndexBytes you get back the actual memoryUse. 252 // 253 // TODO take into account tombstones for aggregation. Even better, adjust 254 // API to be shard centric not repo centric. 255 if len(d.repoListEntry) > 0 { 256 indexBytes := d.memoryUse() 257 indexBytesChunk := indexBytes / len(d.repoListEntry) 258 for i := range d.repoListEntry { 259 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk) 260 indexBytes -= indexBytesChunk 261 } 262 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes) 263 } 264 265 return nil 266} 267 268// calculateNewLinesStats computes some Sourcegraph specific statistics for files 269// in the range [start, end). These are not as efficient to calculate as the 270// normal statistics. We experimentally measured about a 10% slower shard load 271// time. However, we find these values very useful to track and computing them 272// outside of load time introduces a lot of complexity. 273func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) { 274 for i := start; i < end; i++ { 275 // branchMask is a bitmask of the branches for a document. Zoekt by 276 // convention represents the default branch as the lowest bit. 277 branchMask := d.fileBranchMasks[i] 278 isDefault := (branchMask & 1) == 1 279 others := uint64(bits.OnesCount64(branchMask >> 1)) 280 281 // this is readNewlines but only reading the size of each section which 282 // corresponds to the number of newlines. 283 sec := simpleSection{ 284 off: d.newlinesStart + d.newlinesIndex[i], 285 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i], 286 } 287 // We are only reading the first varint which is the size. So we don't 288 // need to read more than MaxVarintLen64 bytes. 289 if sec.sz > binary.MaxVarintLen64 { 290 sec.sz = binary.MaxVarintLen64 291 } 292 blob, err := d.readSectionBlob(sec) 293 if err != nil { 294 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err) 295 continue 296 } 297 sz, _ := binary.Uvarint(blob) 298 299 count += sz 300 if isDefault { 301 defaultCount += sz 302 } 303 otherCount += (others * sz) 304 } 305 306 return 307} 308 309func (d *indexData) String() string { 310 return fmt.Sprintf("shard(%s)", d.file.Name()) 311} 312 313// calculates an approximate size of indexData in memory in bytes. 314func (d *indexData) memoryUse() int { 315 sz := 0 316 for _, a := range [][]uint32{ 317 d.newlinesIndex, d.docSectionsIndex, 318 d.boundaries, d.fileNameIndex, 319 d.fileEndRunes, d.fileNameEndRunes, 320 d.fileEndSymbol, d.symbols.symKindIndex, 321 d.subRepos, 322 } { 323 sz += 4 * len(a) 324 } 325 sz += d.runeOffsets.sizeBytes() 326 sz += d.fileNameRuneOffsets.sizeBytes() 327 sz += len(d.languages) 328 sz += len(d.checksums) 329 sz += 2 * len(d.repos) 330 sz += 8 * len(d.runeDocSections) 331 sz += 8 * len(d.fileBranchMasks) 332 sz += d.contentNgrams.SizeBytes() 333 sz += d.fileNameNgrams.SizeBytes() 334 return sz 335} 336 337// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to 338// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids 339// overlapping trigrams to keep their intersection as small as possible. 340// 341// Invariant: first will always have a smaller index than last. 342func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) { 343 first, last = minFrequencyNgramOffsets(ngramOffs, frequencies) 344 345 // If the trigrams are overlapping, then try to shift one to reduce overlap. 346 // This is guaranteed to produce a smaller intersection. 347 if last.index-first.index < ngramSize { 348 newFirstIndex := max(last.index-ngramSize, 0) 349 if newFirstIndex != first.index { 350 first = ngramOffs[indexMap[newFirstIndex]] 351 } 352 353 newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1) 354 if newLastIndex != last.index { 355 last = ngramOffs[indexMap[newLastIndex]] 356 } 357 } 358 return 359} 360 361func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) { 362 // Find the two lowest frequency ngrams. 363 idx0, idx1 := 0, 0 364 min0, min1 := uint32(math.MaxUint32), uint32(math.MaxUint32) 365 for i, x := range frequencies { 366 if x <= min0 { 367 idx0, idx1 = i, idx0 368 min0, min1 = x, min0 369 } else if x <= min1 { 370 idx1 = i 371 min1 = x 372 } 373 } 374 375 first = ngramOffs[idx0] 376 last = ngramOffs[idx1] 377 378 // Ensure first appears before last as a helpful invariant. 379 if first.index > last.index { 380 last, first = first, last 381 } 382 return 383} 384 385func (data *indexData) ngrams(filename bool) btreeIndex { 386 if filename { 387 return data.fileNameNgrams 388 } 389 return data.contentNgrams 390} 391 392type ngramIterationResults struct { 393 matchIterator 394 395 caseSensitive bool 396 fileName bool 397 substrBytes []byte 398 substrLowered []byte 399} 400 401func (r *ngramIterationResults) String() string { 402 return fmt.Sprintf("wrapper(%v)", r.matchIterator) 403} 404 405func (r *ngramIterationResults) candidates() []*candidateMatch { 406 cs := r.matchIterator.candidates() 407 for _, c := range cs { 408 c.caseSensitive = r.caseSensitive 409 c.fileName = r.fileName 410 c.substrBytes = r.substrBytes 411 c.substrLowered = r.substrLowered 412 } 413 return cs 414} 415 416func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) { 417 str := query.Pattern 418 419 // Find the 2 least common ngrams from the string. 420 ngramOffs := splitNGrams([]byte(str)) 421 422 // protect against accidental searching of empty strings 423 if len(ngramOffs) == 0 { 424 return nil, errors.New("iterateNgrams needs non empty string") 425 } 426 427 // PERF: Sort to increase the chances adjacent checks are in the same btree 428 // bucket (which can cause disk IO). 429 slices.SortFunc(ngramOffs, runeNgramOff.Compare) 430 frequencies := make([]uint32, 0, len(ngramOffs)) 431 indexMap := make([]int, len(ngramOffs)) 432 ngramLookups := 0 433 ngrams := d.ngrams(query.FileName) 434 for i, o := range ngramOffs { 435 var freq uint32 436 if query.CaseSensitive { 437 freq = ngrams.Get(o.ngram).sz 438 ngramLookups++ 439 } else { 440 for _, v := range generateCaseNgrams(o.ngram) { 441 freq += ngrams.Get(v).sz 442 ngramLookups++ 443 } 444 } 445 446 if freq == 0 { 447 return &ngramIterationResults{ 448 matchIterator: &noMatchTree{ 449 Why: "freq=0", 450 Stats: zoekt.Stats{ 451 NgramLookups: ngramLookups, 452 }, 453 }, 454 }, nil 455 } 456 457 frequencies = append(frequencies, freq) 458 indexMap[o.index] = i 459 } 460 461 first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies) 462 463 iter := &ngramDocIterator{ 464 leftPad: uint32(first.index), 465 rightPad: uint32(utf8.RuneCountInString(str) - first.index), 466 ngramLookups: ngramLookups, 467 } 468 if query.FileName { 469 iter.ends = d.fileNameEndRunes 470 } else { 471 iter.ends = d.fileEndRunes 472 } 473 474 if first != last { 475 runeDist := uint32(last.index - first.index) 476 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName) 477 if err != nil { 478 return nil, err 479 } 480 481 iter.iter = i 482 } else { 483 hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName) 484 if err != nil { 485 return nil, err 486 } 487 iter.iter = hitIter 488 } 489 490 patBytes := []byte(query.Pattern) 491 lowerPatBytes := toLower(patBytes) 492 493 return &ngramIterationResults{ 494 matchIterator: iter, 495 caseSensitive: query.CaseSensitive, 496 fileName: query.FileName, 497 substrBytes: patBytes, 498 substrLowered: lowerPatBytes, 499 }, nil 500} 501 502func (d *indexData) fileName(i uint32) []byte { 503 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]] 504} 505 506func (d *indexData) numDocs() uint32 { 507 return uint32(len(d.fileBranchMasks)) 508} 509 510func (s *indexData) Close() { 511 s.file.Close() 512} 513 514const ( 515 rawConfigYes = 1 516 rawConfigNo = 2 517) 518 519// encodeRawConfig encodes a rawConfig map into a uint8 mask. 520func encodeRawConfig(rawConfig map[string]string) uint8 { 521 var encoded uint8 522 for i, f := range []string{"public", "fork", "archived"} { 523 var e uint8 524 v, ok := rawConfig[f] 525 if ok && v == "1" { 526 e |= rawConfigYes 527 } else { 528 e |= rawConfigNo 529 } 530 encoded = encoded | e<<(2*i) 531 } 532 return encoded 533}