fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "encoding/binary" 19 "errors" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "math/bits" 24 "slices" 25 "unicode/utf8" 26 27 "github.com/sourcegraph/zoekt" 28 "github.com/sourcegraph/zoekt/query" 29) 30 31// indexData holds the pattern-independent data that we have to have 32// in memory to search. Most of the memory is taken up by the ngram => 33// offset index. 34type indexData struct { 35 symbols symbolData 36 37 file IndexFile 38 39 contentNgrams btreeIndex 40 41 newlinesStart uint32 42 newlinesIndex []uint32 43 44 docSectionsStart uint32 45 docSectionsIndex []uint32 46 47 runeDocSections []DocumentSection 48 49 // rune offset=>byte offset mapping, relative to the start of the content corpus 50 runeOffsets runeOffsetMap 51 52 // offsets of file contents; includes end of last file 53 boundariesStart uint32 54 boundaries []uint32 55 56 // rune offsets for the file content boundaries 57 fileEndRunes []uint32 58 59 fileNameContent []byte 60 fileNameIndex []uint32 61 fileNameNgrams btreeIndex 62 63 // fileEndSymbol[i] is the index of the first symbol for document i. 64 fileEndSymbol []uint32 65 66 // rune offset=>byte offset mapping, relative to the start of the filename corpus 67 fileNameRuneOffsets runeOffsetMap 68 69 // rune offsets for the file name boundaries 70 fileNameEndRunes []uint32 71 72 fileBranchMasks []uint64 73 74 // mask (power of 2) => name 75 branchNames []map[uint]string 76 77 // name => mask (power of 2) 78 branchIDs []map[string]uint 79 80 metaData zoekt.IndexMetadata 81 repoMetaData []zoekt.Repository 82 83 subRepos []uint32 84 subRepoPaths [][]string 85 86 // Checksums for all the files, at 8-byte intervals 87 checksums []byte 88 89 // languages for all the files. 90 languages []byte 91 92 // inverse of LanguageMap in metaData 93 languageMap map[uint16]string 94 95 // file categories for all the files. 96 categories []byte 97 98 repoListEntry []zoekt.RepoListEntry 99 100 // repository indexes for all the files 101 repos []uint16 102 103 // rawConfigMasks contains the encoded RawConfig for each repository 104 rawConfigMasks []uint8 105} 106 107type symbolData struct { 108 // symContent stores Symbol.Sym and Symbol.Parent. 109 // TODO we don't need to store Symbol.Sym. 110 symContent []byte 111 symIndex []byte 112 // symKindContent is an enum of sym.Kind and sym.ParentKind 113 symKindContent []byte 114 symKindIndex []uint32 115 // symMetadata is [4]uint32 0 Kind Parent ParentKind 116 symMetaData []byte 117} 118 119func uint32SliceAt(a []byte, n uint32) uint32 { 120 return binary.BigEndian.Uint32(a[n*4:]) 121} 122 123func uint32SliceLen(a []byte) uint32 { 124 return uint32(len(a) / 4) 125} 126 127// parent returns index i of the parent enum 128func (d *symbolData) parent(i uint32) []byte { 129 delta := uint32SliceAt(d.symIndex, 0) 130 start := uint32SliceAt(d.symIndex, i) - delta 131 var end uint32 132 if i+1 == uint32SliceLen(d.symIndex) { 133 end = uint32(len(d.symContent)) 134 } else { 135 end = uint32SliceAt(d.symIndex, i+1) - delta 136 } 137 return d.symContent[start:end] 138} 139 140// kind returns index i of the kind enum 141func (d *symbolData) kind(i uint32) []byte { 142 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]] 143} 144 145// data returns the symbol at index i 146func (d *symbolData) data(i uint32) *zoekt.Symbol { 147 size := uint32(4 * 4) // 4 uint32s 148 offset := i * size 149 if offset >= uint32(len(d.symMetaData)) { 150 return nil 151 } 152 153 metadata := d.symMetaData[offset : offset+size] 154 sym := &zoekt.Symbol{} 155 key := uint32SliceAt(metadata, 1) 156 sym.Kind = string(d.kind(key)) 157 key = uint32SliceAt(metadata, 2) 158 sym.Parent = string(d.parent(key)) 159 key = uint32SliceAt(metadata, 3) 160 sym.ParentKind = string(d.kind(key)) 161 return sym 162} 163 164func (d *indexData) getChecksum(idx uint32) []byte { 165 start := crc64.Size * idx 166 return d.checksums[start : start+crc64.Size] 167} 168 169func (d *indexData) getLanguage(idx uint32) uint16 { 170 if d.metaData.IndexFeatureVersion < 12 { 171 // older zoekt files had 8-bit language entries 172 return uint16(d.languages[idx]) 173 } 174 // newer zoekt files have 16-bit language entries 175 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8 176} 177 178func (d *indexData) getCategory(idx uint32) FileCategory { 179 if len(d.categories) == 0 { 180 // This means we're reading an older index, so return 'missing' 181 return FileCategoryMissing 182 } 183 category, err := decodeCategory(d.categories[idx]) 184 if err != nil { 185 return FileCategoryMissing 186 } 187 return category 188} 189 190// calculates stats for files in the range [start, end). 191func (d *indexData) calculateStatsForFileRange(start, end uint32) zoekt.RepoStats { 192 if start >= end { 193 // An empty shard for an empty repository. 194 return zoekt.RepoStats{ 195 Shards: 1, 196 } 197 } 198 199 bytesContent := d.boundaries[end] - d.boundaries[start] 200 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start] 201 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end) 202 203 // CR keegan for stefan: I think we may want to restructure RepoListEntry so 204 // that we don't change anything, except we have 205 // []Repository. Alternatively, things we can divide up we do (like 206 // here). Right now I don't like that these numbers are not true, especially 207 // after aggregation. For now I will move forward with this until we can 208 // chat more. 209 return zoekt.RepoStats{ 210 ContentBytes: int64(bytesContent) + int64(bytesFN), 211 Documents: int(end - start), 212 // CR keegan for stefan: our shard count is going to go out of whack, 213 // since we will aggregate these. So we will report more shards than are 214 // present on disk. What should we do? 215 Shards: 1, 216 217 // Sourcegraph specific 218 NewLinesCount: count, 219 DefaultBranchNewLinesCount: defaultCount, 220 OtherBranchesNewLinesCount: otherCount, 221 } 222} 223 224func (d *indexData) calculateStats() error { 225 d.repoListEntry = make([]zoekt.RepoListEntry, 0, len(d.repoMetaData)) 226 var start, end uint32 227 228 for repoID, md := range d.repoMetaData { 229 // determine the file range for repo i 230 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) { 231 end++ 232 } 233 if start < end && d.repos[start] != uint16(repoID) { 234 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID) 235 } 236 237 d.repoListEntry = append(d.repoListEntry, zoekt.RepoListEntry{ 238 Repository: md, 239 IndexMetadata: d.metaData, 240 Stats: d.calculateStatsForFileRange(start, end), 241 }) 242 start = end 243 } 244 245 // All repos in a compound shard share memoryUse. So we average out the 246 // memoryUse per shard in our reporting. This has the benefit that when you 247 // aggregate the IndexBytes you get back the actual memoryUse. 248 // 249 // TODO take into account tombstones for aggregation. Even better, adjust 250 // API to be shard centric not repo centric. 251 if len(d.repoListEntry) > 0 { 252 indexBytes := d.memoryUse() 253 indexBytesChunk := indexBytes / len(d.repoListEntry) 254 for i := range d.repoListEntry { 255 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk) 256 indexBytes -= indexBytesChunk 257 } 258 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes) 259 } 260 261 return nil 262} 263 264// calculateNewLinesStats computes some Sourcegraph specific statistics for files 265// in the range [start, end). These are not as efficient to calculate as the 266// normal statistics. We experimentally measured about a 10% slower shard load 267// time. However, we find these values very useful to track and computing them 268// outside of load time introduces a lot of complexity. 269func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) { 270 for i := start; i < end; i++ { 271 // branchMask is a bitmask of the branches for a document. Zoekt by 272 // convention represents the default branch as the lowest bit. 273 branchMask := d.fileBranchMasks[i] 274 isDefault := (branchMask & 1) == 1 275 others := uint64(bits.OnesCount64(branchMask >> 1)) 276 277 // this is readNewlines but only reading the size of each section which 278 // corresponds to the number of newlines. 279 sec := simpleSection{ 280 off: d.newlinesStart + d.newlinesIndex[i], 281 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i], 282 } 283 // We are only reading the first varint which is the size. So we don't 284 // need to read more than MaxVarintLen64 bytes. 285 if sec.sz > binary.MaxVarintLen64 { 286 sec.sz = binary.MaxVarintLen64 287 } 288 blob, err := d.readSectionBlob(sec) 289 if err != nil { 290 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err) 291 continue 292 } 293 sz, _ := binary.Uvarint(blob) 294 295 count += sz 296 if isDefault { 297 defaultCount += sz 298 } 299 otherCount += (others * sz) 300 } 301 302 return 303} 304 305func (d *indexData) String() string { 306 return fmt.Sprintf("shard(%s)", d.file.Name()) 307} 308 309// calculates an approximate size of indexData in memory in bytes. 310func (d *indexData) memoryUse() int { 311 sz := 0 312 for _, a := range [][]uint32{ 313 d.newlinesIndex, d.docSectionsIndex, 314 d.boundaries, d.fileNameIndex, 315 d.fileEndRunes, d.fileNameEndRunes, 316 d.fileEndSymbol, d.symbols.symKindIndex, 317 d.subRepos, 318 } { 319 sz += 4 * len(a) 320 } 321 sz += d.runeOffsets.sizeBytes() 322 sz += d.fileNameRuneOffsets.sizeBytes() 323 sz += len(d.languages) 324 sz += len(d.checksums) 325 sz += 2 * len(d.repos) 326 sz += 8 * len(d.runeDocSections) 327 sz += 8 * len(d.fileBranchMasks) 328 sz += d.contentNgrams.SizeBytes() 329 sz += d.fileNameNgrams.SizeBytes() 330 return sz 331} 332 333// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to 334// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids 335// overlapping trigrams to keep their intersection as small as possible. 336// 337// Invariant: first will always have a smaller index than last. 338func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) { 339 first, last = minFrequencyNgramOffsets(ngramOffs, frequencies) 340 341 // If the trigrams are overlapping, then try to shift one to reduce overlap. 342 // This is guaranteed to produce a smaller intersection. 343 if last.index-first.index < ngramSize { 344 newFirstIndex := max(last.index-ngramSize, 0) 345 if newFirstIndex != first.index { 346 first = ngramOffs[indexMap[newFirstIndex]] 347 } 348 349 newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1) 350 if newLastIndex != last.index { 351 last = ngramOffs[indexMap[newLastIndex]] 352 } 353 } 354 return 355} 356 357const maxUInt32 = 0xffffffff 358 359func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) { 360 // Find the two lowest frequency ngrams. 361 idx0, idx1 := 0, 0 362 min0, min1 := uint32(maxUInt32), uint32(maxUInt32) 363 for i, x := range frequencies { 364 if x <= min0 { 365 idx0, idx1 = i, idx0 366 min0, min1 = x, min0 367 } else if x <= min1 { 368 idx1 = i 369 min1 = x 370 } 371 } 372 373 first = ngramOffs[idx0] 374 last = ngramOffs[idx1] 375 376 // Ensure first appears before last as a helpful invariant. 377 if first.index > last.index { 378 last, first = first, last 379 } 380 return 381} 382 383func (data *indexData) ngrams(filename bool) btreeIndex { 384 if filename { 385 return data.fileNameNgrams 386 } 387 return data.contentNgrams 388} 389 390type ngramIterationResults struct { 391 matchIterator 392 393 caseSensitive bool 394 fileName bool 395 substrBytes []byte 396 substrLowered []byte 397} 398 399func (r *ngramIterationResults) String() string { 400 return fmt.Sprintf("wrapper(%v)", r.matchIterator) 401} 402 403func (r *ngramIterationResults) candidates() []*candidateMatch { 404 cs := r.matchIterator.candidates() 405 for _, c := range cs { 406 c.caseSensitive = r.caseSensitive 407 c.fileName = r.fileName 408 c.substrBytes = r.substrBytes 409 c.substrLowered = r.substrLowered 410 } 411 return cs 412} 413 414func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) { 415 str := query.Pattern 416 417 // Find the 2 least common ngrams from the string. 418 ngramOffs := splitNGrams([]byte(str)) 419 420 // protect against accidental searching of empty strings 421 if len(ngramOffs) == 0 { 422 return nil, errors.New("iterateNgrams needs non empty string") 423 } 424 425 // PERF: Sort to increase the chances adjacent checks are in the same btree 426 // bucket (which can cause disk IO). 427 slices.SortFunc(ngramOffs, runeNgramOff.Compare) 428 frequencies := make([]uint32, 0, len(ngramOffs)) 429 indexMap := make([]int, len(ngramOffs)) 430 ngramLookups := 0 431 ngrams := d.ngrams(query.FileName) 432 for i, o := range ngramOffs { 433 var freq uint32 434 if query.CaseSensitive { 435 freq = ngrams.Get(o.ngram).sz 436 ngramLookups++ 437 } else { 438 for _, v := range generateCaseNgrams(o.ngram) { 439 freq += ngrams.Get(v).sz 440 ngramLookups++ 441 } 442 } 443 444 if freq == 0 { 445 return &ngramIterationResults{ 446 matchIterator: &noMatchTree{ 447 Why: "freq=0", 448 Stats: zoekt.Stats{ 449 NgramLookups: ngramLookups, 450 }, 451 }, 452 }, nil 453 } 454 455 frequencies = append(frequencies, freq) 456 indexMap[o.index] = i 457 } 458 459 first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies) 460 461 iter := &ngramDocIterator{ 462 leftPad: uint32(first.index), 463 rightPad: uint32(utf8.RuneCountInString(str) - first.index), 464 ngramLookups: ngramLookups, 465 } 466 if query.FileName { 467 iter.ends = d.fileNameEndRunes 468 } else { 469 iter.ends = d.fileEndRunes 470 } 471 472 if first != last { 473 runeDist := uint32(last.index - first.index) 474 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName) 475 if err != nil { 476 return nil, err 477 } 478 479 iter.iter = i 480 } else { 481 hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName) 482 if err != nil { 483 return nil, err 484 } 485 iter.iter = hitIter 486 } 487 488 patBytes := []byte(query.Pattern) 489 lowerPatBytes := toLower(patBytes) 490 491 return &ngramIterationResults{ 492 matchIterator: iter, 493 caseSensitive: query.CaseSensitive, 494 fileName: query.FileName, 495 substrBytes: patBytes, 496 substrLowered: lowerPatBytes, 497 }, nil 498} 499 500func (d *indexData) fileName(i uint32) []byte { 501 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]] 502} 503 504func (d *indexData) numDocs() uint32 { 505 return uint32(len(d.fileBranchMasks)) 506} 507 508func (s *indexData) Close() { 509 s.file.Close() 510} 511 512const ( 513 rawConfigYes = 1 514 rawConfigNo = 2 515) 516 517// encodeRawConfig encodes a rawConfig map into a uint8 mask. 518func encodeRawConfig(rawConfig map[string]string) uint8 { 519 var encoded uint8 520 for i, f := range []string{"public", "fork", "archived"} { 521 var e uint8 522 v, ok := rawConfig[f] 523 if ok && v == "1" { 524 e |= rawConfigYes 525 } else { 526 e |= rawConfigNo 527 } 528 encoded = encoded | e<<(2*i) 529 } 530 return encoded 531}