fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "encoding/binary" 19 "errors" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "math/bits" 24 "slices" 25 "unicode/utf8" 26 27 "github.com/sourcegraph/zoekt/query" 28) 29 30// indexData holds the pattern-independent data that we have to have 31// in memory to search. Most of the memory is taken up by the ngram => 32// offset index. 33type indexData struct { 34 symbols symbolData 35 36 file IndexFile 37 38 contentNgrams btreeIndex 39 40 newlinesStart uint32 41 newlinesIndex []uint32 42 43 docSectionsStart uint32 44 docSectionsIndex []uint32 45 46 runeDocSections []DocumentSection 47 48 // rune offset=>byte offset mapping, relative to the start of the content corpus 49 runeOffsets runeOffsetMap 50 51 // offsets of file contents; includes end of last file 52 boundariesStart uint32 53 boundaries []uint32 54 55 // rune offsets for the file content boundaries 56 fileEndRunes []uint32 57 58 fileNameContent []byte 59 fileNameIndex []uint32 60 fileNameNgrams btreeIndex 61 62 // fileEndSymbol[i] is the index of the first symbol for document i. 63 fileEndSymbol []uint32 64 65 // rune offset=>byte offset mapping, relative to the start of the filename corpus 66 fileNameRuneOffsets runeOffsetMap 67 68 // rune offsets for the file name boundaries 69 fileNameEndRunes []uint32 70 71 fileBranchMasks []uint64 72 73 // mask (power of 2) => name 74 branchNames []map[uint]string 75 76 // name => mask (power of 2) 77 branchIDs []map[string]uint 78 79 metaData IndexMetadata 80 repoMetaData []Repository 81 82 subRepos []uint32 83 subRepoPaths [][]string 84 85 // Checksums for all the files, at 8-byte intervals 86 checksums []byte 87 88 // languages for all the files. 89 languages []byte 90 91 // inverse of LanguageMap in metaData 92 languageMap map[uint16]string 93 94 repoListEntry []RepoListEntry 95 96 // repository indexes for all the files 97 repos []uint16 98 99 // rawConfigMasks contains the encoded RawConfig for each repository 100 rawConfigMasks []uint8 101} 102 103type symbolData struct { 104 // symContent stores Symbol.Sym and Symbol.Parent. 105 // TODO we don't need to store Symbol.Sym. 106 symContent []byte 107 symIndex []byte 108 // symKindContent is an enum of sym.Kind and sym.ParentKind 109 symKindContent []byte 110 symKindIndex []uint32 111 // symMetadata is [4]uint32 0 Kind Parent ParentKind 112 symMetaData []byte 113} 114 115func uint32SliceAt(a []byte, n uint32) uint32 { 116 return binary.BigEndian.Uint32(a[n*4:]) 117} 118 119func uint32SliceLen(a []byte) uint32 { 120 return uint32(len(a) / 4) 121} 122 123// parent returns index i of the parent enum 124func (d *symbolData) parent(i uint32) []byte { 125 delta := uint32SliceAt(d.symIndex, 0) 126 start := uint32SliceAt(d.symIndex, i) - delta 127 var end uint32 128 if i+1 == uint32SliceLen(d.symIndex) { 129 end = uint32(len(d.symContent)) 130 } else { 131 end = uint32SliceAt(d.symIndex, i+1) - delta 132 } 133 return d.symContent[start:end] 134} 135 136// kind returns index i of the kind enum 137func (d *symbolData) kind(i uint32) []byte { 138 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]] 139} 140 141// data returns the symbol at index i 142func (d *symbolData) data(i uint32) *Symbol { 143 size := uint32(4 * 4) // 4 uint32s 144 offset := i * size 145 if offset >= uint32(len(d.symMetaData)) { 146 return nil 147 } 148 149 metadata := d.symMetaData[offset : offset+size] 150 sym := &Symbol{} 151 key := uint32SliceAt(metadata, 1) 152 sym.Kind = string(d.kind(key)) 153 key = uint32SliceAt(metadata, 2) 154 sym.Parent = string(d.parent(key)) 155 key = uint32SliceAt(metadata, 3) 156 sym.ParentKind = string(d.kind(key)) 157 return sym 158} 159 160func (d *indexData) getChecksum(idx uint32) []byte { 161 start := crc64.Size * idx 162 return d.checksums[start : start+crc64.Size] 163} 164 165func (d *indexData) getLanguage(idx uint32) uint16 { 166 if d.metaData.IndexFeatureVersion < 12 { 167 // older zoekt files had 8-bit language entries 168 return uint16(d.languages[idx]) 169 } 170 // newer zoekt files have 16-bit language entries 171 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8 172} 173 174// calculates stats for files in the range [start, end). 175func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats { 176 if start >= end { 177 // An empty shard for an empty repository. 178 return RepoStats{ 179 Shards: 1, 180 } 181 } 182 183 bytesContent := d.boundaries[end] - d.boundaries[start] 184 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start] 185 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end) 186 187 // CR keegan for stefan: I think we may want to restructure RepoListEntry so 188 // that we don't change anything, except we have 189 // []Repository. Alternatively, things we can divide up we do (like 190 // here). Right now I don't like that these numbers are not true, especially 191 // after aggregation. For now I will move forward with this until we can 192 // chat more. 193 return RepoStats{ 194 ContentBytes: int64(bytesContent) + int64(bytesFN), 195 Documents: int(end - start), 196 // CR keegan for stefan: our shard count is going to go out of whack, 197 // since we will aggregate these. So we will report more shards than are 198 // present on disk. What should we do? 199 Shards: 1, 200 201 // Sourcegraph specific 202 NewLinesCount: count, 203 DefaultBranchNewLinesCount: defaultCount, 204 OtherBranchesNewLinesCount: otherCount, 205 } 206} 207 208func (d *indexData) calculateStats() error { 209 d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData)) 210 var start, end uint32 211 212 for repoID, md := range d.repoMetaData { 213 // determine the file range for repo i 214 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) { 215 end++ 216 } 217 if start < end && d.repos[start] != uint16(repoID) { 218 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID) 219 } 220 221 d.repoListEntry = append(d.repoListEntry, RepoListEntry{ 222 Repository: md, 223 IndexMetadata: d.metaData, 224 Stats: d.calculateStatsForFileRange(start, end), 225 }) 226 start = end 227 } 228 229 // All repos in a compound shard share memoryUse. So we average out the 230 // memoryUse per shard in our reporting. This has the benefit that when you 231 // aggregate the IndexBytes you get back the actual memoryUse. 232 // 233 // TODO take into account tombstones for aggregation. Even better, adjust 234 // API to be shard centric not repo centric. 235 if len(d.repoListEntry) > 0 { 236 indexBytes := d.memoryUse() 237 indexBytesChunk := indexBytes / len(d.repoListEntry) 238 for i := range d.repoListEntry { 239 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk) 240 indexBytes -= indexBytesChunk 241 } 242 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes) 243 } 244 245 return nil 246} 247 248// calculateNewLinesStats computes some Sourcegraph specific statistics for files 249// in the range [start, end). These are not as efficient to calculate as the 250// normal statistics. We experimentally measured about a 10% slower shard load 251// time. However, we find these values very useful to track and computing them 252// outside of load time introduces a lot of complexity. 253func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) { 254 for i := start; i < end; i++ { 255 // branchMask is a bitmask of the branches for a document. Zoekt by 256 // convention represents the default branch as the lowest bit. 257 branchMask := d.fileBranchMasks[i] 258 isDefault := (branchMask & 1) == 1 259 others := uint64(bits.OnesCount64(branchMask >> 1)) 260 261 // this is readNewlines but only reading the size of each section which 262 // corresponds to the number of newlines. 263 sec := simpleSection{ 264 off: d.newlinesStart + d.newlinesIndex[i], 265 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i], 266 } 267 // We are only reading the first varint which is the size. So we don't 268 // need to read more than MaxVarintLen64 bytes. 269 if sec.sz > binary.MaxVarintLen64 { 270 sec.sz = binary.MaxVarintLen64 271 } 272 blob, err := d.readSectionBlob(sec) 273 if err != nil { 274 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err) 275 continue 276 } 277 sz, _ := binary.Uvarint(blob) 278 279 count += sz 280 if isDefault { 281 defaultCount += sz 282 } 283 otherCount += (others * sz) 284 } 285 286 return 287} 288 289func (d *indexData) String() string { 290 return fmt.Sprintf("shard(%s)", d.file.Name()) 291} 292 293// calculates an approximate size of indexData in memory in bytes. 294func (d *indexData) memoryUse() int { 295 sz := 0 296 for _, a := range [][]uint32{ 297 d.newlinesIndex, d.docSectionsIndex, 298 d.boundaries, d.fileNameIndex, 299 d.fileEndRunes, d.fileNameEndRunes, 300 d.fileEndSymbol, d.symbols.symKindIndex, 301 d.subRepos, 302 } { 303 sz += 4 * len(a) 304 } 305 sz += d.runeOffsets.sizeBytes() 306 sz += d.fileNameRuneOffsets.sizeBytes() 307 sz += len(d.languages) 308 sz += len(d.checksums) 309 sz += 2 * len(d.repos) 310 sz += 8 * len(d.runeDocSections) 311 sz += 8 * len(d.fileBranchMasks) 312 sz += d.contentNgrams.SizeBytes() 313 sz += d.fileNameNgrams.SizeBytes() 314 return sz 315} 316 317// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to 318// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids 319// overlapping trigrams to keep their intersection as small as possible. 320// 321// Invariant: first will always have a smaller index than last. 322func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) { 323 first, last = minFrequencyNgramOffsets(ngramOffs, frequencies) 324 325 // If the trigrams are overlapping, then try to shift one to reduce overlap. 326 // This is guaranteed to produce a smaller intersection. 327 if last.index-first.index < ngramSize { 328 newFirstIndex := max(last.index-ngramSize, 0) 329 if newFirstIndex != first.index { 330 first = ngramOffs[indexMap[newFirstIndex]] 331 } 332 333 newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1) 334 if newLastIndex != last.index { 335 last = ngramOffs[indexMap[newLastIndex]] 336 } 337 } 338 return 339} 340 341const maxUInt32 = 0xffffffff 342 343func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) { 344 // Find the two lowest frequency ngrams. 345 idx0, idx1 := 0, 0 346 min0, min1 := uint32(maxUInt32), uint32(maxUInt32) 347 for i, x := range frequencies { 348 if x <= min0 { 349 idx0, idx1 = i, idx0 350 min0, min1 = x, min0 351 } else if x <= min1 { 352 idx1 = i 353 min1 = x 354 } 355 } 356 357 first = ngramOffs[idx0] 358 last = ngramOffs[idx1] 359 360 // Ensure first appears before last as a helpful invariant. 361 if first.index > last.index { 362 last, first = first, last 363 } 364 return 365} 366 367func (data *indexData) ngrams(filename bool) btreeIndex { 368 if filename { 369 return data.fileNameNgrams 370 } 371 return data.contentNgrams 372} 373 374type ngramIterationResults struct { 375 matchIterator 376 377 caseSensitive bool 378 fileName bool 379 substrBytes []byte 380 substrLowered []byte 381} 382 383func (r *ngramIterationResults) String() string { 384 return fmt.Sprintf("wrapper(%v)", r.matchIterator) 385} 386 387func (r *ngramIterationResults) candidates() []*candidateMatch { 388 cs := r.matchIterator.candidates() 389 for _, c := range cs { 390 c.caseSensitive = r.caseSensitive 391 c.fileName = r.fileName 392 c.substrBytes = r.substrBytes 393 c.substrLowered = r.substrLowered 394 } 395 return cs 396} 397 398func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) { 399 str := query.Pattern 400 401 // Find the 2 least common ngrams from the string. 402 ngramOffs := splitNGrams([]byte(str)) 403 404 // protect against accidental searching of empty strings 405 if len(ngramOffs) == 0 { 406 return nil, errors.New("iterateNgrams needs non empty string") 407 } 408 409 // PERF: Sort to increase the chances adjacent checks are in the same btree 410 // bucket (which can cause disk IO). 411 slices.SortFunc(ngramOffs, runeNgramOff.Compare) 412 frequencies := make([]uint32, 0, len(ngramOffs)) 413 indexMap := make([]int, len(ngramOffs)) 414 ngramLookups := 0 415 ngrams := d.ngrams(query.FileName) 416 for i, o := range ngramOffs { 417 var freq uint32 418 if query.CaseSensitive { 419 freq = ngrams.Get(o.ngram).sz 420 ngramLookups++ 421 } else { 422 for _, v := range generateCaseNgrams(o.ngram) { 423 freq += ngrams.Get(v).sz 424 ngramLookups++ 425 } 426 } 427 428 if freq == 0 { 429 return &ngramIterationResults{ 430 matchIterator: &noMatchTree{ 431 Why: "freq=0", 432 Stats: Stats{ 433 NgramLookups: ngramLookups, 434 }, 435 }, 436 }, nil 437 } 438 439 frequencies = append(frequencies, freq) 440 indexMap[o.index] = i 441 } 442 443 first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies) 444 445 iter := &ngramDocIterator{ 446 leftPad: uint32(first.index), 447 rightPad: uint32(utf8.RuneCountInString(str) - first.index), 448 ngramLookups: ngramLookups, 449 } 450 if query.FileName { 451 iter.ends = d.fileNameEndRunes 452 } else { 453 iter.ends = d.fileEndRunes 454 } 455 456 if first != last { 457 runeDist := uint32(last.index - first.index) 458 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName) 459 if err != nil { 460 return nil, err 461 } 462 463 iter.iter = i 464 } else { 465 hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName) 466 if err != nil { 467 return nil, err 468 } 469 iter.iter = hitIter 470 } 471 472 patBytes := []byte(query.Pattern) 473 lowerPatBytes := toLower(patBytes) 474 475 return &ngramIterationResults{ 476 matchIterator: iter, 477 caseSensitive: query.CaseSensitive, 478 fileName: query.FileName, 479 substrBytes: patBytes, 480 substrLowered: lowerPatBytes, 481 }, nil 482} 483 484func (d *indexData) fileName(i uint32) []byte { 485 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]] 486} 487 488func (d *indexData) numDocs() uint32 { 489 return uint32(len(d.fileBranchMasks)) 490} 491 492func (s *indexData) Close() { 493 s.file.Close() 494} 495 496const ( 497 rawConfigYes = 1 498 rawConfigNo = 2 499) 500 501// encodeRawConfig encodes a rawConfig map into a uint8 mask. 502func encodeRawConfig(rawConfig map[string]string) uint8 { 503 var encoded uint8 504 for i, f := range []string{"public", "fork", "archived"} { 505 var e uint8 506 v, ok := rawConfig[f] 507 if ok && v == "1" { 508 e |= rawConfigYes 509 } else { 510 e |= rawConfigNo 511 } 512 encoded = encoded | e<<(2*i) 513 } 514 return encoded 515}