fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "encoding/binary" 19 "errors" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "math/bits" 24 "slices" 25 "unicode/utf8" 26 27 "github.com/sourcegraph/zoekt/query" 28) 29 30// indexData holds the pattern-independent data that we have to have 31// in memory to search. Most of the memory is taken up by the ngram => 32// offset index. 33type indexData struct { 34 symbols symbolData 35 36 file IndexFile 37 38 contentNgrams btreeIndex 39 40 newlinesStart uint32 41 newlinesIndex []uint32 42 43 docSectionsStart uint32 44 docSectionsIndex []uint32 45 46 runeDocSections []DocumentSection 47 48 // rune offset=>byte offset mapping, relative to the start of the content corpus 49 runeOffsets runeOffsetMap 50 51 // offsets of file contents; includes end of last file 52 boundariesStart uint32 53 boundaries []uint32 54 55 // rune offsets for the file content boundaries 56 fileEndRunes []uint32 57 58 fileNameContent []byte 59 fileNameIndex []uint32 60 fileNameNgrams btreeIndex 61 62 // fileEndSymbol[i] is the index of the first symbol for document i. 63 fileEndSymbol []uint32 64 65 // rune offset=>byte offset mapping, relative to the start of the filename corpus 66 fileNameRuneOffsets runeOffsetMap 67 68 // rune offsets for the file name boundaries 69 fileNameEndRunes []uint32 70 71 fileBranchMasks []uint64 72 73 // mask (power of 2) => name 74 branchNames []map[uint]string 75 76 // name => mask (power of 2) 77 branchIDs []map[string]uint 78 79 metaData IndexMetadata 80 repoMetaData []Repository 81 82 subRepos []uint32 83 subRepoPaths [][]string 84 85 // Checksums for all the files, at 8-byte intervals 86 checksums []byte 87 88 // languages for all the files. 89 languages []byte 90 91 // inverse of LanguageMap in metaData 92 languageMap map[uint16]string 93 94 repoListEntry []RepoListEntry 95 96 // repository indexes for all the files 97 repos []uint16 98 99 // Experimental: docID => rank vec 100 ranks [][]float64 101 102 // rawConfigMasks contains the encoded RawConfig for each repository 103 rawConfigMasks []uint8 104} 105 106type symbolData struct { 107 // symContent stores Symbol.Sym and Symbol.Parent. 108 // TODO we don't need to store Symbol.Sym. 109 symContent []byte 110 symIndex []byte 111 // symKindContent is an enum of sym.Kind and sym.ParentKind 112 symKindContent []byte 113 symKindIndex []uint32 114 // symMetadata is [4]uint32 0 Kind Parent ParentKind 115 symMetaData []byte 116} 117 118func uint32SliceAt(a []byte, n uint32) uint32 { 119 return binary.BigEndian.Uint32(a[n*4:]) 120} 121 122func uint32SliceLen(a []byte) uint32 { 123 return uint32(len(a) / 4) 124} 125 126// parent returns index i of the parent enum 127func (d *symbolData) parent(i uint32) []byte { 128 delta := uint32SliceAt(d.symIndex, 0) 129 start := uint32SliceAt(d.symIndex, i) - delta 130 var end uint32 131 if i+1 == uint32SliceLen(d.symIndex) { 132 end = uint32(len(d.symContent)) 133 } else { 134 end = uint32SliceAt(d.symIndex, i+1) - delta 135 } 136 return d.symContent[start:end] 137} 138 139// kind returns index i of the kind enum 140func (d *symbolData) kind(i uint32) []byte { 141 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]] 142} 143 144// data returns the symbol at index i 145func (d *symbolData) data(i uint32) *Symbol { 146 size := uint32(4 * 4) // 4 uint32s 147 offset := i * size 148 if offset >= uint32(len(d.symMetaData)) { 149 return nil 150 } 151 152 metadata := d.symMetaData[offset : offset+size] 153 sym := &Symbol{} 154 key := uint32SliceAt(metadata, 1) 155 sym.Kind = string(d.kind(key)) 156 key = uint32SliceAt(metadata, 2) 157 sym.Parent = string(d.parent(key)) 158 key = uint32SliceAt(metadata, 3) 159 sym.ParentKind = string(d.kind(key)) 160 return sym 161} 162 163func (d *indexData) getChecksum(idx uint32) []byte { 164 start := crc64.Size * idx 165 return d.checksums[start : start+crc64.Size] 166} 167 168func (d *indexData) getLanguage(idx uint32) uint16 { 169 if d.metaData.IndexFeatureVersion < 12 { 170 // older zoekt files had 8-bit language entries 171 return uint16(d.languages[idx]) 172 } 173 // newer zoekt files have 16-bit language entries 174 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8 175} 176 177// calculates stats for files in the range [start, end). 178func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats { 179 if start >= end { 180 // An empty shard for an empty repository. 181 return RepoStats{ 182 Shards: 1, 183 } 184 } 185 186 bytesContent := d.boundaries[end] - d.boundaries[start] 187 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start] 188 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end) 189 190 // CR keegan for stefan: I think we may want to restructure RepoListEntry so 191 // that we don't change anything, except we have 192 // []Repository. Alternatively, things we can divide up we do (like 193 // here). Right now I don't like that these numbers are not true, especially 194 // after aggregation. For now I will move forward with this until we can 195 // chat more. 196 return RepoStats{ 197 ContentBytes: int64(bytesContent) + int64(bytesFN), 198 Documents: int(end - start), 199 // CR keegan for stefan: our shard count is going to go out of whack, 200 // since we will aggregate these. So we will report more shards than are 201 // present on disk. What should we do? 202 Shards: 1, 203 204 // Sourcegraph specific 205 NewLinesCount: count, 206 DefaultBranchNewLinesCount: defaultCount, 207 OtherBranchesNewLinesCount: otherCount, 208 } 209} 210 211func (d *indexData) calculateStats() error { 212 d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData)) 213 var start, end uint32 214 215 for repoID, md := range d.repoMetaData { 216 // determine the file range for repo i 217 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) { 218 end++ 219 } 220 if start < end && d.repos[start] != uint16(repoID) { 221 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID) 222 } 223 224 d.repoListEntry = append(d.repoListEntry, RepoListEntry{ 225 Repository: md, 226 IndexMetadata: d.metaData, 227 Stats: d.calculateStatsForFileRange(start, end), 228 }) 229 start = end 230 } 231 232 // All repos in a compound shard share memoryUse. So we average out the 233 // memoryUse per shard in our reporting. This has the benefit that when you 234 // aggregate the IndexBytes you get back the actual memoryUse. 235 // 236 // TODO take into account tombstones for aggregation. Even better, adjust 237 // API to be shard centric not repo centric. 238 if len(d.repoListEntry) > 0 { 239 indexBytes := d.memoryUse() 240 indexBytesChunk := indexBytes / len(d.repoListEntry) 241 for i := range d.repoListEntry { 242 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk) 243 indexBytes -= indexBytesChunk 244 } 245 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes) 246 } 247 248 return nil 249} 250 251// calculateNewLinesStats computes some Sourcegraph specific statistics for files 252// in the range [start, end). These are not as efficient to calculate as the 253// normal statistics. We experimentally measured about a 10% slower shard load 254// time. However, we find these values very useful to track and computing them 255// outside of load time introduces a lot of complexity. 256func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) { 257 for i := start; i < end; i++ { 258 // branchMask is a bitmask of the branches for a document. Zoekt by 259 // convention represents the default branch as the lowest bit. 260 branchMask := d.fileBranchMasks[i] 261 isDefault := (branchMask & 1) == 1 262 others := uint64(bits.OnesCount64(branchMask >> 1)) 263 264 // this is readNewlines but only reading the size of each section which 265 // corresponds to the number of newlines. 266 sec := simpleSection{ 267 off: d.newlinesStart + d.newlinesIndex[i], 268 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i], 269 } 270 // We are only reading the first varint which is the size. So we don't 271 // need to read more than MaxVarintLen64 bytes. 272 if sec.sz > binary.MaxVarintLen64 { 273 sec.sz = binary.MaxVarintLen64 274 } 275 blob, err := d.readSectionBlob(sec) 276 if err != nil { 277 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err) 278 continue 279 } 280 sz, _ := binary.Uvarint(blob) 281 282 count += sz 283 if isDefault { 284 defaultCount += sz 285 } 286 otherCount += (others * sz) 287 } 288 289 return 290} 291 292func (d *indexData) String() string { 293 return fmt.Sprintf("shard(%s)", d.file.Name()) 294} 295 296// calculates an approximate size of indexData in memory in bytes. 297func (d *indexData) memoryUse() int { 298 sz := 0 299 for _, a := range [][]uint32{ 300 d.newlinesIndex, d.docSectionsIndex, 301 d.boundaries, d.fileNameIndex, 302 d.fileEndRunes, d.fileNameEndRunes, 303 d.fileEndSymbol, d.symbols.symKindIndex, 304 d.subRepos, 305 } { 306 sz += 4 * len(a) 307 } 308 sz += d.runeOffsets.sizeBytes() 309 sz += d.fileNameRuneOffsets.sizeBytes() 310 sz += len(d.languages) 311 sz += len(d.checksums) 312 sz += 2 * len(d.repos) 313 if len(d.ranks) > 0 { 314 sz += 8 * len(d.ranks) * len(d.ranks[0]) 315 } 316 sz += 8 * len(d.runeDocSections) 317 sz += 8 * len(d.fileBranchMasks) 318 sz += d.contentNgrams.SizeBytes() 319 sz += d.fileNameNgrams.SizeBytes() 320 return sz 321} 322 323const maxUInt32 = 0xffffffff 324 325func min2Index(xs []uint32) (idx0, idx1 int) { 326 min0, min1 := uint32(maxUInt32), uint32(maxUInt32) 327 for i, x := range xs { 328 if x <= min0 { 329 idx0, idx1 = i, idx0 330 min0, min1 = x, min0 331 } else if x <= min1 { 332 idx1 = i 333 min1 = x 334 } 335 } 336 return 337} 338 339// minFrequencyNgramOffsets returns the two lowest frequency ngrams to pass to 340// the distance iterator. If they have the same frequency, we maximise the 341// distance between them. first will always have a smaller index than last. 342func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) { 343 firstI, lastI := min2Index(frequencies) 344 // If the frequencies are equal lets maximise distance in the query 345 // string. This optimization normally triggers for long repeated trigrams 346 // in a string, eg a query like "AAAAA..." 347 if frequencies[firstI] == frequencies[lastI] { 348 for i, freq := range frequencies { 349 if freq != frequencies[firstI] { 350 continue 351 } 352 if ngramOffs[i].index < ngramOffs[firstI].index { 353 firstI = i 354 } 355 if ngramOffs[i].index > ngramOffs[lastI].index { 356 lastI = i 357 } 358 } 359 } 360 first = ngramOffs[firstI] 361 last = ngramOffs[lastI] 362 // Ensure first appears before last to make distance logic below clean. 363 if first.index > last.index { 364 last, first = first, last 365 } 366 return first, last 367} 368 369func (data *indexData) ngrams(filename bool) btreeIndex { 370 if filename { 371 return data.fileNameNgrams 372 } 373 return data.contentNgrams 374} 375 376type ngramIterationResults struct { 377 matchIterator 378 379 caseSensitive bool 380 fileName bool 381 substrBytes []byte 382 substrLowered []byte 383} 384 385func (r *ngramIterationResults) String() string { 386 return fmt.Sprintf("wrapper(%v)", r.matchIterator) 387} 388 389func (r *ngramIterationResults) candidates() []*candidateMatch { 390 cs := r.matchIterator.candidates() 391 for _, c := range cs { 392 c.caseSensitive = r.caseSensitive 393 c.fileName = r.fileName 394 c.substrBytes = r.substrBytes 395 c.substrLowered = r.substrLowered 396 } 397 return cs 398} 399 400func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) { 401 str := query.Pattern 402 403 // Find the 2 least common ngrams from the string. 404 ngramOffs := splitNGrams([]byte(query.Pattern)) 405 406 // protect against accidental searching of empty strings 407 if len(ngramOffs) == 0 { 408 return nil, errors.New("iterateNgrams needs non empty string") 409 } 410 411 // PERF: Sort to increase the chances adjacent checks are in the same btree 412 // bucket (which can cause disk IO). 413 slices.SortFunc(ngramOffs, runeNgramOff.Compare) 414 frequencies := make([]uint32, 0, len(ngramOffs)) 415 ngramLookups := 0 416 ngrams := d.ngrams(query.FileName) 417 for _, o := range ngramOffs { 418 var freq uint32 419 if query.CaseSensitive { 420 freq = ngrams.Get(o.ngram).sz 421 ngramLookups++ 422 } else { 423 for _, v := range generateCaseNgrams(o.ngram) { 424 freq += ngrams.Get(v).sz 425 ngramLookups++ 426 } 427 } 428 429 if freq == 0 { 430 return &ngramIterationResults{ 431 matchIterator: &noMatchTree{ 432 Why: "freq=0", 433 Stats: Stats{ 434 NgramLookups: ngramLookups, 435 }, 436 }, 437 }, nil 438 } 439 440 frequencies = append(frequencies, freq) 441 } 442 443 // first and last are now the smallest trigram posting lists to iterate 444 // through. 445 first, last := minFrequencyNgramOffsets(ngramOffs, frequencies) 446 447 iter := &ngramDocIterator{ 448 leftPad: first.index, 449 rightPad: uint32(utf8.RuneCountInString(str)) - first.index, 450 ngramLookups: ngramLookups, 451 } 452 if query.FileName { 453 iter.ends = d.fileNameEndRunes 454 } else { 455 iter.ends = d.fileEndRunes 456 } 457 458 if first != last { 459 runeDist := last.index - first.index 460 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName) 461 if err != nil { 462 return nil, err 463 } 464 465 iter.iter = i 466 } else { 467 hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName) 468 if err != nil { 469 return nil, err 470 } 471 iter.iter = hitIter 472 } 473 474 patBytes := []byte(query.Pattern) 475 lowerPatBytes := toLower(patBytes) 476 477 return &ngramIterationResults{ 478 matchIterator: iter, 479 caseSensitive: query.CaseSensitive, 480 fileName: query.FileName, 481 substrBytes: patBytes, 482 substrLowered: lowerPatBytes, 483 }, nil 484} 485 486func (d *indexData) fileName(i uint32) []byte { 487 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]] 488} 489 490func (d *indexData) numDocs() uint32 { 491 return uint32(len(d.fileBranchMasks)) 492} 493 494func (s *indexData) Close() { 495 s.file.Close() 496} 497 498const ( 499 rawConfigYes = 1 500 rawConfigNo = 2 501) 502 503// encodeRawConfig encodes a rawConfig map into a uint8 mask. 504func encodeRawConfig(rawConfig map[string]string) uint8 { 505 var encoded uint8 506 for i, f := range []string{"public", "fork", "archived"} { 507 var e uint8 508 v, ok := rawConfig[f] 509 if ok && v == "1" { 510 e |= rawConfigYes 511 } else { 512 e |= rawConfigNo 513 } 514 encoded = encoded | e<<(2*i) 515 } 516 return encoded 517}