fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "encoding/binary" 19 "errors" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "math/bits" 24 "unicode/utf8" 25 26 "github.com/sourcegraph/zoekt/query" 27 "golang.org/x/exp/slices" 28) 29 30// indexData holds the pattern-independent data that we have to have 31// in memory to search. Most of the memory is taken up by the ngram => 32// offset index. 33type indexData struct { 34 symbols symbolData 35 36 file IndexFile 37 38 ngrams btreeIndex 39 40 newlinesStart uint32 41 newlinesIndex []uint32 42 43 docSectionsStart uint32 44 docSectionsIndex []uint32 45 46 runeDocSections []DocumentSection 47 runeDocSectionsRaw []byte 48 49 // rune offset=>byte offset mapping, relative to the start of the content corpus 50 runeOffsets runeOffsetMap 51 52 // offsets of file contents; includes end of last file 53 boundariesStart uint32 54 boundaries []uint32 55 56 // rune offsets for the file content boundaries 57 fileEndRunes []uint32 58 59 fileNameContent []byte 60 fileNameIndex []uint32 61 fileNameNgrams btreeIndex 62 63 // fileEndSymbol[i] is the index of the first symbol for document i. 64 fileEndSymbol []uint32 65 66 // rune offset=>byte offset mapping, relative to the start of the filename corpus 67 fileNameRuneOffsets runeOffsetMap 68 69 // rune offsets for the file name boundaries 70 fileNameEndRunes []uint32 71 72 fileBranchMasks []uint64 73 74 // mask (power of 2) => name 75 branchNames []map[uint]string 76 77 // name => mask (power of 2) 78 branchIDs []map[string]uint 79 80 metaData IndexMetadata 81 repoMetaData []Repository 82 83 subRepos []uint32 84 subRepoPaths [][]string 85 86 // Checksums for all the files, at 8-byte intervals 87 checksums []byte 88 89 // languages for all the files. 90 languages []byte 91 92 // inverse of LanguageMap in metaData 93 languageMap map[uint16]string 94 95 repoListEntry []RepoListEntry 96 97 // repository indexes for all the files 98 repos []uint16 99 100 // Experimental: docID => rank vec 101 ranks [][]float64 102 103 // rawConfigMasks contains the encoded RawConfig for each repository 104 rawConfigMasks []uint8 105} 106 107type symbolData struct { 108 // symContent stores Symbol.Sym and Symbol.Parent. 109 // TODO we don't need to store Symbol.Sym. 110 symContent []byte 111 symIndex []byte 112 // symKindContent is an enum of sym.Kind and sym.ParentKind 113 symKindContent []byte 114 symKindIndex []uint32 115 // symMetadata is [4]uint32 0 Kind Parent ParentKind 116 symMetaData []byte 117} 118 119func uint32SliceAt(a []byte, n uint32) uint32 { 120 return binary.BigEndian.Uint32(a[n*4:]) 121} 122 123func uint32SliceLen(a []byte) uint32 { 124 return uint32(len(a) / 4) 125} 126 127// parent returns index i of the parent enum 128func (d *symbolData) parent(i uint32) []byte { 129 delta := uint32SliceAt(d.symIndex, 0) 130 start := uint32SliceAt(d.symIndex, i) - delta 131 var end uint32 132 if i+1 == uint32SliceLen(d.symIndex) { 133 end = uint32(len(d.symContent)) 134 } else { 135 end = uint32SliceAt(d.symIndex, i+1) - delta 136 } 137 return d.symContent[start:end] 138} 139 140// kind returns index i of the kind enum 141func (d *symbolData) kind(i uint32) []byte { 142 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]] 143} 144 145// data returns the symbol at index i 146func (d *symbolData) data(i uint32) *Symbol { 147 size := uint32(4 * 4) // 4 uint32s 148 offset := i * size 149 if offset >= uint32(len(d.symMetaData)) { 150 return nil 151 } 152 153 metadata := d.symMetaData[offset : offset+size] 154 sym := &Symbol{} 155 key := uint32SliceAt(metadata, 1) 156 sym.Kind = string(d.kind(key)) 157 key = uint32SliceAt(metadata, 2) 158 sym.Parent = string(d.parent(key)) 159 key = uint32SliceAt(metadata, 3) 160 sym.ParentKind = string(d.kind(key)) 161 return sym 162} 163 164func (d *indexData) getChecksum(idx uint32) []byte { 165 start := crc64.Size * idx 166 return d.checksums[start : start+crc64.Size] 167} 168 169func (d *indexData) getLanguage(idx uint32) uint16 { 170 if d.metaData.IndexFeatureVersion < 12 { 171 // older zoekt files had 8-bit language entries 172 return uint16(d.languages[idx]) 173 } 174 // newer zoekt files have 16-bit language entries 175 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8 176} 177 178// calculates stats for files in the range [start, end). 179func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats { 180 if start >= end { 181 // An empty shard for an empty repository. 182 return RepoStats{ 183 Shards: 1, 184 } 185 } 186 187 bytesContent := d.boundaries[end] - d.boundaries[start] 188 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start] 189 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end) 190 191 // CR keegan for stefan: I think we may want to restructure RepoListEntry so 192 // that we don't change anything, except we have 193 // []Repository. Alternatively, things we can divide up we do (like 194 // here). Right now I don't like that these numbers are not true, especially 195 // after aggregation. For now I will move forward with this until we can 196 // chat more. 197 return RepoStats{ 198 ContentBytes: int64(bytesContent) + int64(bytesFN), 199 Documents: int(end - start), 200 // CR keegan for stefan: our shard count is going to go out of whack, 201 // since we will aggregate these. So we will report more shards than are 202 // present on disk. What should we do? 203 Shards: 1, 204 205 // Sourcegraph specific 206 NewLinesCount: count, 207 DefaultBranchNewLinesCount: defaultCount, 208 OtherBranchesNewLinesCount: otherCount, 209 } 210} 211 212func (d *indexData) calculateStats() error { 213 d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData)) 214 var start, end uint32 215 216 for repoID, md := range d.repoMetaData { 217 // determine the file range for repo i 218 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) { 219 end++ 220 } 221 if start < end && d.repos[start] != uint16(repoID) { 222 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID) 223 } 224 225 d.repoListEntry = append(d.repoListEntry, RepoListEntry{ 226 Repository: md, 227 IndexMetadata: d.metaData, 228 Stats: d.calculateStatsForFileRange(start, end), 229 }) 230 start = end 231 } 232 233 // All repos in a compound shard share memoryUse. So we average out the 234 // memoryUse per shard in our reporting. This has the benefit that when you 235 // aggregate the IndexBytes you get back the actual memoryUse. 236 // 237 // TODO take into account tombstones for aggregation. Even better, adjust 238 // API to be shard centric not repo centric. 239 if len(d.repoListEntry) > 0 { 240 indexBytes := d.memoryUse() 241 indexBytesChunk := indexBytes / len(d.repoListEntry) 242 for i := range d.repoListEntry { 243 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk) 244 indexBytes -= indexBytesChunk 245 } 246 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes) 247 } 248 249 return nil 250} 251 252// calculateNewLinesStats computes some Sourcegraph specific statistics for files 253// in the range [start, end). These are not as efficient to calculate as the 254// normal statistics. We experimentally measured about a 10% slower shard load 255// time. However, we find these values very useful to track and computing them 256// outside of load time introduces a lot of complexity. 257func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) { 258 for i := start; i < end; i++ { 259 // branchMask is a bitmask of the branches for a document. Zoekt by 260 // convention represents the default branch as the lowest bit. 261 branchMask := d.fileBranchMasks[i] 262 isDefault := (branchMask & 1) == 1 263 others := uint64(bits.OnesCount64(branchMask >> 1)) 264 265 // this is readNewlines but only reading the size of each section which 266 // corresponds to the number of newlines. 267 sec := simpleSection{ 268 off: d.newlinesStart + d.newlinesIndex[i], 269 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i], 270 } 271 // We are only reading the first varint which is the size. So we don't 272 // need to read more than MaxVarintLen64 bytes. 273 if sec.sz > binary.MaxVarintLen64 { 274 sec.sz = binary.MaxVarintLen64 275 } 276 blob, err := d.readSectionBlob(sec) 277 if err != nil { 278 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err) 279 continue 280 } 281 sz, _ := binary.Uvarint(blob) 282 283 count += sz 284 if isDefault { 285 defaultCount += sz 286 } 287 otherCount += (others * sz) 288 } 289 290 return 291} 292 293func (d *indexData) String() string { 294 return fmt.Sprintf("shard(%s)", d.file.Name()) 295} 296 297// calculates an approximate size of indexData in memory in bytes. 298func (d *indexData) memoryUse() int { 299 sz := 0 300 for _, a := range [][]uint32{ 301 d.newlinesIndex, d.docSectionsIndex, 302 d.boundaries, d.fileNameIndex, 303 d.fileEndRunes, d.fileNameEndRunes, 304 d.fileEndSymbol, d.symbols.symKindIndex, 305 d.subRepos, 306 } { 307 sz += 4 * len(a) 308 } 309 sz += d.runeOffsets.sizeBytes() 310 sz += d.fileNameRuneOffsets.sizeBytes() 311 sz += len(d.languages) 312 sz += len(d.checksums) 313 sz += 2 * len(d.repos) 314 if len(d.ranks) > 0 { 315 sz += 8 * len(d.ranks) * len(d.ranks[0]) 316 } 317 sz += 8 * len(d.runeDocSections) 318 sz += 8 * len(d.fileBranchMasks) 319 sz += d.ngrams.SizeBytes() 320 sz += d.fileNameNgrams.SizeBytes() 321 return sz 322} 323 324const maxUInt32 = 0xffffffff 325 326func min2Index(xs []uint32) (idx0, idx1 int) { 327 min0, min1 := uint32(maxUInt32), uint32(maxUInt32) 328 for i, x := range xs { 329 if x <= min0 { 330 idx0, idx1 = i, idx0 331 min0, min1 = x, min0 332 } else if x <= min1 { 333 idx1 = i 334 min1 = x 335 } 336 } 337 return 338} 339 340// minFrequencyNgramOffsets returns the two lowest frequency ngrams to pass to 341// the distance iterator. If they have the same frequency, we maximise the 342// distance between them. first will always have a smaller index than last. 343func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) { 344 firstI, lastI := min2Index(frequencies) 345 // If the frequencies are equal lets maximise distance in the query 346 // string. This optimization normally triggers for long repeated trigrams 347 // in a string, eg a query like "AAAAA..." 348 if frequencies[firstI] == frequencies[lastI] { 349 for i, freq := range frequencies { 350 if freq != frequencies[firstI] { 351 continue 352 } 353 if ngramOffs[i].index < ngramOffs[firstI].index { 354 firstI = i 355 } 356 if ngramOffs[i].index > ngramOffs[lastI].index { 357 lastI = i 358 } 359 } 360 } 361 first = ngramOffs[firstI] 362 last = ngramOffs[lastI] 363 // Ensure first appears before last to make distance logic below clean. 364 if first.index > last.index { 365 last, first = first, last 366 } 367 return first, last 368} 369 370func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 { 371 if filename { 372 return data.fileNameNgrams.Get(ng).sz 373 } 374 return data.ngrams.Get(ng).sz 375} 376 377type ngramIterationResults struct { 378 matchIterator 379 380 caseSensitive bool 381 fileName bool 382 substrBytes []byte 383 substrLowered []byte 384} 385 386func (r *ngramIterationResults) String() string { 387 return fmt.Sprintf("wrapper(%v)", r.matchIterator) 388} 389 390func (r *ngramIterationResults) candidates() []*candidateMatch { 391 cs := r.matchIterator.candidates() 392 for _, c := range cs { 393 c.caseSensitive = r.caseSensitive 394 c.fileName = r.fileName 395 c.substrBytes = r.substrBytes 396 c.substrLowered = r.substrLowered 397 } 398 return cs 399} 400 401func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) { 402 str := query.Pattern 403 404 // Find the 2 least common ngrams from the string. 405 ngramOffs := splitNGrams([]byte(query.Pattern)) 406 407 // protect against accidental searching of empty strings 408 if len(ngramOffs) == 0 { 409 return nil, errors.New("iterateNgrams needs non empty string") 410 } 411 412 // PERF: Sort to increase the chances adjacent checks are in the same btree 413 // bucket (which can cause disk IO). 414 slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool { 415 return a.ngram < b.ngram 416 }) 417 frequencies := make([]uint32, 0, len(ngramOffs)) 418 ngramLookups := 0 419 for _, o := range ngramOffs { 420 var freq uint32 421 if query.CaseSensitive { 422 freq = d.ngramFrequency(o.ngram, query.FileName) 423 ngramLookups++ 424 } else { 425 for _, v := range generateCaseNgrams(o.ngram) { 426 freq += d.ngramFrequency(v, query.FileName) 427 ngramLookups++ 428 } 429 } 430 431 if freq == 0 { 432 return &ngramIterationResults{ 433 matchIterator: &noMatchTree{ 434 Why: "freq=0", 435 Stats: Stats{ 436 NgramLookups: ngramLookups, 437 }, 438 }, 439 }, nil 440 } 441 442 frequencies = append(frequencies, freq) 443 } 444 445 // first and last are now the smallest trigram posting lists to iterate 446 // through. 447 first, last := minFrequencyNgramOffsets(ngramOffs, frequencies) 448 449 iter := &ngramDocIterator{ 450 leftPad: first.index, 451 rightPad: uint32(utf8.RuneCountInString(str)) - first.index, 452 ngramLookups: ngramLookups, 453 } 454 if query.FileName { 455 iter.ends = d.fileNameEndRunes 456 } else { 457 iter.ends = d.fileEndRunes 458 } 459 460 if first != last { 461 runeDist := last.index - first.index 462 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName) 463 if err != nil { 464 return nil, err 465 } 466 467 iter.iter = i 468 } else { 469 hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName) 470 if err != nil { 471 return nil, err 472 } 473 iter.iter = hitIter 474 } 475 476 patBytes := []byte(query.Pattern) 477 lowerPatBytes := toLower(patBytes) 478 479 return &ngramIterationResults{ 480 matchIterator: iter, 481 caseSensitive: query.CaseSensitive, 482 fileName: query.FileName, 483 substrBytes: patBytes, 484 substrLowered: lowerPatBytes, 485 }, nil 486} 487 488func (d *indexData) fileName(i uint32) []byte { 489 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]] 490} 491 492func (d *indexData) numDocs() uint32 { 493 return uint32(len(d.fileBranchMasks)) 494} 495 496func (s *indexData) Close() { 497 s.file.Close() 498} 499 500const ( 501 rawConfigYes = 1 502 rawConfigNo = 2 503) 504 505// encodeRawConfig encodes a rawConfig map into a uint8 mask. 506func encodeRawConfig(rawConfig map[string]string) uint8 { 507 var encoded uint8 508 for i, f := range []string{"public", "fork", "archived"} { 509 var e uint8 510 v, ok := rawConfig[f] 511 if ok && v == "1" { 512 e |= rawConfigYes 513 } else { 514 e |= rawConfigNo 515 } 516 encoded = encoded | e<<(2*i) 517 } 518 return encoded 519}