fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "encoding/binary" 19 "fmt" 20 "hash/crc64" 21 "log" 22 "math/bits" 23 "unicode/utf8" 24 25 "github.com/sourcegraph/zoekt/query" 26) 27 28// indexData holds the pattern-independent data that we have to have 29// in memory to search. Most of the memory is taken up by the ngram => 30// offset index. 31type indexData struct { 32 symbols symbolData 33 34 file IndexFile 35 36 ngrams ngramMap 37 38 newlinesStart uint32 39 newlinesIndex []uint32 40 41 docSectionsStart uint32 42 docSectionsIndex []uint32 43 44 runeDocSections []DocumentSection 45 runeDocSectionsRaw []byte 46 47 // rune offset=>byte offset mapping, relative to the start of the content corpus 48 runeOffsets runeOffsetMap 49 50 // offsets of file contents; includes end of last file 51 boundariesStart uint32 52 boundaries []uint32 53 54 // rune offsets for the file content boundaries 55 fileEndRunes []uint32 56 57 fileNameContent []byte 58 fileNameIndex []uint32 59 fileNameNgrams map[ngram][]byte 60 61 // fileEndSymbol[i] is the index of the first symbol for document i. 62 fileEndSymbol []uint32 63 64 // rune offset=>byte offset mapping, relative to the start of the filename corpus 65 fileNameRuneOffsets runeOffsetMap 66 67 // rune offsets for the file name boundaries 68 fileNameEndRunes []uint32 69 70 fileBranchMasks []uint64 71 72 // mask (power of 2) => name 73 branchNames []map[uint]string 74 75 // name => mask (power of 2) 76 branchIDs []map[string]uint 77 78 metaData IndexMetadata 79 repoMetaData []Repository 80 81 subRepos []uint32 82 subRepoPaths [][]string 83 84 // Checksums for all the files, at 8-byte intervals 85 checksums []byte 86 87 // languages for all the files. 88 languages []byte 89 90 // inverse of LanguageMap in metaData 91 languageMap map[uint16]string 92 93 repoListEntry []RepoListEntry 94 95 // repository indexes for all the files 96 repos []uint16 97 98 // Experimental: docID => rank vec 99 ranks [][]float64 100 101 // rawConfigMasks contains the encoded RawConfig for each repository 102 rawConfigMasks []uint8 103} 104 105type symbolData struct { 106 // symContent stores Symbol.Sym and Symbol.Parent. 107 // TODO we don't need to store Symbol.Sym. 108 symContent []byte 109 symIndex []byte 110 // symKindContent is an enum of sym.Kind and sym.ParentKind 111 symKindContent []byte 112 symKindIndex []uint32 113 // symMetadata is [4]uint32 0 Kind Parent ParentKind 114 symMetaData []byte 115} 116 117func uint32SliceAt(a []byte, n uint32) uint32 { 118 return binary.BigEndian.Uint32(a[n*4:]) 119} 120 121func uint32SliceLen(a []byte) uint32 { 122 return uint32(len(a) / 4) 123} 124 125// parent returns index i of the parent enum 126func (d *symbolData) parent(i uint32) []byte { 127 delta := uint32SliceAt(d.symIndex, 0) 128 start := uint32SliceAt(d.symIndex, i) - delta 129 var end uint32 130 if i+1 == uint32SliceLen(d.symIndex) { 131 end = uint32(len(d.symContent)) 132 } else { 133 end = uint32SliceAt(d.symIndex, i+1) - delta 134 } 135 return d.symContent[start:end] 136} 137 138// kind returns index i of the kind enum 139func (d *symbolData) kind(i uint32) []byte { 140 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]] 141} 142 143// data returns the symbol at index i 144func (d *symbolData) data(i uint32) *Symbol { 145 size := uint32(4 * 4) // 4 uint32s 146 offset := i * size 147 if offset >= uint32(len(d.symMetaData)) { 148 return nil 149 } 150 151 metadata := d.symMetaData[offset : offset+size] 152 sym := &Symbol{} 153 key := uint32SliceAt(metadata, 1) 154 sym.Kind = string(d.kind(key)) 155 key = uint32SliceAt(metadata, 2) 156 sym.Parent = string(d.parent(key)) 157 key = uint32SliceAt(metadata, 3) 158 sym.ParentKind = string(d.kind(key)) 159 return sym 160} 161 162func (d *indexData) getChecksum(idx uint32) []byte { 163 start := crc64.Size * idx 164 return d.checksums[start : start+crc64.Size] 165} 166 167func (d *indexData) getLanguage(idx uint32) uint16 { 168 if d.metaData.IndexFeatureVersion < 12 { 169 // older zoekt files had 8-bit language entries 170 return uint16(d.languages[idx]) 171 } 172 // newer zoekt files have 16-bit language entries 173 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8 174} 175 176// calculates stats for files in the range [start, end). 177func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats { 178 if start >= end { 179 // An empty shard for an empty repository. 180 return RepoStats{ 181 Shards: 1, 182 } 183 } 184 185 bytesContent := d.boundaries[end] - d.boundaries[start] 186 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start] 187 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end) 188 189 // CR keegan for stefan: I think we may want to restructure RepoListEntry so 190 // that we don't change anything, except we have 191 // []Repository. Alternatively, things we can divide up we do (like 192 // here). Right now I don't like that these numbers are not true, especially 193 // after aggregation. For now I will move forward with this until we can 194 // chat more. 195 return RepoStats{ 196 ContentBytes: int64(bytesContent) + int64(bytesFN), 197 Documents: int(end - start), 198 // CR keegan for stefan: our shard count is going to go out of whack, 199 // since we will aggregate these. So we will report more shards than are 200 // present on disk. What should we do? 201 Shards: 1, 202 203 // Sourcegraph specific 204 NewLinesCount: count, 205 DefaultBranchNewLinesCount: defaultCount, 206 OtherBranchesNewLinesCount: otherCount, 207 } 208} 209 210func (d *indexData) calculateStats() error { 211 d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData)) 212 var start, end uint32 213 214 for repoID, md := range d.repoMetaData { 215 // determine the file range for repo i 216 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) { 217 end++ 218 } 219 if start < end && d.repos[start] != uint16(repoID) { 220 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID) 221 } 222 223 d.repoListEntry = append(d.repoListEntry, RepoListEntry{ 224 Repository: md, 225 IndexMetadata: d.metaData, 226 Stats: d.calculateStatsForFileRange(start, end), 227 }) 228 start = end 229 } 230 231 // All repos in a compound shard share memoryUse. So we average out the 232 // memoryUse per shard in our reporting. This has the benefit that when you 233 // aggregate the IndexBytes you get back the actual memoryUse. 234 // 235 // TODO take into account tombstones for aggregation. Even better, adjust 236 // API to be shard centric not repo centric. 237 if len(d.repoListEntry) > 0 { 238 indexBytes := d.memoryUse() 239 indexBytesChunk := indexBytes / len(d.repoListEntry) 240 for i := range d.repoListEntry { 241 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk) 242 indexBytes -= indexBytesChunk 243 } 244 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes) 245 } 246 247 return nil 248} 249 250// calculateNewLinesStats computes some Sourcegraph specific statistics for files 251// in the range [start, end). These are not as efficient to calculate as the 252// normal statistics. We experimentally measured about a 10% slower shard load 253// time. However, we find these values very useful to track and computing them 254// outside of load time introduces a lot of complexity. 255func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) { 256 for i := start; i < end; i++ { 257 // branchMask is a bitmask of the branches for a document. Zoekt by 258 // convention represents the default branch as the lowest bit. 259 branchMask := d.fileBranchMasks[i] 260 isDefault := (branchMask & 1) == 1 261 others := uint64(bits.OnesCount64(branchMask >> 1)) 262 263 // this is readNewlines but only reading the size of each section which 264 // corresponds to the number of newlines. 265 sec := simpleSection{ 266 off: d.newlinesStart + d.newlinesIndex[i], 267 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i], 268 } 269 // We are only reading the first varint which is the size. So we don't 270 // need to read more than MaxVarintLen64 bytes. 271 if sec.sz > binary.MaxVarintLen64 { 272 sec.sz = binary.MaxVarintLen64 273 } 274 blob, err := d.readSectionBlob(sec) 275 if err != nil { 276 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err) 277 continue 278 } 279 sz, _ := binary.Uvarint(blob) 280 281 count += sz 282 if isDefault { 283 defaultCount += sz 284 } 285 otherCount += (others * sz) 286 } 287 288 return 289} 290 291func (d *indexData) String() string { 292 return fmt.Sprintf("shard(%s)", d.file.Name()) 293} 294 295// calculates an approximate size of indexData in memory in bytes. 296func (d *indexData) memoryUse() int { 297 sz := 0 298 for _, a := range [][]uint32{ 299 d.newlinesIndex, d.docSectionsIndex, 300 d.boundaries, d.fileNameIndex, 301 d.fileEndRunes, d.fileNameEndRunes, 302 d.fileEndSymbol, d.symbols.symKindIndex, 303 d.subRepos, 304 } { 305 sz += 4 * len(a) 306 } 307 sz += d.runeOffsets.sizeBytes() 308 sz += d.fileNameRuneOffsets.sizeBytes() 309 sz += len(d.languages) 310 sz += len(d.checksums) 311 sz += 2 * len(d.repos) 312 if len(d.ranks) > 0 { 313 sz += 8 * len(d.ranks) * len(d.ranks[0]) 314 } 315 sz += 8 * len(d.runeDocSections) 316 sz += 8 * len(d.fileBranchMasks) 317 sz += d.ngrams.SizeBytes() 318 sz += 12 * len(d.fileNameNgrams) // these slices reference mmap-ed memory 319 return sz 320} 321 322const maxUInt32 = 0xffffffff 323 324func firstMinarg(xs []uint32) uint32 { 325 m := uint32(maxUInt32) 326 j := len(xs) 327 for i, x := range xs { 328 if x < m { 329 m = x 330 j = i 331 } 332 } 333 return uint32(j) 334} 335 336func lastMinarg(xs []uint32) uint32 { 337 m := uint32(maxUInt32) 338 j := len(xs) 339 for i, x := range xs { 340 if x <= m { 341 m = x 342 j = i 343 } 344 } 345 return uint32(j) 346} 347 348func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 { 349 if filename { 350 return uint32(len(data.fileNameNgrams[ng])) 351 } 352 353 return data.ngrams.Get(ng).sz 354} 355 356type ngramIterationResults struct { 357 matchIterator 358 359 caseSensitive bool 360 fileName bool 361 substrBytes []byte 362 substrLowered []byte 363} 364 365func (r *ngramIterationResults) String() string { 366 return fmt.Sprintf("wrapper(%v)", r.matchIterator) 367} 368 369func (r *ngramIterationResults) candidates() []*candidateMatch { 370 cs := r.matchIterator.candidates() 371 for _, c := range cs { 372 c.caseSensitive = r.caseSensitive 373 c.fileName = r.fileName 374 c.substrBytes = r.substrBytes 375 c.substrLowered = r.substrLowered 376 } 377 return cs 378} 379 380func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) { 381 str := query.Pattern 382 383 // Find the 2 least common ngrams from the string. 384 ngramOffs := splitNGrams([]byte(query.Pattern)) 385 frequencies := make([]uint32, 0, len(ngramOffs)) 386 for _, o := range ngramOffs { 387 var freq uint32 388 if query.CaseSensitive { 389 freq = d.ngramFrequency(o.ngram, query.FileName) 390 } else { 391 for _, v := range generateCaseNgrams(o.ngram) { 392 freq += d.ngramFrequency(v, query.FileName) 393 } 394 } 395 396 if freq == 0 { 397 return &ngramIterationResults{ 398 matchIterator: &noMatchTree{ 399 Why: "freq=0", 400 }, 401 }, nil 402 } 403 404 frequencies = append(frequencies, freq) 405 } 406 firstI := firstMinarg(frequencies) 407 frequencies[firstI] = maxUInt32 408 lastI := lastMinarg(frequencies) 409 if firstI > lastI { 410 lastI, firstI = firstI, lastI 411 } 412 413 firstNG := ngramOffs[firstI].ngram 414 lastNG := ngramOffs[lastI].ngram 415 iter := &ngramDocIterator{ 416 leftPad: firstI, 417 rightPad: uint32(utf8.RuneCountInString(str)) - firstI, 418 } 419 if query.FileName { 420 iter.ends = d.fileNameEndRunes 421 } else { 422 iter.ends = d.fileEndRunes 423 } 424 425 if firstI != lastI { 426 i, err := d.newDistanceTrigramIter(firstNG, lastNG, lastI-firstI, query.CaseSensitive, query.FileName) 427 if err != nil { 428 return nil, err 429 } 430 431 iter.iter = i 432 } else { 433 hitIter, err := d.trigramHitIterator(lastNG, query.CaseSensitive, query.FileName) 434 if err != nil { 435 return nil, err 436 } 437 iter.iter = hitIter 438 } 439 440 patBytes := []byte(query.Pattern) 441 lowerPatBytes := toLower(patBytes) 442 443 return &ngramIterationResults{ 444 matchIterator: iter, 445 caseSensitive: query.CaseSensitive, 446 fileName: query.FileName, 447 substrBytes: patBytes, 448 substrLowered: lowerPatBytes, 449 }, nil 450} 451 452func (d *indexData) fileName(i uint32) []byte { 453 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]] 454} 455 456func (d *indexData) numDocs() uint32 { 457 return uint32(len(d.fileBranchMasks)) 458} 459 460func (s *indexData) Close() { 461 s.file.Close() 462} 463 464const ( 465 rawConfigYes = 1 466 rawConfigNo = 2 467) 468 469// encodeRawConfig encodes a rawConfig map into a uint8 mask. 470func encodeRawConfig(rawConfig map[string]string) uint8 { 471 var encoded uint8 472 for i, f := range []string{"public", "fork", "archived"} { 473 var e uint8 474 v, ok := rawConfig[f] 475 if ok && v == "1" { 476 e |= rawConfigYes 477 } else { 478 e |= rawConfigNo 479 } 480 encoded = encoded | e<<(2*i) 481 } 482 return encoded 483}