fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "encoding/binary" 19 "fmt" 20 "hash/crc64" 21 "log" 22 "math/bits" 23 "unicode/utf8" 24 25 "github.com/sourcegraph/zoekt/query" 26) 27 28// indexData holds the pattern-independent data that we have to have 29// in memory to search. Most of the memory is taken up by the ngram => 30// offset index. 31type indexData struct { 32 symbols symbolData 33 34 file IndexFile 35 36 ngrams ngramIndex 37 38 newlinesStart uint32 39 newlinesIndex []uint32 40 41 docSectionsStart uint32 42 docSectionsIndex []uint32 43 44 runeDocSections []DocumentSection 45 runeDocSectionsRaw []byte 46 47 // rune offset=>byte offset mapping, relative to the start of the content corpus 48 runeOffsets runeOffsetMap 49 50 // offsets of file contents; includes end of last file 51 boundariesStart uint32 52 boundaries []uint32 53 54 // rune offsets for the file content boundaries 55 fileEndRunes []uint32 56 57 fileNameContent []byte 58 fileNameIndex []uint32 59 fileNameNgrams fileNameNgrams 60 61 // fileEndSymbol[i] is the index of the first symbol for document i. 62 fileEndSymbol []uint32 63 64 // rune offset=>byte offset mapping, relative to the start of the filename corpus 65 fileNameRuneOffsets runeOffsetMap 66 67 // rune offsets for the file name boundaries 68 fileNameEndRunes []uint32 69 70 fileBranchMasks []uint64 71 72 // mask (power of 2) => name 73 branchNames []map[uint]string 74 75 // name => mask (power of 2) 76 branchIDs []map[string]uint 77 78 metaData IndexMetadata 79 repoMetaData []Repository 80 81 subRepos []uint32 82 subRepoPaths [][]string 83 84 // Checksums for all the files, at 8-byte intervals 85 checksums []byte 86 87 // languages for all the files. 88 languages []byte 89 90 // inverse of LanguageMap in metaData 91 languageMap map[uint16]string 92 93 repoListEntry []RepoListEntry 94 95 // repository indexes for all the files 96 repos []uint16 97 98 // Experimental: docID => rank vec 99 ranks [][]float64 100 101 // rawConfigMasks contains the encoded RawConfig for each repository 102 rawConfigMasks []uint8 103} 104 105type symbolData struct { 106 // symContent stores Symbol.Sym and Symbol.Parent. 107 // TODO we don't need to store Symbol.Sym. 108 symContent []byte 109 symIndex []byte 110 // symKindContent is an enum of sym.Kind and sym.ParentKind 111 symKindContent []byte 112 symKindIndex []uint32 113 // symMetadata is [4]uint32 0 Kind Parent ParentKind 114 symMetaData []byte 115} 116 117func uint32SliceAt(a []byte, n uint32) uint32 { 118 return binary.BigEndian.Uint32(a[n*4:]) 119} 120 121func uint32SliceLen(a []byte) uint32 { 122 return uint32(len(a) / 4) 123} 124 125// parent returns index i of the parent enum 126func (d *symbolData) parent(i uint32) []byte { 127 delta := uint32SliceAt(d.symIndex, 0) 128 start := uint32SliceAt(d.symIndex, i) - delta 129 var end uint32 130 if i+1 == uint32SliceLen(d.symIndex) { 131 end = uint32(len(d.symContent)) 132 } else { 133 end = uint32SliceAt(d.symIndex, i+1) - delta 134 } 135 return d.symContent[start:end] 136} 137 138// kind returns index i of the kind enum 139func (d *symbolData) kind(i uint32) []byte { 140 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]] 141} 142 143// data returns the symbol at index i 144func (d *symbolData) data(i uint32) *Symbol { 145 size := uint32(4 * 4) // 4 uint32s 146 offset := i * size 147 if offset >= uint32(len(d.symMetaData)) { 148 return nil 149 } 150 151 metadata := d.symMetaData[offset : offset+size] 152 sym := &Symbol{} 153 key := uint32SliceAt(metadata, 1) 154 sym.Kind = string(d.kind(key)) 155 key = uint32SliceAt(metadata, 2) 156 sym.Parent = string(d.parent(key)) 157 key = uint32SliceAt(metadata, 3) 158 sym.ParentKind = string(d.kind(key)) 159 return sym 160} 161 162func (d *indexData) getChecksum(idx uint32) []byte { 163 start := crc64.Size * idx 164 return d.checksums[start : start+crc64.Size] 165} 166 167func (d *indexData) getLanguage(idx uint32) uint16 { 168 if d.metaData.IndexFeatureVersion < 12 { 169 // older zoekt files had 8-bit language entries 170 return uint16(d.languages[idx]) 171 } 172 // newer zoekt files have 16-bit language entries 173 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8 174} 175 176// calculates stats for files in the range [start, end). 177func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats { 178 if start >= end { 179 // An empty shard for an empty repository. 180 return RepoStats{ 181 Shards: 1, 182 } 183 } 184 185 bytesContent := d.boundaries[end] - d.boundaries[start] 186 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start] 187 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end) 188 189 // CR keegan for stefan: I think we may want to restructure RepoListEntry so 190 // that we don't change anything, except we have 191 // []Repository. Alternatively, things we can divide up we do (like 192 // here). Right now I don't like that these numbers are not true, especially 193 // after aggregation. For now I will move forward with this until we can 194 // chat more. 195 return RepoStats{ 196 ContentBytes: int64(bytesContent) + int64(bytesFN), 197 Documents: int(end - start), 198 // CR keegan for stefan: our shard count is going to go out of whack, 199 // since we will aggregate these. So we will report more shards than are 200 // present on disk. What should we do? 201 Shards: 1, 202 203 // Sourcegraph specific 204 NewLinesCount: count, 205 DefaultBranchNewLinesCount: defaultCount, 206 OtherBranchesNewLinesCount: otherCount, 207 } 208} 209 210func (d *indexData) calculateStats() error { 211 d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData)) 212 var start, end uint32 213 214 for repoID, md := range d.repoMetaData { 215 // determine the file range for repo i 216 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) { 217 end++ 218 } 219 if start < end && d.repos[start] != uint16(repoID) { 220 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID) 221 } 222 223 d.repoListEntry = append(d.repoListEntry, RepoListEntry{ 224 Repository: md, 225 IndexMetadata: d.metaData, 226 Stats: d.calculateStatsForFileRange(start, end), 227 }) 228 start = end 229 } 230 231 // All repos in a compound shard share memoryUse. So we average out the 232 // memoryUse per shard in our reporting. This has the benefit that when you 233 // aggregate the IndexBytes you get back the actual memoryUse. 234 // 235 // TODO take into account tombstones for aggregation. Even better, adjust 236 // API to be shard centric not repo centric. 237 if len(d.repoListEntry) > 0 { 238 indexBytes := d.memoryUse() 239 indexBytesChunk := indexBytes / len(d.repoListEntry) 240 for i := range d.repoListEntry { 241 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk) 242 indexBytes -= indexBytesChunk 243 } 244 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes) 245 } 246 247 return nil 248} 249 250// calculateNewLinesStats computes some Sourcegraph specific statistics for files 251// in the range [start, end). These are not as efficient to calculate as the 252// normal statistics. We experimentally measured about a 10% slower shard load 253// time. However, we find these values very useful to track and computing them 254// outside of load time introduces a lot of complexity. 255func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) { 256 for i := start; i < end; i++ { 257 // branchMask is a bitmask of the branches for a document. Zoekt by 258 // convention represents the default branch as the lowest bit. 259 branchMask := d.fileBranchMasks[i] 260 isDefault := (branchMask & 1) == 1 261 others := uint64(bits.OnesCount64(branchMask >> 1)) 262 263 // this is readNewlines but only reading the size of each section which 264 // corresponds to the number of newlines. 265 sec := simpleSection{ 266 off: d.newlinesStart + d.newlinesIndex[i], 267 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i], 268 } 269 // We are only reading the first varint which is the size. So we don't 270 // need to read more than MaxVarintLen64 bytes. 271 if sec.sz > binary.MaxVarintLen64 { 272 sec.sz = binary.MaxVarintLen64 273 } 274 blob, err := d.readSectionBlob(sec) 275 if err != nil { 276 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err) 277 continue 278 } 279 sz, _ := binary.Uvarint(blob) 280 281 count += sz 282 if isDefault { 283 defaultCount += sz 284 } 285 otherCount += (others * sz) 286 } 287 288 return 289} 290 291func (d *indexData) String() string { 292 return fmt.Sprintf("shard(%s)", d.file.Name()) 293} 294 295// calculates an approximate size of indexData in memory in bytes. 296func (d *indexData) memoryUse() int { 297 sz := 0 298 for _, a := range [][]uint32{ 299 d.newlinesIndex, d.docSectionsIndex, 300 d.boundaries, d.fileNameIndex, 301 d.fileEndRunes, d.fileNameEndRunes, 302 d.fileEndSymbol, d.symbols.symKindIndex, 303 d.subRepos, 304 } { 305 sz += 4 * len(a) 306 } 307 sz += d.runeOffsets.sizeBytes() 308 sz += d.fileNameRuneOffsets.sizeBytes() 309 sz += len(d.languages) 310 sz += len(d.checksums) 311 sz += 2 * len(d.repos) 312 if len(d.ranks) > 0 { 313 sz += 8 * len(d.ranks) * len(d.ranks[0]) 314 } 315 sz += 8 * len(d.runeDocSections) 316 sz += 8 * len(d.fileBranchMasks) 317 if d.ngrams != nil { 318 sz += d.ngrams.SizeBytes() 319 } 320 sz += d.fileNameNgrams.SizeBytes() 321 return sz 322} 323 324const maxUInt32 = 0xffffffff 325 326func firstMinarg(xs []uint32) uint32 { 327 m := uint32(maxUInt32) 328 j := len(xs) 329 for i, x := range xs { 330 if x < m { 331 m = x 332 j = i 333 } 334 } 335 return uint32(j) 336} 337 338func lastMinarg(xs []uint32) uint32 { 339 m := uint32(maxUInt32) 340 j := len(xs) 341 for i, x := range xs { 342 if x <= m { 343 m = x 344 j = i 345 } 346 } 347 return uint32(j) 348} 349 350func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 { 351 if filename { 352 return data.fileNameNgrams.Frequency(ng) 353 } 354 355 if data.ngrams == nil { 356 return 0 357 } 358 359 return data.ngrams.Get(ng).sz 360} 361 362type ngramIterationResults struct { 363 matchIterator 364 365 caseSensitive bool 366 fileName bool 367 substrBytes []byte 368 substrLowered []byte 369} 370 371func (r *ngramIterationResults) String() string { 372 return fmt.Sprintf("wrapper(%v)", r.matchIterator) 373} 374 375func (r *ngramIterationResults) candidates() []*candidateMatch { 376 cs := r.matchIterator.candidates() 377 for _, c := range cs { 378 c.caseSensitive = r.caseSensitive 379 c.fileName = r.fileName 380 c.substrBytes = r.substrBytes 381 c.substrLowered = r.substrLowered 382 } 383 return cs 384} 385 386func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) { 387 str := query.Pattern 388 389 // Find the 2 least common ngrams from the string. 390 ngramOffs := splitNGrams([]byte(query.Pattern)) 391 frequencies := make([]uint32, 0, len(ngramOffs)) 392 for _, o := range ngramOffs { 393 var freq uint32 394 if query.CaseSensitive { 395 freq = d.ngramFrequency(o.ngram, query.FileName) 396 } else { 397 for _, v := range generateCaseNgrams(o.ngram) { 398 freq += d.ngramFrequency(v, query.FileName) 399 } 400 } 401 402 if freq == 0 { 403 return &ngramIterationResults{ 404 matchIterator: &noMatchTree{ 405 Why: "freq=0", 406 }, 407 }, nil 408 } 409 410 frequencies = append(frequencies, freq) 411 } 412 firstI := firstMinarg(frequencies) 413 frequencies[firstI] = maxUInt32 414 lastI := lastMinarg(frequencies) 415 if firstI > lastI { 416 lastI, firstI = firstI, lastI 417 } 418 419 firstNG := ngramOffs[firstI].ngram 420 lastNG := ngramOffs[lastI].ngram 421 iter := &ngramDocIterator{ 422 leftPad: firstI, 423 rightPad: uint32(utf8.RuneCountInString(str)) - firstI, 424 } 425 if query.FileName { 426 iter.ends = d.fileNameEndRunes 427 } else { 428 iter.ends = d.fileEndRunes 429 } 430 431 if firstI != lastI { 432 i, err := d.newDistanceTrigramIter(firstNG, lastNG, lastI-firstI, query.CaseSensitive, query.FileName) 433 if err != nil { 434 return nil, err 435 } 436 437 iter.iter = i 438 } else { 439 hitIter, err := d.trigramHitIterator(lastNG, query.CaseSensitive, query.FileName) 440 if err != nil { 441 return nil, err 442 } 443 iter.iter = hitIter 444 } 445 446 patBytes := []byte(query.Pattern) 447 lowerPatBytes := toLower(patBytes) 448 449 return &ngramIterationResults{ 450 matchIterator: iter, 451 caseSensitive: query.CaseSensitive, 452 fileName: query.FileName, 453 substrBytes: patBytes, 454 substrLowered: lowerPatBytes, 455 }, nil 456} 457 458func (d *indexData) fileName(i uint32) []byte { 459 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]] 460} 461 462func (d *indexData) numDocs() uint32 { 463 return uint32(len(d.fileBranchMasks)) 464} 465 466func (s *indexData) Close() { 467 s.file.Close() 468} 469 470const ( 471 rawConfigYes = 1 472 rawConfigNo = 2 473) 474 475// encodeRawConfig encodes a rawConfig map into a uint8 mask. 476func encodeRawConfig(rawConfig map[string]string) uint8 { 477 var encoded uint8 478 for i, f := range []string{"public", "fork", "archived"} { 479 var e uint8 480 v, ok := rawConfig[f] 481 if ok && v == "1" { 482 e |= rawConfigYes 483 } else { 484 e |= rawConfigNo 485 } 486 encoded = encoded | e<<(2*i) 487 } 488 return encoded 489}