fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "encoding/binary"
19 "errors"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "math"
24 "math/bits"
25 "slices"
26 "unicode/utf8"
27
28 "github.com/sourcegraph/zoekt"
29 "github.com/sourcegraph/zoekt/query"
30)
31
32// indexData holds the pattern-independent data that we have to have
33// in memory to search. Most of the memory is taken up by the ngram =>
34// offset index.
35type indexData struct {
36 symbols symbolData
37
38 file IndexFile
39
40 contentNgrams btreeIndex
41
42 newlinesStart uint32
43 newlinesIndex []uint32
44
45 docSectionsStart uint32
46 docSectionsIndex []uint32
47
48 runeDocSections []DocumentSection
49
50 // rune offset=>byte offset mapping, relative to the start of the content corpus
51 runeOffsets runeOffsetMap
52
53 // offsets of file contents; includes end of last file
54 boundariesStart uint32
55 boundaries []uint32
56
57 // rune offsets for the file content boundaries
58 fileEndRunes []uint32
59
60 fileNameContent []byte
61 fileNameIndex []uint32
62 fileNameNgrams btreeIndex
63
64 // fileEndSymbol[i] is the index of the first symbol for document i.
65 fileEndSymbol []uint32
66
67 // rune offset=>byte offset mapping, relative to the start of the filename corpus
68 fileNameRuneOffsets runeOffsetMap
69
70 // rune offsets for the file name boundaries
71 fileNameEndRunes []uint32
72
73 fileBranchMasks []uint64
74
75 // mask (power of 2) => name
76 branchNames []map[uint]string
77
78 // name => mask (power of 2)
79 branchIDs []map[string]uint
80
81 metaData zoekt.IndexMetadata
82 repoMetaData []zoekt.Repository
83
84 subRepos []uint32
85 subRepoPaths [][]string
86
87 // Checksums for all the files, at 8-byte intervals
88 checksums []byte
89
90 // languages for all the files.
91 languages []byte
92
93 // inverse of LanguageMap in metaData
94 languageMap map[uint16]string
95
96 // file categories for all the files.
97 categories []byte
98
99 repoListEntry []zoekt.RepoListEntry
100
101 // repository indexes for all the files
102 repos []uint16
103
104 // rawConfigMasks contains the encoded RawConfig for each repository
105 rawConfigMasks []uint8
106
107 // Cache for docMatchTree objects
108 docMatchTreeCache *docMatchTreeCache
109}
110
111type symbolData struct {
112 // symContent stores Symbol.Sym and Symbol.Parent.
113 // TODO we don't need to store Symbol.Sym.
114 symContent []byte
115 symIndex []byte
116 // symKindContent is an enum of sym.Kind and sym.ParentKind
117 symKindContent []byte
118 symKindIndex []uint32
119 // symMetadata is [4]uint32 0 Kind Parent ParentKind
120 symMetaData []byte
121}
122
123func uint32SliceAt(a []byte, n uint32) uint32 {
124 return binary.BigEndian.Uint32(a[n*4:])
125}
126
127func uint32SliceLen(a []byte) uint32 {
128 return uint32(len(a) / 4)
129}
130
131// parent returns index i of the parent enum
132func (d *symbolData) parent(i uint32) []byte {
133 delta := uint32SliceAt(d.symIndex, 0)
134 start := uint32SliceAt(d.symIndex, i) - delta
135 var end uint32
136 if i+1 == uint32SliceLen(d.symIndex) {
137 end = uint32(len(d.symContent))
138 } else {
139 end = uint32SliceAt(d.symIndex, i+1) - delta
140 }
141 return d.symContent[start:end]
142}
143
144// kind returns index i of the kind enum
145func (d *symbolData) kind(i uint32) []byte {
146 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
147}
148
149// data returns the symbol at index i
150func (d *symbolData) data(i uint32) *zoekt.Symbol {
151 size := uint32(4 * 4) // 4 uint32s
152 offset := i * size
153 if offset >= uint32(len(d.symMetaData)) {
154 return nil
155 }
156
157 metadata := d.symMetaData[offset : offset+size]
158 sym := &zoekt.Symbol{}
159 key := uint32SliceAt(metadata, 1)
160 sym.Kind = string(d.kind(key))
161 key = uint32SliceAt(metadata, 2)
162 sym.Parent = string(d.parent(key))
163 key = uint32SliceAt(metadata, 3)
164 sym.ParentKind = string(d.kind(key))
165 return sym
166}
167
168func (d *indexData) getChecksum(idx uint32) []byte {
169 start := crc64.Size * idx
170 return d.checksums[start : start+crc64.Size]
171}
172
173func (d *indexData) getLanguage(idx uint32) uint16 {
174 if d.metaData.IndexFeatureVersion < 12 {
175 // older zoekt files had 8-bit language entries
176 return uint16(d.languages[idx])
177 }
178 // newer zoekt files have 16-bit language entries
179 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
180}
181
182func (d *indexData) getCategory(idx uint32) FileCategory {
183 if len(d.categories) == 0 {
184 // This means we're reading an older index, so return 'missing'
185 return FileCategoryMissing
186 }
187 category, err := decodeCategory(d.categories[idx])
188 if err != nil {
189 return FileCategoryMissing
190 }
191 return category
192}
193
194// calculates stats for files in the range [start, end).
195func (d *indexData) calculateStatsForFileRange(start, end uint32) zoekt.RepoStats {
196 if start >= end {
197 // An empty shard for an empty repository.
198 return zoekt.RepoStats{
199 Shards: 1,
200 }
201 }
202
203 bytesContent := d.boundaries[end] - d.boundaries[start]
204 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
205 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
206
207 // CR keegan for stefan: I think we may want to restructure RepoListEntry so
208 // that we don't change anything, except we have
209 // []Repository. Alternatively, things we can divide up we do (like
210 // here). Right now I don't like that these numbers are not true, especially
211 // after aggregation. For now I will move forward with this until we can
212 // chat more.
213 return zoekt.RepoStats{
214 ContentBytes: int64(bytesContent) + int64(bytesFN),
215 Documents: int(end - start),
216 // CR keegan for stefan: our shard count is going to go out of whack,
217 // since we will aggregate these. So we will report more shards than are
218 // present on disk. What should we do?
219 Shards: 1,
220
221 // Sourcegraph specific
222 NewLinesCount: count,
223 DefaultBranchNewLinesCount: defaultCount,
224 OtherBranchesNewLinesCount: otherCount,
225 }
226}
227
228func (d *indexData) calculateStats() error {
229 d.repoListEntry = make([]zoekt.RepoListEntry, 0, len(d.repoMetaData))
230 var start, end uint32
231
232 for repoID, md := range d.repoMetaData {
233 // determine the file range for repo i
234 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
235 end++
236 }
237 if start < end && d.repos[start] != uint16(repoID) {
238 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
239 }
240
241 d.repoListEntry = append(d.repoListEntry, zoekt.RepoListEntry{
242 Repository: md,
243 IndexMetadata: d.metaData,
244 Stats: d.calculateStatsForFileRange(start, end),
245 })
246 start = end
247 }
248
249 // All repos in a compound shard share memoryUse. So we average out the
250 // memoryUse per shard in our reporting. This has the benefit that when you
251 // aggregate the IndexBytes you get back the actual memoryUse.
252 //
253 // TODO take into account tombstones for aggregation. Even better, adjust
254 // API to be shard centric not repo centric.
255 if len(d.repoListEntry) > 0 {
256 indexBytes := d.memoryUse()
257 indexBytesChunk := indexBytes / len(d.repoListEntry)
258 for i := range d.repoListEntry {
259 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
260 indexBytes -= indexBytesChunk
261 }
262 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
263 }
264
265 return nil
266}
267
268// calculateNewLinesStats computes some Sourcegraph specific statistics for files
269// in the range [start, end). These are not as efficient to calculate as the
270// normal statistics. We experimentally measured about a 10% slower shard load
271// time. However, we find these values very useful to track and computing them
272// outside of load time introduces a lot of complexity.
273func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
274 for i := start; i < end; i++ {
275 // branchMask is a bitmask of the branches for a document. Zoekt by
276 // convention represents the default branch as the lowest bit.
277 branchMask := d.fileBranchMasks[i]
278 isDefault := (branchMask & 1) == 1
279 others := uint64(bits.OnesCount64(branchMask >> 1))
280
281 // this is readNewlines but only reading the size of each section which
282 // corresponds to the number of newlines.
283 sec := simpleSection{
284 off: d.newlinesStart + d.newlinesIndex[i],
285 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
286 }
287 // We are only reading the first varint which is the size. So we don't
288 // need to read more than MaxVarintLen64 bytes.
289 if sec.sz > binary.MaxVarintLen64 {
290 sec.sz = binary.MaxVarintLen64
291 }
292 blob, err := d.readSectionBlob(sec)
293 if err != nil {
294 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
295 continue
296 }
297 sz, _ := binary.Uvarint(blob)
298
299 count += sz
300 if isDefault {
301 defaultCount += sz
302 }
303 otherCount += (others * sz)
304 }
305
306 return
307}
308
309func (d *indexData) String() string {
310 return fmt.Sprintf("shard(%s)", d.file.Name())
311}
312
313// calculates an approximate size of indexData in memory in bytes.
314func (d *indexData) memoryUse() int {
315 sz := 0
316 for _, a := range [][]uint32{
317 d.newlinesIndex, d.docSectionsIndex,
318 d.boundaries, d.fileNameIndex,
319 d.fileEndRunes, d.fileNameEndRunes,
320 d.fileEndSymbol, d.symbols.symKindIndex,
321 d.subRepos,
322 } {
323 sz += 4 * len(a)
324 }
325 sz += d.runeOffsets.sizeBytes()
326 sz += d.fileNameRuneOffsets.sizeBytes()
327 sz += len(d.languages)
328 sz += len(d.checksums)
329 sz += 2 * len(d.repos)
330 sz += 8 * len(d.runeDocSections)
331 sz += 8 * len(d.fileBranchMasks)
332 sz += d.contentNgrams.SizeBytes()
333 sz += d.fileNameNgrams.SizeBytes()
334 return sz
335}
336
337// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
338// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids
339// overlapping trigrams to keep their intersection as small as possible.
340//
341// Invariant: first will always have a smaller index than last.
342func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
343 first, last = minFrequencyNgramOffsets(ngramOffs, frequencies)
344
345 // If the trigrams are overlapping, then try to shift one to reduce overlap.
346 // This is guaranteed to produce a smaller intersection.
347 if last.index-first.index < ngramSize {
348 newFirstIndex := max(last.index-ngramSize, 0)
349 if newFirstIndex != first.index {
350 first = ngramOffs[indexMap[newFirstIndex]]
351 }
352
353 newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1)
354 if newLastIndex != last.index {
355 last = ngramOffs[indexMap[newLastIndex]]
356 }
357 }
358 return
359}
360
361func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
362 // Find the two lowest frequency ngrams.
363 idx0, idx1 := 0, 0
364 min0, min1 := uint32(math.MaxUint32), uint32(math.MaxUint32)
365 for i, x := range frequencies {
366 if x <= min0 {
367 idx0, idx1 = i, idx0
368 min0, min1 = x, min0
369 } else if x <= min1 {
370 idx1 = i
371 min1 = x
372 }
373 }
374
375 first = ngramOffs[idx0]
376 last = ngramOffs[idx1]
377
378 // Ensure first appears before last as a helpful invariant.
379 if first.index > last.index {
380 last, first = first, last
381 }
382 return
383}
384
385func (data *indexData) ngrams(filename bool) btreeIndex {
386 if filename {
387 return data.fileNameNgrams
388 }
389 return data.contentNgrams
390}
391
392type ngramIterationResults struct {
393 matchIterator
394
395 caseSensitive bool
396 fileName bool
397 substrBytes []byte
398 substrLowered []byte
399}
400
401func (r *ngramIterationResults) String() string {
402 return fmt.Sprintf("wrapper(%v)", r.matchIterator)
403}
404
405func (r *ngramIterationResults) candidates() []*candidateMatch {
406 cs := r.matchIterator.candidates()
407 for _, c := range cs {
408 c.caseSensitive = r.caseSensitive
409 c.fileName = r.fileName
410 c.substrBytes = r.substrBytes
411 c.substrLowered = r.substrLowered
412 }
413 return cs
414}
415
416func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
417 str := query.Pattern
418
419 // Find the 2 least common ngrams from the string.
420 ngramOffs := splitNGrams([]byte(str))
421
422 // protect against accidental searching of empty strings
423 if len(ngramOffs) == 0 {
424 return nil, errors.New("iterateNgrams needs non empty string")
425 }
426
427 // PERF: Sort to increase the chances adjacent checks are in the same btree
428 // bucket (which can cause disk IO).
429 slices.SortFunc(ngramOffs, runeNgramOff.Compare)
430 frequencies := make([]uint32, 0, len(ngramOffs))
431 indexMap := make([]int, len(ngramOffs))
432 ngramLookups := 0
433 ngrams := d.ngrams(query.FileName)
434 for i, o := range ngramOffs {
435 var freq uint32
436 if query.CaseSensitive {
437 freq = ngrams.Get(o.ngram).sz
438 ngramLookups++
439 } else {
440 for _, v := range generateCaseNgrams(o.ngram) {
441 freq += ngrams.Get(v).sz
442 ngramLookups++
443 }
444 }
445
446 if freq == 0 {
447 return &ngramIterationResults{
448 matchIterator: &noMatchTree{
449 Why: "freq=0",
450 Stats: zoekt.Stats{
451 NgramLookups: ngramLookups,
452 },
453 },
454 }, nil
455 }
456
457 frequencies = append(frequencies, freq)
458 indexMap[o.index] = i
459 }
460
461 first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies)
462
463 iter := &ngramDocIterator{
464 leftPad: uint32(first.index),
465 rightPad: uint32(utf8.RuneCountInString(str) - first.index),
466 ngramLookups: ngramLookups,
467 }
468 if query.FileName {
469 iter.ends = d.fileNameEndRunes
470 } else {
471 iter.ends = d.fileEndRunes
472 }
473
474 if first != last {
475 runeDist := uint32(last.index - first.index)
476 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
477 if err != nil {
478 return nil, err
479 }
480
481 iter.iter = i
482 } else {
483 hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
484 if err != nil {
485 return nil, err
486 }
487 iter.iter = hitIter
488 }
489
490 patBytes := []byte(query.Pattern)
491 lowerPatBytes := toLower(patBytes)
492
493 return &ngramIterationResults{
494 matchIterator: iter,
495 caseSensitive: query.CaseSensitive,
496 fileName: query.FileName,
497 substrBytes: patBytes,
498 substrLowered: lowerPatBytes,
499 }, nil
500}
501
502func (d *indexData) fileName(i uint32) []byte {
503 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
504}
505
506func (d *indexData) numDocs() uint32 {
507 return uint32(len(d.fileBranchMasks))
508}
509
510func (s *indexData) Close() {
511 s.file.Close()
512}
513
514const (
515 rawConfigYes = 1
516 rawConfigNo = 2
517)
518
519// encodeRawConfig encodes a rawConfig map into a uint8 mask.
520func encodeRawConfig(rawConfig map[string]string) uint8 {
521 var encoded uint8
522 for i, f := range []string{"public", "fork", "archived"} {
523 var e uint8
524 v, ok := rawConfig[f]
525 if ok && v == "1" {
526 e |= rawConfigYes
527 } else {
528 e |= rawConfigNo
529 }
530 encoded = encoded | e<<(2*i)
531 }
532 return encoded
533}