fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "encoding/binary"
19 "errors"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "math"
24 "math/bits"
25 "slices"
26 "unicode/utf8"
27
28 "github.com/sourcegraph/zoekt"
29 "github.com/sourcegraph/zoekt/query"
30)
31
32// indexData holds the pattern-independent data that we have to have
33// in memory to search. Most of the memory is taken up by the ngram =>
34// offset index.
35type indexData struct {
36 symbols symbolData
37
38 file IndexFile
39
40 contentNgrams btreeIndex
41
42 newlinesStart uint32
43 newlinesIndex []uint32
44
45 docSectionsStart uint32
46 docSectionsIndex []uint32
47
48 runeDocSections []DocumentSection
49
50 // rune offset=>byte offset mapping, relative to the start of the content corpus
51 runeOffsets runeOffsetMap
52
53 // offsets of file contents; includes end of last file
54 boundariesStart uint32
55 boundaries []uint32
56
57 // rune offsets for the file content boundaries
58 fileEndRunes []uint32
59
60 fileNameContent []byte
61 fileNameIndex []uint32
62 fileNameNgrams btreeIndex
63
64 // fileEndSymbol[i] is the index of the first symbol for document i.
65 fileEndSymbol []uint32
66
67 // rune offset=>byte offset mapping, relative to the start of the filename corpus
68 fileNameRuneOffsets runeOffsetMap
69
70 // rune offsets for the file name boundaries
71 fileNameEndRunes []uint32
72
73 fileBranchMasks []uint64
74
75 // mask (power of 2) => name
76 branchNames []map[uint]string
77
78 // name => mask (power of 2)
79 branchIDs []map[string]uint
80
81 metaData zoekt.IndexMetadata
82 repoMetaData []zoekt.Repository
83
84 subRepos []uint32
85 subRepoPaths [][]string
86
87 // Checksums for all the files, at 8-byte intervals
88 checksums []byte
89
90 // languages for all the files.
91 languages []byte
92
93 // inverse of LanguageMap in metaData
94 languageMap map[uint16]string
95
96 // file categories for all the files.
97 categories []byte
98
99 repoListEntry []zoekt.RepoListEntry
100
101 // repository indexes for all the files
102 repos []uint16
103
104 // rawConfigMasks contains the encoded RawConfig for each repository
105 rawConfigMasks []uint8
106}
107
108type symbolData struct {
109 // symContent stores Symbol.Sym and Symbol.Parent.
110 // TODO we don't need to store Symbol.Sym.
111 symContent []byte
112 symIndex []byte
113 // symKindContent is an enum of sym.Kind and sym.ParentKind
114 symKindContent []byte
115 symKindIndex []uint32
116 // symMetadata is [4]uint32 0 Kind Parent ParentKind
117 symMetaData []byte
118}
119
120func uint32SliceAt(a []byte, n uint32) uint32 {
121 return binary.BigEndian.Uint32(a[n*4:])
122}
123
124func uint32SliceLen(a []byte) uint32 {
125 return uint32(len(a) / 4)
126}
127
128// parent returns index i of the parent enum
129func (d *symbolData) parent(i uint32) []byte {
130 delta := uint32SliceAt(d.symIndex, 0)
131 start := uint32SliceAt(d.symIndex, i) - delta
132 var end uint32
133 if i+1 == uint32SliceLen(d.symIndex) {
134 end = uint32(len(d.symContent))
135 } else {
136 end = uint32SliceAt(d.symIndex, i+1) - delta
137 }
138 return d.symContent[start:end]
139}
140
141// kind returns index i of the kind enum
142func (d *symbolData) kind(i uint32) []byte {
143 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
144}
145
146// data returns the symbol at index i
147func (d *symbolData) data(i uint32) *zoekt.Symbol {
148 size := uint32(4 * 4) // 4 uint32s
149 offset := i * size
150 if offset >= uint32(len(d.symMetaData)) {
151 return nil
152 }
153
154 metadata := d.symMetaData[offset : offset+size]
155 sym := &zoekt.Symbol{}
156 key := uint32SliceAt(metadata, 1)
157 sym.Kind = string(d.kind(key))
158 key = uint32SliceAt(metadata, 2)
159 sym.Parent = string(d.parent(key))
160 key = uint32SliceAt(metadata, 3)
161 sym.ParentKind = string(d.kind(key))
162 return sym
163}
164
165func (d *indexData) getChecksum(idx uint32) []byte {
166 start := crc64.Size * idx
167 return d.checksums[start : start+crc64.Size]
168}
169
170func (d *indexData) getLanguage(idx uint32) uint16 {
171 if d.metaData.IndexFeatureVersion < 12 {
172 // older zoekt files had 8-bit language entries
173 return uint16(d.languages[idx])
174 }
175 // newer zoekt files have 16-bit language entries
176 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
177}
178
179func (d *indexData) getCategory(idx uint32) FileCategory {
180 if len(d.categories) == 0 {
181 // This means we're reading an older index, so return 'missing'
182 return FileCategoryMissing
183 }
184 category, err := decodeCategory(d.categories[idx])
185 if err != nil {
186 return FileCategoryMissing
187 }
188 return category
189}
190
191// calculates stats for files in the range [start, end).
192func (d *indexData) calculateStatsForFileRange(start, end uint32) zoekt.RepoStats {
193 if start >= end {
194 // An empty shard for an empty repository.
195 return zoekt.RepoStats{
196 Shards: 1,
197 }
198 }
199
200 bytesContent := d.boundaries[end] - d.boundaries[start]
201 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
202 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
203
204 // CR keegan for stefan: I think we may want to restructure RepoListEntry so
205 // that we don't change anything, except we have
206 // []Repository. Alternatively, things we can divide up we do (like
207 // here). Right now I don't like that these numbers are not true, especially
208 // after aggregation. For now I will move forward with this until we can
209 // chat more.
210 return zoekt.RepoStats{
211 ContentBytes: int64(bytesContent) + int64(bytesFN),
212 Documents: int(end - start),
213 // CR keegan for stefan: our shard count is going to go out of whack,
214 // since we will aggregate these. So we will report more shards than are
215 // present on disk. What should we do?
216 Shards: 1,
217
218 // Sourcegraph specific
219 NewLinesCount: count,
220 DefaultBranchNewLinesCount: defaultCount,
221 OtherBranchesNewLinesCount: otherCount,
222 }
223}
224
225func (d *indexData) calculateStats() error {
226 d.repoListEntry = make([]zoekt.RepoListEntry, 0, len(d.repoMetaData))
227 var start, end uint32
228
229 for repoID, md := range d.repoMetaData {
230 // determine the file range for repo i
231 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
232 end++
233 }
234 if start < end && d.repos[start] != uint16(repoID) {
235 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
236 }
237
238 d.repoListEntry = append(d.repoListEntry, zoekt.RepoListEntry{
239 Repository: md,
240 IndexMetadata: d.metaData,
241 Stats: d.calculateStatsForFileRange(start, end),
242 })
243 start = end
244 }
245
246 // All repos in a compound shard share memoryUse. So we average out the
247 // memoryUse per shard in our reporting. This has the benefit that when you
248 // aggregate the IndexBytes you get back the actual memoryUse.
249 //
250 // TODO take into account tombstones for aggregation. Even better, adjust
251 // API to be shard centric not repo centric.
252 if len(d.repoListEntry) > 0 {
253 indexBytes := d.memoryUse()
254 indexBytesChunk := indexBytes / len(d.repoListEntry)
255 for i := range d.repoListEntry {
256 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
257 indexBytes -= indexBytesChunk
258 }
259 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
260 }
261
262 return nil
263}
264
265// calculateNewLinesStats computes some Sourcegraph specific statistics for files
266// in the range [start, end). These are not as efficient to calculate as the
267// normal statistics. We experimentally measured about a 10% slower shard load
268// time. However, we find these values very useful to track and computing them
269// outside of load time introduces a lot of complexity.
270func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
271 for i := start; i < end; i++ {
272 // branchMask is a bitmask of the branches for a document. Zoekt by
273 // convention represents the default branch as the lowest bit.
274 branchMask := d.fileBranchMasks[i]
275 isDefault := (branchMask & 1) == 1
276 others := uint64(bits.OnesCount64(branchMask >> 1))
277
278 // this is readNewlines but only reading the size of each section which
279 // corresponds to the number of newlines.
280 sec := simpleSection{
281 off: d.newlinesStart + d.newlinesIndex[i],
282 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
283 }
284 // We are only reading the first varint which is the size. So we don't
285 // need to read more than MaxVarintLen64 bytes.
286 if sec.sz > binary.MaxVarintLen64 {
287 sec.sz = binary.MaxVarintLen64
288 }
289 blob, err := d.readSectionBlob(sec)
290 if err != nil {
291 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
292 continue
293 }
294 sz, _ := binary.Uvarint(blob)
295
296 count += sz
297 if isDefault {
298 defaultCount += sz
299 }
300 otherCount += (others * sz)
301 }
302
303 return
304}
305
306func (d *indexData) String() string {
307 return fmt.Sprintf("shard(%s)", d.file.Name())
308}
309
310// calculates an approximate size of indexData in memory in bytes.
311func (d *indexData) memoryUse() int {
312 sz := 0
313 for _, a := range [][]uint32{
314 d.newlinesIndex, d.docSectionsIndex,
315 d.boundaries, d.fileNameIndex,
316 d.fileEndRunes, d.fileNameEndRunes,
317 d.fileEndSymbol, d.symbols.symKindIndex,
318 d.subRepos,
319 } {
320 sz += 4 * len(a)
321 }
322 sz += d.runeOffsets.sizeBytes()
323 sz += d.fileNameRuneOffsets.sizeBytes()
324 sz += len(d.languages)
325 sz += len(d.checksums)
326 sz += 2 * len(d.repos)
327 sz += 8 * len(d.runeDocSections)
328 sz += 8 * len(d.fileBranchMasks)
329 sz += d.contentNgrams.SizeBytes()
330 sz += d.fileNameNgrams.SizeBytes()
331 return sz
332}
333
334// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
335// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids
336// overlapping trigrams to keep their intersection as small as possible.
337//
338// Invariant: first will always have a smaller index than last.
339func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
340 first, last = minFrequencyNgramOffsets(ngramOffs, frequencies)
341
342 // If the trigrams are overlapping, then try to shift one to reduce overlap.
343 // This is guaranteed to produce a smaller intersection.
344 if last.index-first.index < ngramSize {
345 newFirstIndex := max(last.index-ngramSize, 0)
346 if newFirstIndex != first.index {
347 first = ngramOffs[indexMap[newFirstIndex]]
348 }
349
350 newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1)
351 if newLastIndex != last.index {
352 last = ngramOffs[indexMap[newLastIndex]]
353 }
354 }
355 return
356}
357
358func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
359 // Find the two lowest frequency ngrams.
360 idx0, idx1 := 0, 0
361 min0, min1 := uint32(math.MaxUint32), uint32(math.MaxUint32)
362 for i, x := range frequencies {
363 if x <= min0 {
364 idx0, idx1 = i, idx0
365 min0, min1 = x, min0
366 } else if x <= min1 {
367 idx1 = i
368 min1 = x
369 }
370 }
371
372 first = ngramOffs[idx0]
373 last = ngramOffs[idx1]
374
375 // Ensure first appears before last as a helpful invariant.
376 if first.index > last.index {
377 last, first = first, last
378 }
379 return
380}
381
382func (data *indexData) ngrams(filename bool) btreeIndex {
383 if filename {
384 return data.fileNameNgrams
385 }
386 return data.contentNgrams
387}
388
389type ngramIterationResults struct {
390 matchIterator
391
392 caseSensitive bool
393 fileName bool
394 substrBytes []byte
395 substrLowered []byte
396}
397
398func (r *ngramIterationResults) String() string {
399 return fmt.Sprintf("wrapper(%v)", r.matchIterator)
400}
401
402func (r *ngramIterationResults) candidates() []*candidateMatch {
403 cs := r.matchIterator.candidates()
404 for _, c := range cs {
405 c.caseSensitive = r.caseSensitive
406 c.fileName = r.fileName
407 c.substrBytes = r.substrBytes
408 c.substrLowered = r.substrLowered
409 }
410 return cs
411}
412
413func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
414 str := query.Pattern
415
416 // Find the 2 least common ngrams from the string.
417 ngramOffs := splitNGrams([]byte(str))
418
419 // protect against accidental searching of empty strings
420 if len(ngramOffs) == 0 {
421 return nil, errors.New("iterateNgrams needs non empty string")
422 }
423
424 // PERF: Sort to increase the chances adjacent checks are in the same btree
425 // bucket (which can cause disk IO).
426 slices.SortFunc(ngramOffs, runeNgramOff.Compare)
427 frequencies := make([]uint32, 0, len(ngramOffs))
428 indexMap := make([]int, len(ngramOffs))
429 ngramLookups := 0
430 ngrams := d.ngrams(query.FileName)
431 for i, o := range ngramOffs {
432 var freq uint32
433 if query.CaseSensitive {
434 freq = ngrams.Get(o.ngram).sz
435 ngramLookups++
436 } else {
437 for _, v := range generateCaseNgrams(o.ngram) {
438 freq += ngrams.Get(v).sz
439 ngramLookups++
440 }
441 }
442
443 if freq == 0 {
444 return &ngramIterationResults{
445 matchIterator: &noMatchTree{
446 Why: "freq=0",
447 Stats: zoekt.Stats{
448 NgramLookups: ngramLookups,
449 },
450 },
451 }, nil
452 }
453
454 frequencies = append(frequencies, freq)
455 indexMap[o.index] = i
456 }
457
458 first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies)
459
460 iter := &ngramDocIterator{
461 leftPad: uint32(first.index),
462 rightPad: uint32(utf8.RuneCountInString(str) - first.index),
463 ngramLookups: ngramLookups,
464 }
465 if query.FileName {
466 iter.ends = d.fileNameEndRunes
467 } else {
468 iter.ends = d.fileEndRunes
469 }
470
471 if first != last {
472 runeDist := uint32(last.index - first.index)
473 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
474 if err != nil {
475 return nil, err
476 }
477
478 iter.iter = i
479 } else {
480 hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
481 if err != nil {
482 return nil, err
483 }
484 iter.iter = hitIter
485 }
486
487 patBytes := []byte(query.Pattern)
488 lowerPatBytes := toLower(patBytes)
489
490 return &ngramIterationResults{
491 matchIterator: iter,
492 caseSensitive: query.CaseSensitive,
493 fileName: query.FileName,
494 substrBytes: patBytes,
495 substrLowered: lowerPatBytes,
496 }, nil
497}
498
499func (d *indexData) fileName(i uint32) []byte {
500 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
501}
502
503func (d *indexData) numDocs() uint32 {
504 return uint32(len(d.fileBranchMasks))
505}
506
507func (s *indexData) Close() {
508 s.file.Close()
509}
510
511const (
512 rawConfigYes = 1
513 rawConfigNo = 2
514)
515
516// encodeRawConfig encodes a rawConfig map into a uint8 mask.
517func encodeRawConfig(rawConfig map[string]string) uint8 {
518 var encoded uint8
519 for i, f := range []string{"public", "fork", "archived"} {
520 var e uint8
521 v, ok := rawConfig[f]
522 if ok && v == "1" {
523 e |= rawConfigYes
524 } else {
525 e |= rawConfigNo
526 }
527 encoded = encoded | e<<(2*i)
528 }
529 return encoded
530}