fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "encoding/binary"
19 "errors"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "math/bits"
24 "slices"
25 "unicode/utf8"
26
27 "github.com/sourcegraph/zoekt"
28 "github.com/sourcegraph/zoekt/query"
29)
30
31// indexData holds the pattern-independent data that we have to have
32// in memory to search. Most of the memory is taken up by the ngram =>
33// offset index.
34type indexData struct {
35 symbols symbolData
36
37 file IndexFile
38
39 contentNgrams btreeIndex
40
41 newlinesStart uint32
42 newlinesIndex []uint32
43
44 docSectionsStart uint32
45 docSectionsIndex []uint32
46
47 runeDocSections []DocumentSection
48
49 // rune offset=>byte offset mapping, relative to the start of the content corpus
50 runeOffsets runeOffsetMap
51
52 // offsets of file contents; includes end of last file
53 boundariesStart uint32
54 boundaries []uint32
55
56 // rune offsets for the file content boundaries
57 fileEndRunes []uint32
58
59 fileNameContent []byte
60 fileNameIndex []uint32
61 fileNameNgrams btreeIndex
62
63 // fileEndSymbol[i] is the index of the first symbol for document i.
64 fileEndSymbol []uint32
65
66 // rune offset=>byte offset mapping, relative to the start of the filename corpus
67 fileNameRuneOffsets runeOffsetMap
68
69 // rune offsets for the file name boundaries
70 fileNameEndRunes []uint32
71
72 fileBranchMasks []uint64
73
74 // mask (power of 2) => name
75 branchNames []map[uint]string
76
77 // name => mask (power of 2)
78 branchIDs []map[string]uint
79
80 metaData zoekt.IndexMetadata
81 repoMetaData []zoekt.Repository
82
83 subRepos []uint32
84 subRepoPaths [][]string
85
86 // Checksums for all the files, at 8-byte intervals
87 checksums []byte
88
89 // languages for all the files.
90 languages []byte
91
92 // inverse of LanguageMap in metaData
93 languageMap map[uint16]string
94
95 // file categories for all the files.
96 categories []byte
97
98 repoListEntry []zoekt.RepoListEntry
99
100 // repository indexes for all the files
101 repos []uint16
102
103 // rawConfigMasks contains the encoded RawConfig for each repository
104 rawConfigMasks []uint8
105}
106
107type symbolData struct {
108 // symContent stores Symbol.Sym and Symbol.Parent.
109 // TODO we don't need to store Symbol.Sym.
110 symContent []byte
111 symIndex []byte
112 // symKindContent is an enum of sym.Kind and sym.ParentKind
113 symKindContent []byte
114 symKindIndex []uint32
115 // symMetadata is [4]uint32 0 Kind Parent ParentKind
116 symMetaData []byte
117}
118
119func uint32SliceAt(a []byte, n uint32) uint32 {
120 return binary.BigEndian.Uint32(a[n*4:])
121}
122
123func uint32SliceLen(a []byte) uint32 {
124 return uint32(len(a) / 4)
125}
126
127// parent returns index i of the parent enum
128func (d *symbolData) parent(i uint32) []byte {
129 delta := uint32SliceAt(d.symIndex, 0)
130 start := uint32SliceAt(d.symIndex, i) - delta
131 var end uint32
132 if i+1 == uint32SliceLen(d.symIndex) {
133 end = uint32(len(d.symContent))
134 } else {
135 end = uint32SliceAt(d.symIndex, i+1) - delta
136 }
137 return d.symContent[start:end]
138}
139
140// kind returns index i of the kind enum
141func (d *symbolData) kind(i uint32) []byte {
142 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
143}
144
145// data returns the symbol at index i
146func (d *symbolData) data(i uint32) *zoekt.Symbol {
147 size := uint32(4 * 4) // 4 uint32s
148 offset := i * size
149 if offset >= uint32(len(d.symMetaData)) {
150 return nil
151 }
152
153 metadata := d.symMetaData[offset : offset+size]
154 sym := &zoekt.Symbol{}
155 key := uint32SliceAt(metadata, 1)
156 sym.Kind = string(d.kind(key))
157 key = uint32SliceAt(metadata, 2)
158 sym.Parent = string(d.parent(key))
159 key = uint32SliceAt(metadata, 3)
160 sym.ParentKind = string(d.kind(key))
161 return sym
162}
163
164func (d *indexData) getChecksum(idx uint32) []byte {
165 start := crc64.Size * idx
166 return d.checksums[start : start+crc64.Size]
167}
168
169func (d *indexData) getLanguage(idx uint32) uint16 {
170 if d.metaData.IndexFeatureVersion < 12 {
171 // older zoekt files had 8-bit language entries
172 return uint16(d.languages[idx])
173 }
174 // newer zoekt files have 16-bit language entries
175 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
176}
177
178func (d *indexData) getCategory(idx uint32) FileCategory {
179 if len(d.categories) == 0 {
180 // This means we're reading an older index, so return 'missing'
181 return FileCategoryMissing
182 }
183 category, err := decodeCategory(d.categories[idx])
184 if err != nil {
185 return FileCategoryMissing
186 }
187 return category
188}
189
190// calculates stats for files in the range [start, end).
191func (d *indexData) calculateStatsForFileRange(start, end uint32) zoekt.RepoStats {
192 if start >= end {
193 // An empty shard for an empty repository.
194 return zoekt.RepoStats{
195 Shards: 1,
196 }
197 }
198
199 bytesContent := d.boundaries[end] - d.boundaries[start]
200 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
201 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
202
203 // CR keegan for stefan: I think we may want to restructure RepoListEntry so
204 // that we don't change anything, except we have
205 // []Repository. Alternatively, things we can divide up we do (like
206 // here). Right now I don't like that these numbers are not true, especially
207 // after aggregation. For now I will move forward with this until we can
208 // chat more.
209 return zoekt.RepoStats{
210 ContentBytes: int64(bytesContent) + int64(bytesFN),
211 Documents: int(end - start),
212 // CR keegan for stefan: our shard count is going to go out of whack,
213 // since we will aggregate these. So we will report more shards than are
214 // present on disk. What should we do?
215 Shards: 1,
216
217 // Sourcegraph specific
218 NewLinesCount: count,
219 DefaultBranchNewLinesCount: defaultCount,
220 OtherBranchesNewLinesCount: otherCount,
221 }
222}
223
224func (d *indexData) calculateStats() error {
225 d.repoListEntry = make([]zoekt.RepoListEntry, 0, len(d.repoMetaData))
226 var start, end uint32
227
228 for repoID, md := range d.repoMetaData {
229 // determine the file range for repo i
230 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
231 end++
232 }
233 if start < end && d.repos[start] != uint16(repoID) {
234 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
235 }
236
237 d.repoListEntry = append(d.repoListEntry, zoekt.RepoListEntry{
238 Repository: md,
239 IndexMetadata: d.metaData,
240 Stats: d.calculateStatsForFileRange(start, end),
241 })
242 start = end
243 }
244
245 // All repos in a compound shard share memoryUse. So we average out the
246 // memoryUse per shard in our reporting. This has the benefit that when you
247 // aggregate the IndexBytes you get back the actual memoryUse.
248 //
249 // TODO take into account tombstones for aggregation. Even better, adjust
250 // API to be shard centric not repo centric.
251 if len(d.repoListEntry) > 0 {
252 indexBytes := d.memoryUse()
253 indexBytesChunk := indexBytes / len(d.repoListEntry)
254 for i := range d.repoListEntry {
255 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
256 indexBytes -= indexBytesChunk
257 }
258 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
259 }
260
261 return nil
262}
263
264// calculateNewLinesStats computes some Sourcegraph specific statistics for files
265// in the range [start, end). These are not as efficient to calculate as the
266// normal statistics. We experimentally measured about a 10% slower shard load
267// time. However, we find these values very useful to track and computing them
268// outside of load time introduces a lot of complexity.
269func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
270 for i := start; i < end; i++ {
271 // branchMask is a bitmask of the branches for a document. Zoekt by
272 // convention represents the default branch as the lowest bit.
273 branchMask := d.fileBranchMasks[i]
274 isDefault := (branchMask & 1) == 1
275 others := uint64(bits.OnesCount64(branchMask >> 1))
276
277 // this is readNewlines but only reading the size of each section which
278 // corresponds to the number of newlines.
279 sec := simpleSection{
280 off: d.newlinesStart + d.newlinesIndex[i],
281 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
282 }
283 // We are only reading the first varint which is the size. So we don't
284 // need to read more than MaxVarintLen64 bytes.
285 if sec.sz > binary.MaxVarintLen64 {
286 sec.sz = binary.MaxVarintLen64
287 }
288 blob, err := d.readSectionBlob(sec)
289 if err != nil {
290 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
291 continue
292 }
293 sz, _ := binary.Uvarint(blob)
294
295 count += sz
296 if isDefault {
297 defaultCount += sz
298 }
299 otherCount += (others * sz)
300 }
301
302 return
303}
304
305func (d *indexData) String() string {
306 return fmt.Sprintf("shard(%s)", d.file.Name())
307}
308
309// calculates an approximate size of indexData in memory in bytes.
310func (d *indexData) memoryUse() int {
311 sz := 0
312 for _, a := range [][]uint32{
313 d.newlinesIndex, d.docSectionsIndex,
314 d.boundaries, d.fileNameIndex,
315 d.fileEndRunes, d.fileNameEndRunes,
316 d.fileEndSymbol, d.symbols.symKindIndex,
317 d.subRepos,
318 } {
319 sz += 4 * len(a)
320 }
321 sz += d.runeOffsets.sizeBytes()
322 sz += d.fileNameRuneOffsets.sizeBytes()
323 sz += len(d.languages)
324 sz += len(d.checksums)
325 sz += 2 * len(d.repos)
326 sz += 8 * len(d.runeDocSections)
327 sz += 8 * len(d.fileBranchMasks)
328 sz += d.contentNgrams.SizeBytes()
329 sz += d.fileNameNgrams.SizeBytes()
330 return sz
331}
332
333// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
334// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids
335// overlapping trigrams to keep their intersection as small as possible.
336//
337// Invariant: first will always have a smaller index than last.
338func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
339 first, last = minFrequencyNgramOffsets(ngramOffs, frequencies)
340
341 // If the trigrams are overlapping, then try to shift one to reduce overlap.
342 // This is guaranteed to produce a smaller intersection.
343 if last.index-first.index < ngramSize {
344 newFirstIndex := max(last.index-ngramSize, 0)
345 if newFirstIndex != first.index {
346 first = ngramOffs[indexMap[newFirstIndex]]
347 }
348
349 newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1)
350 if newLastIndex != last.index {
351 last = ngramOffs[indexMap[newLastIndex]]
352 }
353 }
354 return
355}
356
357const maxUInt32 = 0xffffffff
358
359func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
360 // Find the two lowest frequency ngrams.
361 idx0, idx1 := 0, 0
362 min0, min1 := uint32(maxUInt32), uint32(maxUInt32)
363 for i, x := range frequencies {
364 if x <= min0 {
365 idx0, idx1 = i, idx0
366 min0, min1 = x, min0
367 } else if x <= min1 {
368 idx1 = i
369 min1 = x
370 }
371 }
372
373 first = ngramOffs[idx0]
374 last = ngramOffs[idx1]
375
376 // Ensure first appears before last as a helpful invariant.
377 if first.index > last.index {
378 last, first = first, last
379 }
380 return
381}
382
383func (data *indexData) ngrams(filename bool) btreeIndex {
384 if filename {
385 return data.fileNameNgrams
386 }
387 return data.contentNgrams
388}
389
390type ngramIterationResults struct {
391 matchIterator
392
393 caseSensitive bool
394 fileName bool
395 substrBytes []byte
396 substrLowered []byte
397}
398
399func (r *ngramIterationResults) String() string {
400 return fmt.Sprintf("wrapper(%v)", r.matchIterator)
401}
402
403func (r *ngramIterationResults) candidates() []*candidateMatch {
404 cs := r.matchIterator.candidates()
405 for _, c := range cs {
406 c.caseSensitive = r.caseSensitive
407 c.fileName = r.fileName
408 c.substrBytes = r.substrBytes
409 c.substrLowered = r.substrLowered
410 }
411 return cs
412}
413
414func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
415 str := query.Pattern
416
417 // Find the 2 least common ngrams from the string.
418 ngramOffs := splitNGrams([]byte(str))
419
420 // protect against accidental searching of empty strings
421 if len(ngramOffs) == 0 {
422 return nil, errors.New("iterateNgrams needs non empty string")
423 }
424
425 // PERF: Sort to increase the chances adjacent checks are in the same btree
426 // bucket (which can cause disk IO).
427 slices.SortFunc(ngramOffs, runeNgramOff.Compare)
428 frequencies := make([]uint32, 0, len(ngramOffs))
429 indexMap := make([]int, len(ngramOffs))
430 ngramLookups := 0
431 ngrams := d.ngrams(query.FileName)
432 for i, o := range ngramOffs {
433 var freq uint32
434 if query.CaseSensitive {
435 freq = ngrams.Get(o.ngram).sz
436 ngramLookups++
437 } else {
438 for _, v := range generateCaseNgrams(o.ngram) {
439 freq += ngrams.Get(v).sz
440 ngramLookups++
441 }
442 }
443
444 if freq == 0 {
445 return &ngramIterationResults{
446 matchIterator: &noMatchTree{
447 Why: "freq=0",
448 Stats: zoekt.Stats{
449 NgramLookups: ngramLookups,
450 },
451 },
452 }, nil
453 }
454
455 frequencies = append(frequencies, freq)
456 indexMap[o.index] = i
457 }
458
459 first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies)
460
461 iter := &ngramDocIterator{
462 leftPad: uint32(first.index),
463 rightPad: uint32(utf8.RuneCountInString(str) - first.index),
464 ngramLookups: ngramLookups,
465 }
466 if query.FileName {
467 iter.ends = d.fileNameEndRunes
468 } else {
469 iter.ends = d.fileEndRunes
470 }
471
472 if first != last {
473 runeDist := uint32(last.index - first.index)
474 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
475 if err != nil {
476 return nil, err
477 }
478
479 iter.iter = i
480 } else {
481 hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
482 if err != nil {
483 return nil, err
484 }
485 iter.iter = hitIter
486 }
487
488 patBytes := []byte(query.Pattern)
489 lowerPatBytes := toLower(patBytes)
490
491 return &ngramIterationResults{
492 matchIterator: iter,
493 caseSensitive: query.CaseSensitive,
494 fileName: query.FileName,
495 substrBytes: patBytes,
496 substrLowered: lowerPatBytes,
497 }, nil
498}
499
500func (d *indexData) fileName(i uint32) []byte {
501 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
502}
503
504func (d *indexData) numDocs() uint32 {
505 return uint32(len(d.fileBranchMasks))
506}
507
508func (s *indexData) Close() {
509 s.file.Close()
510}
511
512const (
513 rawConfigYes = 1
514 rawConfigNo = 2
515)
516
517// encodeRawConfig encodes a rawConfig map into a uint8 mask.
518func encodeRawConfig(rawConfig map[string]string) uint8 {
519 var encoded uint8
520 for i, f := range []string{"public", "fork", "archived"} {
521 var e uint8
522 v, ok := rawConfig[f]
523 if ok && v == "1" {
524 e |= rawConfigYes
525 } else {
526 e |= rawConfigNo
527 }
528 encoded = encoded | e<<(2*i)
529 }
530 return encoded
531}