fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "encoding/binary"
19 "errors"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "math/bits"
24 "slices"
25 "unicode/utf8"
26
27 "github.com/sourcegraph/zoekt"
28 "github.com/sourcegraph/zoekt/query"
29)
30
31// indexData holds the pattern-independent data that we have to have
32// in memory to search. Most of the memory is taken up by the ngram =>
33// offset index.
34type indexData struct {
35 symbols symbolData
36
37 file IndexFile
38
39 contentNgrams btreeIndex
40
41 newlinesStart uint32
42 newlinesIndex []uint32
43
44 docSectionsStart uint32
45 docSectionsIndex []uint32
46
47 runeDocSections []DocumentSection
48
49 // rune offset=>byte offset mapping, relative to the start of the content corpus
50 runeOffsets runeOffsetMap
51
52 // offsets of file contents; includes end of last file
53 boundariesStart uint32
54 boundaries []uint32
55
56 // rune offsets for the file content boundaries
57 fileEndRunes []uint32
58
59 fileNameContent []byte
60 fileNameIndex []uint32
61 fileNameNgrams btreeIndex
62
63 // fileEndSymbol[i] is the index of the first symbol for document i.
64 fileEndSymbol []uint32
65
66 // rune offset=>byte offset mapping, relative to the start of the filename corpus
67 fileNameRuneOffsets runeOffsetMap
68
69 // rune offsets for the file name boundaries
70 fileNameEndRunes []uint32
71
72 fileBranchMasks []uint64
73
74 // mask (power of 2) => name
75 branchNames []map[uint]string
76
77 // name => mask (power of 2)
78 branchIDs []map[string]uint
79
80 metaData zoekt.IndexMetadata
81 repoMetaData []zoekt.Repository
82
83 subRepos []uint32
84 subRepoPaths [][]string
85
86 // Checksums for all the files, at 8-byte intervals
87 checksums []byte
88
89 // languages for all the files.
90 languages []byte
91
92 // inverse of LanguageMap in metaData
93 languageMap map[uint16]string
94
95 repoListEntry []zoekt.RepoListEntry
96
97 // repository indexes for all the files
98 repos []uint16
99
100 // rawConfigMasks contains the encoded RawConfig for each repository
101 rawConfigMasks []uint8
102}
103
104type symbolData struct {
105 // symContent stores Symbol.Sym and Symbol.Parent.
106 // TODO we don't need to store Symbol.Sym.
107 symContent []byte
108 symIndex []byte
109 // symKindContent is an enum of sym.Kind and sym.ParentKind
110 symKindContent []byte
111 symKindIndex []uint32
112 // symMetadata is [4]uint32 0 Kind Parent ParentKind
113 symMetaData []byte
114}
115
116func uint32SliceAt(a []byte, n uint32) uint32 {
117 return binary.BigEndian.Uint32(a[n*4:])
118}
119
120func uint32SliceLen(a []byte) uint32 {
121 return uint32(len(a) / 4)
122}
123
124// parent returns index i of the parent enum
125func (d *symbolData) parent(i uint32) []byte {
126 delta := uint32SliceAt(d.symIndex, 0)
127 start := uint32SliceAt(d.symIndex, i) - delta
128 var end uint32
129 if i+1 == uint32SliceLen(d.symIndex) {
130 end = uint32(len(d.symContent))
131 } else {
132 end = uint32SliceAt(d.symIndex, i+1) - delta
133 }
134 return d.symContent[start:end]
135}
136
137// kind returns index i of the kind enum
138func (d *symbolData) kind(i uint32) []byte {
139 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
140}
141
142// data returns the symbol at index i
143func (d *symbolData) data(i uint32) *zoekt.Symbol {
144 size := uint32(4 * 4) // 4 uint32s
145 offset := i * size
146 if offset >= uint32(len(d.symMetaData)) {
147 return nil
148 }
149
150 metadata := d.symMetaData[offset : offset+size]
151 sym := &zoekt.Symbol{}
152 key := uint32SliceAt(metadata, 1)
153 sym.Kind = string(d.kind(key))
154 key = uint32SliceAt(metadata, 2)
155 sym.Parent = string(d.parent(key))
156 key = uint32SliceAt(metadata, 3)
157 sym.ParentKind = string(d.kind(key))
158 return sym
159}
160
161func (d *indexData) getChecksum(idx uint32) []byte {
162 start := crc64.Size * idx
163 return d.checksums[start : start+crc64.Size]
164}
165
166func (d *indexData) getLanguage(idx uint32) uint16 {
167 if d.metaData.IndexFeatureVersion < 12 {
168 // older zoekt files had 8-bit language entries
169 return uint16(d.languages[idx])
170 }
171 // newer zoekt files have 16-bit language entries
172 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
173}
174
175// calculates stats for files in the range [start, end).
176func (d *indexData) calculateStatsForFileRange(start, end uint32) zoekt.RepoStats {
177 if start >= end {
178 // An empty shard for an empty repository.
179 return zoekt.RepoStats{
180 Shards: 1,
181 }
182 }
183
184 bytesContent := d.boundaries[end] - d.boundaries[start]
185 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
186 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
187
188 // CR keegan for stefan: I think we may want to restructure RepoListEntry so
189 // that we don't change anything, except we have
190 // []Repository. Alternatively, things we can divide up we do (like
191 // here). Right now I don't like that these numbers are not true, especially
192 // after aggregation. For now I will move forward with this until we can
193 // chat more.
194 return zoekt.RepoStats{
195 ContentBytes: int64(bytesContent) + int64(bytesFN),
196 Documents: int(end - start),
197 // CR keegan for stefan: our shard count is going to go out of whack,
198 // since we will aggregate these. So we will report more shards than are
199 // present on disk. What should we do?
200 Shards: 1,
201
202 // Sourcegraph specific
203 NewLinesCount: count,
204 DefaultBranchNewLinesCount: defaultCount,
205 OtherBranchesNewLinesCount: otherCount,
206 }
207}
208
209func (d *indexData) calculateStats() error {
210 d.repoListEntry = make([]zoekt.RepoListEntry, 0, len(d.repoMetaData))
211 var start, end uint32
212
213 for repoID, md := range d.repoMetaData {
214 // determine the file range for repo i
215 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
216 end++
217 }
218 if start < end && d.repos[start] != uint16(repoID) {
219 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
220 }
221
222 d.repoListEntry = append(d.repoListEntry, zoekt.RepoListEntry{
223 Repository: md,
224 IndexMetadata: d.metaData,
225 Stats: d.calculateStatsForFileRange(start, end),
226 })
227 start = end
228 }
229
230 // All repos in a compound shard share memoryUse. So we average out the
231 // memoryUse per shard in our reporting. This has the benefit that when you
232 // aggregate the IndexBytes you get back the actual memoryUse.
233 //
234 // TODO take into account tombstones for aggregation. Even better, adjust
235 // API to be shard centric not repo centric.
236 if len(d.repoListEntry) > 0 {
237 indexBytes := d.memoryUse()
238 indexBytesChunk := indexBytes / len(d.repoListEntry)
239 for i := range d.repoListEntry {
240 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
241 indexBytes -= indexBytesChunk
242 }
243 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
244 }
245
246 return nil
247}
248
249// calculateNewLinesStats computes some Sourcegraph specific statistics for files
250// in the range [start, end). These are not as efficient to calculate as the
251// normal statistics. We experimentally measured about a 10% slower shard load
252// time. However, we find these values very useful to track and computing them
253// outside of load time introduces a lot of complexity.
254func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
255 for i := start; i < end; i++ {
256 // branchMask is a bitmask of the branches for a document. Zoekt by
257 // convention represents the default branch as the lowest bit.
258 branchMask := d.fileBranchMasks[i]
259 isDefault := (branchMask & 1) == 1
260 others := uint64(bits.OnesCount64(branchMask >> 1))
261
262 // this is readNewlines but only reading the size of each section which
263 // corresponds to the number of newlines.
264 sec := simpleSection{
265 off: d.newlinesStart + d.newlinesIndex[i],
266 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
267 }
268 // We are only reading the first varint which is the size. So we don't
269 // need to read more than MaxVarintLen64 bytes.
270 if sec.sz > binary.MaxVarintLen64 {
271 sec.sz = binary.MaxVarintLen64
272 }
273 blob, err := d.readSectionBlob(sec)
274 if err != nil {
275 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
276 continue
277 }
278 sz, _ := binary.Uvarint(blob)
279
280 count += sz
281 if isDefault {
282 defaultCount += sz
283 }
284 otherCount += (others * sz)
285 }
286
287 return
288}
289
290func (d *indexData) String() string {
291 return fmt.Sprintf("shard(%s)", d.file.Name())
292}
293
294// calculates an approximate size of indexData in memory in bytes.
295func (d *indexData) memoryUse() int {
296 sz := 0
297 for _, a := range [][]uint32{
298 d.newlinesIndex, d.docSectionsIndex,
299 d.boundaries, d.fileNameIndex,
300 d.fileEndRunes, d.fileNameEndRunes,
301 d.fileEndSymbol, d.symbols.symKindIndex,
302 d.subRepos,
303 } {
304 sz += 4 * len(a)
305 }
306 sz += d.runeOffsets.sizeBytes()
307 sz += d.fileNameRuneOffsets.sizeBytes()
308 sz += len(d.languages)
309 sz += len(d.checksums)
310 sz += 2 * len(d.repos)
311 sz += 8 * len(d.runeDocSections)
312 sz += 8 * len(d.fileBranchMasks)
313 sz += d.contentNgrams.SizeBytes()
314 sz += d.fileNameNgrams.SizeBytes()
315 return sz
316}
317
318// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
319// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids
320// overlapping trigrams to keep their intersection as small as possible.
321//
322// Invariant: first will always have a smaller index than last.
323func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
324 first, last = minFrequencyNgramOffsets(ngramOffs, frequencies)
325
326 // If the trigrams are overlapping, then try to shift one to reduce overlap.
327 // This is guaranteed to produce a smaller intersection.
328 if last.index-first.index < ngramSize {
329 newFirstIndex := max(last.index-ngramSize, 0)
330 if newFirstIndex != first.index {
331 first = ngramOffs[indexMap[newFirstIndex]]
332 }
333
334 newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1)
335 if newLastIndex != last.index {
336 last = ngramOffs[indexMap[newLastIndex]]
337 }
338 }
339 return
340}
341
342const maxUInt32 = 0xffffffff
343
344func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
345 // Find the two lowest frequency ngrams.
346 idx0, idx1 := 0, 0
347 min0, min1 := uint32(maxUInt32), uint32(maxUInt32)
348 for i, x := range frequencies {
349 if x <= min0 {
350 idx0, idx1 = i, idx0
351 min0, min1 = x, min0
352 } else if x <= min1 {
353 idx1 = i
354 min1 = x
355 }
356 }
357
358 first = ngramOffs[idx0]
359 last = ngramOffs[idx1]
360
361 // Ensure first appears before last as a helpful invariant.
362 if first.index > last.index {
363 last, first = first, last
364 }
365 return
366}
367
368func (data *indexData) ngrams(filename bool) btreeIndex {
369 if filename {
370 return data.fileNameNgrams
371 }
372 return data.contentNgrams
373}
374
375type ngramIterationResults struct {
376 matchIterator
377
378 caseSensitive bool
379 fileName bool
380 substrBytes []byte
381 substrLowered []byte
382}
383
384func (r *ngramIterationResults) String() string {
385 return fmt.Sprintf("wrapper(%v)", r.matchIterator)
386}
387
388func (r *ngramIterationResults) candidates() []*candidateMatch {
389 cs := r.matchIterator.candidates()
390 for _, c := range cs {
391 c.caseSensitive = r.caseSensitive
392 c.fileName = r.fileName
393 c.substrBytes = r.substrBytes
394 c.substrLowered = r.substrLowered
395 }
396 return cs
397}
398
399func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
400 str := query.Pattern
401
402 // Find the 2 least common ngrams from the string.
403 ngramOffs := splitNGrams([]byte(str))
404
405 // protect against accidental searching of empty strings
406 if len(ngramOffs) == 0 {
407 return nil, errors.New("iterateNgrams needs non empty string")
408 }
409
410 // PERF: Sort to increase the chances adjacent checks are in the same btree
411 // bucket (which can cause disk IO).
412 slices.SortFunc(ngramOffs, runeNgramOff.Compare)
413 frequencies := make([]uint32, 0, len(ngramOffs))
414 indexMap := make([]int, len(ngramOffs))
415 ngramLookups := 0
416 ngrams := d.ngrams(query.FileName)
417 for i, o := range ngramOffs {
418 var freq uint32
419 if query.CaseSensitive {
420 freq = ngrams.Get(o.ngram).sz
421 ngramLookups++
422 } else {
423 for _, v := range generateCaseNgrams(o.ngram) {
424 freq += ngrams.Get(v).sz
425 ngramLookups++
426 }
427 }
428
429 if freq == 0 {
430 return &ngramIterationResults{
431 matchIterator: &noMatchTree{
432 Why: "freq=0",
433 Stats: zoekt.Stats{
434 NgramLookups: ngramLookups,
435 },
436 },
437 }, nil
438 }
439
440 frequencies = append(frequencies, freq)
441 indexMap[o.index] = i
442 }
443
444 first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies)
445
446 iter := &ngramDocIterator{
447 leftPad: uint32(first.index),
448 rightPad: uint32(utf8.RuneCountInString(str) - first.index),
449 ngramLookups: ngramLookups,
450 }
451 if query.FileName {
452 iter.ends = d.fileNameEndRunes
453 } else {
454 iter.ends = d.fileEndRunes
455 }
456
457 if first != last {
458 runeDist := uint32(last.index - first.index)
459 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
460 if err != nil {
461 return nil, err
462 }
463
464 iter.iter = i
465 } else {
466 hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
467 if err != nil {
468 return nil, err
469 }
470 iter.iter = hitIter
471 }
472
473 patBytes := []byte(query.Pattern)
474 lowerPatBytes := toLower(patBytes)
475
476 return &ngramIterationResults{
477 matchIterator: iter,
478 caseSensitive: query.CaseSensitive,
479 fileName: query.FileName,
480 substrBytes: patBytes,
481 substrLowered: lowerPatBytes,
482 }, nil
483}
484
485func (d *indexData) fileName(i uint32) []byte {
486 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
487}
488
489func (d *indexData) numDocs() uint32 {
490 return uint32(len(d.fileBranchMasks))
491}
492
493func (s *indexData) Close() {
494 s.file.Close()
495}
496
497const (
498 rawConfigYes = 1
499 rawConfigNo = 2
500)
501
502// encodeRawConfig encodes a rawConfig map into a uint8 mask.
503func encodeRawConfig(rawConfig map[string]string) uint8 {
504 var encoded uint8
505 for i, f := range []string{"public", "fork", "archived"} {
506 var e uint8
507 v, ok := rawConfig[f]
508 if ok && v == "1" {
509 e |= rawConfigYes
510 } else {
511 e |= rawConfigNo
512 }
513 encoded = encoded | e<<(2*i)
514 }
515 return encoded
516}