fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "encoding/binary"
19 "errors"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "math/bits"
24 "slices"
25 "unicode/utf8"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30// indexData holds the pattern-independent data that we have to have
31// in memory to search. Most of the memory is taken up by the ngram =>
32// offset index.
33type indexData struct {
34 symbols symbolData
35
36 file IndexFile
37
38 contentNgrams btreeIndex
39
40 newlinesStart uint32
41 newlinesIndex []uint32
42
43 docSectionsStart uint32
44 docSectionsIndex []uint32
45
46 runeDocSections []DocumentSection
47
48 // rune offset=>byte offset mapping, relative to the start of the content corpus
49 runeOffsets runeOffsetMap
50
51 // offsets of file contents; includes end of last file
52 boundariesStart uint32
53 boundaries []uint32
54
55 // rune offsets for the file content boundaries
56 fileEndRunes []uint32
57
58 fileNameContent []byte
59 fileNameIndex []uint32
60 fileNameNgrams btreeIndex
61
62 // fileEndSymbol[i] is the index of the first symbol for document i.
63 fileEndSymbol []uint32
64
65 // rune offset=>byte offset mapping, relative to the start of the filename corpus
66 fileNameRuneOffsets runeOffsetMap
67
68 // rune offsets for the file name boundaries
69 fileNameEndRunes []uint32
70
71 fileBranchMasks []uint64
72
73 // mask (power of 2) => name
74 branchNames []map[uint]string
75
76 // name => mask (power of 2)
77 branchIDs []map[string]uint
78
79 metaData IndexMetadata
80 repoMetaData []Repository
81
82 subRepos []uint32
83 subRepoPaths [][]string
84
85 // Checksums for all the files, at 8-byte intervals
86 checksums []byte
87
88 // languages for all the files.
89 languages []byte
90
91 // inverse of LanguageMap in metaData
92 languageMap map[uint16]string
93
94 repoListEntry []RepoListEntry
95
96 // repository indexes for all the files
97 repos []uint16
98
99 // rawConfigMasks contains the encoded RawConfig for each repository
100 rawConfigMasks []uint8
101}
102
103type symbolData struct {
104 // symContent stores Symbol.Sym and Symbol.Parent.
105 // TODO we don't need to store Symbol.Sym.
106 symContent []byte
107 symIndex []byte
108 // symKindContent is an enum of sym.Kind and sym.ParentKind
109 symKindContent []byte
110 symKindIndex []uint32
111 // symMetadata is [4]uint32 0 Kind Parent ParentKind
112 symMetaData []byte
113}
114
115func uint32SliceAt(a []byte, n uint32) uint32 {
116 return binary.BigEndian.Uint32(a[n*4:])
117}
118
119func uint32SliceLen(a []byte) uint32 {
120 return uint32(len(a) / 4)
121}
122
123// parent returns index i of the parent enum
124func (d *symbolData) parent(i uint32) []byte {
125 delta := uint32SliceAt(d.symIndex, 0)
126 start := uint32SliceAt(d.symIndex, i) - delta
127 var end uint32
128 if i+1 == uint32SliceLen(d.symIndex) {
129 end = uint32(len(d.symContent))
130 } else {
131 end = uint32SliceAt(d.symIndex, i+1) - delta
132 }
133 return d.symContent[start:end]
134}
135
136// kind returns index i of the kind enum
137func (d *symbolData) kind(i uint32) []byte {
138 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
139}
140
141// data returns the symbol at index i
142func (d *symbolData) data(i uint32) *Symbol {
143 size := uint32(4 * 4) // 4 uint32s
144 offset := i * size
145 if offset >= uint32(len(d.symMetaData)) {
146 return nil
147 }
148
149 metadata := d.symMetaData[offset : offset+size]
150 sym := &Symbol{}
151 key := uint32SliceAt(metadata, 1)
152 sym.Kind = string(d.kind(key))
153 key = uint32SliceAt(metadata, 2)
154 sym.Parent = string(d.parent(key))
155 key = uint32SliceAt(metadata, 3)
156 sym.ParentKind = string(d.kind(key))
157 return sym
158}
159
160func (d *indexData) getChecksum(idx uint32) []byte {
161 start := crc64.Size * idx
162 return d.checksums[start : start+crc64.Size]
163}
164
165func (d *indexData) getLanguage(idx uint32) uint16 {
166 if d.metaData.IndexFeatureVersion < 12 {
167 // older zoekt files had 8-bit language entries
168 return uint16(d.languages[idx])
169 }
170 // newer zoekt files have 16-bit language entries
171 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
172}
173
174// calculates stats for files in the range [start, end).
175func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats {
176 if start >= end {
177 // An empty shard for an empty repository.
178 return RepoStats{
179 Shards: 1,
180 }
181 }
182
183 bytesContent := d.boundaries[end] - d.boundaries[start]
184 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
185 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
186
187 // CR keegan for stefan: I think we may want to restructure RepoListEntry so
188 // that we don't change anything, except we have
189 // []Repository. Alternatively, things we can divide up we do (like
190 // here). Right now I don't like that these numbers are not true, especially
191 // after aggregation. For now I will move forward with this until we can
192 // chat more.
193 return RepoStats{
194 ContentBytes: int64(bytesContent) + int64(bytesFN),
195 Documents: int(end - start),
196 // CR keegan for stefan: our shard count is going to go out of whack,
197 // since we will aggregate these. So we will report more shards than are
198 // present on disk. What should we do?
199 Shards: 1,
200
201 // Sourcegraph specific
202 NewLinesCount: count,
203 DefaultBranchNewLinesCount: defaultCount,
204 OtherBranchesNewLinesCount: otherCount,
205 }
206}
207
208func (d *indexData) calculateStats() error {
209 d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData))
210 var start, end uint32
211
212 for repoID, md := range d.repoMetaData {
213 // determine the file range for repo i
214 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
215 end++
216 }
217 if start < end && d.repos[start] != uint16(repoID) {
218 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
219 }
220
221 d.repoListEntry = append(d.repoListEntry, RepoListEntry{
222 Repository: md,
223 IndexMetadata: d.metaData,
224 Stats: d.calculateStatsForFileRange(start, end),
225 })
226 start = end
227 }
228
229 // All repos in a compound shard share memoryUse. So we average out the
230 // memoryUse per shard in our reporting. This has the benefit that when you
231 // aggregate the IndexBytes you get back the actual memoryUse.
232 //
233 // TODO take into account tombstones for aggregation. Even better, adjust
234 // API to be shard centric not repo centric.
235 if len(d.repoListEntry) > 0 {
236 indexBytes := d.memoryUse()
237 indexBytesChunk := indexBytes / len(d.repoListEntry)
238 for i := range d.repoListEntry {
239 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
240 indexBytes -= indexBytesChunk
241 }
242 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
243 }
244
245 return nil
246}
247
248// calculateNewLinesStats computes some Sourcegraph specific statistics for files
249// in the range [start, end). These are not as efficient to calculate as the
250// normal statistics. We experimentally measured about a 10% slower shard load
251// time. However, we find these values very useful to track and computing them
252// outside of load time introduces a lot of complexity.
253func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
254 for i := start; i < end; i++ {
255 // branchMask is a bitmask of the branches for a document. Zoekt by
256 // convention represents the default branch as the lowest bit.
257 branchMask := d.fileBranchMasks[i]
258 isDefault := (branchMask & 1) == 1
259 others := uint64(bits.OnesCount64(branchMask >> 1))
260
261 // this is readNewlines but only reading the size of each section which
262 // corresponds to the number of newlines.
263 sec := simpleSection{
264 off: d.newlinesStart + d.newlinesIndex[i],
265 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
266 }
267 // We are only reading the first varint which is the size. So we don't
268 // need to read more than MaxVarintLen64 bytes.
269 if sec.sz > binary.MaxVarintLen64 {
270 sec.sz = binary.MaxVarintLen64
271 }
272 blob, err := d.readSectionBlob(sec)
273 if err != nil {
274 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
275 continue
276 }
277 sz, _ := binary.Uvarint(blob)
278
279 count += sz
280 if isDefault {
281 defaultCount += sz
282 }
283 otherCount += (others * sz)
284 }
285
286 return
287}
288
289func (d *indexData) String() string {
290 return fmt.Sprintf("shard(%s)", d.file.Name())
291}
292
293// calculates an approximate size of indexData in memory in bytes.
294func (d *indexData) memoryUse() int {
295 sz := 0
296 for _, a := range [][]uint32{
297 d.newlinesIndex, d.docSectionsIndex,
298 d.boundaries, d.fileNameIndex,
299 d.fileEndRunes, d.fileNameEndRunes,
300 d.fileEndSymbol, d.symbols.symKindIndex,
301 d.subRepos,
302 } {
303 sz += 4 * len(a)
304 }
305 sz += d.runeOffsets.sizeBytes()
306 sz += d.fileNameRuneOffsets.sizeBytes()
307 sz += len(d.languages)
308 sz += len(d.checksums)
309 sz += 2 * len(d.repos)
310 sz += 8 * len(d.runeDocSections)
311 sz += 8 * len(d.fileBranchMasks)
312 sz += d.contentNgrams.SizeBytes()
313 sz += d.fileNameNgrams.SizeBytes()
314 return sz
315}
316
317// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
318// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids
319// overlapping trigrams to keep their intersection as small as possible.
320//
321// Invariant: first will always have a smaller index than last.
322func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
323 first, last = minFrequencyNgramOffsets(ngramOffs, frequencies)
324
325 // If the trigrams are overlapping, then try to shift one to reduce overlap.
326 // This is guaranteed to produce a smaller intersection.
327 if last.index-first.index < ngramSize {
328 newFirstIndex := max(last.index-ngramSize, 0)
329 if newFirstIndex != first.index {
330 first = ngramOffs[indexMap[newFirstIndex]]
331 }
332
333 newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1)
334 if newLastIndex != last.index {
335 last = ngramOffs[indexMap[newLastIndex]]
336 }
337 }
338 return
339}
340
341const maxUInt32 = 0xffffffff
342
343func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
344 // Find the two lowest frequency ngrams.
345 idx0, idx1 := 0, 0
346 min0, min1 := uint32(maxUInt32), uint32(maxUInt32)
347 for i, x := range frequencies {
348 if x <= min0 {
349 idx0, idx1 = i, idx0
350 min0, min1 = x, min0
351 } else if x <= min1 {
352 idx1 = i
353 min1 = x
354 }
355 }
356
357 first = ngramOffs[idx0]
358 last = ngramOffs[idx1]
359
360 // Ensure first appears before last as a helpful invariant.
361 if first.index > last.index {
362 last, first = first, last
363 }
364 return
365}
366
367func (data *indexData) ngrams(filename bool) btreeIndex {
368 if filename {
369 return data.fileNameNgrams
370 }
371 return data.contentNgrams
372}
373
374type ngramIterationResults struct {
375 matchIterator
376
377 caseSensitive bool
378 fileName bool
379 substrBytes []byte
380 substrLowered []byte
381}
382
383func (r *ngramIterationResults) String() string {
384 return fmt.Sprintf("wrapper(%v)", r.matchIterator)
385}
386
387func (r *ngramIterationResults) candidates() []*candidateMatch {
388 cs := r.matchIterator.candidates()
389 for _, c := range cs {
390 c.caseSensitive = r.caseSensitive
391 c.fileName = r.fileName
392 c.substrBytes = r.substrBytes
393 c.substrLowered = r.substrLowered
394 }
395 return cs
396}
397
398func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
399 str := query.Pattern
400
401 // Find the 2 least common ngrams from the string.
402 ngramOffs := splitNGrams([]byte(str))
403
404 // protect against accidental searching of empty strings
405 if len(ngramOffs) == 0 {
406 return nil, errors.New("iterateNgrams needs non empty string")
407 }
408
409 // PERF: Sort to increase the chances adjacent checks are in the same btree
410 // bucket (which can cause disk IO).
411 slices.SortFunc(ngramOffs, runeNgramOff.Compare)
412 frequencies := make([]uint32, 0, len(ngramOffs))
413 indexMap := make([]int, len(ngramOffs))
414 ngramLookups := 0
415 ngrams := d.ngrams(query.FileName)
416 for i, o := range ngramOffs {
417 var freq uint32
418 if query.CaseSensitive {
419 freq = ngrams.Get(o.ngram).sz
420 ngramLookups++
421 } else {
422 for _, v := range generateCaseNgrams(o.ngram) {
423 freq += ngrams.Get(v).sz
424 ngramLookups++
425 }
426 }
427
428 if freq == 0 {
429 return &ngramIterationResults{
430 matchIterator: &noMatchTree{
431 Why: "freq=0",
432 Stats: Stats{
433 NgramLookups: ngramLookups,
434 },
435 },
436 }, nil
437 }
438
439 frequencies = append(frequencies, freq)
440 indexMap[o.index] = i
441 }
442
443 first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies)
444
445 iter := &ngramDocIterator{
446 leftPad: uint32(first.index),
447 rightPad: uint32(utf8.RuneCountInString(str) - first.index),
448 ngramLookups: ngramLookups,
449 }
450 if query.FileName {
451 iter.ends = d.fileNameEndRunes
452 } else {
453 iter.ends = d.fileEndRunes
454 }
455
456 if first != last {
457 runeDist := uint32(last.index - first.index)
458 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
459 if err != nil {
460 return nil, err
461 }
462
463 iter.iter = i
464 } else {
465 hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
466 if err != nil {
467 return nil, err
468 }
469 iter.iter = hitIter
470 }
471
472 patBytes := []byte(query.Pattern)
473 lowerPatBytes := toLower(patBytes)
474
475 return &ngramIterationResults{
476 matchIterator: iter,
477 caseSensitive: query.CaseSensitive,
478 fileName: query.FileName,
479 substrBytes: patBytes,
480 substrLowered: lowerPatBytes,
481 }, nil
482}
483
484func (d *indexData) fileName(i uint32) []byte {
485 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
486}
487
488func (d *indexData) numDocs() uint32 {
489 return uint32(len(d.fileBranchMasks))
490}
491
492func (s *indexData) Close() {
493 s.file.Close()
494}
495
496const (
497 rawConfigYes = 1
498 rawConfigNo = 2
499)
500
501// encodeRawConfig encodes a rawConfig map into a uint8 mask.
502func encodeRawConfig(rawConfig map[string]string) uint8 {
503 var encoded uint8
504 for i, f := range []string{"public", "fork", "archived"} {
505 var e uint8
506 v, ok := rawConfig[f]
507 if ok && v == "1" {
508 e |= rawConfigYes
509 } else {
510 e |= rawConfigNo
511 }
512 encoded = encoded | e<<(2*i)
513 }
514 return encoded
515}