fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "encoding/binary"
19 "errors"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "math/bits"
24 "os"
25 "slices"
26 "strconv"
27 "unicode/utf8"
28
29 "github.com/sourcegraph/zoekt/query"
30)
31
32// indexData holds the pattern-independent data that we have to have
33// in memory to search. Most of the memory is taken up by the ngram =>
34// offset index.
35type indexData struct {
36 symbols symbolData
37
38 file IndexFile
39
40 contentNgrams btreeIndex
41
42 newlinesStart uint32
43 newlinesIndex []uint32
44
45 docSectionsStart uint32
46 docSectionsIndex []uint32
47
48 runeDocSections []DocumentSection
49
50 // rune offset=>byte offset mapping, relative to the start of the content corpus
51 runeOffsets runeOffsetMap
52
53 // offsets of file contents; includes end of last file
54 boundariesStart uint32
55 boundaries []uint32
56
57 // rune offsets for the file content boundaries
58 fileEndRunes []uint32
59
60 fileNameContent []byte
61 fileNameIndex []uint32
62 fileNameNgrams btreeIndex
63
64 // fileEndSymbol[i] is the index of the first symbol for document i.
65 fileEndSymbol []uint32
66
67 // rune offset=>byte offset mapping, relative to the start of the filename corpus
68 fileNameRuneOffsets runeOffsetMap
69
70 // rune offsets for the file name boundaries
71 fileNameEndRunes []uint32
72
73 fileBranchMasks []uint64
74
75 // mask (power of 2) => name
76 branchNames []map[uint]string
77
78 // name => mask (power of 2)
79 branchIDs []map[string]uint
80
81 metaData IndexMetadata
82 repoMetaData []Repository
83
84 subRepos []uint32
85 subRepoPaths [][]string
86
87 // Checksums for all the files, at 8-byte intervals
88 checksums []byte
89
90 // languages for all the files.
91 languages []byte
92
93 // inverse of LanguageMap in metaData
94 languageMap map[uint16]string
95
96 repoListEntry []RepoListEntry
97
98 // repository indexes for all the files
99 repos []uint16
100
101 // Experimental: docID => rank vec
102 ranks [][]float64
103
104 // rawConfigMasks contains the encoded RawConfig for each repository
105 rawConfigMasks []uint8
106}
107
108type symbolData struct {
109 // symContent stores Symbol.Sym and Symbol.Parent.
110 // TODO we don't need to store Symbol.Sym.
111 symContent []byte
112 symIndex []byte
113 // symKindContent is an enum of sym.Kind and sym.ParentKind
114 symKindContent []byte
115 symKindIndex []uint32
116 // symMetadata is [4]uint32 0 Kind Parent ParentKind
117 symMetaData []byte
118}
119
120func uint32SliceAt(a []byte, n uint32) uint32 {
121 return binary.BigEndian.Uint32(a[n*4:])
122}
123
124func uint32SliceLen(a []byte) uint32 {
125 return uint32(len(a) / 4)
126}
127
128// parent returns index i of the parent enum
129func (d *symbolData) parent(i uint32) []byte {
130 delta := uint32SliceAt(d.symIndex, 0)
131 start := uint32SliceAt(d.symIndex, i) - delta
132 var end uint32
133 if i+1 == uint32SliceLen(d.symIndex) {
134 end = uint32(len(d.symContent))
135 } else {
136 end = uint32SliceAt(d.symIndex, i+1) - delta
137 }
138 return d.symContent[start:end]
139}
140
141// kind returns index i of the kind enum
142func (d *symbolData) kind(i uint32) []byte {
143 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
144}
145
146// data returns the symbol at index i
147func (d *symbolData) data(i uint32) *Symbol {
148 size := uint32(4 * 4) // 4 uint32s
149 offset := i * size
150 if offset >= uint32(len(d.symMetaData)) {
151 return nil
152 }
153
154 metadata := d.symMetaData[offset : offset+size]
155 sym := &Symbol{}
156 key := uint32SliceAt(metadata, 1)
157 sym.Kind = string(d.kind(key))
158 key = uint32SliceAt(metadata, 2)
159 sym.Parent = string(d.parent(key))
160 key = uint32SliceAt(metadata, 3)
161 sym.ParentKind = string(d.kind(key))
162 return sym
163}
164
165func (d *indexData) getChecksum(idx uint32) []byte {
166 start := crc64.Size * idx
167 return d.checksums[start : start+crc64.Size]
168}
169
170func (d *indexData) getLanguage(idx uint32) uint16 {
171 if d.metaData.IndexFeatureVersion < 12 {
172 // older zoekt files had 8-bit language entries
173 return uint16(d.languages[idx])
174 }
175 // newer zoekt files have 16-bit language entries
176 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
177}
178
179// calculates stats for files in the range [start, end).
180func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats {
181 if start >= end {
182 // An empty shard for an empty repository.
183 return RepoStats{
184 Shards: 1,
185 }
186 }
187
188 bytesContent := d.boundaries[end] - d.boundaries[start]
189 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
190 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
191
192 // CR keegan for stefan: I think we may want to restructure RepoListEntry so
193 // that we don't change anything, except we have
194 // []Repository. Alternatively, things we can divide up we do (like
195 // here). Right now I don't like that these numbers are not true, especially
196 // after aggregation. For now I will move forward with this until we can
197 // chat more.
198 return RepoStats{
199 ContentBytes: int64(bytesContent) + int64(bytesFN),
200 Documents: int(end - start),
201 // CR keegan for stefan: our shard count is going to go out of whack,
202 // since we will aggregate these. So we will report more shards than are
203 // present on disk. What should we do?
204 Shards: 1,
205
206 // Sourcegraph specific
207 NewLinesCount: count,
208 DefaultBranchNewLinesCount: defaultCount,
209 OtherBranchesNewLinesCount: otherCount,
210 }
211}
212
213func (d *indexData) calculateStats() error {
214 d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData))
215 var start, end uint32
216
217 for repoID, md := range d.repoMetaData {
218 // determine the file range for repo i
219 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
220 end++
221 }
222 if start < end && d.repos[start] != uint16(repoID) {
223 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
224 }
225
226 d.repoListEntry = append(d.repoListEntry, RepoListEntry{
227 Repository: md,
228 IndexMetadata: d.metaData,
229 Stats: d.calculateStatsForFileRange(start, end),
230 })
231 start = end
232 }
233
234 // All repos in a compound shard share memoryUse. So we average out the
235 // memoryUse per shard in our reporting. This has the benefit that when you
236 // aggregate the IndexBytes you get back the actual memoryUse.
237 //
238 // TODO take into account tombstones for aggregation. Even better, adjust
239 // API to be shard centric not repo centric.
240 if len(d.repoListEntry) > 0 {
241 indexBytes := d.memoryUse()
242 indexBytesChunk := indexBytes / len(d.repoListEntry)
243 for i := range d.repoListEntry {
244 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
245 indexBytes -= indexBytesChunk
246 }
247 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
248 }
249
250 return nil
251}
252
253// calculateNewLinesStats computes some Sourcegraph specific statistics for files
254// in the range [start, end). These are not as efficient to calculate as the
255// normal statistics. We experimentally measured about a 10% slower shard load
256// time. However, we find these values very useful to track and computing them
257// outside of load time introduces a lot of complexity.
258func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
259 for i := start; i < end; i++ {
260 // branchMask is a bitmask of the branches for a document. Zoekt by
261 // convention represents the default branch as the lowest bit.
262 branchMask := d.fileBranchMasks[i]
263 isDefault := (branchMask & 1) == 1
264 others := uint64(bits.OnesCount64(branchMask >> 1))
265
266 // this is readNewlines but only reading the size of each section which
267 // corresponds to the number of newlines.
268 sec := simpleSection{
269 off: d.newlinesStart + d.newlinesIndex[i],
270 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
271 }
272 // We are only reading the first varint which is the size. So we don't
273 // need to read more than MaxVarintLen64 bytes.
274 if sec.sz > binary.MaxVarintLen64 {
275 sec.sz = binary.MaxVarintLen64
276 }
277 blob, err := d.readSectionBlob(sec)
278 if err != nil {
279 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
280 continue
281 }
282 sz, _ := binary.Uvarint(blob)
283
284 count += sz
285 if isDefault {
286 defaultCount += sz
287 }
288 otherCount += (others * sz)
289 }
290
291 return
292}
293
294func (d *indexData) String() string {
295 return fmt.Sprintf("shard(%s)", d.file.Name())
296}
297
298// calculates an approximate size of indexData in memory in bytes.
299func (d *indexData) memoryUse() int {
300 sz := 0
301 for _, a := range [][]uint32{
302 d.newlinesIndex, d.docSectionsIndex,
303 d.boundaries, d.fileNameIndex,
304 d.fileEndRunes, d.fileNameEndRunes,
305 d.fileEndSymbol, d.symbols.symKindIndex,
306 d.subRepos,
307 } {
308 sz += 4 * len(a)
309 }
310 sz += d.runeOffsets.sizeBytes()
311 sz += d.fileNameRuneOffsets.sizeBytes()
312 sz += len(d.languages)
313 sz += len(d.checksums)
314 sz += 2 * len(d.repos)
315 if len(d.ranks) > 0 {
316 sz += 8 * len(d.ranks) * len(d.ranks[0])
317 }
318 sz += 8 * len(d.runeDocSections)
319 sz += 8 * len(d.fileBranchMasks)
320 sz += d.contentNgrams.SizeBytes()
321 sz += d.fileNameNgrams.SizeBytes()
322 return sz
323}
324
325// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
326// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids
327// overlapping trigrams to keep their intersection as small as possible.
328//
329// Invariant: first will always have a smaller index than last.
330func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
331 first, last = minFrequencyNgramOffsets(ngramOffs, frequencies)
332
333 // If the trigrams are overlapping, then try to shift one to reduce overlap.
334 // This is guaranteed to produce a smaller intersection.
335 if last.index-first.index < ngramSize {
336 newFirstIndex := max(last.index-ngramSize, 0)
337 if newFirstIndex != first.index {
338 first = ngramOffs[indexMap[newFirstIndex]]
339 }
340
341 newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1)
342 if newLastIndex != last.index {
343 last = ngramOffs[indexMap[newLastIndex]]
344 }
345 }
346 return
347}
348
349const maxUInt32 = 0xffffffff
350
351func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
352 // Find the two lowest frequency ngrams.
353 idx0, idx1 := 0, 0
354 min0, min1 := uint32(maxUInt32), uint32(maxUInt32)
355 for i, x := range frequencies {
356 if x <= min0 {
357 idx0, idx1 = i, idx0
358 min0, min1 = x, min0
359 } else if x <= min1 {
360 idx1 = i
361 min1 = x
362 }
363 }
364
365 first = ngramOffs[idx0]
366 last = ngramOffs[idx1]
367
368 // Ensure first appears before last as a helpful invariant.
369 if first.index > last.index {
370 last, first = first, last
371 }
372 return
373}
374
375func (data *indexData) ngrams(filename bool) btreeIndex {
376 if filename {
377 return data.fileNameNgrams
378 }
379 return data.contentNgrams
380}
381
382type ngramIterationResults struct {
383 matchIterator
384
385 caseSensitive bool
386 fileName bool
387 substrBytes []byte
388 substrLowered []byte
389}
390
391func (r *ngramIterationResults) String() string {
392 return fmt.Sprintf("wrapper(%v)", r.matchIterator)
393}
394
395func (r *ngramIterationResults) candidates() []*candidateMatch {
396 cs := r.matchIterator.candidates()
397 for _, c := range cs {
398 c.caseSensitive = r.caseSensitive
399 c.fileName = r.fileName
400 c.substrBytes = r.substrBytes
401 c.substrLowered = r.substrLowered
402 }
403 return cs
404}
405
406// experimentIterateNgramLookupLimit when non-zero will only lookup this many
407// ngrams from a query string. Note: that if case-insensitive, this only
408// limits the input. So we will still lookup the case folding.
409//
410// This experiment is targetting looking up large snippets. If it is
411// successful, we will likely hardcode the value we use in production.
412//
413// Future note: if we find cases where this works badly, we can consider only
414// searching a random subset of the query string to avoid bad strings.
415var experimentIterateNgramLookupLimit = getEnvInt("SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT")
416
417func getEnvInt(k string) int {
418 v, _ := strconv.Atoi(os.Getenv(k))
419 if v != 0 {
420 log.Printf("%s = %d\n", k, v)
421 }
422 return v
423}
424
425func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
426 str := query.Pattern
427
428 // Find the 2 least common ngrams from the string.
429 var ngramOffs []runeNgramOff
430 if ngramLimit := experimentIterateNgramLookupLimit; ngramLimit > 0 {
431 // Note: we can't just do str = str[:ngramLimit] due to utf-8 and str
432 // length is asked later on for other optimizations.
433 ngramOffs = splitNGramsLimit([]byte(str), ngramLimit)
434 } else {
435 ngramOffs = splitNGrams([]byte(str))
436 }
437
438 // protect against accidental searching of empty strings
439 if len(ngramOffs) == 0 {
440 return nil, errors.New("iterateNgrams needs non empty string")
441 }
442
443 // PERF: Sort to increase the chances adjacent checks are in the same btree
444 // bucket (which can cause disk IO).
445 slices.SortFunc(ngramOffs, runeNgramOff.Compare)
446 frequencies := make([]uint32, 0, len(ngramOffs))
447 indexMap := make([]int, len(ngramOffs))
448 ngramLookups := 0
449 ngrams := d.ngrams(query.FileName)
450 for i, o := range ngramOffs {
451 var freq uint32
452 if query.CaseSensitive {
453 freq = ngrams.Get(o.ngram).sz
454 ngramLookups++
455 } else {
456 for _, v := range generateCaseNgrams(o.ngram) {
457 freq += ngrams.Get(v).sz
458 ngramLookups++
459 }
460 }
461
462 if freq == 0 {
463 return &ngramIterationResults{
464 matchIterator: &noMatchTree{
465 Why: "freq=0",
466 Stats: Stats{
467 NgramLookups: ngramLookups,
468 },
469 },
470 }, nil
471 }
472
473 frequencies = append(frequencies, freq)
474 indexMap[o.index] = i
475 }
476
477 first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies)
478
479 iter := &ngramDocIterator{
480 leftPad: uint32(first.index),
481 rightPad: uint32(utf8.RuneCountInString(str) - first.index),
482 ngramLookups: ngramLookups,
483 }
484 if query.FileName {
485 iter.ends = d.fileNameEndRunes
486 } else {
487 iter.ends = d.fileEndRunes
488 }
489
490 if first != last {
491 runeDist := uint32(last.index - first.index)
492 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
493 if err != nil {
494 return nil, err
495 }
496
497 iter.iter = i
498 } else {
499 hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
500 if err != nil {
501 return nil, err
502 }
503 iter.iter = hitIter
504 }
505
506 patBytes := []byte(query.Pattern)
507 lowerPatBytes := toLower(patBytes)
508
509 return &ngramIterationResults{
510 matchIterator: iter,
511 caseSensitive: query.CaseSensitive,
512 fileName: query.FileName,
513 substrBytes: patBytes,
514 substrLowered: lowerPatBytes,
515 }, nil
516}
517
518func (d *indexData) fileName(i uint32) []byte {
519 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
520}
521
522func (d *indexData) numDocs() uint32 {
523 return uint32(len(d.fileBranchMasks))
524}
525
526func (s *indexData) Close() {
527 s.file.Close()
528}
529
530const (
531 rawConfigYes = 1
532 rawConfigNo = 2
533)
534
535// encodeRawConfig encodes a rawConfig map into a uint8 mask.
536func encodeRawConfig(rawConfig map[string]string) uint8 {
537 var encoded uint8
538 for i, f := range []string{"public", "fork", "archived"} {
539 var e uint8
540 v, ok := rawConfig[f]
541 if ok && v == "1" {
542 e |= rawConfigYes
543 } else {
544 e |= rawConfigNo
545 }
546 encoded = encoded | e<<(2*i)
547 }
548 return encoded
549}