fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "encoding/binary"
19 "errors"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "math/bits"
24 "unicode/utf8"
25
26 "github.com/sourcegraph/zoekt/query"
27 "golang.org/x/exp/slices"
28)
29
30// indexData holds the pattern-independent data that we have to have
31// in memory to search. Most of the memory is taken up by the ngram =>
32// offset index.
33type indexData struct {
34 symbols symbolData
35
36 file IndexFile
37
38 ngrams btreeIndex
39
40 newlinesStart uint32
41 newlinesIndex []uint32
42
43 docSectionsStart uint32
44 docSectionsIndex []uint32
45
46 runeDocSections []DocumentSection
47 runeDocSectionsRaw []byte
48
49 // rune offset=>byte offset mapping, relative to the start of the content corpus
50 runeOffsets runeOffsetMap
51
52 // offsets of file contents; includes end of last file
53 boundariesStart uint32
54 boundaries []uint32
55
56 // rune offsets for the file content boundaries
57 fileEndRunes []uint32
58
59 fileNameContent []byte
60 fileNameIndex []uint32
61 fileNameNgrams btreeIndex
62
63 // fileEndSymbol[i] is the index of the first symbol for document i.
64 fileEndSymbol []uint32
65
66 // rune offset=>byte offset mapping, relative to the start of the filename corpus
67 fileNameRuneOffsets runeOffsetMap
68
69 // rune offsets for the file name boundaries
70 fileNameEndRunes []uint32
71
72 fileBranchMasks []uint64
73
74 // mask (power of 2) => name
75 branchNames []map[uint]string
76
77 // name => mask (power of 2)
78 branchIDs []map[string]uint
79
80 metaData IndexMetadata
81 repoMetaData []Repository
82
83 subRepos []uint32
84 subRepoPaths [][]string
85
86 // Checksums for all the files, at 8-byte intervals
87 checksums []byte
88
89 // languages for all the files.
90 languages []byte
91
92 // inverse of LanguageMap in metaData
93 languageMap map[uint16]string
94
95 repoListEntry []RepoListEntry
96
97 // repository indexes for all the files
98 repos []uint16
99
100 // Experimental: docID => rank vec
101 ranks [][]float64
102
103 // rawConfigMasks contains the encoded RawConfig for each repository
104 rawConfigMasks []uint8
105}
106
107type symbolData struct {
108 // symContent stores Symbol.Sym and Symbol.Parent.
109 // TODO we don't need to store Symbol.Sym.
110 symContent []byte
111 symIndex []byte
112 // symKindContent is an enum of sym.Kind and sym.ParentKind
113 symKindContent []byte
114 symKindIndex []uint32
115 // symMetadata is [4]uint32 0 Kind Parent ParentKind
116 symMetaData []byte
117}
118
119func uint32SliceAt(a []byte, n uint32) uint32 {
120 return binary.BigEndian.Uint32(a[n*4:])
121}
122
123func uint32SliceLen(a []byte) uint32 {
124 return uint32(len(a) / 4)
125}
126
127// parent returns index i of the parent enum
128func (d *symbolData) parent(i uint32) []byte {
129 delta := uint32SliceAt(d.symIndex, 0)
130 start := uint32SliceAt(d.symIndex, i) - delta
131 var end uint32
132 if i+1 == uint32SliceLen(d.symIndex) {
133 end = uint32(len(d.symContent))
134 } else {
135 end = uint32SliceAt(d.symIndex, i+1) - delta
136 }
137 return d.symContent[start:end]
138}
139
140// kind returns index i of the kind enum
141func (d *symbolData) kind(i uint32) []byte {
142 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
143}
144
145// data returns the symbol at index i
146func (d *symbolData) data(i uint32) *Symbol {
147 size := uint32(4 * 4) // 4 uint32s
148 offset := i * size
149 if offset >= uint32(len(d.symMetaData)) {
150 return nil
151 }
152
153 metadata := d.symMetaData[offset : offset+size]
154 sym := &Symbol{}
155 key := uint32SliceAt(metadata, 1)
156 sym.Kind = string(d.kind(key))
157 key = uint32SliceAt(metadata, 2)
158 sym.Parent = string(d.parent(key))
159 key = uint32SliceAt(metadata, 3)
160 sym.ParentKind = string(d.kind(key))
161 return sym
162}
163
164func (d *indexData) getChecksum(idx uint32) []byte {
165 start := crc64.Size * idx
166 return d.checksums[start : start+crc64.Size]
167}
168
169func (d *indexData) getLanguage(idx uint32) uint16 {
170 if d.metaData.IndexFeatureVersion < 12 {
171 // older zoekt files had 8-bit language entries
172 return uint16(d.languages[idx])
173 }
174 // newer zoekt files have 16-bit language entries
175 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
176}
177
178// calculates stats for files in the range [start, end).
179func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats {
180 if start >= end {
181 // An empty shard for an empty repository.
182 return RepoStats{
183 Shards: 1,
184 }
185 }
186
187 bytesContent := d.boundaries[end] - d.boundaries[start]
188 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
189 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
190
191 // CR keegan for stefan: I think we may want to restructure RepoListEntry so
192 // that we don't change anything, except we have
193 // []Repository. Alternatively, things we can divide up we do (like
194 // here). Right now I don't like that these numbers are not true, especially
195 // after aggregation. For now I will move forward with this until we can
196 // chat more.
197 return RepoStats{
198 ContentBytes: int64(bytesContent) + int64(bytesFN),
199 Documents: int(end - start),
200 // CR keegan for stefan: our shard count is going to go out of whack,
201 // since we will aggregate these. So we will report more shards than are
202 // present on disk. What should we do?
203 Shards: 1,
204
205 // Sourcegraph specific
206 NewLinesCount: count,
207 DefaultBranchNewLinesCount: defaultCount,
208 OtherBranchesNewLinesCount: otherCount,
209 }
210}
211
212func (d *indexData) calculateStats() error {
213 d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData))
214 var start, end uint32
215
216 for repoID, md := range d.repoMetaData {
217 // determine the file range for repo i
218 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
219 end++
220 }
221 if start < end && d.repos[start] != uint16(repoID) {
222 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
223 }
224
225 d.repoListEntry = append(d.repoListEntry, RepoListEntry{
226 Repository: md,
227 IndexMetadata: d.metaData,
228 Stats: d.calculateStatsForFileRange(start, end),
229 })
230 start = end
231 }
232
233 // All repos in a compound shard share memoryUse. So we average out the
234 // memoryUse per shard in our reporting. This has the benefit that when you
235 // aggregate the IndexBytes you get back the actual memoryUse.
236 //
237 // TODO take into account tombstones for aggregation. Even better, adjust
238 // API to be shard centric not repo centric.
239 if len(d.repoListEntry) > 0 {
240 indexBytes := d.memoryUse()
241 indexBytesChunk := indexBytes / len(d.repoListEntry)
242 for i := range d.repoListEntry {
243 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
244 indexBytes -= indexBytesChunk
245 }
246 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
247 }
248
249 return nil
250}
251
252// calculateNewLinesStats computes some Sourcegraph specific statistics for files
253// in the range [start, end). These are not as efficient to calculate as the
254// normal statistics. We experimentally measured about a 10% slower shard load
255// time. However, we find these values very useful to track and computing them
256// outside of load time introduces a lot of complexity.
257func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
258 for i := start; i < end; i++ {
259 // branchMask is a bitmask of the branches for a document. Zoekt by
260 // convention represents the default branch as the lowest bit.
261 branchMask := d.fileBranchMasks[i]
262 isDefault := (branchMask & 1) == 1
263 others := uint64(bits.OnesCount64(branchMask >> 1))
264
265 // this is readNewlines but only reading the size of each section which
266 // corresponds to the number of newlines.
267 sec := simpleSection{
268 off: d.newlinesStart + d.newlinesIndex[i],
269 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
270 }
271 // We are only reading the first varint which is the size. So we don't
272 // need to read more than MaxVarintLen64 bytes.
273 if sec.sz > binary.MaxVarintLen64 {
274 sec.sz = binary.MaxVarintLen64
275 }
276 blob, err := d.readSectionBlob(sec)
277 if err != nil {
278 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
279 continue
280 }
281 sz, _ := binary.Uvarint(blob)
282
283 count += sz
284 if isDefault {
285 defaultCount += sz
286 }
287 otherCount += (others * sz)
288 }
289
290 return
291}
292
293func (d *indexData) String() string {
294 return fmt.Sprintf("shard(%s)", d.file.Name())
295}
296
297// calculates an approximate size of indexData in memory in bytes.
298func (d *indexData) memoryUse() int {
299 sz := 0
300 for _, a := range [][]uint32{
301 d.newlinesIndex, d.docSectionsIndex,
302 d.boundaries, d.fileNameIndex,
303 d.fileEndRunes, d.fileNameEndRunes,
304 d.fileEndSymbol, d.symbols.symKindIndex,
305 d.subRepos,
306 } {
307 sz += 4 * len(a)
308 }
309 sz += d.runeOffsets.sizeBytes()
310 sz += d.fileNameRuneOffsets.sizeBytes()
311 sz += len(d.languages)
312 sz += len(d.checksums)
313 sz += 2 * len(d.repos)
314 if len(d.ranks) > 0 {
315 sz += 8 * len(d.ranks) * len(d.ranks[0])
316 }
317 sz += 8 * len(d.runeDocSections)
318 sz += 8 * len(d.fileBranchMasks)
319 sz += d.ngrams.SizeBytes()
320 sz += d.fileNameNgrams.SizeBytes()
321 return sz
322}
323
324const maxUInt32 = 0xffffffff
325
326func min2Index(xs []uint32) (idx0, idx1 int) {
327 min0, min1 := uint32(maxUInt32), uint32(maxUInt32)
328 for i, x := range xs {
329 if x <= min0 {
330 idx0, idx1 = i, idx0
331 min0, min1 = x, min0
332 } else if x <= min1 {
333 idx1 = i
334 min1 = x
335 }
336 }
337 return
338}
339
340// minFrequencyNgramOffsets returns the two lowest frequency ngrams to pass to
341// the distance iterator. If they have the same frequency, we maximise the
342// distance between them. first will always have a smaller index than last.
343func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
344 firstI, lastI := min2Index(frequencies)
345 // If the frequencies are equal lets maximise distance in the query
346 // string. This optimization normally triggers for long repeated trigrams
347 // in a string, eg a query like "AAAAA..."
348 if frequencies[firstI] == frequencies[lastI] {
349 for i, freq := range frequencies {
350 if freq != frequencies[firstI] {
351 continue
352 }
353 if ngramOffs[i].index < ngramOffs[firstI].index {
354 firstI = i
355 }
356 if ngramOffs[i].index > ngramOffs[lastI].index {
357 lastI = i
358 }
359 }
360 }
361 first = ngramOffs[firstI]
362 last = ngramOffs[lastI]
363 // Ensure first appears before last to make distance logic below clean.
364 if first.index > last.index {
365 last, first = first, last
366 }
367 return first, last
368}
369
370func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
371 if filename {
372 return data.fileNameNgrams.Get(ng).sz
373 }
374 return data.ngrams.Get(ng).sz
375}
376
377type ngramIterationResults struct {
378 matchIterator
379
380 caseSensitive bool
381 fileName bool
382 substrBytes []byte
383 substrLowered []byte
384}
385
386func (r *ngramIterationResults) String() string {
387 return fmt.Sprintf("wrapper(%v)", r.matchIterator)
388}
389
390func (r *ngramIterationResults) candidates() []*candidateMatch {
391 cs := r.matchIterator.candidates()
392 for _, c := range cs {
393 c.caseSensitive = r.caseSensitive
394 c.fileName = r.fileName
395 c.substrBytes = r.substrBytes
396 c.substrLowered = r.substrLowered
397 }
398 return cs
399}
400
401func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
402 str := query.Pattern
403
404 // Find the 2 least common ngrams from the string.
405 ngramOffs := splitNGrams([]byte(query.Pattern))
406
407 // protect against accidental searching of empty strings
408 if len(ngramOffs) == 0 {
409 return nil, errors.New("iterateNgrams needs non empty string")
410 }
411
412 // PERF: Sort to increase the chances adjacent checks are in the same btree
413 // bucket (which can cause disk IO).
414 slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool {
415 return a.ngram < b.ngram
416 })
417 frequencies := make([]uint32, 0, len(ngramOffs))
418 ngramLookups := 0
419 for _, o := range ngramOffs {
420 var freq uint32
421 if query.CaseSensitive {
422 freq = d.ngramFrequency(o.ngram, query.FileName)
423 ngramLookups++
424 } else {
425 for _, v := range generateCaseNgrams(o.ngram) {
426 freq += d.ngramFrequency(v, query.FileName)
427 ngramLookups++
428 }
429 }
430
431 if freq == 0 {
432 return &ngramIterationResults{
433 matchIterator: &noMatchTree{
434 Why: "freq=0",
435 Stats: Stats{
436 NgramLookups: ngramLookups,
437 },
438 },
439 }, nil
440 }
441
442 frequencies = append(frequencies, freq)
443 }
444
445 // first and last are now the smallest trigram posting lists to iterate
446 // through.
447 first, last := minFrequencyNgramOffsets(ngramOffs, frequencies)
448
449 iter := &ngramDocIterator{
450 leftPad: first.index,
451 rightPad: uint32(utf8.RuneCountInString(str)) - first.index,
452 ngramLookups: ngramLookups,
453 }
454 if query.FileName {
455 iter.ends = d.fileNameEndRunes
456 } else {
457 iter.ends = d.fileEndRunes
458 }
459
460 if first != last {
461 runeDist := last.index - first.index
462 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
463 if err != nil {
464 return nil, err
465 }
466
467 iter.iter = i
468 } else {
469 hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
470 if err != nil {
471 return nil, err
472 }
473 iter.iter = hitIter
474 }
475
476 patBytes := []byte(query.Pattern)
477 lowerPatBytes := toLower(patBytes)
478
479 return &ngramIterationResults{
480 matchIterator: iter,
481 caseSensitive: query.CaseSensitive,
482 fileName: query.FileName,
483 substrBytes: patBytes,
484 substrLowered: lowerPatBytes,
485 }, nil
486}
487
488func (d *indexData) fileName(i uint32) []byte {
489 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
490}
491
492func (d *indexData) numDocs() uint32 {
493 return uint32(len(d.fileBranchMasks))
494}
495
496func (s *indexData) Close() {
497 s.file.Close()
498}
499
500const (
501 rawConfigYes = 1
502 rawConfigNo = 2
503)
504
505// encodeRawConfig encodes a rawConfig map into a uint8 mask.
506func encodeRawConfig(rawConfig map[string]string) uint8 {
507 var encoded uint8
508 for i, f := range []string{"public", "fork", "archived"} {
509 var e uint8
510 v, ok := rawConfig[f]
511 if ok && v == "1" {
512 e |= rawConfigYes
513 } else {
514 e |= rawConfigNo
515 }
516 encoded = encoded | e<<(2*i)
517 }
518 return encoded
519}