fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "encoding/binary"
19 "fmt"
20 "hash/crc64"
21 "log"
22 "math/bits"
23 "unicode/utf8"
24
25 "github.com/sourcegraph/zoekt/query"
26 "golang.org/x/exp/slices"
27)
28
29// indexData holds the pattern-independent data that we have to have
30// in memory to search. Most of the memory is taken up by the ngram =>
31// offset index.
32type indexData struct {
33 symbols symbolData
34
35 file IndexFile
36
37 ngrams btreeIndex
38
39 newlinesStart uint32
40 newlinesIndex []uint32
41
42 docSectionsStart uint32
43 docSectionsIndex []uint32
44
45 runeDocSections []DocumentSection
46 runeDocSectionsRaw []byte
47
48 // rune offset=>byte offset mapping, relative to the start of the content corpus
49 runeOffsets runeOffsetMap
50
51 // offsets of file contents; includes end of last file
52 boundariesStart uint32
53 boundaries []uint32
54
55 // rune offsets for the file content boundaries
56 fileEndRunes []uint32
57
58 fileNameContent []byte
59 fileNameIndex []uint32
60 fileNameNgrams btreeIndex
61
62 // fileEndSymbol[i] is the index of the first symbol for document i.
63 fileEndSymbol []uint32
64
65 // rune offset=>byte offset mapping, relative to the start of the filename corpus
66 fileNameRuneOffsets runeOffsetMap
67
68 // rune offsets for the file name boundaries
69 fileNameEndRunes []uint32
70
71 fileBranchMasks []uint64
72
73 // mask (power of 2) => name
74 branchNames []map[uint]string
75
76 // name => mask (power of 2)
77 branchIDs []map[string]uint
78
79 metaData IndexMetadata
80 repoMetaData []Repository
81
82 subRepos []uint32
83 subRepoPaths [][]string
84
85 // Checksums for all the files, at 8-byte intervals
86 checksums []byte
87
88 // languages for all the files.
89 languages []byte
90
91 // inverse of LanguageMap in metaData
92 languageMap map[uint16]string
93
94 repoListEntry []RepoListEntry
95
96 // repository indexes for all the files
97 repos []uint16
98
99 // Experimental: docID => rank vec
100 ranks [][]float64
101
102 // rawConfigMasks contains the encoded RawConfig for each repository
103 rawConfigMasks []uint8
104}
105
106type symbolData struct {
107 // symContent stores Symbol.Sym and Symbol.Parent.
108 // TODO we don't need to store Symbol.Sym.
109 symContent []byte
110 symIndex []byte
111 // symKindContent is an enum of sym.Kind and sym.ParentKind
112 symKindContent []byte
113 symKindIndex []uint32
114 // symMetadata is [4]uint32 0 Kind Parent ParentKind
115 symMetaData []byte
116}
117
118func uint32SliceAt(a []byte, n uint32) uint32 {
119 return binary.BigEndian.Uint32(a[n*4:])
120}
121
122func uint32SliceLen(a []byte) uint32 {
123 return uint32(len(a) / 4)
124}
125
126// parent returns index i of the parent enum
127func (d *symbolData) parent(i uint32) []byte {
128 delta := uint32SliceAt(d.symIndex, 0)
129 start := uint32SliceAt(d.symIndex, i) - delta
130 var end uint32
131 if i+1 == uint32SliceLen(d.symIndex) {
132 end = uint32(len(d.symContent))
133 } else {
134 end = uint32SliceAt(d.symIndex, i+1) - delta
135 }
136 return d.symContent[start:end]
137}
138
139// kind returns index i of the kind enum
140func (d *symbolData) kind(i uint32) []byte {
141 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
142}
143
144// data returns the symbol at index i
145func (d *symbolData) data(i uint32) *Symbol {
146 size := uint32(4 * 4) // 4 uint32s
147 offset := i * size
148 if offset >= uint32(len(d.symMetaData)) {
149 return nil
150 }
151
152 metadata := d.symMetaData[offset : offset+size]
153 sym := &Symbol{}
154 key := uint32SliceAt(metadata, 1)
155 sym.Kind = string(d.kind(key))
156 key = uint32SliceAt(metadata, 2)
157 sym.Parent = string(d.parent(key))
158 key = uint32SliceAt(metadata, 3)
159 sym.ParentKind = string(d.kind(key))
160 return sym
161}
162
163func (d *indexData) getChecksum(idx uint32) []byte {
164 start := crc64.Size * idx
165 return d.checksums[start : start+crc64.Size]
166}
167
168func (d *indexData) getLanguage(idx uint32) uint16 {
169 if d.metaData.IndexFeatureVersion < 12 {
170 // older zoekt files had 8-bit language entries
171 return uint16(d.languages[idx])
172 }
173 // newer zoekt files have 16-bit language entries
174 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
175}
176
177// calculates stats for files in the range [start, end).
178func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats {
179 if start >= end {
180 // An empty shard for an empty repository.
181 return RepoStats{
182 Shards: 1,
183 }
184 }
185
186 bytesContent := d.boundaries[end] - d.boundaries[start]
187 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
188 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
189
190 // CR keegan for stefan: I think we may want to restructure RepoListEntry so
191 // that we don't change anything, except we have
192 // []Repository. Alternatively, things we can divide up we do (like
193 // here). Right now I don't like that these numbers are not true, especially
194 // after aggregation. For now I will move forward with this until we can
195 // chat more.
196 return RepoStats{
197 ContentBytes: int64(bytesContent) + int64(bytesFN),
198 Documents: int(end - start),
199 // CR keegan for stefan: our shard count is going to go out of whack,
200 // since we will aggregate these. So we will report more shards than are
201 // present on disk. What should we do?
202 Shards: 1,
203
204 // Sourcegraph specific
205 NewLinesCount: count,
206 DefaultBranchNewLinesCount: defaultCount,
207 OtherBranchesNewLinesCount: otherCount,
208 }
209}
210
211func (d *indexData) calculateStats() error {
212 d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData))
213 var start, end uint32
214
215 for repoID, md := range d.repoMetaData {
216 // determine the file range for repo i
217 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
218 end++
219 }
220 if start < end && d.repos[start] != uint16(repoID) {
221 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
222 }
223
224 d.repoListEntry = append(d.repoListEntry, RepoListEntry{
225 Repository: md,
226 IndexMetadata: d.metaData,
227 Stats: d.calculateStatsForFileRange(start, end),
228 })
229 start = end
230 }
231
232 // All repos in a compound shard share memoryUse. So we average out the
233 // memoryUse per shard in our reporting. This has the benefit that when you
234 // aggregate the IndexBytes you get back the actual memoryUse.
235 //
236 // TODO take into account tombstones for aggregation. Even better, adjust
237 // API to be shard centric not repo centric.
238 if len(d.repoListEntry) > 0 {
239 indexBytes := d.memoryUse()
240 indexBytesChunk := indexBytes / len(d.repoListEntry)
241 for i := range d.repoListEntry {
242 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
243 indexBytes -= indexBytesChunk
244 }
245 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
246 }
247
248 return nil
249}
250
251// calculateNewLinesStats computes some Sourcegraph specific statistics for files
252// in the range [start, end). These are not as efficient to calculate as the
253// normal statistics. We experimentally measured about a 10% slower shard load
254// time. However, we find these values very useful to track and computing them
255// outside of load time introduces a lot of complexity.
256func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
257 for i := start; i < end; i++ {
258 // branchMask is a bitmask of the branches for a document. Zoekt by
259 // convention represents the default branch as the lowest bit.
260 branchMask := d.fileBranchMasks[i]
261 isDefault := (branchMask & 1) == 1
262 others := uint64(bits.OnesCount64(branchMask >> 1))
263
264 // this is readNewlines but only reading the size of each section which
265 // corresponds to the number of newlines.
266 sec := simpleSection{
267 off: d.newlinesStart + d.newlinesIndex[i],
268 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
269 }
270 // We are only reading the first varint which is the size. So we don't
271 // need to read more than MaxVarintLen64 bytes.
272 if sec.sz > binary.MaxVarintLen64 {
273 sec.sz = binary.MaxVarintLen64
274 }
275 blob, err := d.readSectionBlob(sec)
276 if err != nil {
277 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
278 continue
279 }
280 sz, _ := binary.Uvarint(blob)
281
282 count += sz
283 if isDefault {
284 defaultCount += sz
285 }
286 otherCount += (others * sz)
287 }
288
289 return
290}
291
292func (d *indexData) String() string {
293 return fmt.Sprintf("shard(%s)", d.file.Name())
294}
295
296// calculates an approximate size of indexData in memory in bytes.
297func (d *indexData) memoryUse() int {
298 sz := 0
299 for _, a := range [][]uint32{
300 d.newlinesIndex, d.docSectionsIndex,
301 d.boundaries, d.fileNameIndex,
302 d.fileEndRunes, d.fileNameEndRunes,
303 d.fileEndSymbol, d.symbols.symKindIndex,
304 d.subRepos,
305 } {
306 sz += 4 * len(a)
307 }
308 sz += d.runeOffsets.sizeBytes()
309 sz += d.fileNameRuneOffsets.sizeBytes()
310 sz += len(d.languages)
311 sz += len(d.checksums)
312 sz += 2 * len(d.repos)
313 if len(d.ranks) > 0 {
314 sz += 8 * len(d.ranks) * len(d.ranks[0])
315 }
316 sz += 8 * len(d.runeDocSections)
317 sz += 8 * len(d.fileBranchMasks)
318 sz += d.ngrams.SizeBytes()
319 sz += d.fileNameNgrams.SizeBytes()
320 return sz
321}
322
323const maxUInt32 = 0xffffffff
324
325func firstMinarg(xs []uint32) uint32 {
326 m := uint32(maxUInt32)
327 j := len(xs)
328 for i, x := range xs {
329 if x < m {
330 m = x
331 j = i
332 }
333 }
334 return uint32(j)
335}
336
337func lastMinarg(xs []uint32) uint32 {
338 m := uint32(maxUInt32)
339 j := len(xs)
340 for i, x := range xs {
341 if x <= m {
342 m = x
343 j = i
344 }
345 }
346 return uint32(j)
347}
348
349func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
350 if filename {
351 return data.fileNameNgrams.Get(ng).sz
352 }
353 return data.ngrams.Get(ng).sz
354}
355
356type ngramIterationResults struct {
357 matchIterator
358
359 caseSensitive bool
360 fileName bool
361 substrBytes []byte
362 substrLowered []byte
363}
364
365func (r *ngramIterationResults) String() string {
366 return fmt.Sprintf("wrapper(%v)", r.matchIterator)
367}
368
369func (r *ngramIterationResults) candidates() []*candidateMatch {
370 cs := r.matchIterator.candidates()
371 for _, c := range cs {
372 c.caseSensitive = r.caseSensitive
373 c.fileName = r.fileName
374 c.substrBytes = r.substrBytes
375 c.substrLowered = r.substrLowered
376 }
377 return cs
378}
379
380func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
381 str := query.Pattern
382
383 // Find the 2 least common ngrams from the string.
384 ngramOffs := splitNGrams([]byte(query.Pattern))
385 // PERF: Sort to increase the chances adjacent checks are in the same btree
386 // bucket (which can cause disk IO).
387 slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool {
388 return a.ngram < b.ngram
389 })
390 frequencies := make([]uint32, 0, len(ngramOffs))
391 ngramLookups := 0
392 for _, o := range ngramOffs {
393 var freq uint32
394 if query.CaseSensitive {
395 freq = d.ngramFrequency(o.ngram, query.FileName)
396 ngramLookups++
397 } else {
398 for _, v := range generateCaseNgrams(o.ngram) {
399 freq += d.ngramFrequency(v, query.FileName)
400 ngramLookups++
401 }
402 }
403
404 if freq == 0 {
405 return &ngramIterationResults{
406 matchIterator: &noMatchTree{
407 Why: "freq=0",
408 Stats: Stats{
409 NgramLookups: ngramLookups,
410 },
411 },
412 }, nil
413 }
414
415 frequencies = append(frequencies, freq)
416 }
417
418 var first, last runeNgramOff
419 {
420 firstI := firstMinarg(frequencies)
421 frequencies[firstI] = maxUInt32
422 lastI := lastMinarg(frequencies)
423 first = ngramOffs[firstI]
424 last = ngramOffs[lastI]
425 if first.index > last.index {
426 last, first = first, last
427 }
428 }
429
430 iter := &ngramDocIterator{
431 leftPad: first.index,
432 rightPad: uint32(utf8.RuneCountInString(str)) - first.index,
433 ngramLookups: ngramLookups,
434 }
435 if query.FileName {
436 iter.ends = d.fileNameEndRunes
437 } else {
438 iter.ends = d.fileEndRunes
439 }
440
441 if first != last {
442 runeDist := last.index - first.index
443 i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
444 if err != nil {
445 return nil, err
446 }
447
448 iter.iter = i
449 } else {
450 hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
451 if err != nil {
452 return nil, err
453 }
454 iter.iter = hitIter
455 }
456
457 patBytes := []byte(query.Pattern)
458 lowerPatBytes := toLower(patBytes)
459
460 return &ngramIterationResults{
461 matchIterator: iter,
462 caseSensitive: query.CaseSensitive,
463 fileName: query.FileName,
464 substrBytes: patBytes,
465 substrLowered: lowerPatBytes,
466 }, nil
467}
468
469func (d *indexData) fileName(i uint32) []byte {
470 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
471}
472
473func (d *indexData) numDocs() uint32 {
474 return uint32(len(d.fileBranchMasks))
475}
476
477func (s *indexData) Close() {
478 s.file.Close()
479}
480
481const (
482 rawConfigYes = 1
483 rawConfigNo = 2
484)
485
486// encodeRawConfig encodes a rawConfig map into a uint8 mask.
487func encodeRawConfig(rawConfig map[string]string) uint8 {
488 var encoded uint8
489 for i, f := range []string{"public", "fork", "archived"} {
490 var e uint8
491 v, ok := rawConfig[f]
492 if ok && v == "1" {
493 e |= rawConfigYes
494 } else {
495 e |= rawConfigNo
496 }
497 encoded = encoded | e<<(2*i)
498 }
499 return encoded
500}