fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "encoding/binary"
19 "fmt"
20 "hash/crc64"
21 "log"
22 "math/bits"
23 "unicode/utf8"
24
25 "github.com/sourcegraph/zoekt/query"
26)
27
28// indexData holds the pattern-independent data that we have to have
29// in memory to search. Most of the memory is taken up by the ngram =>
30// offset index.
31type indexData struct {
32 symbols symbolData
33
34 file IndexFile
35
36 ngrams ngramIndex
37
38 newlinesStart uint32
39 newlinesIndex []uint32
40
41 docSectionsStart uint32
42 docSectionsIndex []uint32
43
44 runeDocSections []DocumentSection
45 runeDocSectionsRaw []byte
46
47 // rune offset=>byte offset mapping, relative to the start of the content corpus
48 runeOffsets runeOffsetMap
49
50 // offsets of file contents; includes end of last file
51 boundariesStart uint32
52 boundaries []uint32
53
54 // rune offsets for the file content boundaries
55 fileEndRunes []uint32
56
57 fileNameContent []byte
58 fileNameIndex []uint32
59 fileNameNgrams fileNameNgrams
60
61 // fileEndSymbol[i] is the index of the first symbol for document i.
62 fileEndSymbol []uint32
63
64 // rune offset=>byte offset mapping, relative to the start of the filename corpus
65 fileNameRuneOffsets runeOffsetMap
66
67 // rune offsets for the file name boundaries
68 fileNameEndRunes []uint32
69
70 fileBranchMasks []uint64
71
72 // mask (power of 2) => name
73 branchNames []map[uint]string
74
75 // name => mask (power of 2)
76 branchIDs []map[string]uint
77
78 metaData IndexMetadata
79 repoMetaData []Repository
80
81 subRepos []uint32
82 subRepoPaths [][]string
83
84 // Checksums for all the files, at 8-byte intervals
85 checksums []byte
86
87 // languages for all the files.
88 languages []byte
89
90 // inverse of LanguageMap in metaData
91 languageMap map[uint16]string
92
93 repoListEntry []RepoListEntry
94
95 // repository indexes for all the files
96 repos []uint16
97
98 // Experimental: docID => rank vec
99 ranks [][]float64
100
101 // rawConfigMasks contains the encoded RawConfig for each repository
102 rawConfigMasks []uint8
103}
104
105type symbolData struct {
106 // symContent stores Symbol.Sym and Symbol.Parent.
107 // TODO we don't need to store Symbol.Sym.
108 symContent []byte
109 symIndex []byte
110 // symKindContent is an enum of sym.Kind and sym.ParentKind
111 symKindContent []byte
112 symKindIndex []uint32
113 // symMetadata is [4]uint32 0 Kind Parent ParentKind
114 symMetaData []byte
115}
116
117func uint32SliceAt(a []byte, n uint32) uint32 {
118 return binary.BigEndian.Uint32(a[n*4:])
119}
120
121func uint32SliceLen(a []byte) uint32 {
122 return uint32(len(a) / 4)
123}
124
125// parent returns index i of the parent enum
126func (d *symbolData) parent(i uint32) []byte {
127 delta := uint32SliceAt(d.symIndex, 0)
128 start := uint32SliceAt(d.symIndex, i) - delta
129 var end uint32
130 if i+1 == uint32SliceLen(d.symIndex) {
131 end = uint32(len(d.symContent))
132 } else {
133 end = uint32SliceAt(d.symIndex, i+1) - delta
134 }
135 return d.symContent[start:end]
136}
137
138// kind returns index i of the kind enum
139func (d *symbolData) kind(i uint32) []byte {
140 return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
141}
142
143// data returns the symbol at index i
144func (d *symbolData) data(i uint32) *Symbol {
145 size := uint32(4 * 4) // 4 uint32s
146 offset := i * size
147 if offset >= uint32(len(d.symMetaData)) {
148 return nil
149 }
150
151 metadata := d.symMetaData[offset : offset+size]
152 sym := &Symbol{}
153 key := uint32SliceAt(metadata, 1)
154 sym.Kind = string(d.kind(key))
155 key = uint32SliceAt(metadata, 2)
156 sym.Parent = string(d.parent(key))
157 key = uint32SliceAt(metadata, 3)
158 sym.ParentKind = string(d.kind(key))
159 return sym
160}
161
162func (d *indexData) getChecksum(idx uint32) []byte {
163 start := crc64.Size * idx
164 return d.checksums[start : start+crc64.Size]
165}
166
167func (d *indexData) getLanguage(idx uint32) uint16 {
168 if d.metaData.IndexFeatureVersion < 12 {
169 // older zoekt files had 8-bit language entries
170 return uint16(d.languages[idx])
171 }
172 // newer zoekt files have 16-bit language entries
173 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
174}
175
176// calculates stats for files in the range [start, end).
177func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats {
178 if start >= end {
179 // An empty shard for an empty repository.
180 return RepoStats{
181 Shards: 1,
182 }
183 }
184
185 bytesContent := d.boundaries[end] - d.boundaries[start]
186 bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
187 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
188
189 // CR keegan for stefan: I think we may want to restructure RepoListEntry so
190 // that we don't change anything, except we have
191 // []Repository. Alternatively, things we can divide up we do (like
192 // here). Right now I don't like that these numbers are not true, especially
193 // after aggregation. For now I will move forward with this until we can
194 // chat more.
195 return RepoStats{
196 ContentBytes: int64(bytesContent) + int64(bytesFN),
197 Documents: int(end - start),
198 // CR keegan for stefan: our shard count is going to go out of whack,
199 // since we will aggregate these. So we will report more shards than are
200 // present on disk. What should we do?
201 Shards: 1,
202
203 // Sourcegraph specific
204 NewLinesCount: count,
205 DefaultBranchNewLinesCount: defaultCount,
206 OtherBranchesNewLinesCount: otherCount,
207 }
208}
209
210func (d *indexData) calculateStats() error {
211 d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData))
212 var start, end uint32
213
214 for repoID, md := range d.repoMetaData {
215 // determine the file range for repo i
216 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
217 end++
218 }
219 if start < end && d.repos[start] != uint16(repoID) {
220 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
221 }
222
223 d.repoListEntry = append(d.repoListEntry, RepoListEntry{
224 Repository: md,
225 IndexMetadata: d.metaData,
226 Stats: d.calculateStatsForFileRange(start, end),
227 })
228 start = end
229 }
230
231 // All repos in a compound shard share memoryUse. So we average out the
232 // memoryUse per shard in our reporting. This has the benefit that when you
233 // aggregate the IndexBytes you get back the actual memoryUse.
234 //
235 // TODO take into account tombstones for aggregation. Even better, adjust
236 // API to be shard centric not repo centric.
237 if len(d.repoListEntry) > 0 {
238 indexBytes := d.memoryUse()
239 indexBytesChunk := indexBytes / len(d.repoListEntry)
240 for i := range d.repoListEntry {
241 d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
242 indexBytes -= indexBytesChunk
243 }
244 d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
245 }
246
247 return nil
248}
249
250// calculateNewLinesStats computes some Sourcegraph specific statistics for files
251// in the range [start, end). These are not as efficient to calculate as the
252// normal statistics. We experimentally measured about a 10% slower shard load
253// time. However, we find these values very useful to track and computing them
254// outside of load time introduces a lot of complexity.
255func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
256 for i := start; i < end; i++ {
257 // branchMask is a bitmask of the branches for a document. Zoekt by
258 // convention represents the default branch as the lowest bit.
259 branchMask := d.fileBranchMasks[i]
260 isDefault := (branchMask & 1) == 1
261 others := uint64(bits.OnesCount64(branchMask >> 1))
262
263 // this is readNewlines but only reading the size of each section which
264 // corresponds to the number of newlines.
265 sec := simpleSection{
266 off: d.newlinesStart + d.newlinesIndex[i],
267 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
268 }
269 // We are only reading the first varint which is the size. So we don't
270 // need to read more than MaxVarintLen64 bytes.
271 if sec.sz > binary.MaxVarintLen64 {
272 sec.sz = binary.MaxVarintLen64
273 }
274 blob, err := d.readSectionBlob(sec)
275 if err != nil {
276 log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
277 continue
278 }
279 sz, _ := binary.Uvarint(blob)
280
281 count += sz
282 if isDefault {
283 defaultCount += sz
284 }
285 otherCount += (others * sz)
286 }
287
288 return
289}
290
291func (d *indexData) String() string {
292 return fmt.Sprintf("shard(%s)", d.file.Name())
293}
294
295// calculates an approximate size of indexData in memory in bytes.
296func (d *indexData) memoryUse() int {
297 sz := 0
298 for _, a := range [][]uint32{
299 d.newlinesIndex, d.docSectionsIndex,
300 d.boundaries, d.fileNameIndex,
301 d.fileEndRunes, d.fileNameEndRunes,
302 d.fileEndSymbol, d.symbols.symKindIndex,
303 d.subRepos,
304 } {
305 sz += 4 * len(a)
306 }
307 sz += d.runeOffsets.sizeBytes()
308 sz += d.fileNameRuneOffsets.sizeBytes()
309 sz += len(d.languages)
310 sz += len(d.checksums)
311 sz += 2 * len(d.repos)
312 if len(d.ranks) > 0 {
313 sz += 8 * len(d.ranks) * len(d.ranks[0])
314 }
315 sz += 8 * len(d.runeDocSections)
316 sz += 8 * len(d.fileBranchMasks)
317 if d.ngrams != nil {
318 sz += d.ngrams.SizeBytes()
319 }
320 sz += d.fileNameNgrams.SizeBytes()
321 return sz
322}
323
324const maxUInt32 = 0xffffffff
325
326func firstMinarg(xs []uint32) uint32 {
327 m := uint32(maxUInt32)
328 j := len(xs)
329 for i, x := range xs {
330 if x < m {
331 m = x
332 j = i
333 }
334 }
335 return uint32(j)
336}
337
338func lastMinarg(xs []uint32) uint32 {
339 m := uint32(maxUInt32)
340 j := len(xs)
341 for i, x := range xs {
342 if x <= m {
343 m = x
344 j = i
345 }
346 }
347 return uint32(j)
348}
349
350func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
351 if filename {
352 return data.fileNameNgrams.Frequency(ng)
353 }
354
355 if data.ngrams == nil {
356 return 0
357 }
358
359 return data.ngrams.Get(ng).sz
360}
361
362type ngramIterationResults struct {
363 matchIterator
364
365 caseSensitive bool
366 fileName bool
367 substrBytes []byte
368 substrLowered []byte
369}
370
371func (r *ngramIterationResults) String() string {
372 return fmt.Sprintf("wrapper(%v)", r.matchIterator)
373}
374
375func (r *ngramIterationResults) candidates() []*candidateMatch {
376 cs := r.matchIterator.candidates()
377 for _, c := range cs {
378 c.caseSensitive = r.caseSensitive
379 c.fileName = r.fileName
380 c.substrBytes = r.substrBytes
381 c.substrLowered = r.substrLowered
382 }
383 return cs
384}
385
386func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
387 str := query.Pattern
388
389 // Find the 2 least common ngrams from the string.
390 ngramOffs := splitNGrams([]byte(query.Pattern))
391 frequencies := make([]uint32, 0, len(ngramOffs))
392 ngramLookups := 0
393 for _, o := range ngramOffs {
394 var freq uint32
395 if query.CaseSensitive {
396 freq = d.ngramFrequency(o.ngram, query.FileName)
397 ngramLookups++
398 } else {
399 for _, v := range generateCaseNgrams(o.ngram) {
400 freq += d.ngramFrequency(v, query.FileName)
401 ngramLookups++
402 }
403 }
404
405 if freq == 0 {
406 return &ngramIterationResults{
407 matchIterator: &noMatchTree{
408 Why: "freq=0",
409 Stats: Stats{
410 NgramLookups: ngramLookups,
411 },
412 },
413 }, nil
414 }
415
416 frequencies = append(frequencies, freq)
417 }
418 firstI := firstMinarg(frequencies)
419 frequencies[firstI] = maxUInt32
420 lastI := lastMinarg(frequencies)
421 if firstI > lastI {
422 lastI, firstI = firstI, lastI
423 }
424
425 firstNG := ngramOffs[firstI].ngram
426 lastNG := ngramOffs[lastI].ngram
427 iter := &ngramDocIterator{
428 leftPad: firstI,
429 rightPad: uint32(utf8.RuneCountInString(str)) - firstI,
430 ngramLookups: ngramLookups,
431 }
432 if query.FileName {
433 iter.ends = d.fileNameEndRunes
434 } else {
435 iter.ends = d.fileEndRunes
436 }
437
438 if firstI != lastI {
439 i, err := d.newDistanceTrigramIter(firstNG, lastNG, lastI-firstI, query.CaseSensitive, query.FileName)
440 if err != nil {
441 return nil, err
442 }
443
444 iter.iter = i
445 } else {
446 hitIter, err := d.trigramHitIterator(lastNG, query.CaseSensitive, query.FileName)
447 if err != nil {
448 return nil, err
449 }
450 iter.iter = hitIter
451 }
452
453 patBytes := []byte(query.Pattern)
454 lowerPatBytes := toLower(patBytes)
455
456 return &ngramIterationResults{
457 matchIterator: iter,
458 caseSensitive: query.CaseSensitive,
459 fileName: query.FileName,
460 substrBytes: patBytes,
461 substrLowered: lowerPatBytes,
462 }, nil
463}
464
465func (d *indexData) fileName(i uint32) []byte {
466 return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
467}
468
469func (d *indexData) numDocs() uint32 {
470 return uint32(len(d.fileBranchMasks))
471}
472
473func (s *indexData) Close() {
474 s.file.Close()
475}
476
477const (
478 rawConfigYes = 1
479 rawConfigNo = 2
480)
481
482// encodeRawConfig encodes a rawConfig map into a uint8 mask.
483func encodeRawConfig(rawConfig map[string]string) uint8 {
484 var encoded uint8
485 for i, f := range []string{"public", "fork", "archived"} {
486 var e uint8
487 v, ok := rawConfig[f]
488 if ok && v == "1" {
489 e |= rawConfigYes
490 } else {
491 e |= rawConfigNo
492 }
493 encoded = encoded | e<<(2*i)
494 }
495 return encoded
496}