fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "encoding/binary"
20 "fmt"
21 "hash/crc64"
22 "html/template"
23 "log"
24 "path/filepath"
25 "sort"
26 "time"
27 "unicode/utf8"
28
29 "github.com/go-enry/go-enry/v2"
30)
31
32var _ = log.Println
33
34const ngramSize = 3
35
36type searchableString struct {
37 data []byte
38}
39
40// Filled by the linker
41var Version string
42
43// Store character (unicode codepoint) offset (in bytes) this often.
44const runeOffsetFrequency = 100
45
46type postingsBuilder struct {
47 postings map[ngram][]byte
48 lastOffsets map[ngram]uint32
49
50 // To support UTF-8 searching, we must map back runes to byte
51 // offsets. As a first attempt, we sample regularly. The
52 // precise offset can be found by walking from the recorded
53 // offset to the desired rune.
54 runeOffsets []uint32
55 runeCount uint32
56
57 isPlainASCII bool
58
59 endRunes []uint32
60 endByte uint32
61}
62
63func newPostingsBuilder() *postingsBuilder {
64 return &postingsBuilder{
65 postings: map[ngram][]byte{},
66 lastOffsets: map[ngram]uint32{},
67 isPlainASCII: true,
68 }
69}
70
71// Store trigram offsets for the given UTF-8 data. The
72// DocumentSections must correspond to rune boundaries in the UTF-8
73// data.
74func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
75 dest := searchableString{
76 data: data,
77 }
78 var buf [8]byte
79 var runeGram [3]rune
80
81 var runeIndex uint32
82 byteCount := 0
83 dataSz := uint32(len(data))
84
85 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
86 for _, s := range byteSections {
87 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
88 }
89 var runeSectionBoundaries []uint32
90
91 endRune := s.runeCount
92 for ; len(data) > 0; runeIndex++ {
93 c, sz := utf8.DecodeRune(data)
94 if sz > 1 {
95 s.isPlainASCII = false
96 }
97 data = data[sz:]
98
99 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
100
101 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
102 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
103 }
104 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
105 runeSectionBoundaries = append(runeSectionBoundaries,
106 endRune+uint32(runeIndex))
107 byteSectionBoundaries = byteSectionBoundaries[1:]
108 }
109
110 byteCount += sz
111
112 if runeIndex < 2 {
113 continue
114 }
115
116 ng := runesToNGram(runeGram)
117 lastOff := s.lastOffsets[ng]
118 newOff := endRune + uint32(runeIndex) - 2
119
120 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
121 s.postings[ng] = append(s.postings[ng], buf[:m]...)
122 s.lastOffsets[ng] = newOff
123 }
124 s.runeCount += runeIndex
125
126 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
127 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
128 }
129
130 // Handle symbol definition that ends at file end. This can
131 // happen for labels at the end of .bat files.
132
133 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
134 runeSectionBoundaries = append(runeSectionBoundaries,
135 endRune+runeIndex)
136 byteSectionBoundaries = byteSectionBoundaries[1:]
137 }
138 runeSecs := make([]DocumentSection, 0, len(byteSections))
139 for i := 0; i < len(runeSectionBoundaries); i += 2 {
140 runeSecs = append(runeSecs, DocumentSection{
141 Start: runeSectionBoundaries[i],
142 End: runeSectionBoundaries[i+1],
143 })
144 }
145
146 s.endRunes = append(s.endRunes, s.runeCount)
147 s.endByte += dataSz
148 return &dest, runeSecs, nil
149}
150
151// IndexBuilder builds a single index shard.
152type IndexBuilder struct {
153 // The version we will write to disk. Sourcegraph Specific. This is to
154 // enable feature flagging new format versions.
155 indexFormatVersion int
156 featureVersion int
157
158 contentStrings []*searchableString
159 nameStrings []*searchableString
160 docSections [][]DocumentSection
161 runeDocSections []DocumentSection
162
163 symID uint32
164 symIndex map[string]uint32
165 symKindID uint32
166 symKindIndex map[string]uint32
167 symMetaData []uint32
168
169 fileEndSymbol []uint32
170
171 checksums []byte
172
173 branchMasks []uint64
174 subRepos []uint32
175
176 // docID => repoID
177 repos []uint16
178
179 // Experimental: docID => rank vec
180 ranks [][]float64
181
182 contentPostings *postingsBuilder
183 namePostings *postingsBuilder
184
185 // root repositories
186 repoList []Repository
187
188 // name to index.
189 subRepoIndices []map[string]uint32
190
191 // language => language code
192 languageMap map[string]uint16
193
194 // language codes, uint16 encoded as little-endian
195 languages []uint8
196
197 // IndexTime will be used as the time if non-zero. Otherwise
198 // time.Now(). This is useful for doing reproducible builds in tests.
199 IndexTime time.Time
200
201 // a sortable 20 chars long id.
202 ID string
203}
204
205func (d *Repository) verify() error {
206 for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} {
207 if _, err := template.New("").Parse(t); err != nil {
208 return err
209 }
210 }
211 return nil
212}
213
214// ContentSize returns the number of content bytes so far ingested.
215func (b *IndexBuilder) ContentSize() uint32 {
216 // Add the name too so we don't skip building index if we have
217 // lots of empty files.
218 return b.contentPostings.endByte + b.namePostings.endByte
219}
220
221// NewIndexBuilder creates a fresh IndexBuilder. The passed in
222// Repository contains repo metadata, and may be set to nil.
223func NewIndexBuilder(r *Repository) (*IndexBuilder, error) {
224 b := newIndexBuilder()
225
226 if r == nil {
227 r = &Repository{}
228 }
229 if err := b.setRepository(r); err != nil {
230 return nil, err
231 }
232 return b, nil
233}
234
235func newIndexBuilder() *IndexBuilder {
236 return &IndexBuilder{
237 indexFormatVersion: IndexFormatVersion,
238 featureVersion: FeatureVersion,
239
240 contentPostings: newPostingsBuilder(),
241 namePostings: newPostingsBuilder(),
242 fileEndSymbol: []uint32{0},
243 symIndex: make(map[string]uint32),
244 symKindIndex: make(map[string]uint32),
245 languageMap: make(map[string]uint16),
246 }
247}
248
249func (b *IndexBuilder) setRepository(desc *Repository) error {
250 if err := desc.verify(); err != nil {
251 return err
252 }
253
254 if len(desc.Branches) > 64 {
255 return fmt.Errorf("too many branches")
256 }
257
258 repo := *desc
259
260 // copy subrepomap without root
261 repo.SubRepoMap = map[string]*Repository{}
262 for k, v := range desc.SubRepoMap {
263 if k != "" {
264 repo.SubRepoMap[k] = v
265 }
266 }
267
268 b.repoList = append(b.repoList, repo)
269
270 return b.populateSubRepoIndices()
271}
272
273type DocumentSection struct {
274 Start, End uint32
275}
276
277// Document holds a document (file) to index.
278type Document struct {
279 Name string
280 Content []byte
281 Branches []string
282 SubRepositoryPath string
283 Language string
284
285 // If set, something is wrong with the file contents, and this
286 // is the reason it wasn't indexed.
287 SkipReason string
288
289 // Document sections for symbols. Offsets should use bytes.
290 Symbols []DocumentSection
291 SymbolsMetaData []*Symbol
292
293 // Ranks is a vector of ranks for a document as provided by a DocumentRanksFile
294 // file in the git repo.
295 //
296 // Two documents can be ordered by comparing the components of their rank
297 // vectors. Bigger entries are better, as are longer vectors.
298 //
299 // This field is experimental and may change at any time without warning.
300 Ranks []float64
301}
302
303type symbolSlice struct {
304 symbols []DocumentSection
305 metaData []*Symbol
306}
307
308func (s symbolSlice) Len() int { return len(s.symbols) }
309
310func (s symbolSlice) Swap(i, j int) {
311 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
312 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
313}
314
315func (s symbolSlice) Less(i, j int) bool {
316 return s.symbols[i].Start < s.symbols[j].Start
317}
318
319// AddFile is a convenience wrapper for Add
320func (b *IndexBuilder) AddFile(name string, content []byte) error {
321 return b.Add(Document{Name: name, Content: content})
322}
323
324// CheckText returns a reason why the given contents are probably not source texts.
325func CheckText(content []byte, maxTrigramCount int) error {
326 if len(content) == 0 {
327 return nil
328 }
329
330 if len(content) < ngramSize {
331 return fmt.Errorf("file size smaller than %d", ngramSize)
332 }
333
334 // PERF: we only need to do the trigram check if the upperbound on content
335 // is greater than our threshold.
336 var trigrams map[ngram]struct{}
337 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound > maxTrigramCount {
338 trigrams = make(map[ngram]struct{}, maxTrigramCount+1)
339 }
340
341 var cur [3]rune
342 byteCount := 0
343 for len(content) > 0 {
344 if content[0] == 0 {
345 return fmt.Errorf("binary data at byte offset %d", byteCount)
346 }
347
348 r, sz := utf8.DecodeRune(content)
349 content = content[sz:]
350 byteCount += sz
351
352 cur[0], cur[1], cur[2] = cur[1], cur[2], r
353 if cur[0] == 0 {
354 // start of file.
355 continue
356 }
357
358 if trigrams != nil {
359 trigrams[runesToNGram(cur)] = struct{}{}
360 if len(trigrams) > maxTrigramCount {
361 // probably not text.
362 return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
363 }
364 }
365 }
366 return nil
367}
368
369func (b *IndexBuilder) populateSubRepoIndices() error {
370 if len(b.subRepoIndices) == len(b.repoList) {
371 return nil
372 }
373 if len(b.subRepoIndices) != len(b.repoList)-1 {
374 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
375 }
376 repo := b.repoList[len(b.repoList)-1]
377 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
378 return nil
379}
380
381func mkSubRepoIndices(repo Repository) map[string]uint32 {
382 paths := []string{""}
383 for k := range repo.SubRepoMap {
384 paths = append(paths, k)
385 }
386 sort.Strings(paths)
387 subRepoIndices := make(map[string]uint32, len(paths))
388 for i, p := range paths {
389 subRepoIndices[p] = uint32(i)
390 }
391 return subRepoIndices
392}
393
394const notIndexedMarker = "NOT-INDEXED: "
395
396func (b *IndexBuilder) symbolID(sym string) uint32 {
397 if _, ok := b.symIndex[sym]; !ok {
398 b.symIndex[sym] = b.symID
399 b.symID++
400 }
401 return b.symIndex[sym]
402}
403
404func (b *IndexBuilder) symbolKindID(t string) uint32 {
405 if _, ok := b.symKindIndex[t]; !ok {
406 b.symKindIndex[t] = b.symKindID
407 b.symKindID++
408 }
409 return b.symKindIndex[t]
410}
411
412func (b *IndexBuilder) addSymbols(symbols []*Symbol) {
413 for _, sym := range symbols {
414 b.symMetaData = append(b.symMetaData,
415 // This field was removed due to redundancy. To avoid
416 // needing to reindex, it is set to zero for now. In the
417 // future, this field will be completely removed. It
418 // will require incrementing the feature version.
419 0,
420 b.symbolKindID(sym.Kind),
421 b.symbolID(sym.Parent),
422 b.symbolKindID(sym.ParentKind))
423 }
424}
425
426func DetermineLanguageIfUnknown(doc *Document) {
427 if doc.Language == "" {
428 c := doc.Content
429 // classifier is faster on small files without losing much accuracy
430 if len(c) > 2048 {
431 c = c[:2048]
432 }
433 doc.Language = enry.GetLanguage(doc.Name, c)
434 }
435}
436
437// Add a file which only occurs in certain branches.
438func (b *IndexBuilder) Add(doc Document) error {
439 hasher := crc64.New(crc64.MakeTable(crc64.ISO))
440
441 if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
442 doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
443 doc.Language = "binary"
444 }
445
446 if doc.SkipReason != "" {
447 doc.Content = []byte(notIndexedMarker + doc.SkipReason)
448 doc.Symbols = nil
449 doc.SymbolsMetaData = nil
450 if doc.Language == "" {
451 doc.Language = "skipped"
452 }
453 }
454
455 DetermineLanguageIfUnknown(&doc)
456
457 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
458 var last DocumentSection
459 for i, s := range doc.Symbols {
460 if i > 0 {
461 if last.End > s.Start {
462 return fmt.Errorf("sections overlap")
463 }
464 }
465 last = s
466 }
467 if last.End > uint32(len(doc.Content)) {
468 return fmt.Errorf("section goes past end of content")
469 }
470
471 if doc.SubRepositoryPath != "" {
472 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
473 if err != nil || rel == doc.Name {
474 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
475 }
476 }
477 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
478 if err != nil {
479 return err
480 }
481 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
482 if err != nil {
483 return err
484 }
485 b.addSymbols(doc.SymbolsMetaData)
486
487 repoIdx := len(b.repoList) - 1
488 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
489 if !ok {
490 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
491 }
492
493 var mask uint64
494 for _, br := range doc.Branches {
495 m := b.branchMask(br)
496 if m == 0 {
497 return fmt.Errorf("no branch found for %s", br)
498 }
499 mask |= m
500 }
501
502 if repoIdx > 1<<16 {
503 return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
504 }
505
506 b.subRepos = append(b.subRepos, subRepoIdx)
507 b.repos = append(b.repos, uint16(repoIdx))
508
509 // doc.Ranks might be nil. In case we don't use offline ranking, doc.Ranks is
510 // always nil.
511 b.ranks = append(b.ranks, doc.Ranks)
512
513 hasher.Write(doc.Content)
514
515 b.contentStrings = append(b.contentStrings, docStr)
516 b.runeDocSections = append(b.runeDocSections, runeSecs...)
517
518 b.nameStrings = append(b.nameStrings, nameStr)
519 b.docSections = append(b.docSections, doc.Symbols)
520 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
521 b.branchMasks = append(b.branchMasks, mask)
522 b.checksums = append(b.checksums, hasher.Sum(nil)...)
523
524 langCode, ok := b.languageMap[doc.Language]
525 if !ok {
526 if len(b.languageMap) >= 65535 {
527 return fmt.Errorf("too many languages")
528 }
529 langCode = uint16(len(b.languageMap))
530 b.languageMap[doc.Language] = langCode
531 }
532 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
533
534 return nil
535}
536
537func (b *IndexBuilder) branchMask(br string) uint64 {
538 for i, b := range b.repoList[len(b.repoList)-1].Branches {
539 if b.Name == br {
540 return uint64(1) << uint(i)
541 }
542 }
543 return 0
544}