fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "encoding/binary"
20 "fmt"
21 "hash/crc64"
22 "html/template"
23 "log"
24 "os"
25 "path/filepath"
26 "sort"
27 "time"
28 "unicode/utf8"
29
30 "github.com/go-enry/go-enry/v2"
31)
32
33var _ = log.Println
34
35const ngramSize = 3
36
37type searchableString struct {
38 data []byte
39}
40
41// Filled by the linker
42var Version string
43
44func HostnameBestEffort() string {
45 if h := os.Getenv("NODE_NAME"); h != "" {
46 return h
47 }
48 if h := os.Getenv("HOSTNAME"); h != "" {
49 return h
50 }
51 hostname, _ := os.Hostname()
52 return hostname
53}
54
55// Store character (unicode codepoint) offset (in bytes) this often.
56const runeOffsetFrequency = 100
57
58type postingsBuilder struct {
59 postings map[ngram][]byte
60 lastOffsets map[ngram]uint32
61
62 // To support UTF-8 searching, we must map back runes to byte
63 // offsets. As a first attempt, we sample regularly. The
64 // precise offset can be found by walking from the recorded
65 // offset to the desired rune.
66 runeOffsets []uint32
67 runeCount uint32
68
69 isPlainASCII bool
70
71 endRunes []uint32
72 endByte uint32
73}
74
75func newPostingsBuilder() *postingsBuilder {
76 return &postingsBuilder{
77 postings: map[ngram][]byte{},
78 lastOffsets: map[ngram]uint32{},
79 isPlainASCII: true,
80 }
81}
82
83// Store trigram offsets for the given UTF-8 data. The
84// DocumentSections must correspond to rune boundaries in the UTF-8
85// data.
86func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
87 dest := searchableString{
88 data: data,
89 }
90 var buf [8]byte
91 var runeGram [3]rune
92
93 var runeIndex uint32
94 byteCount := 0
95 dataSz := uint32(len(data))
96
97 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
98 for _, s := range byteSections {
99 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
100 }
101 var runeSectionBoundaries []uint32
102
103 endRune := s.runeCount
104 for ; len(data) > 0; runeIndex++ {
105 c, sz := utf8.DecodeRune(data)
106 if sz > 1 {
107 s.isPlainASCII = false
108 }
109 data = data[sz:]
110
111 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
112
113 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
114 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
115 }
116 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
117 runeSectionBoundaries = append(runeSectionBoundaries,
118 endRune+uint32(runeIndex))
119 byteSectionBoundaries = byteSectionBoundaries[1:]
120 }
121
122 byteCount += sz
123
124 if runeIndex < 2 {
125 continue
126 }
127
128 ng := runesToNGram(runeGram)
129 lastOff := s.lastOffsets[ng]
130 newOff := endRune + uint32(runeIndex) - 2
131
132 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
133 s.postings[ng] = append(s.postings[ng], buf[:m]...)
134 s.lastOffsets[ng] = newOff
135 }
136 s.runeCount += runeIndex
137
138 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
139 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
140 }
141
142 // Handle symbol definition that ends at file end. This can
143 // happen for labels at the end of .bat files.
144
145 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
146 runeSectionBoundaries = append(runeSectionBoundaries,
147 endRune+runeIndex)
148 byteSectionBoundaries = byteSectionBoundaries[1:]
149 }
150 runeSecs := make([]DocumentSection, 0, len(byteSections))
151 for i := 0; i < len(runeSectionBoundaries); i += 2 {
152 runeSecs = append(runeSecs, DocumentSection{
153 Start: runeSectionBoundaries[i],
154 End: runeSectionBoundaries[i+1],
155 })
156 }
157
158 s.endRunes = append(s.endRunes, s.runeCount)
159 s.endByte += dataSz
160 return &dest, runeSecs, nil
161}
162
163// IndexBuilder builds a single index shard.
164type IndexBuilder struct {
165 // The version we will write to disk. Sourcegraph Specific. This is to
166 // enable feature flagging new format versions.
167 indexFormatVersion int
168 featureVersion int
169
170 contentStrings []*searchableString
171 nameStrings []*searchableString
172 docSections [][]DocumentSection
173 runeDocSections []DocumentSection
174
175 symID uint32
176 symIndex map[string]uint32
177 symKindID uint32
178 symKindIndex map[string]uint32
179 symMetaData []uint32
180
181 fileEndSymbol []uint32
182
183 checksums []byte
184
185 branchMasks []uint64
186 subRepos []uint32
187
188 // docID => repoID
189 repos []uint16
190
191 // Experimental: docID => rank vec
192 ranks [][]float64
193
194 contentPostings *postingsBuilder
195 namePostings *postingsBuilder
196
197 // root repositories
198 repoList []Repository
199
200 // name to index.
201 subRepoIndices []map[string]uint32
202
203 // language => language code
204 languageMap map[string]uint16
205
206 // language codes, uint16 encoded as little-endian
207 languages []uint8
208
209 // IndexTime will be used as the time if non-zero. Otherwise
210 // time.Now(). This is useful for doing reproducible builds in tests.
211 IndexTime time.Time
212
213 // a sortable 20 chars long id.
214 ID string
215}
216
217func (d *Repository) verify() error {
218 for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} {
219 if _, err := template.New("").Parse(t); err != nil {
220 return err
221 }
222 }
223 return nil
224}
225
226// ContentSize returns the number of content bytes so far ingested.
227func (b *IndexBuilder) ContentSize() uint32 {
228 // Add the name too so we don't skip building index if we have
229 // lots of empty files.
230 return b.contentPostings.endByte + b.namePostings.endByte
231}
232
233// NumFiles returns the number of files added to this builder
234func (b *IndexBuilder) NumFiles() int {
235 return len(b.contentStrings)
236}
237
238// NewIndexBuilder creates a fresh IndexBuilder. The passed in
239// Repository contains repo metadata, and may be set to nil.
240func NewIndexBuilder(r *Repository) (*IndexBuilder, error) {
241 b := newIndexBuilder()
242
243 if r == nil {
244 r = &Repository{}
245 }
246 if err := b.setRepository(r); err != nil {
247 return nil, err
248 }
249 return b, nil
250}
251
252func newIndexBuilder() *IndexBuilder {
253 return &IndexBuilder{
254 indexFormatVersion: IndexFormatVersion,
255 featureVersion: FeatureVersion,
256
257 contentPostings: newPostingsBuilder(),
258 namePostings: newPostingsBuilder(),
259 fileEndSymbol: []uint32{0},
260 symIndex: make(map[string]uint32),
261 symKindIndex: make(map[string]uint32),
262 languageMap: make(map[string]uint16),
263 }
264}
265
266func (b *IndexBuilder) setRepository(desc *Repository) error {
267 if err := desc.verify(); err != nil {
268 return err
269 }
270
271 if len(desc.Branches) > 64 {
272 return fmt.Errorf("too many branches")
273 }
274
275 repo := *desc
276
277 // copy subrepomap without root
278 repo.SubRepoMap = map[string]*Repository{}
279 for k, v := range desc.SubRepoMap {
280 if k != "" {
281 repo.SubRepoMap[k] = v
282 }
283 }
284
285 b.repoList = append(b.repoList, repo)
286
287 return b.populateSubRepoIndices()
288}
289
290type DocumentSection struct {
291 Start, End uint32
292}
293
294// Document holds a document (file) to index.
295type Document struct {
296 Name string
297 Content []byte
298 Branches []string
299 SubRepositoryPath string
300 Language string
301
302 // If set, something is wrong with the file contents, and this
303 // is the reason it wasn't indexed.
304 SkipReason string
305
306 // Document sections for symbols. Offsets should use bytes.
307 Symbols []DocumentSection
308 SymbolsMetaData []*Symbol
309
310 // Ranks is a vector of ranks for a document as provided by a DocumentRanksFile
311 // file in the git repo.
312 //
313 // Two documents can be ordered by comparing the components of their rank
314 // vectors. Bigger entries are better, as are longer vectors.
315 //
316 // This field is experimental and may change at any time without warning.
317 Ranks []float64
318}
319
320type symbolSlice struct {
321 symbols []DocumentSection
322 metaData []*Symbol
323}
324
325func (s symbolSlice) Len() int { return len(s.symbols) }
326
327func (s symbolSlice) Swap(i, j int) {
328 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
329 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
330}
331
332func (s symbolSlice) Less(i, j int) bool {
333 return s.symbols[i].Start < s.symbols[j].Start
334}
335
336// AddFile is a convenience wrapper for Add
337func (b *IndexBuilder) AddFile(name string, content []byte) error {
338 return b.Add(Document{Name: name, Content: content})
339}
340
341// CheckText returns a reason why the given contents are probably not source texts.
342func CheckText(content []byte, maxTrigramCount int) error {
343 if len(content) == 0 {
344 return nil
345 }
346
347 if len(content) < ngramSize {
348 return fmt.Errorf("file size smaller than %d", ngramSize)
349 }
350
351 // PERF: we only need to do the trigram check if the upperbound on content
352 // is greater than our threshold.
353 var trigrams map[ngram]struct{}
354 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound > maxTrigramCount {
355 trigrams = make(map[ngram]struct{}, maxTrigramCount+1)
356 }
357
358 var cur [3]rune
359 byteCount := 0
360 for len(content) > 0 {
361 if content[0] == 0 {
362 return fmt.Errorf("binary data at byte offset %d", byteCount)
363 }
364
365 r, sz := utf8.DecodeRune(content)
366 content = content[sz:]
367 byteCount += sz
368
369 cur[0], cur[1], cur[2] = cur[1], cur[2], r
370 if cur[0] == 0 {
371 // start of file.
372 continue
373 }
374
375 if trigrams != nil {
376 trigrams[runesToNGram(cur)] = struct{}{}
377 if len(trigrams) > maxTrigramCount {
378 // probably not text.
379 return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
380 }
381 }
382 }
383 return nil
384}
385
386func (b *IndexBuilder) populateSubRepoIndices() error {
387 if len(b.subRepoIndices) == len(b.repoList) {
388 return nil
389 }
390 if len(b.subRepoIndices) != len(b.repoList)-1 {
391 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
392 }
393 repo := b.repoList[len(b.repoList)-1]
394 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
395 return nil
396}
397
398func mkSubRepoIndices(repo Repository) map[string]uint32 {
399 paths := []string{""}
400 for k := range repo.SubRepoMap {
401 paths = append(paths, k)
402 }
403 sort.Strings(paths)
404 subRepoIndices := make(map[string]uint32, len(paths))
405 for i, p := range paths {
406 subRepoIndices[p] = uint32(i)
407 }
408 return subRepoIndices
409}
410
411const notIndexedMarker = "NOT-INDEXED: "
412
413func (b *IndexBuilder) symbolID(sym string) uint32 {
414 if _, ok := b.symIndex[sym]; !ok {
415 b.symIndex[sym] = b.symID
416 b.symID++
417 }
418 return b.symIndex[sym]
419}
420
421func (b *IndexBuilder) symbolKindID(t string) uint32 {
422 if _, ok := b.symKindIndex[t]; !ok {
423 b.symKindIndex[t] = b.symKindID
424 b.symKindID++
425 }
426 return b.symKindIndex[t]
427}
428
429func (b *IndexBuilder) addSymbols(symbols []*Symbol) {
430 for _, sym := range symbols {
431 b.symMetaData = append(b.symMetaData,
432 // This field was removed due to redundancy. To avoid
433 // needing to reindex, it is set to zero for now. In the
434 // future, this field will be completely removed. It
435 // will require incrementing the feature version.
436 0,
437 b.symbolKindID(sym.Kind),
438 b.symbolID(sym.Parent),
439 b.symbolKindID(sym.ParentKind))
440 }
441}
442
443func DetermineLanguageIfUnknown(doc *Document) {
444 if doc.Language == "" {
445 c := doc.Content
446 // classifier is faster on small files without losing much accuracy
447 if len(c) > 2048 {
448 c = c[:2048]
449 }
450 doc.Language = enry.GetLanguage(doc.Name, c)
451 }
452}
453
454// Add a file which only occurs in certain branches.
455func (b *IndexBuilder) Add(doc Document) error {
456 hasher := crc64.New(crc64.MakeTable(crc64.ISO))
457
458 if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
459 doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
460 doc.Language = "binary"
461 }
462
463 if doc.SkipReason != "" {
464 doc.Content = []byte(notIndexedMarker + doc.SkipReason)
465 doc.Symbols = nil
466 doc.SymbolsMetaData = nil
467 if doc.Language == "" {
468 doc.Language = "skipped"
469 }
470 }
471
472 DetermineLanguageIfUnknown(&doc)
473
474 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
475 var last DocumentSection
476 for i, s := range doc.Symbols {
477 if i > 0 {
478 if last.End > s.Start {
479 return fmt.Errorf("sections overlap")
480 }
481 }
482 last = s
483 }
484 if last.End > uint32(len(doc.Content)) {
485 return fmt.Errorf("section goes past end of content")
486 }
487
488 if doc.SubRepositoryPath != "" {
489 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
490 if err != nil || rel == doc.Name {
491 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
492 }
493 }
494 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
495 if err != nil {
496 return err
497 }
498 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
499 if err != nil {
500 return err
501 }
502 b.addSymbols(doc.SymbolsMetaData)
503
504 repoIdx := len(b.repoList) - 1
505 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
506 if !ok {
507 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
508 }
509
510 var mask uint64
511 for _, br := range doc.Branches {
512 m := b.branchMask(br)
513 if m == 0 {
514 return fmt.Errorf("no branch found for %s", br)
515 }
516 mask |= m
517 }
518
519 if repoIdx > 1<<16 {
520 return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
521 }
522
523 b.subRepos = append(b.subRepos, subRepoIdx)
524 b.repos = append(b.repos, uint16(repoIdx))
525
526 // doc.Ranks might be nil. In case we don't use offline ranking, doc.Ranks is
527 // always nil.
528 b.ranks = append(b.ranks, doc.Ranks)
529
530 hasher.Write(doc.Content)
531
532 b.contentStrings = append(b.contentStrings, docStr)
533 b.runeDocSections = append(b.runeDocSections, runeSecs...)
534
535 b.nameStrings = append(b.nameStrings, nameStr)
536 b.docSections = append(b.docSections, doc.Symbols)
537 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
538 b.branchMasks = append(b.branchMasks, mask)
539 b.checksums = append(b.checksums, hasher.Sum(nil)...)
540
541 langCode, ok := b.languageMap[doc.Language]
542 if !ok {
543 if len(b.languageMap) >= 65535 {
544 return fmt.Errorf("too many languages")
545 }
546 langCode = uint16(len(b.languageMap))
547 b.languageMap[doc.Language] = langCode
548 }
549 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
550
551 return nil
552}
553
554func (b *IndexBuilder) branchMask(br string) uint64 {
555 for i, b := range b.repoList[len(b.repoList)-1].Branches {
556 if b.Name == br {
557 return uint64(1) << uint(i)
558 }
559 }
560 return 0
561}