fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "encoding/binary"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "net/url"
24 "os"
25 "path/filepath"
26 "sort"
27 "strings"
28 "text/template"
29 "time"
30 "unicode/utf8"
31
32 "github.com/sourcegraph/zoekt/internal/languages"
33)
34
35var _ = log.Println
36
37const ngramSize = 3
38
39type searchableString struct {
40 data []byte
41}
42
43// Filled by the linker
44var Version string
45
46func HostnameBestEffort() string {
47 if h := os.Getenv("NODE_NAME"); h != "" {
48 return h
49 }
50 if h := os.Getenv("HOSTNAME"); h != "" {
51 return h
52 }
53 hostname, _ := os.Hostname()
54 return hostname
55}
56
57// Store character (unicode codepoint) offset (in bytes) this often.
58const runeOffsetFrequency = 100
59
60type postingsBuilder struct {
61 postings map[ngram][]byte
62 lastOffsets map[ngram]uint32
63
64 // To support UTF-8 searching, we must map back runes to byte
65 // offsets. As a first attempt, we sample regularly. The
66 // precise offset can be found by walking from the recorded
67 // offset to the desired rune.
68 runeOffsets []uint32
69 runeCount uint32
70
71 isPlainASCII bool
72
73 endRunes []uint32
74 endByte uint32
75}
76
77func newPostingsBuilder() *postingsBuilder {
78 return &postingsBuilder{
79 postings: map[ngram][]byte{},
80 lastOffsets: map[ngram]uint32{},
81 isPlainASCII: true,
82 }
83}
84
85// Store trigram offsets for the given UTF-8 data. The
86// DocumentSections must correspond to rune boundaries in the UTF-8
87// data.
88func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
89 dest := searchableString{
90 data: data,
91 }
92 var buf [8]byte
93 var runeGram [3]rune
94
95 var runeIndex uint32
96 byteCount := 0
97 dataSz := uint32(len(data))
98
99 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
100 for _, s := range byteSections {
101 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
102 }
103 var runeSectionBoundaries []uint32
104
105 endRune := s.runeCount
106 for ; len(data) > 0; runeIndex++ {
107 c, sz := utf8.DecodeRune(data)
108 if sz > 1 {
109 s.isPlainASCII = false
110 }
111 data = data[sz:]
112
113 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
114
115 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
116 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
117 }
118 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
119 runeSectionBoundaries = append(runeSectionBoundaries,
120 endRune+uint32(runeIndex))
121 byteSectionBoundaries = byteSectionBoundaries[1:]
122 }
123
124 byteCount += sz
125
126 if runeIndex < 2 {
127 continue
128 }
129
130 ng := runesToNGram(runeGram)
131 lastOff := s.lastOffsets[ng]
132 newOff := endRune + uint32(runeIndex) - 2
133
134 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
135 s.postings[ng] = append(s.postings[ng], buf[:m]...)
136 s.lastOffsets[ng] = newOff
137 }
138 s.runeCount += runeIndex
139
140 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
141 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
142 }
143
144 // Handle symbol definition that ends at file end. This can
145 // happen for labels at the end of .bat files.
146
147 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
148 runeSectionBoundaries = append(runeSectionBoundaries,
149 endRune+runeIndex)
150 byteSectionBoundaries = byteSectionBoundaries[1:]
151 }
152 runeSecs := make([]DocumentSection, 0, len(byteSections))
153 for i := 0; i < len(runeSectionBoundaries); i += 2 {
154 runeSecs = append(runeSecs, DocumentSection{
155 Start: runeSectionBoundaries[i],
156 End: runeSectionBoundaries[i+1],
157 })
158 }
159
160 s.endRunes = append(s.endRunes, s.runeCount)
161 s.endByte += dataSz
162 return &dest, runeSecs, nil
163}
164
165// IndexBuilder builds a single index shard.
166type IndexBuilder struct {
167 // The version we will write to disk. Sourcegraph Specific. This is to
168 // enable feature flagging new format versions.
169 indexFormatVersion int
170 featureVersion int
171
172 contentStrings []*searchableString
173 nameStrings []*searchableString
174 docSections [][]DocumentSection
175 runeDocSections []DocumentSection
176
177 symID uint32
178 symIndex map[string]uint32
179 symKindID uint32
180 symKindIndex map[string]uint32
181 symMetaData []uint32
182
183 fileEndSymbol []uint32
184
185 checksums []byte
186
187 branchMasks []uint64
188 subRepos []uint32
189
190 // docID => repoID
191 repos []uint16
192
193 contentPostings *postingsBuilder
194 namePostings *postingsBuilder
195
196 // root repositories
197 repoList []Repository
198
199 // name to index.
200 subRepoIndices []map[string]uint32
201
202 // language => language code
203 languageMap map[string]uint16
204
205 // language codes, uint16 encoded as little-endian
206 languages []uint8
207
208 // IndexTime will be used as the time if non-zero. Otherwise
209 // time.Now(). This is useful for doing reproducible builds in tests.
210 IndexTime time.Time
211
212 // a sortable 20 chars long id.
213 ID string
214}
215
216func (d *Repository) verify() error {
217 for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} {
218 if _, err := ParseTemplate(t); err != nil {
219 return err
220 }
221 }
222 return nil
223}
224
225func urlJoinPath(base string, elem ...string) string {
226 // golangs html/template always escapes "+" appearing in an HTML attribute
227 // [1]. We may even want to treat more characters, differently but this
228 // atleast makes it possible to visit URLs like [2].
229 //
230 // We only do this to elem since base will normally be a hardcoded string.
231 //
232 // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80
233 // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift
234 elem = append([]string{}, elem...) // copy to mutate
235 for i := range elem {
236 elem[i] = strings.ReplaceAll(elem[i], "+", "%2B")
237 }
238 u, err := url.JoinPath(base, elem...)
239 if err != nil {
240 return "#!error: " + err.Error()
241 }
242 return u
243}
244
245// ParseTemplate will parse the templates for FileURLTemplate,
246// LineFragmentTemplate and CommitURLTemplate.
247//
248// It makes available the extra function UrlJoinPath.
249func ParseTemplate(text string) (*template.Template, error) {
250 return template.New("").Funcs(template.FuncMap{
251 "URLJoinPath": urlJoinPath,
252 }).Parse(text)
253}
254
255// ContentSize returns the number of content bytes so far ingested.
256func (b *IndexBuilder) ContentSize() uint32 {
257 // Add the name too so we don't skip building index if we have
258 // lots of empty files.
259 return b.contentPostings.endByte + b.namePostings.endByte
260}
261
262// NumFiles returns the number of files added to this builder
263func (b *IndexBuilder) NumFiles() int {
264 return len(b.contentStrings)
265}
266
267// NewIndexBuilder creates a fresh IndexBuilder. The passed in
268// Repository contains repo metadata, and may be set to nil.
269func NewIndexBuilder(r *Repository) (*IndexBuilder, error) {
270 b := newIndexBuilder()
271
272 if r == nil {
273 r = &Repository{}
274 }
275 if err := b.setRepository(r); err != nil {
276 return nil, err
277 }
278 return b, nil
279}
280
281func newIndexBuilder() *IndexBuilder {
282 return &IndexBuilder{
283 indexFormatVersion: IndexFormatVersion,
284 featureVersion: FeatureVersion,
285
286 contentPostings: newPostingsBuilder(),
287 namePostings: newPostingsBuilder(),
288 fileEndSymbol: []uint32{0},
289 symIndex: make(map[string]uint32),
290 symKindIndex: make(map[string]uint32),
291 languageMap: make(map[string]uint16),
292 }
293}
294
295func (b *IndexBuilder) setRepository(desc *Repository) error {
296 if err := desc.verify(); err != nil {
297 return err
298 }
299
300 if len(desc.Branches) > 64 {
301 return fmt.Errorf("too many branches")
302 }
303
304 repo := *desc
305
306 // copy subrepomap without root
307 repo.SubRepoMap = map[string]*Repository{}
308 for k, v := range desc.SubRepoMap {
309 if k != "" {
310 repo.SubRepoMap[k] = v
311 }
312 }
313
314 b.repoList = append(b.repoList, repo)
315
316 return b.populateSubRepoIndices()
317}
318
319type DocumentSection struct {
320 Start, End uint32
321}
322
323// Document holds a document (file) to index.
324type Document struct {
325 Name string
326 Content []byte
327 Branches []string
328 SubRepositoryPath string
329 Language string
330
331 // If set, something is wrong with the file contents, and this
332 // is the reason it wasn't indexed.
333 SkipReason string
334
335 // Document sections for symbols. Offsets should use bytes.
336 Symbols []DocumentSection
337 SymbolsMetaData []*Symbol
338}
339
340type symbolSlice struct {
341 symbols []DocumentSection
342 metaData []*Symbol
343}
344
345func (s symbolSlice) Len() int { return len(s.symbols) }
346
347func (s symbolSlice) Swap(i, j int) {
348 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
349 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
350}
351
352func (s symbolSlice) Less(i, j int) bool {
353 return s.symbols[i].Start < s.symbols[j].Start
354}
355
356// AddFile is a convenience wrapper for Add
357func (b *IndexBuilder) AddFile(name string, content []byte) error {
358 return b.Add(Document{Name: name, Content: content})
359}
360
361func (b *IndexBuilder) populateSubRepoIndices() error {
362 if len(b.subRepoIndices) == len(b.repoList) {
363 return nil
364 }
365 if len(b.subRepoIndices) != len(b.repoList)-1 {
366 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
367 }
368 repo := b.repoList[len(b.repoList)-1]
369 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
370 return nil
371}
372
373func mkSubRepoIndices(repo Repository) map[string]uint32 {
374 paths := []string{""}
375 for k := range repo.SubRepoMap {
376 paths = append(paths, k)
377 }
378 sort.Strings(paths)
379 subRepoIndices := make(map[string]uint32, len(paths))
380 for i, p := range paths {
381 subRepoIndices[p] = uint32(i)
382 }
383 return subRepoIndices
384}
385
386const notIndexedMarker = "NOT-INDEXED: "
387
388func (b *IndexBuilder) symbolID(sym string) uint32 {
389 if _, ok := b.symIndex[sym]; !ok {
390 b.symIndex[sym] = b.symID
391 b.symID++
392 }
393 return b.symIndex[sym]
394}
395
396func (b *IndexBuilder) symbolKindID(t string) uint32 {
397 if _, ok := b.symKindIndex[t]; !ok {
398 b.symKindIndex[t] = b.symKindID
399 b.symKindID++
400 }
401 return b.symKindIndex[t]
402}
403
404func (b *IndexBuilder) addSymbols(symbols []*Symbol) {
405 for _, sym := range symbols {
406 b.symMetaData = append(b.symMetaData,
407 // This field was removed due to redundancy. To avoid
408 // needing to reindex, it is set to zero for now. In the
409 // future, this field will be completely removed. It
410 // will require incrementing the feature version.
411 0,
412 b.symbolKindID(sym.Kind),
413 b.symbolID(sym.Parent),
414 b.symbolKindID(sym.ParentKind))
415 }
416}
417
418func DetermineLanguageIfUnknown(doc *Document) {
419 if doc.Language == "" {
420 doc.Language = languages.GetLanguage(doc.Name, doc.Content)
421 }
422}
423
424// Add a file which only occurs in certain branches.
425func (b *IndexBuilder) Add(doc Document) error {
426 hasher := crc64.New(crc64.MakeTable(crc64.ISO))
427
428 if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
429 doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
430 doc.Language = "binary"
431 }
432
433 if doc.SkipReason != "" {
434 doc.Content = []byte(notIndexedMarker + doc.SkipReason)
435 doc.Symbols = nil
436 doc.SymbolsMetaData = nil
437 if doc.Language == "" {
438 doc.Language = "skipped"
439 }
440 }
441
442 DetermineLanguageIfUnknown(&doc)
443
444 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
445 var last DocumentSection
446 for i, s := range doc.Symbols {
447 if i > 0 {
448 if last.End > s.Start {
449 return fmt.Errorf("sections overlap")
450 }
451 }
452 last = s
453 }
454 if last.End > uint32(len(doc.Content)) {
455 return fmt.Errorf("section goes past end of content")
456 }
457
458 if doc.SubRepositoryPath != "" {
459 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
460 if err != nil || rel == doc.Name {
461 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
462 }
463 }
464 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
465 if err != nil {
466 return err
467 }
468 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
469 if err != nil {
470 return err
471 }
472 b.addSymbols(doc.SymbolsMetaData)
473
474 repoIdx := len(b.repoList) - 1
475 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
476 if !ok {
477 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
478 }
479
480 var mask uint64
481 for _, br := range doc.Branches {
482 m := b.branchMask(br)
483 if m == 0 {
484 return fmt.Errorf("no branch found for %s", br)
485 }
486 mask |= m
487 }
488
489 if repoIdx > 1<<16 {
490 return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
491 }
492
493 b.subRepos = append(b.subRepos, subRepoIdx)
494 b.repos = append(b.repos, uint16(repoIdx))
495
496 hasher.Write(doc.Content)
497
498 b.contentStrings = append(b.contentStrings, docStr)
499 b.runeDocSections = append(b.runeDocSections, runeSecs...)
500
501 b.nameStrings = append(b.nameStrings, nameStr)
502 b.docSections = append(b.docSections, doc.Symbols)
503 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
504 b.branchMasks = append(b.branchMasks, mask)
505 b.checksums = append(b.checksums, hasher.Sum(nil)...)
506
507 langCode, ok := b.languageMap[doc.Language]
508 if !ok {
509 if len(b.languageMap) >= 65535 {
510 return fmt.Errorf("too many languages")
511 }
512 langCode = uint16(len(b.languageMap))
513 b.languageMap[doc.Language] = langCode
514 }
515 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
516
517 return nil
518}
519
520func (b *IndexBuilder) branchMask(br string) uint64 {
521 for i, b := range b.repoList[len(b.repoList)-1].Branches {
522 if b.Name == br {
523 return uint64(1) << uint(i)
524 }
525 }
526 return 0
527}
528
529type DocChecker struct {
530 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
531 trigrams map[ngram]struct{}
532}
533
534// Check returns a reason why the given contents are probably not source texts.
535func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error {
536 if len(content) == 0 {
537 return nil
538 }
539
540 if len(content) < ngramSize {
541 return fmt.Errorf("file size smaller than %d", ngramSize)
542 }
543
544 if index := bytes.IndexByte(content, 0); index > 0 {
545 return fmt.Errorf("binary data at byte offset %d", index)
546 }
547
548 // PERF: we only need to do the trigram check if the upperbound on content is greater than
549 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
550 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
551 return nil
552 }
553
554 var cur [3]rune
555 byteCount := 0
556 t.clearTrigrams(maxTrigramCount)
557
558 for len(content) > 0 {
559 r, sz := utf8.DecodeRune(content)
560 content = content[sz:]
561 byteCount += sz
562
563 cur[0], cur[1], cur[2] = cur[1], cur[2], r
564 if cur[0] == 0 {
565 // start of file.
566 continue
567 }
568
569 t.trigrams[runesToNGram(cur)] = struct{}{}
570 if len(t.trigrams) > maxTrigramCount {
571 // probably not text.
572 return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
573 }
574 }
575 return nil
576}
577
578func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
579 if t.trigrams == nil {
580 t.trigrams = make(map[ngram]struct{}, maxTrigramCount)
581 }
582 for key := range t.trigrams {
583 delete(t.trigrams, key)
584 }
585}