fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "encoding/binary"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "net/url"
24 "os"
25 "path/filepath"
26 "sort"
27 "strings"
28 "text/template"
29 "time"
30 "unicode/utf8"
31
32 "github.com/sourcegraph/zoekt/internal/languages"
33)
34
35var _ = log.Println
36
37const ngramSize = 3
38
39type searchableString struct {
40 data []byte
41}
42
43// Filled by the linker
44var Version string
45
46func HostnameBestEffort() string {
47 if h := os.Getenv("NODE_NAME"); h != "" {
48 return h
49 }
50 if h := os.Getenv("HOSTNAME"); h != "" {
51 return h
52 }
53 hostname, _ := os.Hostname()
54 return hostname
55}
56
57// Store character (unicode codepoint) offset (in bytes) this often.
58const runeOffsetFrequency = 100
59
60type postingsBuilder struct {
61 postings map[ngram][]byte
62 lastOffsets map[ngram]uint32
63
64 // To support UTF-8 searching, we must map back runes to byte
65 // offsets. As a first attempt, we sample regularly. The
66 // precise offset can be found by walking from the recorded
67 // offset to the desired rune.
68 runeOffsets []uint32
69 runeCount uint32
70
71 isPlainASCII bool
72
73 endRunes []uint32
74 endByte uint32
75}
76
77func newPostingsBuilder() *postingsBuilder {
78 return &postingsBuilder{
79 postings: map[ngram][]byte{},
80 lastOffsets: map[ngram]uint32{},
81 isPlainASCII: true,
82 }
83}
84
85// Store trigram offsets for the given UTF-8 data. The
86// DocumentSections must correspond to rune boundaries in the UTF-8
87// data.
88func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
89 dest := searchableString{
90 data: data,
91 }
92 var buf [8]byte
93 var runeGram [3]rune
94
95 var runeIndex uint32
96 byteCount := 0
97 dataSz := uint32(len(data))
98
99 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
100 for _, s := range byteSections {
101 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
102 }
103 var runeSectionBoundaries []uint32
104
105 endRune := s.runeCount
106 for ; len(data) > 0; runeIndex++ {
107 c, sz := utf8.DecodeRune(data)
108 if sz > 1 {
109 s.isPlainASCII = false
110 }
111 data = data[sz:]
112
113 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
114
115 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
116 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
117 }
118 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
119 runeSectionBoundaries = append(runeSectionBoundaries,
120 endRune+uint32(runeIndex))
121 byteSectionBoundaries = byteSectionBoundaries[1:]
122 }
123
124 byteCount += sz
125
126 if runeIndex < 2 {
127 continue
128 }
129
130 ng := runesToNGram(runeGram)
131 lastOff := s.lastOffsets[ng]
132 newOff := endRune + uint32(runeIndex) - 2
133
134 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
135 s.postings[ng] = append(s.postings[ng], buf[:m]...)
136 s.lastOffsets[ng] = newOff
137 }
138 s.runeCount += runeIndex
139
140 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
141 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
142 }
143
144 // Handle symbol definition that ends at file end. This can
145 // happen for labels at the end of .bat files.
146
147 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
148 runeSectionBoundaries = append(runeSectionBoundaries,
149 endRune+runeIndex)
150 byteSectionBoundaries = byteSectionBoundaries[1:]
151 }
152 runeSecs := make([]DocumentSection, 0, len(byteSections))
153 for i := 0; i < len(runeSectionBoundaries); i += 2 {
154 runeSecs = append(runeSecs, DocumentSection{
155 Start: runeSectionBoundaries[i],
156 End: runeSectionBoundaries[i+1],
157 })
158 }
159
160 s.endRunes = append(s.endRunes, s.runeCount)
161 s.endByte += dataSz
162 return &dest, runeSecs, nil
163}
164
165// IndexBuilder builds a single index shard.
166type IndexBuilder struct {
167 // The version we will write to disk. Sourcegraph Specific. This is to
168 // enable feature flagging new format versions.
169 indexFormatVersion int
170 featureVersion int
171
172 contentStrings []*searchableString
173 nameStrings []*searchableString
174 docSections [][]DocumentSection
175 runeDocSections []DocumentSection
176
177 symID uint32
178 symIndex map[string]uint32
179 symKindID uint32
180 symKindIndex map[string]uint32
181 symMetaData []uint32
182
183 fileEndSymbol []uint32
184
185 checksums []byte
186
187 branchMasks []uint64
188 subRepos []uint32
189
190 // docID => repoID
191 repos []uint16
192
193 // Experimental: docID => rank vec
194 ranks [][]float64
195
196 contentPostings *postingsBuilder
197 namePostings *postingsBuilder
198
199 // root repositories
200 repoList []Repository
201
202 // name to index.
203 subRepoIndices []map[string]uint32
204
205 // language => language code
206 languageMap map[string]uint16
207
208 // language codes, uint16 encoded as little-endian
209 languages []uint8
210
211 // IndexTime will be used as the time if non-zero. Otherwise
212 // time.Now(). This is useful for doing reproducible builds in tests.
213 IndexTime time.Time
214
215 // a sortable 20 chars long id.
216 ID string
217}
218
219func (d *Repository) verify() error {
220 for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} {
221 if _, err := ParseTemplate(t); err != nil {
222 return err
223 }
224 }
225 return nil
226}
227
228func urlJoinPath(base string, elem ...string) string {
229 // golangs html/template always escapes "+" appearing in an HTML attribute
230 // [1]. We may even want to treat more characters, differently but this
231 // atleast makes it possible to visit URLs like [2].
232 //
233 // We only do this to elem since base will normally be a hardcoded string.
234 //
235 // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80
236 // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift
237 elem = append([]string{}, elem...) // copy to mutate
238 for i := range elem {
239 elem[i] = strings.ReplaceAll(elem[i], "+", "%2B")
240 }
241 u, err := url.JoinPath(base, elem...)
242 if err != nil {
243 return "#!error: " + err.Error()
244 }
245 return u
246}
247
248// ParseTemplate will parse the templates for FileURLTemplate,
249// LineFragmentTemplate and CommitURLTemplate.
250//
251// It makes available the extra function UrlJoinPath.
252func ParseTemplate(text string) (*template.Template, error) {
253 return template.New("").Funcs(template.FuncMap{
254 "URLJoinPath": urlJoinPath,
255 }).Parse(text)
256}
257
258// ContentSize returns the number of content bytes so far ingested.
259func (b *IndexBuilder) ContentSize() uint32 {
260 // Add the name too so we don't skip building index if we have
261 // lots of empty files.
262 return b.contentPostings.endByte + b.namePostings.endByte
263}
264
265// NumFiles returns the number of files added to this builder
266func (b *IndexBuilder) NumFiles() int {
267 return len(b.contentStrings)
268}
269
270// NewIndexBuilder creates a fresh IndexBuilder. The passed in
271// Repository contains repo metadata, and may be set to nil.
272func NewIndexBuilder(r *Repository) (*IndexBuilder, error) {
273 b := newIndexBuilder()
274
275 if r == nil {
276 r = &Repository{}
277 }
278 if err := b.setRepository(r); err != nil {
279 return nil, err
280 }
281 return b, nil
282}
283
284func newIndexBuilder() *IndexBuilder {
285 return &IndexBuilder{
286 indexFormatVersion: IndexFormatVersion,
287 featureVersion: FeatureVersion,
288
289 contentPostings: newPostingsBuilder(),
290 namePostings: newPostingsBuilder(),
291 fileEndSymbol: []uint32{0},
292 symIndex: make(map[string]uint32),
293 symKindIndex: make(map[string]uint32),
294 languageMap: make(map[string]uint16),
295 }
296}
297
298func (b *IndexBuilder) setRepository(desc *Repository) error {
299 if err := desc.verify(); err != nil {
300 return err
301 }
302
303 if len(desc.Branches) > 64 {
304 return fmt.Errorf("too many branches")
305 }
306
307 repo := *desc
308
309 // copy subrepomap without root
310 repo.SubRepoMap = map[string]*Repository{}
311 for k, v := range desc.SubRepoMap {
312 if k != "" {
313 repo.SubRepoMap[k] = v
314 }
315 }
316
317 b.repoList = append(b.repoList, repo)
318
319 return b.populateSubRepoIndices()
320}
321
322type DocumentSection struct {
323 Start, End uint32
324}
325
326// Document holds a document (file) to index.
327type Document struct {
328 Name string
329 Content []byte
330 Branches []string
331 SubRepositoryPath string
332 Language string
333
334 // If set, something is wrong with the file contents, and this
335 // is the reason it wasn't indexed.
336 SkipReason string
337
338 // Document sections for symbols. Offsets should use bytes.
339 Symbols []DocumentSection
340 SymbolsMetaData []*Symbol
341
342 // Ranks is a vector of ranks for a document as provided by a DocumentRanksFile
343 // file in the git repo.
344 //
345 // Two documents can be ordered by comparing the components of their rank
346 // vectors. Bigger entries are better, as are longer vectors.
347 //
348 // This field is experimental and may change at any time without warning.
349 Ranks []float64
350}
351
352type symbolSlice struct {
353 symbols []DocumentSection
354 metaData []*Symbol
355}
356
357func (s symbolSlice) Len() int { return len(s.symbols) }
358
359func (s symbolSlice) Swap(i, j int) {
360 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
361 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
362}
363
364func (s symbolSlice) Less(i, j int) bool {
365 return s.symbols[i].Start < s.symbols[j].Start
366}
367
368// AddFile is a convenience wrapper for Add
369func (b *IndexBuilder) AddFile(name string, content []byte) error {
370 return b.Add(Document{Name: name, Content: content})
371}
372
373func (b *IndexBuilder) populateSubRepoIndices() error {
374 if len(b.subRepoIndices) == len(b.repoList) {
375 return nil
376 }
377 if len(b.subRepoIndices) != len(b.repoList)-1 {
378 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
379 }
380 repo := b.repoList[len(b.repoList)-1]
381 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
382 return nil
383}
384
385func mkSubRepoIndices(repo Repository) map[string]uint32 {
386 paths := []string{""}
387 for k := range repo.SubRepoMap {
388 paths = append(paths, k)
389 }
390 sort.Strings(paths)
391 subRepoIndices := make(map[string]uint32, len(paths))
392 for i, p := range paths {
393 subRepoIndices[p] = uint32(i)
394 }
395 return subRepoIndices
396}
397
398const notIndexedMarker = "NOT-INDEXED: "
399
400func (b *IndexBuilder) symbolID(sym string) uint32 {
401 if _, ok := b.symIndex[sym]; !ok {
402 b.symIndex[sym] = b.symID
403 b.symID++
404 }
405 return b.symIndex[sym]
406}
407
408func (b *IndexBuilder) symbolKindID(t string) uint32 {
409 if _, ok := b.symKindIndex[t]; !ok {
410 b.symKindIndex[t] = b.symKindID
411 b.symKindID++
412 }
413 return b.symKindIndex[t]
414}
415
416func (b *IndexBuilder) addSymbols(symbols []*Symbol) {
417 for _, sym := range symbols {
418 b.symMetaData = append(b.symMetaData,
419 // This field was removed due to redundancy. To avoid
420 // needing to reindex, it is set to zero for now. In the
421 // future, this field will be completely removed. It
422 // will require incrementing the feature version.
423 0,
424 b.symbolKindID(sym.Kind),
425 b.symbolID(sym.Parent),
426 b.symbolKindID(sym.ParentKind))
427 }
428}
429
430func DetermineLanguageIfUnknown(doc *Document) {
431 if doc.Language == "" {
432 doc.Language = languages.GetLanguage(doc.Name, doc.Content)
433 }
434}
435
436// Add a file which only occurs in certain branches.
437func (b *IndexBuilder) Add(doc Document) error {
438 hasher := crc64.New(crc64.MakeTable(crc64.ISO))
439
440 if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
441 doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
442 doc.Language = "binary"
443 }
444
445 if doc.SkipReason != "" {
446 doc.Content = []byte(notIndexedMarker + doc.SkipReason)
447 doc.Symbols = nil
448 doc.SymbolsMetaData = nil
449 if doc.Language == "" {
450 doc.Language = "skipped"
451 }
452 }
453
454 DetermineLanguageIfUnknown(&doc)
455
456 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
457 var last DocumentSection
458 for i, s := range doc.Symbols {
459 if i > 0 {
460 if last.End > s.Start {
461 return fmt.Errorf("sections overlap")
462 }
463 }
464 last = s
465 }
466 if last.End > uint32(len(doc.Content)) {
467 return fmt.Errorf("section goes past end of content")
468 }
469
470 if doc.SubRepositoryPath != "" {
471 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
472 if err != nil || rel == doc.Name {
473 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
474 }
475 }
476 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
477 if err != nil {
478 return err
479 }
480 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
481 if err != nil {
482 return err
483 }
484 b.addSymbols(doc.SymbolsMetaData)
485
486 repoIdx := len(b.repoList) - 1
487 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
488 if !ok {
489 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
490 }
491
492 var mask uint64
493 for _, br := range doc.Branches {
494 m := b.branchMask(br)
495 if m == 0 {
496 return fmt.Errorf("no branch found for %s", br)
497 }
498 mask |= m
499 }
500
501 if repoIdx > 1<<16 {
502 return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
503 }
504
505 b.subRepos = append(b.subRepos, subRepoIdx)
506 b.repos = append(b.repos, uint16(repoIdx))
507
508 // doc.Ranks might be nil. In case we don't use offline ranking, doc.Ranks is
509 // always nil.
510 b.ranks = append(b.ranks, doc.Ranks)
511
512 hasher.Write(doc.Content)
513
514 b.contentStrings = append(b.contentStrings, docStr)
515 b.runeDocSections = append(b.runeDocSections, runeSecs...)
516
517 b.nameStrings = append(b.nameStrings, nameStr)
518 b.docSections = append(b.docSections, doc.Symbols)
519 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
520 b.branchMasks = append(b.branchMasks, mask)
521 b.checksums = append(b.checksums, hasher.Sum(nil)...)
522
523 langCode, ok := b.languageMap[doc.Language]
524 if !ok {
525 if len(b.languageMap) >= 65535 {
526 return fmt.Errorf("too many languages")
527 }
528 langCode = uint16(len(b.languageMap))
529 b.languageMap[doc.Language] = langCode
530 }
531 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
532
533 return nil
534}
535
536func (b *IndexBuilder) branchMask(br string) uint64 {
537 for i, b := range b.repoList[len(b.repoList)-1].Branches {
538 if b.Name == br {
539 return uint64(1) << uint(i)
540 }
541 }
542 return 0
543}
544
545type DocChecker struct {
546 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
547 trigrams map[ngram]struct{}
548}
549
550// Check returns a reason why the given contents are probably not source texts.
551func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error {
552 if len(content) == 0 {
553 return nil
554 }
555
556 if len(content) < ngramSize {
557 return fmt.Errorf("file size smaller than %d", ngramSize)
558 }
559
560 if index := bytes.IndexByte(content, 0); index > 0 {
561 return fmt.Errorf("binary data at byte offset %d", index)
562 }
563
564 // PERF: we only need to do the trigram check if the upperbound on content is greater than
565 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
566 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
567 return nil
568 }
569
570 var cur [3]rune
571 byteCount := 0
572 t.clearTrigrams(maxTrigramCount)
573
574 for len(content) > 0 {
575 r, sz := utf8.DecodeRune(content)
576 content = content[sz:]
577 byteCount += sz
578
579 cur[0], cur[1], cur[2] = cur[1], cur[2], r
580 if cur[0] == 0 {
581 // start of file.
582 continue
583 }
584
585 t.trigrams[runesToNGram(cur)] = struct{}{}
586 if len(t.trigrams) > maxTrigramCount {
587 // probably not text.
588 return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
589 }
590 }
591 return nil
592}
593
594func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
595 if t.trigrams == nil {
596 t.trigrams = make(map[ngram]struct{}, maxTrigramCount)
597 }
598 for key := range t.trigrams {
599 delete(t.trigrams, key)
600 }
601}