fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "encoding/binary"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "net/url"
24 "os"
25 "path/filepath"
26 "slices"
27 "sort"
28 "strings"
29 "text/template"
30 "time"
31 "unicode/utf8"
32
33 "github.com/sourcegraph/zoekt"
34 "github.com/sourcegraph/zoekt/languages"
35)
36
37var _ = log.Println
38
39const ngramSize = 3
40
41type searchableString struct {
42 data []byte
43}
44
45// Filled by the linker
46var Version string
47
48func HostnameBestEffort() string {
49 if h := os.Getenv("NODE_NAME"); h != "" {
50 return h
51 }
52 if h := os.Getenv("HOSTNAME"); h != "" {
53 return h
54 }
55 hostname, _ := os.Hostname()
56 return hostname
57}
58
59// Store character (unicode codepoint) offset (in bytes) this often.
60const runeOffsetFrequency = 100
61
62type postingsBuilder struct {
63 postings map[ngram][]byte
64 lastOffsets map[ngram]uint32
65
66 // To support UTF-8 searching, we must map back runes to byte
67 // offsets. As a first attempt, we sample regularly. The
68 // precise offset can be found by walking from the recorded
69 // offset to the desired rune.
70 runeOffsets []uint32
71 runeCount uint32
72
73 isPlainASCII bool
74
75 endRunes []uint32
76 endByte uint32
77}
78
79func newPostingsBuilder() *postingsBuilder {
80 return &postingsBuilder{
81 postings: map[ngram][]byte{},
82 lastOffsets: map[ngram]uint32{},
83 isPlainASCII: true,
84 }
85}
86
87// Store trigram offsets for the given UTF-8 data. The
88// DocumentSections must correspond to rune boundaries in the UTF-8
89// data.
90func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
91 dest := searchableString{
92 data: data,
93 }
94 var buf [8]byte
95 var runeGram [3]rune
96
97 var runeIndex uint32
98 byteCount := 0
99 dataSz := uint32(len(data))
100
101 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
102 for _, s := range byteSections {
103 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
104 }
105 var runeSectionBoundaries []uint32
106
107 endRune := s.runeCount
108 for ; len(data) > 0; runeIndex++ {
109 c, sz := utf8.DecodeRune(data)
110 if sz > 1 {
111 s.isPlainASCII = false
112 }
113 data = data[sz:]
114
115 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
116
117 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
118 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
119 }
120 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
121 runeSectionBoundaries = append(runeSectionBoundaries,
122 endRune+uint32(runeIndex))
123 byteSectionBoundaries = byteSectionBoundaries[1:]
124 }
125
126 byteCount += sz
127
128 if runeIndex < 2 {
129 continue
130 }
131
132 ng := runesToNGram(runeGram)
133 lastOff := s.lastOffsets[ng]
134 newOff := endRune + uint32(runeIndex) - 2
135
136 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
137 s.postings[ng] = append(s.postings[ng], buf[:m]...)
138 s.lastOffsets[ng] = newOff
139 }
140 s.runeCount += runeIndex
141
142 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
143 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
144 }
145
146 // Handle symbol definition that ends at file end. This can
147 // happen for labels at the end of .bat files.
148
149 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
150 runeSectionBoundaries = append(runeSectionBoundaries,
151 endRune+runeIndex)
152 byteSectionBoundaries = byteSectionBoundaries[1:]
153 }
154 runeSecs := make([]DocumentSection, 0, len(byteSections))
155 for i := 0; i < len(runeSectionBoundaries); i += 2 {
156 runeSecs = append(runeSecs, DocumentSection{
157 Start: runeSectionBoundaries[i],
158 End: runeSectionBoundaries[i+1],
159 })
160 }
161
162 s.endRunes = append(s.endRunes, s.runeCount)
163 s.endByte += dataSz
164 return &dest, runeSecs, nil
165}
166
167// ShardBuilder builds a single index shard.
168type ShardBuilder struct {
169 // The version we will write to disk. Sourcegraph Specific. This is to
170 // enable feature flagging new format versions.
171 indexFormatVersion int
172 featureVersion int
173
174 contentStrings []*searchableString
175 nameStrings []*searchableString
176 docSections [][]DocumentSection
177 runeDocSections []DocumentSection
178
179 symID uint32
180 symIndex map[string]uint32
181 symKindID uint32
182 symKindIndex map[string]uint32
183 symMetaData []uint32
184
185 fileEndSymbol []uint32
186
187 checksums []byte
188
189 branchMasks []uint64
190 subRepos []uint32
191
192 // docID => repoID
193 repos []uint16
194
195 contentPostings *postingsBuilder
196 namePostings *postingsBuilder
197
198 // root repositories
199 repoList []zoekt.Repository
200
201 // name to index.
202 subRepoIndices []map[string]uint32
203
204 // language => language code
205 languageMap map[string]uint16
206
207 // language codes, uint16 encoded as little-endian
208 languages []uint8
209
210 categories []byte
211
212 // IndexTime will be used as the time if non-zero. Otherwise
213 // time.Now(). This is useful for doing reproducible builds in tests.
214 IndexTime time.Time
215
216 // a sortable 20 chars long id.
217 ID string
218}
219
220func verify(repo *zoekt.Repository) error {
221 for _, t := range []string{repo.FileURLTemplate, repo.LineFragmentTemplate, repo.CommitURLTemplate} {
222 if _, err := ParseTemplate(t); err != nil {
223 return err
224 }
225 }
226 return nil
227}
228
229func urlJoinPath(base string, elem ...string) string {
230 // golangs html/template always escapes "+" appearing in an HTML attribute
231 // [1]. We may even want to treat more characters, differently but this
232 // atleast makes it possible to visit URLs like [2].
233 //
234 // We only do this to elem since base will normally be a hardcoded string.
235 //
236 // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80
237 // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift
238 elem = slices.Clone(elem) // copy to mutate
239 for i := range elem {
240 elem[i] = strings.ReplaceAll(elem[i], "+", "%2B")
241 }
242 u, err := url.JoinPath(base, elem...)
243 if err != nil {
244 return "#!error: " + err.Error()
245 }
246 return u
247}
248
249// ParseTemplate will parse the templates for FileURLTemplate,
250// LineFragmentTemplate and CommitURLTemplate.
251//
252// It makes available the extra function UrlJoinPath.
253func ParseTemplate(text string) (*template.Template, error) {
254 return template.New("").Funcs(template.FuncMap{
255 "URLJoinPath": urlJoinPath,
256 }).Parse(text)
257}
258
259// ContentSize returns the number of content bytes so far ingested.
260func (b *ShardBuilder) ContentSize() uint32 {
261 // Add the name too so we don't skip building index if we have
262 // lots of empty files.
263 return b.contentPostings.endByte + b.namePostings.endByte
264}
265
266// NumFiles returns the number of files added to this builder
267func (b *ShardBuilder) NumFiles() int {
268 return len(b.contentStrings)
269}
270
271// NewShardBuilder creates a fresh ShardBuilder. The passed in
272// Repository contains repo metadata, and may be set to nil.
273func NewShardBuilder(r *zoekt.Repository) (*ShardBuilder, error) {
274 b := newShardBuilder()
275
276 if r == nil {
277 r = &zoekt.Repository{}
278 }
279 if err := b.setRepository(r); err != nil {
280 return nil, err
281 }
282 return b, nil
283}
284
285func newShardBuilder() *ShardBuilder {
286 return &ShardBuilder{
287 indexFormatVersion: IndexFormatVersion,
288 featureVersion: FeatureVersion,
289
290 contentPostings: newPostingsBuilder(),
291 namePostings: newPostingsBuilder(),
292 fileEndSymbol: []uint32{0},
293 symIndex: make(map[string]uint32),
294 symKindIndex: make(map[string]uint32),
295 languageMap: make(map[string]uint16),
296 }
297}
298
299func (b *ShardBuilder) setRepository(desc *zoekt.Repository) error {
300 if err := verify(desc); err != nil {
301 return err
302 }
303
304 if len(desc.Branches) > 64 {
305 return fmt.Errorf("too many branches")
306 }
307
308 repo := *desc
309
310 // copy subrepomap without root
311 repo.SubRepoMap = map[string]*zoekt.Repository{}
312 for k, v := range desc.SubRepoMap {
313 if k != "" {
314 repo.SubRepoMap[k] = v
315 }
316 }
317
318 b.repoList = append(b.repoList, repo)
319
320 return b.populateSubRepoIndices()
321}
322
323type symbolSlice struct {
324 symbols []DocumentSection
325 metaData []*zoekt.Symbol
326}
327
328func (s symbolSlice) Len() int { return len(s.symbols) }
329
330func (s symbolSlice) Swap(i, j int) {
331 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
332 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
333}
334
335func (s symbolSlice) Less(i, j int) bool {
336 return s.symbols[i].Start < s.symbols[j].Start
337}
338
339// AddFile is a convenience wrapper for Add
340func (b *ShardBuilder) AddFile(name string, content []byte) error {
341 return b.Add(Document{Name: name, Content: content})
342}
343
344func (b *ShardBuilder) populateSubRepoIndices() error {
345 if len(b.subRepoIndices) == len(b.repoList) {
346 return nil
347 }
348 if len(b.subRepoIndices) != len(b.repoList)-1 {
349 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
350 }
351 repo := b.repoList[len(b.repoList)-1]
352 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
353 return nil
354}
355
356func mkSubRepoIndices(repo zoekt.Repository) map[string]uint32 {
357 paths := []string{""}
358 for k := range repo.SubRepoMap {
359 paths = append(paths, k)
360 }
361 sort.Strings(paths)
362 subRepoIndices := make(map[string]uint32, len(paths))
363 for i, p := range paths {
364 subRepoIndices[p] = uint32(i)
365 }
366 return subRepoIndices
367}
368
369const notIndexedMarker = "NOT-INDEXED: "
370
371func (b *ShardBuilder) symbolID(sym string) uint32 {
372 if _, ok := b.symIndex[sym]; !ok {
373 b.symIndex[sym] = b.symID
374 b.symID++
375 }
376 return b.symIndex[sym]
377}
378
379func (b *ShardBuilder) symbolKindID(t string) uint32 {
380 if _, ok := b.symKindIndex[t]; !ok {
381 b.symKindIndex[t] = b.symKindID
382 b.symKindID++
383 }
384 return b.symKindIndex[t]
385}
386
387func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) {
388 for _, sym := range symbols {
389 b.symMetaData = append(b.symMetaData,
390 // This field was removed due to redundancy. To avoid
391 // needing to reindex, it is set to zero for now. In the
392 // future, this field will be completely removed. It
393 // will require incrementing the feature version.
394 0,
395 b.symbolKindID(sym.Kind),
396 b.symbolID(sym.Parent),
397 b.symbolKindID(sym.ParentKind))
398 }
399}
400
401func DetermineLanguageIfUnknown(doc *Document) {
402 if doc.Language != "" {
403 return
404 }
405
406 // If this document has been skipped (doc.SkipReason != SkipReasonNone), it's
407 // likely very large, or it's a non-code file like binary. In this case, we just
408 // guess the language based on the file name to avoid examining the contents.
409 // Note: passing nil content is allowed by the go-enry contract (the underlying
410 // library we use here).
411 var content []byte
412 if doc.SkipReason == SkipReasonNone {
413 content = doc.Content
414 }
415 langs := languages.GetLanguagesFromContent(doc.Name, content)
416 if len(langs) > 0 {
417 doc.Language = langs[0]
418 }
419}
420
421// Add a file which only occurs in certain branches.
422func (b *ShardBuilder) Add(doc Document) error {
423 if index := bytes.IndexByte(doc.Content, 0); index > 0 {
424 doc.SkipReason = SkipReasonBinary
425 }
426
427 if doc.SkipReason != SkipReasonNone {
428 doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation())
429 doc.Symbols = nil
430 doc.SymbolsMetaData = nil
431 }
432
433 DetermineLanguageIfUnknown(&doc)
434 DetermineFileCategory(&doc)
435
436 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
437 var last DocumentSection
438 for i, s := range doc.Symbols {
439 if i > 0 {
440 if last.End > s.Start {
441 return fmt.Errorf("sections overlap")
442 }
443 }
444 last = s
445 }
446 if last.End > uint32(len(doc.Content)) {
447 return fmt.Errorf("section goes past end of content")
448 }
449
450 if doc.SubRepositoryPath != "" {
451 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
452 if err != nil || rel == doc.Name {
453 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
454 }
455 }
456 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
457 if err != nil {
458 return err
459 }
460 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
461 if err != nil {
462 return err
463 }
464 b.addSymbols(doc.SymbolsMetaData)
465
466 repoIdx := len(b.repoList) - 1
467 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
468 if !ok {
469 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
470 }
471
472 var mask uint64
473 for _, br := range doc.Branches {
474 m := b.branchMask(br)
475 if m == 0 {
476 return fmt.Errorf("no branch found for %s", br)
477 }
478 mask |= m
479 }
480
481 if repoIdx > 1<<16 {
482 return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
483 }
484
485 b.subRepos = append(b.subRepos, subRepoIdx)
486 b.repos = append(b.repos, uint16(repoIdx))
487
488 hasher := crc64.New(crc64.MakeTable(crc64.ISO))
489 hasher.Write(doc.Content)
490
491 b.contentStrings = append(b.contentStrings, docStr)
492 b.runeDocSections = append(b.runeDocSections, runeSecs...)
493
494 b.nameStrings = append(b.nameStrings, nameStr)
495 b.docSections = append(b.docSections, doc.Symbols)
496 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
497 b.branchMasks = append(b.branchMasks, mask)
498 b.checksums = append(b.checksums, hasher.Sum(nil)...)
499
500 langCode, ok := b.languageMap[doc.Language]
501 if !ok {
502 if len(b.languageMap) >= 65535 {
503 return fmt.Errorf("too many languages")
504 }
505 langCode = uint16(len(b.languageMap))
506 b.languageMap[doc.Language] = langCode
507 }
508 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
509
510 category, err := doc.Category.encode()
511 if err != nil {
512 return err
513 }
514 b.categories = append(b.categories, category)
515
516 return nil
517}
518
519func (b *ShardBuilder) branchMask(br string) uint64 {
520 for i, b := range b.repoList[len(b.repoList)-1].Branches {
521 if b.Name == br {
522 return uint64(1) << uint(i)
523 }
524 }
525 return 0
526}
527
528// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID
529// is missing or there are no repos, this returns false.
530func (b *ShardBuilder) repoIDs() ([]uint32, bool) {
531 if len(b.repoList) == 0 {
532 return nil, false
533 }
534
535 ids := make([]uint32, 0, len(b.repoList))
536 for _, repo := range b.repoList {
537 if repo.ID == 0 {
538 return nil, false
539 }
540 ids = append(ids, repo.ID)
541 }
542 return ids, true
543}
544
545type DocChecker struct {
546 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
547 trigrams map[ngram]struct{}
548}
549
550// Check returns a reason why the given contents are probably not source texts.
551func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason {
552 if len(content) == 0 {
553 return SkipReasonNone
554 }
555
556 if len(content) < ngramSize {
557 return SkipReasonTooSmall
558 }
559
560 if index := bytes.IndexByte(content, 0); index > 0 {
561 return SkipReasonBinary
562 }
563
564 // PERF: we only need to do the trigram check if the upperbound on content is greater than
565 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
566 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
567 return SkipReasonNone
568 }
569
570 var cur [3]rune
571 byteCount := 0
572 t.clearTrigrams(maxTrigramCount)
573
574 for len(content) > 0 {
575 r, sz := utf8.DecodeRune(content)
576 content = content[sz:]
577 byteCount += sz
578
579 cur[0], cur[1], cur[2] = cur[1], cur[2], r
580 if cur[0] == 0 {
581 // start of file.
582 continue
583 }
584
585 t.trigrams[runesToNGram(cur)] = struct{}{}
586 if len(t.trigrams) > maxTrigramCount {
587 // probably not text.
588 return SkipReasonTooManyTrigrams
589 }
590 }
591 return SkipReasonNone
592}
593
594func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
595 if t.trigrams == nil {
596 t.trigrams = make(map[ngram]struct{}, maxTrigramCount)
597 }
598 for key := range t.trigrams {
599 delete(t.trigrams, key)
600 }
601}
602
603// shardName returns the name of the shard for the given prefix, version, and
604// shard number.
605func shardName(indexDir string, prefix string, version, n int) string {
606 prefix = url.QueryEscape(prefix)
607 if len(prefix) > 200 {
608 prefix = prefix[:200] + hashString(prefix)[:8]
609 }
610 return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n))
611}