fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "encoding/binary"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "net/url"
24 "os"
25 "path/filepath"
26 "sort"
27 "strings"
28 "text/template"
29 "time"
30 "unicode/utf8"
31
32 "slices"
33
34 "github.com/sourcegraph/zoekt"
35 "github.com/sourcegraph/zoekt/internal/languages"
36)
37
38var _ = log.Println
39
40const ngramSize = 3
41
42type searchableString struct {
43 data []byte
44}
45
46// Filled by the linker
47var Version string
48
49func HostnameBestEffort() string {
50 if h := os.Getenv("NODE_NAME"); h != "" {
51 return h
52 }
53 if h := os.Getenv("HOSTNAME"); h != "" {
54 return h
55 }
56 hostname, _ := os.Hostname()
57 return hostname
58}
59
60// Store character (unicode codepoint) offset (in bytes) this often.
61const runeOffsetFrequency = 100
62
63type postingsBuilder struct {
64 postings map[ngram][]byte
65 lastOffsets map[ngram]uint32
66
67 // To support UTF-8 searching, we must map back runes to byte
68 // offsets. As a first attempt, we sample regularly. The
69 // precise offset can be found by walking from the recorded
70 // offset to the desired rune.
71 runeOffsets []uint32
72 runeCount uint32
73
74 isPlainASCII bool
75
76 endRunes []uint32
77 endByte uint32
78}
79
80func newPostingsBuilder() *postingsBuilder {
81 return &postingsBuilder{
82 postings: map[ngram][]byte{},
83 lastOffsets: map[ngram]uint32{},
84 isPlainASCII: true,
85 }
86}
87
88// Store trigram offsets for the given UTF-8 data. The
89// DocumentSections must correspond to rune boundaries in the UTF-8
90// data.
91func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
92 dest := searchableString{
93 data: data,
94 }
95 var buf [8]byte
96 var runeGram [3]rune
97
98 var runeIndex uint32
99 byteCount := 0
100 dataSz := uint32(len(data))
101
102 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
103 for _, s := range byteSections {
104 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
105 }
106 var runeSectionBoundaries []uint32
107
108 endRune := s.runeCount
109 for ; len(data) > 0; runeIndex++ {
110 c, sz := utf8.DecodeRune(data)
111 if sz > 1 {
112 s.isPlainASCII = false
113 }
114 data = data[sz:]
115
116 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
117
118 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
119 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
120 }
121 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
122 runeSectionBoundaries = append(runeSectionBoundaries,
123 endRune+uint32(runeIndex))
124 byteSectionBoundaries = byteSectionBoundaries[1:]
125 }
126
127 byteCount += sz
128
129 if runeIndex < 2 {
130 continue
131 }
132
133 ng := runesToNGram(runeGram)
134 lastOff := s.lastOffsets[ng]
135 newOff := endRune + uint32(runeIndex) - 2
136
137 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
138 s.postings[ng] = append(s.postings[ng], buf[:m]...)
139 s.lastOffsets[ng] = newOff
140 }
141 s.runeCount += runeIndex
142
143 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
144 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
145 }
146
147 // Handle symbol definition that ends at file end. This can
148 // happen for labels at the end of .bat files.
149
150 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
151 runeSectionBoundaries = append(runeSectionBoundaries,
152 endRune+runeIndex)
153 byteSectionBoundaries = byteSectionBoundaries[1:]
154 }
155 runeSecs := make([]DocumentSection, 0, len(byteSections))
156 for i := 0; i < len(runeSectionBoundaries); i += 2 {
157 runeSecs = append(runeSecs, DocumentSection{
158 Start: runeSectionBoundaries[i],
159 End: runeSectionBoundaries[i+1],
160 })
161 }
162
163 s.endRunes = append(s.endRunes, s.runeCount)
164 s.endByte += dataSz
165 return &dest, runeSecs, nil
166}
167
168// ShardBuilder builds a single index shard.
169type ShardBuilder struct {
170 // The version we will write to disk. Sourcegraph Specific. This is to
171 // enable feature flagging new format versions.
172 indexFormatVersion int
173 featureVersion int
174
175 contentStrings []*searchableString
176 nameStrings []*searchableString
177 docSections [][]DocumentSection
178 runeDocSections []DocumentSection
179
180 symID uint32
181 symIndex map[string]uint32
182 symKindID uint32
183 symKindIndex map[string]uint32
184 symMetaData []uint32
185
186 fileEndSymbol []uint32
187
188 checksums []byte
189
190 branchMasks []uint64
191 subRepos []uint32
192
193 // docID => repoID
194 repos []uint16
195
196 contentPostings *postingsBuilder
197 namePostings *postingsBuilder
198
199 // root repositories
200 repoList []zoekt.Repository
201
202 // name to index.
203 subRepoIndices []map[string]uint32
204
205 // language => language code
206 languageMap map[string]uint16
207
208 // language codes, uint16 encoded as little-endian
209 languages []uint8
210
211 categories []byte
212
213 // IndexTime will be used as the time if non-zero. Otherwise
214 // time.Now(). This is useful for doing reproducible builds in tests.
215 IndexTime time.Time
216
217 // a sortable 20 chars long id.
218 ID string
219}
220
221func verify(repo *zoekt.Repository) error {
222 for _, t := range []string{repo.FileURLTemplate, repo.LineFragmentTemplate, repo.CommitURLTemplate} {
223 if _, err := ParseTemplate(t); err != nil {
224 return err
225 }
226 }
227 return nil
228}
229
230func urlJoinPath(base string, elem ...string) string {
231 // golangs html/template always escapes "+" appearing in an HTML attribute
232 // [1]. We may even want to treat more characters, differently but this
233 // atleast makes it possible to visit URLs like [2].
234 //
235 // We only do this to elem since base will normally be a hardcoded string.
236 //
237 // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80
238 // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift
239 elem = slices.Clone(elem) // copy to mutate
240 for i := range elem {
241 elem[i] = strings.ReplaceAll(elem[i], "+", "%2B")
242 }
243 u, err := url.JoinPath(base, elem...)
244 if err != nil {
245 return "#!error: " + err.Error()
246 }
247 return u
248}
249
250// ParseTemplate will parse the templates for FileURLTemplate,
251// LineFragmentTemplate and CommitURLTemplate.
252//
253// It makes available the extra function UrlJoinPath.
254func ParseTemplate(text string) (*template.Template, error) {
255 return template.New("").Funcs(template.FuncMap{
256 "URLJoinPath": urlJoinPath,
257 }).Parse(text)
258}
259
260// ContentSize returns the number of content bytes so far ingested.
261func (b *ShardBuilder) ContentSize() uint32 {
262 // Add the name too so we don't skip building index if we have
263 // lots of empty files.
264 return b.contentPostings.endByte + b.namePostings.endByte
265}
266
267// NumFiles returns the number of files added to this builder
268func (b *ShardBuilder) NumFiles() int {
269 return len(b.contentStrings)
270}
271
272// NewShardBuilder creates a fresh ShardBuilder. The passed in
273// Repository contains repo metadata, and may be set to nil.
274func NewShardBuilder(r *zoekt.Repository) (*ShardBuilder, error) {
275 b := newShardBuilder()
276
277 if r == nil {
278 r = &zoekt.Repository{}
279 }
280 if err := b.setRepository(r); err != nil {
281 return nil, err
282 }
283 return b, nil
284}
285
286func newShardBuilder() *ShardBuilder {
287 return &ShardBuilder{
288 indexFormatVersion: IndexFormatVersion,
289 featureVersion: FeatureVersion,
290
291 contentPostings: newPostingsBuilder(),
292 namePostings: newPostingsBuilder(),
293 fileEndSymbol: []uint32{0},
294 symIndex: make(map[string]uint32),
295 symKindIndex: make(map[string]uint32),
296 languageMap: make(map[string]uint16),
297 }
298}
299
300func (b *ShardBuilder) setRepository(desc *zoekt.Repository) error {
301 if err := verify(desc); err != nil {
302 return err
303 }
304
305 if len(desc.Branches) > 64 {
306 return fmt.Errorf("too many branches")
307 }
308
309 repo := *desc
310
311 // copy subrepomap without root
312 repo.SubRepoMap = map[string]*zoekt.Repository{}
313 for k, v := range desc.SubRepoMap {
314 if k != "" {
315 repo.SubRepoMap[k] = v
316 }
317 }
318
319 b.repoList = append(b.repoList, repo)
320
321 return b.populateSubRepoIndices()
322}
323
324type symbolSlice struct {
325 symbols []DocumentSection
326 metaData []*zoekt.Symbol
327}
328
329func (s symbolSlice) Len() int { return len(s.symbols) }
330
331func (s symbolSlice) Swap(i, j int) {
332 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
333 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
334}
335
336func (s symbolSlice) Less(i, j int) bool {
337 return s.symbols[i].Start < s.symbols[j].Start
338}
339
340// AddFile is a convenience wrapper for Add
341func (b *ShardBuilder) AddFile(name string, content []byte) error {
342 return b.Add(Document{Name: name, Content: content})
343}
344
345func (b *ShardBuilder) populateSubRepoIndices() error {
346 if len(b.subRepoIndices) == len(b.repoList) {
347 return nil
348 }
349 if len(b.subRepoIndices) != len(b.repoList)-1 {
350 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
351 }
352 repo := b.repoList[len(b.repoList)-1]
353 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
354 return nil
355}
356
357func mkSubRepoIndices(repo zoekt.Repository) map[string]uint32 {
358 paths := []string{""}
359 for k := range repo.SubRepoMap {
360 paths = append(paths, k)
361 }
362 sort.Strings(paths)
363 subRepoIndices := make(map[string]uint32, len(paths))
364 for i, p := range paths {
365 subRepoIndices[p] = uint32(i)
366 }
367 return subRepoIndices
368}
369
370const notIndexedMarker = "NOT-INDEXED: "
371
372func (b *ShardBuilder) symbolID(sym string) uint32 {
373 if _, ok := b.symIndex[sym]; !ok {
374 b.symIndex[sym] = b.symID
375 b.symID++
376 }
377 return b.symIndex[sym]
378}
379
380func (b *ShardBuilder) symbolKindID(t string) uint32 {
381 if _, ok := b.symKindIndex[t]; !ok {
382 b.symKindIndex[t] = b.symKindID
383 b.symKindID++
384 }
385 return b.symKindIndex[t]
386}
387
388func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) {
389 for _, sym := range symbols {
390 b.symMetaData = append(b.symMetaData,
391 // This field was removed due to redundancy. To avoid
392 // needing to reindex, it is set to zero for now. In the
393 // future, this field will be completely removed. It
394 // will require incrementing the feature version.
395 0,
396 b.symbolKindID(sym.Kind),
397 b.symbolID(sym.Parent),
398 b.symbolKindID(sym.ParentKind))
399 }
400}
401
402func DetermineLanguageIfUnknown(doc *Document) {
403 if doc.Language != "" {
404 return
405 }
406
407 if doc.SkipReason != SkipReasonNone {
408 // If this document has been skipped, it's likely very large, or it's a non-code file like binary.
409 // In this case, we just guess the language based on file name to avoid examining the contents.
410 // Note: passing nil content is allowed by the go-enry contract (the underlying library we use here).
411 doc.Language = languages.GetLanguage(doc.Name, nil)
412 } else {
413 doc.Language = languages.GetLanguage(doc.Name, doc.Content)
414 }
415}
416
417// Add a file which only occurs in certain branches.
418func (b *ShardBuilder) Add(doc Document) error {
419 if index := bytes.IndexByte(doc.Content, 0); index > 0 {
420 doc.SkipReason = SkipReasonBinary
421 }
422
423 if doc.SkipReason != SkipReasonNone {
424 doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation())
425 doc.Symbols = nil
426 doc.SymbolsMetaData = nil
427 }
428
429 DetermineLanguageIfUnknown(&doc)
430 DetermineFileCategory(&doc)
431
432 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
433 var last DocumentSection
434 for i, s := range doc.Symbols {
435 if i > 0 {
436 if last.End > s.Start {
437 return fmt.Errorf("sections overlap")
438 }
439 }
440 last = s
441 }
442 if last.End > uint32(len(doc.Content)) {
443 return fmt.Errorf("section goes past end of content")
444 }
445
446 if doc.SubRepositoryPath != "" {
447 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
448 if err != nil || rel == doc.Name {
449 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
450 }
451 }
452 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
453 if err != nil {
454 return err
455 }
456 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
457 if err != nil {
458 return err
459 }
460 b.addSymbols(doc.SymbolsMetaData)
461
462 repoIdx := len(b.repoList) - 1
463 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
464 if !ok {
465 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
466 }
467
468 var mask uint64
469 for _, br := range doc.Branches {
470 m := b.branchMask(br)
471 if m == 0 {
472 return fmt.Errorf("no branch found for %s", br)
473 }
474 mask |= m
475 }
476
477 if repoIdx > 1<<16 {
478 return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
479 }
480
481 b.subRepos = append(b.subRepos, subRepoIdx)
482 b.repos = append(b.repos, uint16(repoIdx))
483
484 hasher := crc64.New(crc64.MakeTable(crc64.ISO))
485 hasher.Write(doc.Content)
486
487 b.contentStrings = append(b.contentStrings, docStr)
488 b.runeDocSections = append(b.runeDocSections, runeSecs...)
489
490 b.nameStrings = append(b.nameStrings, nameStr)
491 b.docSections = append(b.docSections, doc.Symbols)
492 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
493 b.branchMasks = append(b.branchMasks, mask)
494 b.checksums = append(b.checksums, hasher.Sum(nil)...)
495
496 langCode, ok := b.languageMap[doc.Language]
497 if !ok {
498 if len(b.languageMap) >= 65535 {
499 return fmt.Errorf("too many languages")
500 }
501 langCode = uint16(len(b.languageMap))
502 b.languageMap[doc.Language] = langCode
503 }
504 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
505
506 category, err := doc.Category.encode()
507 if err != nil {
508 return err
509 }
510 b.categories = append(b.categories, category)
511
512 return nil
513}
514
515func (b *ShardBuilder) branchMask(br string) uint64 {
516 for i, b := range b.repoList[len(b.repoList)-1].Branches {
517 if b.Name == br {
518 return uint64(1) << uint(i)
519 }
520 }
521 return 0
522}
523
524// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID
525// is missing or there are no repos, this returns false.
526func (b *ShardBuilder) repoIDs() ([]uint32, bool) {
527 if len(b.repoList) == 0 {
528 return nil, false
529 }
530
531 ids := make([]uint32, 0, len(b.repoList))
532 for _, repo := range b.repoList {
533 if repo.ID == 0 {
534 return nil, false
535 }
536 ids = append(ids, repo.ID)
537 }
538 return ids, true
539}
540
541type DocChecker struct {
542 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
543 trigrams map[ngram]struct{}
544}
545
546// Check returns a reason why the given contents are probably not source texts.
547func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason {
548 if len(content) == 0 {
549 return SkipReasonNone
550 }
551
552 if len(content) < ngramSize {
553 return SkipReasonTooSmall
554 }
555
556 if index := bytes.IndexByte(content, 0); index > 0 {
557 return SkipReasonBinary
558 }
559
560 // PERF: we only need to do the trigram check if the upperbound on content is greater than
561 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
562 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
563 return SkipReasonNone
564 }
565
566 var cur [3]rune
567 byteCount := 0
568 t.clearTrigrams(maxTrigramCount)
569
570 for len(content) > 0 {
571 r, sz := utf8.DecodeRune(content)
572 content = content[sz:]
573 byteCount += sz
574
575 cur[0], cur[1], cur[2] = cur[1], cur[2], r
576 if cur[0] == 0 {
577 // start of file.
578 continue
579 }
580
581 t.trigrams[runesToNGram(cur)] = struct{}{}
582 if len(t.trigrams) > maxTrigramCount {
583 // probably not text.
584 return SkipReasonTooManyTrigrams
585 }
586 }
587 return SkipReasonNone
588}
589
590func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
591 if t.trigrams == nil {
592 t.trigrams = make(map[ngram]struct{}, maxTrigramCount)
593 }
594 for key := range t.trigrams {
595 delete(t.trigrams, key)
596 }
597}
598
599// shardName returns the name of the shard for the given prefix, version, and
600// shard number.
601func shardName(indexDir string, prefix string, version, n int) string {
602 prefix = url.QueryEscape(prefix)
603 if len(prefix) > 200 {
604 prefix = prefix[:200] + hashString(prefix)[:8]
605 }
606 return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n))
607}