fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "encoding/binary"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "net/url"
24 "os"
25 "path/filepath"
26 "sort"
27 "strings"
28 "text/template"
29 "time"
30 "unicode/utf8"
31
32 "github.com/sourcegraph/zoekt"
33 "github.com/sourcegraph/zoekt/internal/languages"
34)
35
36var _ = log.Println
37
38const ngramSize = 3
39
40type searchableString struct {
41 data []byte
42}
43
44// Filled by the linker
45var Version string
46
47func HostnameBestEffort() string {
48 if h := os.Getenv("NODE_NAME"); h != "" {
49 return h
50 }
51 if h := os.Getenv("HOSTNAME"); h != "" {
52 return h
53 }
54 hostname, _ := os.Hostname()
55 return hostname
56}
57
58// Store character (unicode codepoint) offset (in bytes) this often.
59const runeOffsetFrequency = 100
60
61type postingsBuilder struct {
62 postings map[ngram][]byte
63 lastOffsets map[ngram]uint32
64
65 // To support UTF-8 searching, we must map back runes to byte
66 // offsets. As a first attempt, we sample regularly. The
67 // precise offset can be found by walking from the recorded
68 // offset to the desired rune.
69 runeOffsets []uint32
70 runeCount uint32
71
72 isPlainASCII bool
73
74 endRunes []uint32
75 endByte uint32
76}
77
78func newPostingsBuilder() *postingsBuilder {
79 return &postingsBuilder{
80 postings: map[ngram][]byte{},
81 lastOffsets: map[ngram]uint32{},
82 isPlainASCII: true,
83 }
84}
85
86// Store trigram offsets for the given UTF-8 data. The
87// DocumentSections must correspond to rune boundaries in the UTF-8
88// data.
89func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
90 dest := searchableString{
91 data: data,
92 }
93 var buf [8]byte
94 var runeGram [3]rune
95
96 var runeIndex uint32
97 byteCount := 0
98 dataSz := uint32(len(data))
99
100 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
101 for _, s := range byteSections {
102 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
103 }
104 var runeSectionBoundaries []uint32
105
106 endRune := s.runeCount
107 for ; len(data) > 0; runeIndex++ {
108 c, sz := utf8.DecodeRune(data)
109 if sz > 1 {
110 s.isPlainASCII = false
111 }
112 data = data[sz:]
113
114 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
115
116 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
117 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
118 }
119 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
120 runeSectionBoundaries = append(runeSectionBoundaries,
121 endRune+uint32(runeIndex))
122 byteSectionBoundaries = byteSectionBoundaries[1:]
123 }
124
125 byteCount += sz
126
127 if runeIndex < 2 {
128 continue
129 }
130
131 ng := runesToNGram(runeGram)
132 lastOff := s.lastOffsets[ng]
133 newOff := endRune + uint32(runeIndex) - 2
134
135 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
136 s.postings[ng] = append(s.postings[ng], buf[:m]...)
137 s.lastOffsets[ng] = newOff
138 }
139 s.runeCount += runeIndex
140
141 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
142 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
143 }
144
145 // Handle symbol definition that ends at file end. This can
146 // happen for labels at the end of .bat files.
147
148 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
149 runeSectionBoundaries = append(runeSectionBoundaries,
150 endRune+runeIndex)
151 byteSectionBoundaries = byteSectionBoundaries[1:]
152 }
153 runeSecs := make([]DocumentSection, 0, len(byteSections))
154 for i := 0; i < len(runeSectionBoundaries); i += 2 {
155 runeSecs = append(runeSecs, DocumentSection{
156 Start: runeSectionBoundaries[i],
157 End: runeSectionBoundaries[i+1],
158 })
159 }
160
161 s.endRunes = append(s.endRunes, s.runeCount)
162 s.endByte += dataSz
163 return &dest, runeSecs, nil
164}
165
166// ShardBuilder builds a single index shard.
167type ShardBuilder struct {
168 // The version we will write to disk. Sourcegraph Specific. This is to
169 // enable feature flagging new format versions.
170 indexFormatVersion int
171 featureVersion int
172
173 contentStrings []*searchableString
174 nameStrings []*searchableString
175 docSections [][]DocumentSection
176 runeDocSections []DocumentSection
177
178 symID uint32
179 symIndex map[string]uint32
180 symKindID uint32
181 symKindIndex map[string]uint32
182 symMetaData []uint32
183
184 fileEndSymbol []uint32
185
186 checksums []byte
187
188 branchMasks []uint64
189 subRepos []uint32
190
191 // docID => repoID
192 repos []uint16
193
194 contentPostings *postingsBuilder
195 namePostings *postingsBuilder
196
197 // root repositories
198 repoList []zoekt.Repository
199
200 // name to index.
201 subRepoIndices []map[string]uint32
202
203 // language => language code
204 languageMap map[string]uint16
205
206 // language codes, uint16 encoded as little-endian
207 languages []uint8
208
209 categories []byte
210
211 // IndexTime will be used as the time if non-zero. Otherwise
212 // time.Now(). This is useful for doing reproducible builds in tests.
213 IndexTime time.Time
214
215 // a sortable 20 chars long id.
216 ID string
217}
218
219func verify(repo *zoekt.Repository) error {
220 for _, t := range []string{repo.FileURLTemplate, repo.LineFragmentTemplate, repo.CommitURLTemplate} {
221 if _, err := ParseTemplate(t); err != nil {
222 return err
223 }
224 }
225 return nil
226}
227
228func urlJoinPath(base string, elem ...string) string {
229 // golangs html/template always escapes "+" appearing in an HTML attribute
230 // [1]. We may even want to treat more characters, differently but this
231 // atleast makes it possible to visit URLs like [2].
232 //
233 // We only do this to elem since base will normally be a hardcoded string.
234 //
235 // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80
236 // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift
237 elem = append([]string{}, elem...) // copy to mutate
238 for i := range elem {
239 elem[i] = strings.ReplaceAll(elem[i], "+", "%2B")
240 }
241 u, err := url.JoinPath(base, elem...)
242 if err != nil {
243 return "#!error: " + err.Error()
244 }
245 return u
246}
247
248// ParseTemplate will parse the templates for FileURLTemplate,
249// LineFragmentTemplate and CommitURLTemplate.
250//
251// It makes available the extra function UrlJoinPath.
252func ParseTemplate(text string) (*template.Template, error) {
253 return template.New("").Funcs(template.FuncMap{
254 "URLJoinPath": urlJoinPath,
255 }).Parse(text)
256}
257
258// ContentSize returns the number of content bytes so far ingested.
259func (b *ShardBuilder) ContentSize() uint32 {
260 // Add the name too so we don't skip building index if we have
261 // lots of empty files.
262 return b.contentPostings.endByte + b.namePostings.endByte
263}
264
265// NumFiles returns the number of files added to this builder
266func (b *ShardBuilder) NumFiles() int {
267 return len(b.contentStrings)
268}
269
270// NewShardBuilder creates a fresh ShardBuilder. The passed in
271// Repository contains repo metadata, and may be set to nil.
272func NewShardBuilder(r *zoekt.Repository) (*ShardBuilder, error) {
273 b := newShardBuilder()
274
275 if r == nil {
276 r = &zoekt.Repository{}
277 }
278 if err := b.setRepository(r); err != nil {
279 return nil, err
280 }
281 return b, nil
282}
283
284func newShardBuilder() *ShardBuilder {
285 return &ShardBuilder{
286 indexFormatVersion: IndexFormatVersion,
287 featureVersion: FeatureVersion,
288
289 contentPostings: newPostingsBuilder(),
290 namePostings: newPostingsBuilder(),
291 fileEndSymbol: []uint32{0},
292 symIndex: make(map[string]uint32),
293 symKindIndex: make(map[string]uint32),
294 languageMap: make(map[string]uint16),
295 }
296}
297
298func (b *ShardBuilder) setRepository(desc *zoekt.Repository) error {
299 if err := verify(desc); err != nil {
300 return err
301 }
302
303 if len(desc.Branches) > 64 {
304 return fmt.Errorf("too many branches")
305 }
306
307 repo := *desc
308
309 // copy subrepomap without root
310 repo.SubRepoMap = map[string]*zoekt.Repository{}
311 for k, v := range desc.SubRepoMap {
312 if k != "" {
313 repo.SubRepoMap[k] = v
314 }
315 }
316
317 b.repoList = append(b.repoList, repo)
318
319 return b.populateSubRepoIndices()
320}
321
322type symbolSlice struct {
323 symbols []DocumentSection
324 metaData []*zoekt.Symbol
325}
326
327func (s symbolSlice) Len() int { return len(s.symbols) }
328
329func (s symbolSlice) Swap(i, j int) {
330 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
331 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
332}
333
334func (s symbolSlice) Less(i, j int) bool {
335 return s.symbols[i].Start < s.symbols[j].Start
336}
337
338// AddFile is a convenience wrapper for Add
339func (b *ShardBuilder) AddFile(name string, content []byte) error {
340 return b.Add(Document{Name: name, Content: content})
341}
342
343func (b *ShardBuilder) populateSubRepoIndices() error {
344 if len(b.subRepoIndices) == len(b.repoList) {
345 return nil
346 }
347 if len(b.subRepoIndices) != len(b.repoList)-1 {
348 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
349 }
350 repo := b.repoList[len(b.repoList)-1]
351 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
352 return nil
353}
354
355func mkSubRepoIndices(repo zoekt.Repository) map[string]uint32 {
356 paths := []string{""}
357 for k := range repo.SubRepoMap {
358 paths = append(paths, k)
359 }
360 sort.Strings(paths)
361 subRepoIndices := make(map[string]uint32, len(paths))
362 for i, p := range paths {
363 subRepoIndices[p] = uint32(i)
364 }
365 return subRepoIndices
366}
367
368const notIndexedMarker = "NOT-INDEXED: "
369
370func (b *ShardBuilder) symbolID(sym string) uint32 {
371 if _, ok := b.symIndex[sym]; !ok {
372 b.symIndex[sym] = b.symID
373 b.symID++
374 }
375 return b.symIndex[sym]
376}
377
378func (b *ShardBuilder) symbolKindID(t string) uint32 {
379 if _, ok := b.symKindIndex[t]; !ok {
380 b.symKindIndex[t] = b.symKindID
381 b.symKindID++
382 }
383 return b.symKindIndex[t]
384}
385
386func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) {
387 for _, sym := range symbols {
388 b.symMetaData = append(b.symMetaData,
389 // This field was removed due to redundancy. To avoid
390 // needing to reindex, it is set to zero for now. In the
391 // future, this field will be completely removed. It
392 // will require incrementing the feature version.
393 0,
394 b.symbolKindID(sym.Kind),
395 b.symbolID(sym.Parent),
396 b.symbolKindID(sym.ParentKind))
397 }
398}
399
400func DetermineLanguageIfUnknown(doc *Document) {
401 if doc.Language != "" {
402 return
403 }
404
405 if doc.SkipReason != SkipReasonNone {
406 // If this document has been skipped, it's likely very large, or it's a non-code file like binary.
407 // In this case, we just guess the language based on file name to avoid examining the contents.
408 // Note: passing nil content is allowed by the go-enry contract (the underlying library we use here).
409 doc.Language = languages.GetLanguage(doc.Name, nil)
410 } else {
411 doc.Language = languages.GetLanguage(doc.Name, doc.Content)
412 }
413}
414
415// Add a file which only occurs in certain branches.
416func (b *ShardBuilder) Add(doc Document) error {
417 if index := bytes.IndexByte(doc.Content, 0); index > 0 {
418 doc.SkipReason = SkipReasonBinary
419 }
420
421 if doc.SkipReason != SkipReasonNone {
422 doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation())
423 doc.Symbols = nil
424 doc.SymbolsMetaData = nil
425 }
426
427 DetermineLanguageIfUnknown(&doc)
428 DetermineFileCategory(&doc)
429
430 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
431 var last DocumentSection
432 for i, s := range doc.Symbols {
433 if i > 0 {
434 if last.End > s.Start {
435 return fmt.Errorf("sections overlap")
436 }
437 }
438 last = s
439 }
440 if last.End > uint32(len(doc.Content)) {
441 return fmt.Errorf("section goes past end of content")
442 }
443
444 if doc.SubRepositoryPath != "" {
445 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
446 if err != nil || rel == doc.Name {
447 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
448 }
449 }
450 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
451 if err != nil {
452 return err
453 }
454 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
455 if err != nil {
456 return err
457 }
458 b.addSymbols(doc.SymbolsMetaData)
459
460 repoIdx := len(b.repoList) - 1
461 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
462 if !ok {
463 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
464 }
465
466 var mask uint64
467 for _, br := range doc.Branches {
468 m := b.branchMask(br)
469 if m == 0 {
470 return fmt.Errorf("no branch found for %s", br)
471 }
472 mask |= m
473 }
474
475 if repoIdx > 1<<16 {
476 return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
477 }
478
479 b.subRepos = append(b.subRepos, subRepoIdx)
480 b.repos = append(b.repos, uint16(repoIdx))
481
482 hasher := crc64.New(crc64.MakeTable(crc64.ISO))
483 hasher.Write(doc.Content)
484
485 b.contentStrings = append(b.contentStrings, docStr)
486 b.runeDocSections = append(b.runeDocSections, runeSecs...)
487
488 b.nameStrings = append(b.nameStrings, nameStr)
489 b.docSections = append(b.docSections, doc.Symbols)
490 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
491 b.branchMasks = append(b.branchMasks, mask)
492 b.checksums = append(b.checksums, hasher.Sum(nil)...)
493
494 langCode, ok := b.languageMap[doc.Language]
495 if !ok {
496 if len(b.languageMap) >= 65535 {
497 return fmt.Errorf("too many languages")
498 }
499 langCode = uint16(len(b.languageMap))
500 b.languageMap[doc.Language] = langCode
501 }
502 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
503
504 category, err := doc.Category.encode()
505 if err != nil {
506 return err
507 }
508 b.categories = append(b.categories, category)
509
510 return nil
511}
512
513func (b *ShardBuilder) branchMask(br string) uint64 {
514 for i, b := range b.repoList[len(b.repoList)-1].Branches {
515 if b.Name == br {
516 return uint64(1) << uint(i)
517 }
518 }
519 return 0
520}
521
522// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID
523// is missing or there are no repos, this returns false.
524func (b *ShardBuilder) repoIDs() ([]uint32, bool) {
525 if len(b.repoList) == 0 {
526 return nil, false
527 }
528
529 ids := make([]uint32, 0, len(b.repoList))
530 for _, repo := range b.repoList {
531 if repo.ID == 0 {
532 return nil, false
533 }
534 ids = append(ids, repo.ID)
535 }
536 return ids, true
537}
538
539type DocChecker struct {
540 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
541 trigrams map[ngram]struct{}
542}
543
544// Check returns a reason why the given contents are probably not source texts.
545func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason {
546 if len(content) == 0 {
547 return SkipReasonNone
548 }
549
550 if len(content) < ngramSize {
551 return SkipReasonTooSmall
552 }
553
554 if index := bytes.IndexByte(content, 0); index > 0 {
555 return SkipReasonBinary
556 }
557
558 // PERF: we only need to do the trigram check if the upperbound on content is greater than
559 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
560 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
561 return SkipReasonNone
562 }
563
564 var cur [3]rune
565 byteCount := 0
566 t.clearTrigrams(maxTrigramCount)
567
568 for len(content) > 0 {
569 r, sz := utf8.DecodeRune(content)
570 content = content[sz:]
571 byteCount += sz
572
573 cur[0], cur[1], cur[2] = cur[1], cur[2], r
574 if cur[0] == 0 {
575 // start of file.
576 continue
577 }
578
579 t.trigrams[runesToNGram(cur)] = struct{}{}
580 if len(t.trigrams) > maxTrigramCount {
581 // probably not text.
582 return SkipReasonTooManyTrigrams
583 }
584 }
585 return SkipReasonNone
586}
587
588func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
589 if t.trigrams == nil {
590 t.trigrams = make(map[ngram]struct{}, maxTrigramCount)
591 }
592 for key := range t.trigrams {
593 delete(t.trigrams, key)
594 }
595}
596
597// ShardName returns the name of the shard for the given prefix, version, and
598// shard number.
599func ShardName(indexDir string, prefix string, version, n int) string {
600 prefix = url.QueryEscape(prefix)
601 if len(prefix) > 200 {
602 prefix = prefix[:200] + hashString(prefix)[:8]
603 }
604 return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n))
605}