index/shard_builder.go at 971fcf5e9d821ede22fd7f7675215e350dfa4021 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / shard_builder.go
at 971fcf5e9d821ede22fd7f7675215e350dfa4021 16 kB View raw
Stefan Hengl copy languages package from Sourcegraph to Zoekt (#979) 10mo ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"bytes"
 19	"encoding/binary"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"net/url"
 24	"os"
 25	"path/filepath"
 26	"slices"
 27	"sort"
 28	"strings"
 29	"text/template"
 30	"time"
 31	"unicode/utf8"
 32
 33	"github.com/sourcegraph/zoekt"
 34	"github.com/sourcegraph/zoekt/languages"
 35)
 36
 37var _ = log.Println
 38
 39const ngramSize = 3
 40
 41type searchableString struct {
 42	data []byte
 43}
 44
 45// Filled by the linker
 46var Version string
 47
 48func HostnameBestEffort() string {
 49	if h := os.Getenv("NODE_NAME"); h != "" {
 50		return h
 51	}
 52	if h := os.Getenv("HOSTNAME"); h != "" {
 53		return h
 54	}
 55	hostname, _ := os.Hostname()
 56	return hostname
 57}
 58
 59// Store character (unicode codepoint) offset (in bytes) this often.
 60const runeOffsetFrequency = 100
 61
 62type postingsBuilder struct {
 63	postings    map[ngram][]byte
 64	lastOffsets map[ngram]uint32
 65
 66	// To support UTF-8 searching, we must map back runes to byte
 67	// offsets. As a first attempt, we sample regularly. The
 68	// precise offset can be found by walking from the recorded
 69	// offset to the desired rune.
 70	runeOffsets []uint32
 71	runeCount   uint32
 72
 73	isPlainASCII bool
 74
 75	endRunes []uint32
 76	endByte  uint32
 77}
 78
 79func newPostingsBuilder() *postingsBuilder {
 80	return &postingsBuilder{
 81		postings:     map[ngram][]byte{},
 82		lastOffsets:  map[ngram]uint32{},
 83		isPlainASCII: true,
 84	}
 85}
 86
 87// Store trigram offsets for the given UTF-8 data. The
 88// DocumentSections must correspond to rune boundaries in the UTF-8
 89// data.
 90func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
 91	dest := searchableString{
 92		data: data,
 93	}
 94	var buf [8]byte
 95	var runeGram [3]rune
 96
 97	var runeIndex uint32
 98	byteCount := 0
 99	dataSz := uint32(len(data))
100
101	byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
102	for _, s := range byteSections {
103		byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
104	}
105	var runeSectionBoundaries []uint32
106
107	endRune := s.runeCount
108	for ; len(data) > 0; runeIndex++ {
109		c, sz := utf8.DecodeRune(data)
110		if sz > 1 {
111			s.isPlainASCII = false
112		}
113		data = data[sz:]
114
115		runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
116
117		if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
118			s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
119		}
120		for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
121			runeSectionBoundaries = append(runeSectionBoundaries,
122				endRune+uint32(runeIndex))
123			byteSectionBoundaries = byteSectionBoundaries[1:]
124		}
125
126		byteCount += sz
127
128		if runeIndex < 2 {
129			continue
130		}
131
132		ng := runesToNGram(runeGram)
133		lastOff := s.lastOffsets[ng]
134		newOff := endRune + uint32(runeIndex) - 2
135
136		m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
137		s.postings[ng] = append(s.postings[ng], buf[:m]...)
138		s.lastOffsets[ng] = newOff
139	}
140	s.runeCount += runeIndex
141
142	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
143		return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
144	}
145
146	// Handle symbol definition that ends at file end. This can
147	// happen for labels at the end of .bat files.
148
149	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
150		runeSectionBoundaries = append(runeSectionBoundaries,
151			endRune+runeIndex)
152		byteSectionBoundaries = byteSectionBoundaries[1:]
153	}
154	runeSecs := make([]DocumentSection, 0, len(byteSections))
155	for i := 0; i < len(runeSectionBoundaries); i += 2 {
156		runeSecs = append(runeSecs, DocumentSection{
157			Start: runeSectionBoundaries[i],
158			End:   runeSectionBoundaries[i+1],
159		})
160	}
161
162	s.endRunes = append(s.endRunes, s.runeCount)
163	s.endByte += dataSz
164	return &dest, runeSecs, nil
165}
166
167// ShardBuilder builds a single index shard.
168type ShardBuilder struct {
169	// The version we will write to disk. Sourcegraph Specific. This is to
170	// enable feature flagging new format versions.
171	indexFormatVersion int
172	featureVersion     int
173
174	contentStrings  []*searchableString
175	nameStrings     []*searchableString
176	docSections     [][]DocumentSection
177	runeDocSections []DocumentSection
178
179	symID        uint32
180	symIndex     map[string]uint32
181	symKindID    uint32
182	symKindIndex map[string]uint32
183	symMetaData  []uint32
184
185	fileEndSymbol []uint32
186
187	checksums []byte
188
189	branchMasks []uint64
190	subRepos    []uint32
191
192	// docID => repoID
193	repos []uint16
194
195	contentPostings *postingsBuilder
196	namePostings    *postingsBuilder
197
198	// root repositories
199	repoList []zoekt.Repository
200
201	// name to index.
202	subRepoIndices []map[string]uint32
203
204	// language => language code
205	languageMap map[string]uint16
206
207	// language codes, uint16 encoded as little-endian
208	languages []uint8
209
210	categories []byte
211
212	// IndexTime will be used as the time if non-zero. Otherwise
213	// time.Now(). This is useful for doing reproducible builds in tests.
214	IndexTime time.Time
215
216	// a sortable 20 chars long id.
217	ID string
218}
219
220func verify(repo *zoekt.Repository) error {
221	for _, t := range []string{repo.FileURLTemplate, repo.LineFragmentTemplate, repo.CommitURLTemplate} {
222		if _, err := ParseTemplate(t); err != nil {
223			return err
224		}
225	}
226	return nil
227}
228
229func urlJoinPath(base string, elem ...string) string {
230	// golangs html/template always escapes "+" appearing in an HTML attribute
231	// [1]. We may even want to treat more characters, differently but this
232	// atleast makes it possible to visit URLs like [2].
233	//
234	// We only do this to elem since base will normally be a hardcoded string.
235	//
236	// [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80
237	// [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift
238	elem = slices.Clone(elem) // copy to mutate
239	for i := range elem {
240		elem[i] = strings.ReplaceAll(elem[i], "+", "%2B")
241	}
242	u, err := url.JoinPath(base, elem...)
243	if err != nil {
244		return "#!error: " + err.Error()
245	}
246	return u
247}
248
249// ParseTemplate will parse the templates for FileURLTemplate,
250// LineFragmentTemplate and CommitURLTemplate.
251//
252// It makes available the extra function UrlJoinPath.
253func ParseTemplate(text string) (*template.Template, error) {
254	return template.New("").Funcs(template.FuncMap{
255		"URLJoinPath": urlJoinPath,
256	}).Parse(text)
257}
258
259// ContentSize returns the number of content bytes so far ingested.
260func (b *ShardBuilder) ContentSize() uint32 {
261	// Add the name too so we don't skip building index if we have
262	// lots of empty files.
263	return b.contentPostings.endByte + b.namePostings.endByte
264}
265
266// NumFiles returns the number of files added to this builder
267func (b *ShardBuilder) NumFiles() int {
268	return len(b.contentStrings)
269}
270
271// NewShardBuilder creates a fresh ShardBuilder. The passed in
272// Repository contains repo metadata, and may be set to nil.
273func NewShardBuilder(r *zoekt.Repository) (*ShardBuilder, error) {
274	b := newShardBuilder()
275
276	if r == nil {
277		r = &zoekt.Repository{}
278	}
279	if err := b.setRepository(r); err != nil {
280		return nil, err
281	}
282	return b, nil
283}
284
285func newShardBuilder() *ShardBuilder {
286	return &ShardBuilder{
287		indexFormatVersion: IndexFormatVersion,
288		featureVersion:     FeatureVersion,
289
290		contentPostings: newPostingsBuilder(),
291		namePostings:    newPostingsBuilder(),
292		fileEndSymbol:   []uint32{0},
293		symIndex:        make(map[string]uint32),
294		symKindIndex:    make(map[string]uint32),
295		languageMap:     make(map[string]uint16),
296	}
297}
298
299func (b *ShardBuilder) setRepository(desc *zoekt.Repository) error {
300	if err := verify(desc); err != nil {
301		return err
302	}
303
304	if len(desc.Branches) > 64 {
305		return fmt.Errorf("too many branches")
306	}
307
308	repo := *desc
309
310	// copy subrepomap without root
311	repo.SubRepoMap = map[string]*zoekt.Repository{}
312	for k, v := range desc.SubRepoMap {
313		if k != "" {
314			repo.SubRepoMap[k] = v
315		}
316	}
317
318	b.repoList = append(b.repoList, repo)
319
320	return b.populateSubRepoIndices()
321}
322
323type symbolSlice struct {
324	symbols  []DocumentSection
325	metaData []*zoekt.Symbol
326}
327
328func (s symbolSlice) Len() int { return len(s.symbols) }
329
330func (s symbolSlice) Swap(i, j int) {
331	s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
332	s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
333}
334
335func (s symbolSlice) Less(i, j int) bool {
336	return s.symbols[i].Start < s.symbols[j].Start
337}
338
339// AddFile is a convenience wrapper for Add
340func (b *ShardBuilder) AddFile(name string, content []byte) error {
341	return b.Add(Document{Name: name, Content: content})
342}
343
344func (b *ShardBuilder) populateSubRepoIndices() error {
345	if len(b.subRepoIndices) == len(b.repoList) {
346		return nil
347	}
348	if len(b.subRepoIndices) != len(b.repoList)-1 {
349		return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
350	}
351	repo := b.repoList[len(b.repoList)-1]
352	b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
353	return nil
354}
355
356func mkSubRepoIndices(repo zoekt.Repository) map[string]uint32 {
357	paths := []string{""}
358	for k := range repo.SubRepoMap {
359		paths = append(paths, k)
360	}
361	sort.Strings(paths)
362	subRepoIndices := make(map[string]uint32, len(paths))
363	for i, p := range paths {
364		subRepoIndices[p] = uint32(i)
365	}
366	return subRepoIndices
367}
368
369const notIndexedMarker = "NOT-INDEXED: "
370
371func (b *ShardBuilder) symbolID(sym string) uint32 {
372	if _, ok := b.symIndex[sym]; !ok {
373		b.symIndex[sym] = b.symID
374		b.symID++
375	}
376	return b.symIndex[sym]
377}
378
379func (b *ShardBuilder) symbolKindID(t string) uint32 {
380	if _, ok := b.symKindIndex[t]; !ok {
381		b.symKindIndex[t] = b.symKindID
382		b.symKindID++
383	}
384	return b.symKindIndex[t]
385}
386
387func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) {
388	for _, sym := range symbols {
389		b.symMetaData = append(b.symMetaData,
390			// This field was removed due to redundancy. To avoid
391			// needing to reindex, it is set to zero for now. In the
392			// future, this field will be completely removed. It
393			// will require incrementing the feature version.
394			0,
395			b.symbolKindID(sym.Kind),
396			b.symbolID(sym.Parent),
397			b.symbolKindID(sym.ParentKind))
398	}
399}
400
401func DetermineLanguageIfUnknown(doc *Document) {
402	if doc.Language != "" {
403		return
404	}
405
406	// If this document has been skipped (doc.SkipReason != SkipReasonNone), it's
407	// likely very large, or it's a non-code file like binary. In this case, we just
408	// guess the language based on the file name to avoid examining the contents.
409	// Note: passing nil content is allowed by the go-enry contract (the underlying
410	// library we use here).
411	var content []byte
412	if doc.SkipReason == SkipReasonNone {
413		content = doc.Content
414	}
415	langs := languages.GetLanguagesFromContent(doc.Name, content)
416	if len(langs) > 0 {
417		doc.Language = langs[0]
418	}
419}
420
421// Add a file which only occurs in certain branches.
422func (b *ShardBuilder) Add(doc Document) error {
423	if index := bytes.IndexByte(doc.Content, 0); index > 0 {
424		doc.SkipReason = SkipReasonBinary
425	}
426
427	if doc.SkipReason != SkipReasonNone {
428		doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation())
429		doc.Symbols = nil
430		doc.SymbolsMetaData = nil
431	}
432
433	DetermineLanguageIfUnknown(&doc)
434	DetermineFileCategory(&doc)
435
436	sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
437	var last DocumentSection
438	for i, s := range doc.Symbols {
439		if i > 0 {
440			if last.End > s.Start {
441				return fmt.Errorf("sections overlap")
442			}
443		}
444		last = s
445	}
446	if last.End > uint32(len(doc.Content)) {
447		return fmt.Errorf("section goes past end of content")
448	}
449
450	if doc.SubRepositoryPath != "" {
451		rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
452		if err != nil || rel == doc.Name {
453			return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
454		}
455	}
456	docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
457	if err != nil {
458		return err
459	}
460	nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
461	if err != nil {
462		return err
463	}
464	b.addSymbols(doc.SymbolsMetaData)
465
466	repoIdx := len(b.repoList) - 1
467	subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
468	if !ok {
469		return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
470	}
471
472	var mask uint64
473	for _, br := range doc.Branches {
474		m := b.branchMask(br)
475		if m == 0 {
476			return fmt.Errorf("no branch found for %s", br)
477		}
478		mask |= m
479	}
480
481	if repoIdx > 1<<16 {
482		return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
483	}
484
485	b.subRepos = append(b.subRepos, subRepoIdx)
486	b.repos = append(b.repos, uint16(repoIdx))
487
488	hasher := crc64.New(crc64.MakeTable(crc64.ISO))
489	hasher.Write(doc.Content)
490
491	b.contentStrings = append(b.contentStrings, docStr)
492	b.runeDocSections = append(b.runeDocSections, runeSecs...)
493
494	b.nameStrings = append(b.nameStrings, nameStr)
495	b.docSections = append(b.docSections, doc.Symbols)
496	b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
497	b.branchMasks = append(b.branchMasks, mask)
498	b.checksums = append(b.checksums, hasher.Sum(nil)...)
499
500	langCode, ok := b.languageMap[doc.Language]
501	if !ok {
502		if len(b.languageMap) >= 65535 {
503			return fmt.Errorf("too many languages")
504		}
505		langCode = uint16(len(b.languageMap))
506		b.languageMap[doc.Language] = langCode
507	}
508	b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
509
510	category, err := doc.Category.encode()
511	if err != nil {
512		return err
513	}
514	b.categories = append(b.categories, category)
515
516	return nil
517}
518
519func (b *ShardBuilder) branchMask(br string) uint64 {
520	for i, b := range b.repoList[len(b.repoList)-1].Branches {
521		if b.Name == br {
522			return uint64(1) << uint(i)
523		}
524	}
525	return 0
526}
527
528// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID
529// is missing or there are no repos, this returns false.
530func (b *ShardBuilder) repoIDs() ([]uint32, bool) {
531	if len(b.repoList) == 0 {
532		return nil, false
533	}
534
535	ids := make([]uint32, 0, len(b.repoList))
536	for _, repo := range b.repoList {
537		if repo.ID == 0 {
538			return nil, false
539		}
540		ids = append(ids, repo.ID)
541	}
542	return ids, true
543}
544
545type DocChecker struct {
546	// A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
547	trigrams map[ngram]struct{}
548}
549
550// Check returns a reason why the given contents are probably not source texts.
551func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason {
552	if len(content) == 0 {
553		return SkipReasonNone
554	}
555
556	if len(content) < ngramSize {
557		return SkipReasonTooSmall
558	}
559
560	if index := bytes.IndexByte(content, 0); index > 0 {
561		return SkipReasonBinary
562	}
563
564	// PERF: we only need to do the trigram check if the upperbound on content is greater than
565	// our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
566	if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
567		return SkipReasonNone
568	}
569
570	var cur [3]rune
571	byteCount := 0
572	t.clearTrigrams(maxTrigramCount)
573
574	for len(content) > 0 {
575		r, sz := utf8.DecodeRune(content)
576		content = content[sz:]
577		byteCount += sz
578
579		cur[0], cur[1], cur[2] = cur[1], cur[2], r
580		if cur[0] == 0 {
581			// start of file.
582			continue
583		}
584
585		t.trigrams[runesToNGram(cur)] = struct{}{}
586		if len(t.trigrams) > maxTrigramCount {
587			// probably not text.
588			return SkipReasonTooManyTrigrams
589		}
590	}
591	return SkipReasonNone
592}
593
594func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
595	if t.trigrams == nil {
596		t.trigrams = make(map[ngram]struct{}, maxTrigramCount)
597	}
598	for key := range t.trigrams {
599		delete(t.trigrams, key)
600	}
601}
602
603// shardName returns the name of the shard for the given prefix, version, and
604// shard number.
605func shardName(indexDir string, prefix string, version, n int) string {
606	prefix = url.QueryEscape(prefix)
607	if len(prefix) > 200 {
608		prefix = prefix[:200] + hashString(prefix)[:8]
609	}
610	return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n))
611}
Configure Feed

Configure Feed