index/shard_builder.go at 8da4bd9f30f5ca41230ab083e39c028d06353a19 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / shard_builder.go
at 8da4bd9f30f5ca41230ab083e39c028d06353a19 16 kB View raw
Keegan Carruthers-Smith index: decide between tenant and non-tenant shard name in one place (#953) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"bytes"
 19	"encoding/binary"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"net/url"
 24	"os"
 25	"path/filepath"
 26	"sort"
 27	"strings"
 28	"text/template"
 29	"time"
 30	"unicode/utf8"
 31
 32	"slices"
 33
 34	"github.com/sourcegraph/zoekt"
 35	"github.com/sourcegraph/zoekt/internal/languages"
 36)
 37
 38var _ = log.Println
 39
 40const ngramSize = 3
 41
 42type searchableString struct {
 43	data []byte
 44}
 45
 46// Filled by the linker
 47var Version string
 48
 49func HostnameBestEffort() string {
 50	if h := os.Getenv("NODE_NAME"); h != "" {
 51		return h
 52	}
 53	if h := os.Getenv("HOSTNAME"); h != "" {
 54		return h
 55	}
 56	hostname, _ := os.Hostname()
 57	return hostname
 58}
 59
 60// Store character (unicode codepoint) offset (in bytes) this often.
 61const runeOffsetFrequency = 100
 62
 63type postingsBuilder struct {
 64	postings    map[ngram][]byte
 65	lastOffsets map[ngram]uint32
 66
 67	// To support UTF-8 searching, we must map back runes to byte
 68	// offsets. As a first attempt, we sample regularly. The
 69	// precise offset can be found by walking from the recorded
 70	// offset to the desired rune.
 71	runeOffsets []uint32
 72	runeCount   uint32
 73
 74	isPlainASCII bool
 75
 76	endRunes []uint32
 77	endByte  uint32
 78}
 79
 80func newPostingsBuilder() *postingsBuilder {
 81	return &postingsBuilder{
 82		postings:     map[ngram][]byte{},
 83		lastOffsets:  map[ngram]uint32{},
 84		isPlainASCII: true,
 85	}
 86}
 87
 88// Store trigram offsets for the given UTF-8 data. The
 89// DocumentSections must correspond to rune boundaries in the UTF-8
 90// data.
 91func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
 92	dest := searchableString{
 93		data: data,
 94	}
 95	var buf [8]byte
 96	var runeGram [3]rune
 97
 98	var runeIndex uint32
 99	byteCount := 0
100	dataSz := uint32(len(data))
101
102	byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
103	for _, s := range byteSections {
104		byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
105	}
106	var runeSectionBoundaries []uint32
107
108	endRune := s.runeCount
109	for ; len(data) > 0; runeIndex++ {
110		c, sz := utf8.DecodeRune(data)
111		if sz > 1 {
112			s.isPlainASCII = false
113		}
114		data = data[sz:]
115
116		runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
117
118		if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
119			s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
120		}
121		for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
122			runeSectionBoundaries = append(runeSectionBoundaries,
123				endRune+uint32(runeIndex))
124			byteSectionBoundaries = byteSectionBoundaries[1:]
125		}
126
127		byteCount += sz
128
129		if runeIndex < 2 {
130			continue
131		}
132
133		ng := runesToNGram(runeGram)
134		lastOff := s.lastOffsets[ng]
135		newOff := endRune + uint32(runeIndex) - 2
136
137		m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
138		s.postings[ng] = append(s.postings[ng], buf[:m]...)
139		s.lastOffsets[ng] = newOff
140	}
141	s.runeCount += runeIndex
142
143	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
144		return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
145	}
146
147	// Handle symbol definition that ends at file end. This can
148	// happen for labels at the end of .bat files.
149
150	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
151		runeSectionBoundaries = append(runeSectionBoundaries,
152			endRune+runeIndex)
153		byteSectionBoundaries = byteSectionBoundaries[1:]
154	}
155	runeSecs := make([]DocumentSection, 0, len(byteSections))
156	for i := 0; i < len(runeSectionBoundaries); i += 2 {
157		runeSecs = append(runeSecs, DocumentSection{
158			Start: runeSectionBoundaries[i],
159			End:   runeSectionBoundaries[i+1],
160		})
161	}
162
163	s.endRunes = append(s.endRunes, s.runeCount)
164	s.endByte += dataSz
165	return &dest, runeSecs, nil
166}
167
168// ShardBuilder builds a single index shard.
169type ShardBuilder struct {
170	// The version we will write to disk. Sourcegraph Specific. This is to
171	// enable feature flagging new format versions.
172	indexFormatVersion int
173	featureVersion     int
174
175	contentStrings  []*searchableString
176	nameStrings     []*searchableString
177	docSections     [][]DocumentSection
178	runeDocSections []DocumentSection
179
180	symID        uint32
181	symIndex     map[string]uint32
182	symKindID    uint32
183	symKindIndex map[string]uint32
184	symMetaData  []uint32
185
186	fileEndSymbol []uint32
187
188	checksums []byte
189
190	branchMasks []uint64
191	subRepos    []uint32
192
193	// docID => repoID
194	repos []uint16
195
196	contentPostings *postingsBuilder
197	namePostings    *postingsBuilder
198
199	// root repositories
200	repoList []zoekt.Repository
201
202	// name to index.
203	subRepoIndices []map[string]uint32
204
205	// language => language code
206	languageMap map[string]uint16
207
208	// language codes, uint16 encoded as little-endian
209	languages []uint8
210
211	categories []byte
212
213	// IndexTime will be used as the time if non-zero. Otherwise
214	// time.Now(). This is useful for doing reproducible builds in tests.
215	IndexTime time.Time
216
217	// a sortable 20 chars long id.
218	ID string
219}
220
221func verify(repo *zoekt.Repository) error {
222	for _, t := range []string{repo.FileURLTemplate, repo.LineFragmentTemplate, repo.CommitURLTemplate} {
223		if _, err := ParseTemplate(t); err != nil {
224			return err
225		}
226	}
227	return nil
228}
229
230func urlJoinPath(base string, elem ...string) string {
231	// golangs html/template always escapes "+" appearing in an HTML attribute
232	// [1]. We may even want to treat more characters, differently but this
233	// atleast makes it possible to visit URLs like [2].
234	//
235	// We only do this to elem since base will normally be a hardcoded string.
236	//
237	// [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80
238	// [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift
239	elem = slices.Clone(elem) // copy to mutate
240	for i := range elem {
241		elem[i] = strings.ReplaceAll(elem[i], "+", "%2B")
242	}
243	u, err := url.JoinPath(base, elem...)
244	if err != nil {
245		return "#!error: " + err.Error()
246	}
247	return u
248}
249
250// ParseTemplate will parse the templates for FileURLTemplate,
251// LineFragmentTemplate and CommitURLTemplate.
252//
253// It makes available the extra function UrlJoinPath.
254func ParseTemplate(text string) (*template.Template, error) {
255	return template.New("").Funcs(template.FuncMap{
256		"URLJoinPath": urlJoinPath,
257	}).Parse(text)
258}
259
260// ContentSize returns the number of content bytes so far ingested.
261func (b *ShardBuilder) ContentSize() uint32 {
262	// Add the name too so we don't skip building index if we have
263	// lots of empty files.
264	return b.contentPostings.endByte + b.namePostings.endByte
265}
266
267// NumFiles returns the number of files added to this builder
268func (b *ShardBuilder) NumFiles() int {
269	return len(b.contentStrings)
270}
271
272// NewShardBuilder creates a fresh ShardBuilder. The passed in
273// Repository contains repo metadata, and may be set to nil.
274func NewShardBuilder(r *zoekt.Repository) (*ShardBuilder, error) {
275	b := newShardBuilder()
276
277	if r == nil {
278		r = &zoekt.Repository{}
279	}
280	if err := b.setRepository(r); err != nil {
281		return nil, err
282	}
283	return b, nil
284}
285
286func newShardBuilder() *ShardBuilder {
287	return &ShardBuilder{
288		indexFormatVersion: IndexFormatVersion,
289		featureVersion:     FeatureVersion,
290
291		contentPostings: newPostingsBuilder(),
292		namePostings:    newPostingsBuilder(),
293		fileEndSymbol:   []uint32{0},
294		symIndex:        make(map[string]uint32),
295		symKindIndex:    make(map[string]uint32),
296		languageMap:     make(map[string]uint16),
297	}
298}
299
300func (b *ShardBuilder) setRepository(desc *zoekt.Repository) error {
301	if err := verify(desc); err != nil {
302		return err
303	}
304
305	if len(desc.Branches) > 64 {
306		return fmt.Errorf("too many branches")
307	}
308
309	repo := *desc
310
311	// copy subrepomap without root
312	repo.SubRepoMap = map[string]*zoekt.Repository{}
313	for k, v := range desc.SubRepoMap {
314		if k != "" {
315			repo.SubRepoMap[k] = v
316		}
317	}
318
319	b.repoList = append(b.repoList, repo)
320
321	return b.populateSubRepoIndices()
322}
323
324type symbolSlice struct {
325	symbols  []DocumentSection
326	metaData []*zoekt.Symbol
327}
328
329func (s symbolSlice) Len() int { return len(s.symbols) }
330
331func (s symbolSlice) Swap(i, j int) {
332	s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
333	s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
334}
335
336func (s symbolSlice) Less(i, j int) bool {
337	return s.symbols[i].Start < s.symbols[j].Start
338}
339
340// AddFile is a convenience wrapper for Add
341func (b *ShardBuilder) AddFile(name string, content []byte) error {
342	return b.Add(Document{Name: name, Content: content})
343}
344
345func (b *ShardBuilder) populateSubRepoIndices() error {
346	if len(b.subRepoIndices) == len(b.repoList) {
347		return nil
348	}
349	if len(b.subRepoIndices) != len(b.repoList)-1 {
350		return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
351	}
352	repo := b.repoList[len(b.repoList)-1]
353	b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
354	return nil
355}
356
357func mkSubRepoIndices(repo zoekt.Repository) map[string]uint32 {
358	paths := []string{""}
359	for k := range repo.SubRepoMap {
360		paths = append(paths, k)
361	}
362	sort.Strings(paths)
363	subRepoIndices := make(map[string]uint32, len(paths))
364	for i, p := range paths {
365		subRepoIndices[p] = uint32(i)
366	}
367	return subRepoIndices
368}
369
370const notIndexedMarker = "NOT-INDEXED: "
371
372func (b *ShardBuilder) symbolID(sym string) uint32 {
373	if _, ok := b.symIndex[sym]; !ok {
374		b.symIndex[sym] = b.symID
375		b.symID++
376	}
377	return b.symIndex[sym]
378}
379
380func (b *ShardBuilder) symbolKindID(t string) uint32 {
381	if _, ok := b.symKindIndex[t]; !ok {
382		b.symKindIndex[t] = b.symKindID
383		b.symKindID++
384	}
385	return b.symKindIndex[t]
386}
387
388func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) {
389	for _, sym := range symbols {
390		b.symMetaData = append(b.symMetaData,
391			// This field was removed due to redundancy. To avoid
392			// needing to reindex, it is set to zero for now. In the
393			// future, this field will be completely removed. It
394			// will require incrementing the feature version.
395			0,
396			b.symbolKindID(sym.Kind),
397			b.symbolID(sym.Parent),
398			b.symbolKindID(sym.ParentKind))
399	}
400}
401
402func DetermineLanguageIfUnknown(doc *Document) {
403	if doc.Language != "" {
404		return
405	}
406
407	if doc.SkipReason != SkipReasonNone {
408		// If this document has been skipped, it's likely very large, or it's a non-code file like binary.
409		// In this case, we just guess the language based on file name to avoid examining the contents.
410		// Note: passing nil content is allowed by the go-enry contract (the underlying library we use here).
411		doc.Language = languages.GetLanguage(doc.Name, nil)
412	} else {
413		doc.Language = languages.GetLanguage(doc.Name, doc.Content)
414	}
415}
416
417// Add a file which only occurs in certain branches.
418func (b *ShardBuilder) Add(doc Document) error {
419	if index := bytes.IndexByte(doc.Content, 0); index > 0 {
420		doc.SkipReason = SkipReasonBinary
421	}
422
423	if doc.SkipReason != SkipReasonNone {
424		doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation())
425		doc.Symbols = nil
426		doc.SymbolsMetaData = nil
427	}
428
429	DetermineLanguageIfUnknown(&doc)
430	DetermineFileCategory(&doc)
431
432	sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
433	var last DocumentSection
434	for i, s := range doc.Symbols {
435		if i > 0 {
436			if last.End > s.Start {
437				return fmt.Errorf("sections overlap")
438			}
439		}
440		last = s
441	}
442	if last.End > uint32(len(doc.Content)) {
443		return fmt.Errorf("section goes past end of content")
444	}
445
446	if doc.SubRepositoryPath != "" {
447		rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
448		if err != nil || rel == doc.Name {
449			return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
450		}
451	}
452	docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
453	if err != nil {
454		return err
455	}
456	nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
457	if err != nil {
458		return err
459	}
460	b.addSymbols(doc.SymbolsMetaData)
461
462	repoIdx := len(b.repoList) - 1
463	subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
464	if !ok {
465		return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
466	}
467
468	var mask uint64
469	for _, br := range doc.Branches {
470		m := b.branchMask(br)
471		if m == 0 {
472			return fmt.Errorf("no branch found for %s", br)
473		}
474		mask |= m
475	}
476
477	if repoIdx > 1<<16 {
478		return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
479	}
480
481	b.subRepos = append(b.subRepos, subRepoIdx)
482	b.repos = append(b.repos, uint16(repoIdx))
483
484	hasher := crc64.New(crc64.MakeTable(crc64.ISO))
485	hasher.Write(doc.Content)
486
487	b.contentStrings = append(b.contentStrings, docStr)
488	b.runeDocSections = append(b.runeDocSections, runeSecs...)
489
490	b.nameStrings = append(b.nameStrings, nameStr)
491	b.docSections = append(b.docSections, doc.Symbols)
492	b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
493	b.branchMasks = append(b.branchMasks, mask)
494	b.checksums = append(b.checksums, hasher.Sum(nil)...)
495
496	langCode, ok := b.languageMap[doc.Language]
497	if !ok {
498		if len(b.languageMap) >= 65535 {
499			return fmt.Errorf("too many languages")
500		}
501		langCode = uint16(len(b.languageMap))
502		b.languageMap[doc.Language] = langCode
503	}
504	b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
505
506	category, err := doc.Category.encode()
507	if err != nil {
508		return err
509	}
510	b.categories = append(b.categories, category)
511
512	return nil
513}
514
515func (b *ShardBuilder) branchMask(br string) uint64 {
516	for i, b := range b.repoList[len(b.repoList)-1].Branches {
517		if b.Name == br {
518			return uint64(1) << uint(i)
519		}
520	}
521	return 0
522}
523
524// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID
525// is missing or there are no repos, this returns false.
526func (b *ShardBuilder) repoIDs() ([]uint32, bool) {
527	if len(b.repoList) == 0 {
528		return nil, false
529	}
530
531	ids := make([]uint32, 0, len(b.repoList))
532	for _, repo := range b.repoList {
533		if repo.ID == 0 {
534			return nil, false
535		}
536		ids = append(ids, repo.ID)
537	}
538	return ids, true
539}
540
541type DocChecker struct {
542	// A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
543	trigrams map[ngram]struct{}
544}
545
546// Check returns a reason why the given contents are probably not source texts.
547func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason {
548	if len(content) == 0 {
549		return SkipReasonNone
550	}
551
552	if len(content) < ngramSize {
553		return SkipReasonTooSmall
554	}
555
556	if index := bytes.IndexByte(content, 0); index > 0 {
557		return SkipReasonBinary
558	}
559
560	// PERF: we only need to do the trigram check if the upperbound on content is greater than
561	// our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
562	if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
563		return SkipReasonNone
564	}
565
566	var cur [3]rune
567	byteCount := 0
568	t.clearTrigrams(maxTrigramCount)
569
570	for len(content) > 0 {
571		r, sz := utf8.DecodeRune(content)
572		content = content[sz:]
573		byteCount += sz
574
575		cur[0], cur[1], cur[2] = cur[1], cur[2], r
576		if cur[0] == 0 {
577			// start of file.
578			continue
579		}
580
581		t.trigrams[runesToNGram(cur)] = struct{}{}
582		if len(t.trigrams) > maxTrigramCount {
583			// probably not text.
584			return SkipReasonTooManyTrigrams
585		}
586	}
587	return SkipReasonNone
588}
589
590func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
591	if t.trigrams == nil {
592		t.trigrams = make(map[ngram]struct{}, maxTrigramCount)
593	}
594	for key := range t.trigrams {
595		delete(t.trigrams, key)
596	}
597}
598
599// shardName returns the name of the shard for the given prefix, version, and
600// shard number.
601func shardName(indexDir string, prefix string, version, n int) string {
602	prefix = url.QueryEscape(prefix)
603	if len(prefix) > 200 {
604		prefix = prefix[:200] + hashString(prefix)[:8]
605	}
606	return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n))
607}
Configure Feed

Configure Feed