index/shard_builder.go at 05b2b92c2ebd0927249db50155dfe7375e0ed525 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / shard_builder.go
at 05b2b92c2ebd0927249db50155dfe7375e0ed525 16 kB View raw
Julie Tibshirani ranking: downweight binary files (#924) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"bytes"
 19	"encoding/binary"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"net/url"
 24	"os"
 25	"path/filepath"
 26	"sort"
 27	"strings"
 28	"text/template"
 29	"time"
 30	"unicode/utf8"
 31
 32	"github.com/sourcegraph/zoekt"
 33	"github.com/sourcegraph/zoekt/internal/languages"
 34)
 35
 36var _ = log.Println
 37
 38const ngramSize = 3
 39
 40type searchableString struct {
 41	data []byte
 42}
 43
 44// Filled by the linker
 45var Version string
 46
 47func HostnameBestEffort() string {
 48	if h := os.Getenv("NODE_NAME"); h != "" {
 49		return h
 50	}
 51	if h := os.Getenv("HOSTNAME"); h != "" {
 52		return h
 53	}
 54	hostname, _ := os.Hostname()
 55	return hostname
 56}
 57
 58// Store character (unicode codepoint) offset (in bytes) this often.
 59const runeOffsetFrequency = 100
 60
 61type postingsBuilder struct {
 62	postings    map[ngram][]byte
 63	lastOffsets map[ngram]uint32
 64
 65	// To support UTF-8 searching, we must map back runes to byte
 66	// offsets. As a first attempt, we sample regularly. The
 67	// precise offset can be found by walking from the recorded
 68	// offset to the desired rune.
 69	runeOffsets []uint32
 70	runeCount   uint32
 71
 72	isPlainASCII bool
 73
 74	endRunes []uint32
 75	endByte  uint32
 76}
 77
 78func newPostingsBuilder() *postingsBuilder {
 79	return &postingsBuilder{
 80		postings:     map[ngram][]byte{},
 81		lastOffsets:  map[ngram]uint32{},
 82		isPlainASCII: true,
 83	}
 84}
 85
 86// Store trigram offsets for the given UTF-8 data. The
 87// DocumentSections must correspond to rune boundaries in the UTF-8
 88// data.
 89func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
 90	dest := searchableString{
 91		data: data,
 92	}
 93	var buf [8]byte
 94	var runeGram [3]rune
 95
 96	var runeIndex uint32
 97	byteCount := 0
 98	dataSz := uint32(len(data))
 99
100	byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
101	for _, s := range byteSections {
102		byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
103	}
104	var runeSectionBoundaries []uint32
105
106	endRune := s.runeCount
107	for ; len(data) > 0; runeIndex++ {
108		c, sz := utf8.DecodeRune(data)
109		if sz > 1 {
110			s.isPlainASCII = false
111		}
112		data = data[sz:]
113
114		runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
115
116		if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
117			s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
118		}
119		for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
120			runeSectionBoundaries = append(runeSectionBoundaries,
121				endRune+uint32(runeIndex))
122			byteSectionBoundaries = byteSectionBoundaries[1:]
123		}
124
125		byteCount += sz
126
127		if runeIndex < 2 {
128			continue
129		}
130
131		ng := runesToNGram(runeGram)
132		lastOff := s.lastOffsets[ng]
133		newOff := endRune + uint32(runeIndex) - 2
134
135		m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
136		s.postings[ng] = append(s.postings[ng], buf[:m]...)
137		s.lastOffsets[ng] = newOff
138	}
139	s.runeCount += runeIndex
140
141	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
142		return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
143	}
144
145	// Handle symbol definition that ends at file end. This can
146	// happen for labels at the end of .bat files.
147
148	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
149		runeSectionBoundaries = append(runeSectionBoundaries,
150			endRune+runeIndex)
151		byteSectionBoundaries = byteSectionBoundaries[1:]
152	}
153	runeSecs := make([]DocumentSection, 0, len(byteSections))
154	for i := 0; i < len(runeSectionBoundaries); i += 2 {
155		runeSecs = append(runeSecs, DocumentSection{
156			Start: runeSectionBoundaries[i],
157			End:   runeSectionBoundaries[i+1],
158		})
159	}
160
161	s.endRunes = append(s.endRunes, s.runeCount)
162	s.endByte += dataSz
163	return &dest, runeSecs, nil
164}
165
166// ShardBuilder builds a single index shard.
167type ShardBuilder struct {
168	// The version we will write to disk. Sourcegraph Specific. This is to
169	// enable feature flagging new format versions.
170	indexFormatVersion int
171	featureVersion     int
172
173	contentStrings  []*searchableString
174	nameStrings     []*searchableString
175	docSections     [][]DocumentSection
176	runeDocSections []DocumentSection
177
178	symID        uint32
179	symIndex     map[string]uint32
180	symKindID    uint32
181	symKindIndex map[string]uint32
182	symMetaData  []uint32
183
184	fileEndSymbol []uint32
185
186	checksums []byte
187
188	branchMasks []uint64
189	subRepos    []uint32
190
191	// docID => repoID
192	repos []uint16
193
194	contentPostings *postingsBuilder
195	namePostings    *postingsBuilder
196
197	// root repositories
198	repoList []zoekt.Repository
199
200	// name to index.
201	subRepoIndices []map[string]uint32
202
203	// language => language code
204	languageMap map[string]uint16
205
206	// language codes, uint16 encoded as little-endian
207	languages []uint8
208
209	categories []byte
210
211	// IndexTime will be used as the time if non-zero. Otherwise
212	// time.Now(). This is useful for doing reproducible builds in tests.
213	IndexTime time.Time
214
215	// a sortable 20 chars long id.
216	ID string
217}
218
219func verify(repo *zoekt.Repository) error {
220	for _, t := range []string{repo.FileURLTemplate, repo.LineFragmentTemplate, repo.CommitURLTemplate} {
221		if _, err := ParseTemplate(t); err != nil {
222			return err
223		}
224	}
225	return nil
226}
227
228func urlJoinPath(base string, elem ...string) string {
229	// golangs html/template always escapes "+" appearing in an HTML attribute
230	// [1]. We may even want to treat more characters, differently but this
231	// atleast makes it possible to visit URLs like [2].
232	//
233	// We only do this to elem since base will normally be a hardcoded string.
234	//
235	// [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80
236	// [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift
237	elem = append([]string{}, elem...) // copy to mutate
238	for i := range elem {
239		elem[i] = strings.ReplaceAll(elem[i], "+", "%2B")
240	}
241	u, err := url.JoinPath(base, elem...)
242	if err != nil {
243		return "#!error: " + err.Error()
244	}
245	return u
246}
247
248// ParseTemplate will parse the templates for FileURLTemplate,
249// LineFragmentTemplate and CommitURLTemplate.
250//
251// It makes available the extra function UrlJoinPath.
252func ParseTemplate(text string) (*template.Template, error) {
253	return template.New("").Funcs(template.FuncMap{
254		"URLJoinPath": urlJoinPath,
255	}).Parse(text)
256}
257
258// ContentSize returns the number of content bytes so far ingested.
259func (b *ShardBuilder) ContentSize() uint32 {
260	// Add the name too so we don't skip building index if we have
261	// lots of empty files.
262	return b.contentPostings.endByte + b.namePostings.endByte
263}
264
265// NumFiles returns the number of files added to this builder
266func (b *ShardBuilder) NumFiles() int {
267	return len(b.contentStrings)
268}
269
270// NewShardBuilder creates a fresh ShardBuilder. The passed in
271// Repository contains repo metadata, and may be set to nil.
272func NewShardBuilder(r *zoekt.Repository) (*ShardBuilder, error) {
273	b := newShardBuilder()
274
275	if r == nil {
276		r = &zoekt.Repository{}
277	}
278	if err := b.setRepository(r); err != nil {
279		return nil, err
280	}
281	return b, nil
282}
283
284func newShardBuilder() *ShardBuilder {
285	return &ShardBuilder{
286		indexFormatVersion: IndexFormatVersion,
287		featureVersion:     FeatureVersion,
288
289		contentPostings: newPostingsBuilder(),
290		namePostings:    newPostingsBuilder(),
291		fileEndSymbol:   []uint32{0},
292		symIndex:        make(map[string]uint32),
293		symKindIndex:    make(map[string]uint32),
294		languageMap:     make(map[string]uint16),
295	}
296}
297
298func (b *ShardBuilder) setRepository(desc *zoekt.Repository) error {
299	if err := verify(desc); err != nil {
300		return err
301	}
302
303	if len(desc.Branches) > 64 {
304		return fmt.Errorf("too many branches")
305	}
306
307	repo := *desc
308
309	// copy subrepomap without root
310	repo.SubRepoMap = map[string]*zoekt.Repository{}
311	for k, v := range desc.SubRepoMap {
312		if k != "" {
313			repo.SubRepoMap[k] = v
314		}
315	}
316
317	b.repoList = append(b.repoList, repo)
318
319	return b.populateSubRepoIndices()
320}
321
322type symbolSlice struct {
323	symbols  []DocumentSection
324	metaData []*zoekt.Symbol
325}
326
327func (s symbolSlice) Len() int { return len(s.symbols) }
328
329func (s symbolSlice) Swap(i, j int) {
330	s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
331	s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
332}
333
334func (s symbolSlice) Less(i, j int) bool {
335	return s.symbols[i].Start < s.symbols[j].Start
336}
337
338// AddFile is a convenience wrapper for Add
339func (b *ShardBuilder) AddFile(name string, content []byte) error {
340	return b.Add(Document{Name: name, Content: content})
341}
342
343func (b *ShardBuilder) populateSubRepoIndices() error {
344	if len(b.subRepoIndices) == len(b.repoList) {
345		return nil
346	}
347	if len(b.subRepoIndices) != len(b.repoList)-1 {
348		return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
349	}
350	repo := b.repoList[len(b.repoList)-1]
351	b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
352	return nil
353}
354
355func mkSubRepoIndices(repo zoekt.Repository) map[string]uint32 {
356	paths := []string{""}
357	for k := range repo.SubRepoMap {
358		paths = append(paths, k)
359	}
360	sort.Strings(paths)
361	subRepoIndices := make(map[string]uint32, len(paths))
362	for i, p := range paths {
363		subRepoIndices[p] = uint32(i)
364	}
365	return subRepoIndices
366}
367
368const notIndexedMarker = "NOT-INDEXED: "
369
370func (b *ShardBuilder) symbolID(sym string) uint32 {
371	if _, ok := b.symIndex[sym]; !ok {
372		b.symIndex[sym] = b.symID
373		b.symID++
374	}
375	return b.symIndex[sym]
376}
377
378func (b *ShardBuilder) symbolKindID(t string) uint32 {
379	if _, ok := b.symKindIndex[t]; !ok {
380		b.symKindIndex[t] = b.symKindID
381		b.symKindID++
382	}
383	return b.symKindIndex[t]
384}
385
386func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) {
387	for _, sym := range symbols {
388		b.symMetaData = append(b.symMetaData,
389			// This field was removed due to redundancy. To avoid
390			// needing to reindex, it is set to zero for now. In the
391			// future, this field will be completely removed. It
392			// will require incrementing the feature version.
393			0,
394			b.symbolKindID(sym.Kind),
395			b.symbolID(sym.Parent),
396			b.symbolKindID(sym.ParentKind))
397	}
398}
399
400func DetermineLanguageIfUnknown(doc *Document) {
401	if doc.Language != "" {
402		return
403	}
404
405	if doc.SkipReason != SkipReasonNone {
406		// If this document has been skipped, it's likely very large, or it's a non-code file like binary.
407		// In this case, we just guess the language based on file name to avoid examining the contents.
408		// Note: passing nil content is allowed by the go-enry contract (the underlying library we use here).
409		doc.Language = languages.GetLanguage(doc.Name, nil)
410	} else {
411		doc.Language = languages.GetLanguage(doc.Name, doc.Content)
412	}
413}
414
415// Add a file which only occurs in certain branches.
416func (b *ShardBuilder) Add(doc Document) error {
417	if index := bytes.IndexByte(doc.Content, 0); index > 0 {
418		doc.SkipReason = SkipReasonBinary
419	}
420
421	if doc.SkipReason != SkipReasonNone {
422		doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation())
423		doc.Symbols = nil
424		doc.SymbolsMetaData = nil
425	}
426
427	DetermineLanguageIfUnknown(&doc)
428	DetermineFileCategory(&doc)
429
430	sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
431	var last DocumentSection
432	for i, s := range doc.Symbols {
433		if i > 0 {
434			if last.End > s.Start {
435				return fmt.Errorf("sections overlap")
436			}
437		}
438		last = s
439	}
440	if last.End > uint32(len(doc.Content)) {
441		return fmt.Errorf("section goes past end of content")
442	}
443
444	if doc.SubRepositoryPath != "" {
445		rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
446		if err != nil || rel == doc.Name {
447			return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
448		}
449	}
450	docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
451	if err != nil {
452		return err
453	}
454	nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
455	if err != nil {
456		return err
457	}
458	b.addSymbols(doc.SymbolsMetaData)
459
460	repoIdx := len(b.repoList) - 1
461	subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
462	if !ok {
463		return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
464	}
465
466	var mask uint64
467	for _, br := range doc.Branches {
468		m := b.branchMask(br)
469		if m == 0 {
470			return fmt.Errorf("no branch found for %s", br)
471		}
472		mask |= m
473	}
474
475	if repoIdx > 1<<16 {
476		return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
477	}
478
479	b.subRepos = append(b.subRepos, subRepoIdx)
480	b.repos = append(b.repos, uint16(repoIdx))
481
482	hasher := crc64.New(crc64.MakeTable(crc64.ISO))
483	hasher.Write(doc.Content)
484
485	b.contentStrings = append(b.contentStrings, docStr)
486	b.runeDocSections = append(b.runeDocSections, runeSecs...)
487
488	b.nameStrings = append(b.nameStrings, nameStr)
489	b.docSections = append(b.docSections, doc.Symbols)
490	b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
491	b.branchMasks = append(b.branchMasks, mask)
492	b.checksums = append(b.checksums, hasher.Sum(nil)...)
493
494	langCode, ok := b.languageMap[doc.Language]
495	if !ok {
496		if len(b.languageMap) >= 65535 {
497			return fmt.Errorf("too many languages")
498		}
499		langCode = uint16(len(b.languageMap))
500		b.languageMap[doc.Language] = langCode
501	}
502	b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
503
504	category, err := doc.Category.encode()
505	if err != nil {
506		return err
507	}
508	b.categories = append(b.categories, category)
509
510	return nil
511}
512
513func (b *ShardBuilder) branchMask(br string) uint64 {
514	for i, b := range b.repoList[len(b.repoList)-1].Branches {
515		if b.Name == br {
516			return uint64(1) << uint(i)
517		}
518	}
519	return 0
520}
521
522// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID
523// is missing or there are no repos, this returns false.
524func (b *ShardBuilder) repoIDs() ([]uint32, bool) {
525	if len(b.repoList) == 0 {
526		return nil, false
527	}
528
529	ids := make([]uint32, 0, len(b.repoList))
530	for _, repo := range b.repoList {
531		if repo.ID == 0 {
532			return nil, false
533		}
534		ids = append(ids, repo.ID)
535	}
536	return ids, true
537}
538
539type DocChecker struct {
540	// A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
541	trigrams map[ngram]struct{}
542}
543
544// Check returns a reason why the given contents are probably not source texts.
545func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason {
546	if len(content) == 0 {
547		return SkipReasonNone
548	}
549
550	if len(content) < ngramSize {
551		return SkipReasonTooSmall
552	}
553
554	if index := bytes.IndexByte(content, 0); index > 0 {
555		return SkipReasonBinary
556	}
557
558	// PERF: we only need to do the trigram check if the upperbound on content is greater than
559	// our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
560	if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
561		return SkipReasonNone
562	}
563
564	var cur [3]rune
565	byteCount := 0
566	t.clearTrigrams(maxTrigramCount)
567
568	for len(content) > 0 {
569		r, sz := utf8.DecodeRune(content)
570		content = content[sz:]
571		byteCount += sz
572
573		cur[0], cur[1], cur[2] = cur[1], cur[2], r
574		if cur[0] == 0 {
575			// start of file.
576			continue
577		}
578
579		t.trigrams[runesToNGram(cur)] = struct{}{}
580		if len(t.trigrams) > maxTrigramCount {
581			// probably not text.
582			return SkipReasonTooManyTrigrams
583		}
584	}
585	return SkipReasonNone
586}
587
588func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
589	if t.trigrams == nil {
590		t.trigrams = make(map[ngram]struct{}, maxTrigramCount)
591	}
592	for key := range t.trigrams {
593		delete(t.trigrams, key)
594	}
595}
596
597// ShardName returns the name of the shard for the given prefix, version, and
598// shard number.
599func ShardName(indexDir string, prefix string, version, n int) string {
600	prefix = url.QueryEscape(prefix)
601	if len(prefix) > 200 {
602		prefix = prefix[:200] + hashString(prefix)[:8]
603	}
604	return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n))
605}
Configure Feed

Configure Feed