indexbuilder.go at cc190492c8d1e3ece352fc56ff6a6d0ba951f855 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / indexbuilder.go
at cc190492c8d1e3ece352fc56ff6a6d0ba951f855 16 kB View raw
Keegan Carruthers-Smith web: escape + as %2B in file path templates (#843) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"bytes"
 19	"encoding/binary"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"net/url"
 24	"os"
 25	"path/filepath"
 26	"sort"
 27	"strings"
 28	"text/template"
 29	"time"
 30	"unicode/utf8"
 31
 32	"github.com/sourcegraph/zoekt/internal/languages"
 33)
 34
 35var _ = log.Println
 36
 37const ngramSize = 3
 38
 39type searchableString struct {
 40	data []byte
 41}
 42
 43// Filled by the linker
 44var Version string
 45
 46func HostnameBestEffort() string {
 47	if h := os.Getenv("NODE_NAME"); h != "" {
 48		return h
 49	}
 50	if h := os.Getenv("HOSTNAME"); h != "" {
 51		return h
 52	}
 53	hostname, _ := os.Hostname()
 54	return hostname
 55}
 56
 57// Store character (unicode codepoint) offset (in bytes) this often.
 58const runeOffsetFrequency = 100
 59
 60type postingsBuilder struct {
 61	postings    map[ngram][]byte
 62	lastOffsets map[ngram]uint32
 63
 64	// To support UTF-8 searching, we must map back runes to byte
 65	// offsets. As a first attempt, we sample regularly. The
 66	// precise offset can be found by walking from the recorded
 67	// offset to the desired rune.
 68	runeOffsets []uint32
 69	runeCount   uint32
 70
 71	isPlainASCII bool
 72
 73	endRunes []uint32
 74	endByte  uint32
 75}
 76
 77func newPostingsBuilder() *postingsBuilder {
 78	return &postingsBuilder{
 79		postings:     map[ngram][]byte{},
 80		lastOffsets:  map[ngram]uint32{},
 81		isPlainASCII: true,
 82	}
 83}
 84
 85// Store trigram offsets for the given UTF-8 data. The
 86// DocumentSections must correspond to rune boundaries in the UTF-8
 87// data.
 88func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
 89	dest := searchableString{
 90		data: data,
 91	}
 92	var buf [8]byte
 93	var runeGram [3]rune
 94
 95	var runeIndex uint32
 96	byteCount := 0
 97	dataSz := uint32(len(data))
 98
 99	byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
100	for _, s := range byteSections {
101		byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
102	}
103	var runeSectionBoundaries []uint32
104
105	endRune := s.runeCount
106	for ; len(data) > 0; runeIndex++ {
107		c, sz := utf8.DecodeRune(data)
108		if sz > 1 {
109			s.isPlainASCII = false
110		}
111		data = data[sz:]
112
113		runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
114
115		if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
116			s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
117		}
118		for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
119			runeSectionBoundaries = append(runeSectionBoundaries,
120				endRune+uint32(runeIndex))
121			byteSectionBoundaries = byteSectionBoundaries[1:]
122		}
123
124		byteCount += sz
125
126		if runeIndex < 2 {
127			continue
128		}
129
130		ng := runesToNGram(runeGram)
131		lastOff := s.lastOffsets[ng]
132		newOff := endRune + uint32(runeIndex) - 2
133
134		m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
135		s.postings[ng] = append(s.postings[ng], buf[:m]...)
136		s.lastOffsets[ng] = newOff
137	}
138	s.runeCount += runeIndex
139
140	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
141		return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
142	}
143
144	// Handle symbol definition that ends at file end. This can
145	// happen for labels at the end of .bat files.
146
147	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
148		runeSectionBoundaries = append(runeSectionBoundaries,
149			endRune+runeIndex)
150		byteSectionBoundaries = byteSectionBoundaries[1:]
151	}
152	runeSecs := make([]DocumentSection, 0, len(byteSections))
153	for i := 0; i < len(runeSectionBoundaries); i += 2 {
154		runeSecs = append(runeSecs, DocumentSection{
155			Start: runeSectionBoundaries[i],
156			End:   runeSectionBoundaries[i+1],
157		})
158	}
159
160	s.endRunes = append(s.endRunes, s.runeCount)
161	s.endByte += dataSz
162	return &dest, runeSecs, nil
163}
164
165// IndexBuilder builds a single index shard.
166type IndexBuilder struct {
167	// The version we will write to disk. Sourcegraph Specific. This is to
168	// enable feature flagging new format versions.
169	indexFormatVersion int
170	featureVersion     int
171
172	contentStrings  []*searchableString
173	nameStrings     []*searchableString
174	docSections     [][]DocumentSection
175	runeDocSections []DocumentSection
176
177	symID        uint32
178	symIndex     map[string]uint32
179	symKindID    uint32
180	symKindIndex map[string]uint32
181	symMetaData  []uint32
182
183	fileEndSymbol []uint32
184
185	checksums []byte
186
187	branchMasks []uint64
188	subRepos    []uint32
189
190	// docID => repoID
191	repos []uint16
192
193	// Experimental: docID => rank vec
194	ranks [][]float64
195
196	contentPostings *postingsBuilder
197	namePostings    *postingsBuilder
198
199	// root repositories
200	repoList []Repository
201
202	// name to index.
203	subRepoIndices []map[string]uint32
204
205	// language => language code
206	languageMap map[string]uint16
207
208	// language codes, uint16 encoded as little-endian
209	languages []uint8
210
211	// IndexTime will be used as the time if non-zero. Otherwise
212	// time.Now(). This is useful for doing reproducible builds in tests.
213	IndexTime time.Time
214
215	// a sortable 20 chars long id.
216	ID string
217}
218
219func (d *Repository) verify() error {
220	for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} {
221		if _, err := ParseTemplate(t); err != nil {
222			return err
223		}
224	}
225	return nil
226}
227
228func urlJoinPath(base string, elem ...string) string {
229	// golangs html/template always escapes "+" appearing in an HTML attribute
230	// [1]. We may even want to treat more characters, differently but this
231	// atleast makes it possible to visit URLs like [2].
232	//
233	// We only do this to elem since base will normally be a hardcoded string.
234	//
235	// [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80
236	// [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift
237	elem = append([]string{}, elem...) // copy to mutate
238	for i := range elem {
239		elem[i] = strings.ReplaceAll(elem[i], "+", "%2B")
240	}
241	u, err := url.JoinPath(base, elem...)
242	if err != nil {
243		return "#!error: " + err.Error()
244	}
245	return u
246}
247
248// ParseTemplate will parse the templates for FileURLTemplate,
249// LineFragmentTemplate and CommitURLTemplate.
250//
251// It makes available the extra function UrlJoinPath.
252func ParseTemplate(text string) (*template.Template, error) {
253	return template.New("").Funcs(template.FuncMap{
254		"URLJoinPath": urlJoinPath,
255	}).Parse(text)
256}
257
258// ContentSize returns the number of content bytes so far ingested.
259func (b *IndexBuilder) ContentSize() uint32 {
260	// Add the name too so we don't skip building index if we have
261	// lots of empty files.
262	return b.contentPostings.endByte + b.namePostings.endByte
263}
264
265// NumFiles returns the number of files added to this builder
266func (b *IndexBuilder) NumFiles() int {
267	return len(b.contentStrings)
268}
269
270// NewIndexBuilder creates a fresh IndexBuilder. The passed in
271// Repository contains repo metadata, and may be set to nil.
272func NewIndexBuilder(r *Repository) (*IndexBuilder, error) {
273	b := newIndexBuilder()
274
275	if r == nil {
276		r = &Repository{}
277	}
278	if err := b.setRepository(r); err != nil {
279		return nil, err
280	}
281	return b, nil
282}
283
284func newIndexBuilder() *IndexBuilder {
285	return &IndexBuilder{
286		indexFormatVersion: IndexFormatVersion,
287		featureVersion:     FeatureVersion,
288
289		contentPostings: newPostingsBuilder(),
290		namePostings:    newPostingsBuilder(),
291		fileEndSymbol:   []uint32{0},
292		symIndex:        make(map[string]uint32),
293		symKindIndex:    make(map[string]uint32),
294		languageMap:     make(map[string]uint16),
295	}
296}
297
298func (b *IndexBuilder) setRepository(desc *Repository) error {
299	if err := desc.verify(); err != nil {
300		return err
301	}
302
303	if len(desc.Branches) > 64 {
304		return fmt.Errorf("too many branches")
305	}
306
307	repo := *desc
308
309	// copy subrepomap without root
310	repo.SubRepoMap = map[string]*Repository{}
311	for k, v := range desc.SubRepoMap {
312		if k != "" {
313			repo.SubRepoMap[k] = v
314		}
315	}
316
317	b.repoList = append(b.repoList, repo)
318
319	return b.populateSubRepoIndices()
320}
321
322type DocumentSection struct {
323	Start, End uint32
324}
325
326// Document holds a document (file) to index.
327type Document struct {
328	Name              string
329	Content           []byte
330	Branches          []string
331	SubRepositoryPath string
332	Language          string
333
334	// If set, something is wrong with the file contents, and this
335	// is the reason it wasn't indexed.
336	SkipReason string
337
338	// Document sections for symbols. Offsets should use bytes.
339	Symbols         []DocumentSection
340	SymbolsMetaData []*Symbol
341
342	// Ranks is a vector of ranks for a document as provided by a DocumentRanksFile
343	// file in the git repo.
344	//
345	// Two documents can be ordered by comparing the components of their rank
346	// vectors. Bigger entries are better, as are longer vectors.
347	//
348	// This field is experimental and may change at any time without warning.
349	Ranks []float64
350}
351
352type symbolSlice struct {
353	symbols  []DocumentSection
354	metaData []*Symbol
355}
356
357func (s symbolSlice) Len() int { return len(s.symbols) }
358
359func (s symbolSlice) Swap(i, j int) {
360	s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
361	s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
362}
363
364func (s symbolSlice) Less(i, j int) bool {
365	return s.symbols[i].Start < s.symbols[j].Start
366}
367
368// AddFile is a convenience wrapper for Add
369func (b *IndexBuilder) AddFile(name string, content []byte) error {
370	return b.Add(Document{Name: name, Content: content})
371}
372
373func (b *IndexBuilder) populateSubRepoIndices() error {
374	if len(b.subRepoIndices) == len(b.repoList) {
375		return nil
376	}
377	if len(b.subRepoIndices) != len(b.repoList)-1 {
378		return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
379	}
380	repo := b.repoList[len(b.repoList)-1]
381	b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
382	return nil
383}
384
385func mkSubRepoIndices(repo Repository) map[string]uint32 {
386	paths := []string{""}
387	for k := range repo.SubRepoMap {
388		paths = append(paths, k)
389	}
390	sort.Strings(paths)
391	subRepoIndices := make(map[string]uint32, len(paths))
392	for i, p := range paths {
393		subRepoIndices[p] = uint32(i)
394	}
395	return subRepoIndices
396}
397
398const notIndexedMarker = "NOT-INDEXED: "
399
400func (b *IndexBuilder) symbolID(sym string) uint32 {
401	if _, ok := b.symIndex[sym]; !ok {
402		b.symIndex[sym] = b.symID
403		b.symID++
404	}
405	return b.symIndex[sym]
406}
407
408func (b *IndexBuilder) symbolKindID(t string) uint32 {
409	if _, ok := b.symKindIndex[t]; !ok {
410		b.symKindIndex[t] = b.symKindID
411		b.symKindID++
412	}
413	return b.symKindIndex[t]
414}
415
416func (b *IndexBuilder) addSymbols(symbols []*Symbol) {
417	for _, sym := range symbols {
418		b.symMetaData = append(b.symMetaData,
419			// This field was removed due to redundancy. To avoid
420			// needing to reindex, it is set to zero for now. In the
421			// future, this field will be completely removed. It
422			// will require incrementing the feature version.
423			0,
424			b.symbolKindID(sym.Kind),
425			b.symbolID(sym.Parent),
426			b.symbolKindID(sym.ParentKind))
427	}
428}
429
430func DetermineLanguageIfUnknown(doc *Document) {
431	if doc.Language == "" {
432		doc.Language = languages.GetLanguage(doc.Name, doc.Content)
433	}
434}
435
436// Add a file which only occurs in certain branches.
437func (b *IndexBuilder) Add(doc Document) error {
438	hasher := crc64.New(crc64.MakeTable(crc64.ISO))
439
440	if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
441		doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
442		doc.Language = "binary"
443	}
444
445	if doc.SkipReason != "" {
446		doc.Content = []byte(notIndexedMarker + doc.SkipReason)
447		doc.Symbols = nil
448		doc.SymbolsMetaData = nil
449		if doc.Language == "" {
450			doc.Language = "skipped"
451		}
452	}
453
454	DetermineLanguageIfUnknown(&doc)
455
456	sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
457	var last DocumentSection
458	for i, s := range doc.Symbols {
459		if i > 0 {
460			if last.End > s.Start {
461				return fmt.Errorf("sections overlap")
462			}
463		}
464		last = s
465	}
466	if last.End > uint32(len(doc.Content)) {
467		return fmt.Errorf("section goes past end of content")
468	}
469
470	if doc.SubRepositoryPath != "" {
471		rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
472		if err != nil || rel == doc.Name {
473			return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
474		}
475	}
476	docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
477	if err != nil {
478		return err
479	}
480	nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
481	if err != nil {
482		return err
483	}
484	b.addSymbols(doc.SymbolsMetaData)
485
486	repoIdx := len(b.repoList) - 1
487	subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
488	if !ok {
489		return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
490	}
491
492	var mask uint64
493	for _, br := range doc.Branches {
494		m := b.branchMask(br)
495		if m == 0 {
496			return fmt.Errorf("no branch found for %s", br)
497		}
498		mask |= m
499	}
500
501	if repoIdx > 1<<16 {
502		return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
503	}
504
505	b.subRepos = append(b.subRepos, subRepoIdx)
506	b.repos = append(b.repos, uint16(repoIdx))
507
508	// doc.Ranks might be nil. In case we don't use offline ranking, doc.Ranks is
509	// always nil.
510	b.ranks = append(b.ranks, doc.Ranks)
511
512	hasher.Write(doc.Content)
513
514	b.contentStrings = append(b.contentStrings, docStr)
515	b.runeDocSections = append(b.runeDocSections, runeSecs...)
516
517	b.nameStrings = append(b.nameStrings, nameStr)
518	b.docSections = append(b.docSections, doc.Symbols)
519	b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
520	b.branchMasks = append(b.branchMasks, mask)
521	b.checksums = append(b.checksums, hasher.Sum(nil)...)
522
523	langCode, ok := b.languageMap[doc.Language]
524	if !ok {
525		if len(b.languageMap) >= 65535 {
526			return fmt.Errorf("too many languages")
527		}
528		langCode = uint16(len(b.languageMap))
529		b.languageMap[doc.Language] = langCode
530	}
531	b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
532
533	return nil
534}
535
536func (b *IndexBuilder) branchMask(br string) uint64 {
537	for i, b := range b.repoList[len(b.repoList)-1].Branches {
538		if b.Name == br {
539			return uint64(1) << uint(i)
540		}
541	}
542	return 0
543}
544
545type DocChecker struct {
546	// A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
547	trigrams map[ngram]struct{}
548}
549
550// Check returns a reason why the given contents are probably not source texts.
551func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error {
552	if len(content) == 0 {
553		return nil
554	}
555
556	if len(content) < ngramSize {
557		return fmt.Errorf("file size smaller than %d", ngramSize)
558	}
559
560	if index := bytes.IndexByte(content, 0); index > 0 {
561		return fmt.Errorf("binary data at byte offset %d", index)
562	}
563
564	// PERF: we only need to do the trigram check if the upperbound on content is greater than
565	// our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
566	if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
567		return nil
568	}
569
570	var cur [3]rune
571	byteCount := 0
572	t.clearTrigrams(maxTrigramCount)
573
574	for len(content) > 0 {
575		r, sz := utf8.DecodeRune(content)
576		content = content[sz:]
577		byteCount += sz
578
579		cur[0], cur[1], cur[2] = cur[1], cur[2], r
580		if cur[0] == 0 {
581			// start of file.
582			continue
583		}
584
585		t.trigrams[runesToNGram(cur)] = struct{}{}
586		if len(t.trigrams) > maxTrigramCount {
587			// probably not text.
588			return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
589		}
590	}
591	return nil
592}
593
594func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
595	if t.trigrams == nil {
596		t.trigrams = make(map[ngram]struct{}, maxTrigramCount)
597	}
598	for key := range t.trigrams {
599		delete(t.trigrams, key)
600	}
601}
Configure Feed

Configure Feed