indexbuilder.go at c7e066e961513670d75ff47181853c77a4c728ff · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / indexbuilder.go
at c7e066e961513670d75ff47181853c77a4c728ff 14 kB View raw
Julie Tibshirani Debug: make indexing timeout configurable (#676) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"bytes"
 19	"encoding/binary"
 20	"fmt"
 21	"hash/crc64"
 22	"html/template"
 23	"log"
 24	"os"
 25	"path/filepath"
 26	"sort"
 27	"time"
 28	"unicode/utf8"
 29
 30	"github.com/go-enry/go-enry/v2"
 31)
 32
 33var _ = log.Println
 34
 35const ngramSize = 3
 36
 37type searchableString struct {
 38	data []byte
 39}
 40
 41// Filled by the linker
 42var Version string
 43
 44func HostnameBestEffort() string {
 45	if h := os.Getenv("NODE_NAME"); h != "" {
 46		return h
 47	}
 48	if h := os.Getenv("HOSTNAME"); h != "" {
 49		return h
 50	}
 51	hostname, _ := os.Hostname()
 52	return hostname
 53}
 54
 55// Store character (unicode codepoint) offset (in bytes) this often.
 56const runeOffsetFrequency = 100
 57
 58type postingsBuilder struct {
 59	postings    map[ngram][]byte
 60	lastOffsets map[ngram]uint32
 61
 62	// To support UTF-8 searching, we must map back runes to byte
 63	// offsets. As a first attempt, we sample regularly. The
 64	// precise offset can be found by walking from the recorded
 65	// offset to the desired rune.
 66	runeOffsets []uint32
 67	runeCount   uint32
 68
 69	isPlainASCII bool
 70
 71	endRunes []uint32
 72	endByte  uint32
 73}
 74
 75func newPostingsBuilder() *postingsBuilder {
 76	return &postingsBuilder{
 77		postings:     map[ngram][]byte{},
 78		lastOffsets:  map[ngram]uint32{},
 79		isPlainASCII: true,
 80	}
 81}
 82
 83// Store trigram offsets for the given UTF-8 data. The
 84// DocumentSections must correspond to rune boundaries in the UTF-8
 85// data.
 86func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
 87	dest := searchableString{
 88		data: data,
 89	}
 90	var buf [8]byte
 91	var runeGram [3]rune
 92
 93	var runeIndex uint32
 94	byteCount := 0
 95	dataSz := uint32(len(data))
 96
 97	byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
 98	for _, s := range byteSections {
 99		byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
100	}
101	var runeSectionBoundaries []uint32
102
103	endRune := s.runeCount
104	for ; len(data) > 0; runeIndex++ {
105		c, sz := utf8.DecodeRune(data)
106		if sz > 1 {
107			s.isPlainASCII = false
108		}
109		data = data[sz:]
110
111		runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
112
113		if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
114			s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
115		}
116		for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
117			runeSectionBoundaries = append(runeSectionBoundaries,
118				endRune+uint32(runeIndex))
119			byteSectionBoundaries = byteSectionBoundaries[1:]
120		}
121
122		byteCount += sz
123
124		if runeIndex < 2 {
125			continue
126		}
127
128		ng := runesToNGram(runeGram)
129		lastOff := s.lastOffsets[ng]
130		newOff := endRune + uint32(runeIndex) - 2
131
132		m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
133		s.postings[ng] = append(s.postings[ng], buf[:m]...)
134		s.lastOffsets[ng] = newOff
135	}
136	s.runeCount += runeIndex
137
138	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
139		return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
140	}
141
142	// Handle symbol definition that ends at file end. This can
143	// happen for labels at the end of .bat files.
144
145	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
146		runeSectionBoundaries = append(runeSectionBoundaries,
147			endRune+runeIndex)
148		byteSectionBoundaries = byteSectionBoundaries[1:]
149	}
150	runeSecs := make([]DocumentSection, 0, len(byteSections))
151	for i := 0; i < len(runeSectionBoundaries); i += 2 {
152		runeSecs = append(runeSecs, DocumentSection{
153			Start: runeSectionBoundaries[i],
154			End:   runeSectionBoundaries[i+1],
155		})
156	}
157
158	s.endRunes = append(s.endRunes, s.runeCount)
159	s.endByte += dataSz
160	return &dest, runeSecs, nil
161}
162
163// IndexBuilder builds a single index shard.
164type IndexBuilder struct {
165	// The version we will write to disk. Sourcegraph Specific. This is to
166	// enable feature flagging new format versions.
167	indexFormatVersion int
168	featureVersion     int
169
170	contentStrings  []*searchableString
171	nameStrings     []*searchableString
172	docSections     [][]DocumentSection
173	runeDocSections []DocumentSection
174
175	symID        uint32
176	symIndex     map[string]uint32
177	symKindID    uint32
178	symKindIndex map[string]uint32
179	symMetaData  []uint32
180
181	fileEndSymbol []uint32
182
183	checksums []byte
184
185	branchMasks []uint64
186	subRepos    []uint32
187
188	// docID => repoID
189	repos []uint16
190
191	// Experimental: docID => rank vec
192	ranks [][]float64
193
194	contentPostings *postingsBuilder
195	namePostings    *postingsBuilder
196
197	// root repositories
198	repoList []Repository
199
200	// name to index.
201	subRepoIndices []map[string]uint32
202
203	// language => language code
204	languageMap map[string]uint16
205
206	// language codes, uint16 encoded as little-endian
207	languages []uint8
208
209	// IndexTime will be used as the time if non-zero. Otherwise
210	// time.Now(). This is useful for doing reproducible builds in tests.
211	IndexTime time.Time
212
213	// a sortable 20 chars long id.
214	ID string
215}
216
217func (d *Repository) verify() error {
218	for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} {
219		if _, err := template.New("").Parse(t); err != nil {
220			return err
221		}
222	}
223	return nil
224}
225
226// ContentSize returns the number of content bytes so far ingested.
227func (b *IndexBuilder) ContentSize() uint32 {
228	// Add the name too so we don't skip building index if we have
229	// lots of empty files.
230	return b.contentPostings.endByte + b.namePostings.endByte
231}
232
233// NumFiles returns the number of files added to this builder
234func (b *IndexBuilder) NumFiles() int {
235	return len(b.contentStrings)
236}
237
238// NewIndexBuilder creates a fresh IndexBuilder. The passed in
239// Repository contains repo metadata, and may be set to nil.
240func NewIndexBuilder(r *Repository) (*IndexBuilder, error) {
241	b := newIndexBuilder()
242
243	if r == nil {
244		r = &Repository{}
245	}
246	if err := b.setRepository(r); err != nil {
247		return nil, err
248	}
249	return b, nil
250}
251
252func newIndexBuilder() *IndexBuilder {
253	return &IndexBuilder{
254		indexFormatVersion: IndexFormatVersion,
255		featureVersion:     FeatureVersion,
256
257		contentPostings: newPostingsBuilder(),
258		namePostings:    newPostingsBuilder(),
259		fileEndSymbol:   []uint32{0},
260		symIndex:        make(map[string]uint32),
261		symKindIndex:    make(map[string]uint32),
262		languageMap:     make(map[string]uint16),
263	}
264}
265
266func (b *IndexBuilder) setRepository(desc *Repository) error {
267	if err := desc.verify(); err != nil {
268		return err
269	}
270
271	if len(desc.Branches) > 64 {
272		return fmt.Errorf("too many branches")
273	}
274
275	repo := *desc
276
277	// copy subrepomap without root
278	repo.SubRepoMap = map[string]*Repository{}
279	for k, v := range desc.SubRepoMap {
280		if k != "" {
281			repo.SubRepoMap[k] = v
282		}
283	}
284
285	b.repoList = append(b.repoList, repo)
286
287	return b.populateSubRepoIndices()
288}
289
290type DocumentSection struct {
291	Start, End uint32
292}
293
294// Document holds a document (file) to index.
295type Document struct {
296	Name              string
297	Content           []byte
298	Branches          []string
299	SubRepositoryPath string
300	Language          string
301
302	// If set, something is wrong with the file contents, and this
303	// is the reason it wasn't indexed.
304	SkipReason string
305
306	// Document sections for symbols. Offsets should use bytes.
307	Symbols         []DocumentSection
308	SymbolsMetaData []*Symbol
309
310	// Ranks is a vector of ranks for a document as provided by a DocumentRanksFile
311	// file in the git repo.
312	//
313	// Two documents can be ordered by comparing the components of their rank
314	// vectors. Bigger entries are better, as are longer vectors.
315	//
316	// This field is experimental and may change at any time without warning.
317	Ranks []float64
318}
319
320type symbolSlice struct {
321	symbols  []DocumentSection
322	metaData []*Symbol
323}
324
325func (s symbolSlice) Len() int { return len(s.symbols) }
326
327func (s symbolSlice) Swap(i, j int) {
328	s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
329	s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
330}
331
332func (s symbolSlice) Less(i, j int) bool {
333	return s.symbols[i].Start < s.symbols[j].Start
334}
335
336// AddFile is a convenience wrapper for Add
337func (b *IndexBuilder) AddFile(name string, content []byte) error {
338	return b.Add(Document{Name: name, Content: content})
339}
340
341// CheckText returns a reason why the given contents are probably not source texts.
342func CheckText(content []byte, maxTrigramCount int) error {
343	if len(content) == 0 {
344		return nil
345	}
346
347	if len(content) < ngramSize {
348		return fmt.Errorf("file size smaller than %d", ngramSize)
349	}
350
351	// PERF: we only need to do the trigram check if the upperbound on content
352	// is greater than our threshold.
353	var trigrams map[ngram]struct{}
354	if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound > maxTrigramCount {
355		trigrams = make(map[ngram]struct{}, maxTrigramCount+1)
356	}
357
358	var cur [3]rune
359	byteCount := 0
360	for len(content) > 0 {
361		if content[0] == 0 {
362			return fmt.Errorf("binary data at byte offset %d", byteCount)
363		}
364
365		r, sz := utf8.DecodeRune(content)
366		content = content[sz:]
367		byteCount += sz
368
369		cur[0], cur[1], cur[2] = cur[1], cur[2], r
370		if cur[0] == 0 {
371			// start of file.
372			continue
373		}
374
375		if trigrams != nil {
376			trigrams[runesToNGram(cur)] = struct{}{}
377			if len(trigrams) > maxTrigramCount {
378				// probably not text.
379				return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
380			}
381		}
382	}
383	return nil
384}
385
386func (b *IndexBuilder) populateSubRepoIndices() error {
387	if len(b.subRepoIndices) == len(b.repoList) {
388		return nil
389	}
390	if len(b.subRepoIndices) != len(b.repoList)-1 {
391		return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
392	}
393	repo := b.repoList[len(b.repoList)-1]
394	b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
395	return nil
396}
397
398func mkSubRepoIndices(repo Repository) map[string]uint32 {
399	paths := []string{""}
400	for k := range repo.SubRepoMap {
401		paths = append(paths, k)
402	}
403	sort.Strings(paths)
404	subRepoIndices := make(map[string]uint32, len(paths))
405	for i, p := range paths {
406		subRepoIndices[p] = uint32(i)
407	}
408	return subRepoIndices
409}
410
411const notIndexedMarker = "NOT-INDEXED: "
412
413func (b *IndexBuilder) symbolID(sym string) uint32 {
414	if _, ok := b.symIndex[sym]; !ok {
415		b.symIndex[sym] = b.symID
416		b.symID++
417	}
418	return b.symIndex[sym]
419}
420
421func (b *IndexBuilder) symbolKindID(t string) uint32 {
422	if _, ok := b.symKindIndex[t]; !ok {
423		b.symKindIndex[t] = b.symKindID
424		b.symKindID++
425	}
426	return b.symKindIndex[t]
427}
428
429func (b *IndexBuilder) addSymbols(symbols []*Symbol) {
430	for _, sym := range symbols {
431		b.symMetaData = append(b.symMetaData,
432			// This field was removed due to redundancy. To avoid
433			// needing to reindex, it is set to zero for now. In the
434			// future, this field will be completely removed. It
435			// will require incrementing the feature version.
436			0,
437			b.symbolKindID(sym.Kind),
438			b.symbolID(sym.Parent),
439			b.symbolKindID(sym.ParentKind))
440	}
441}
442
443func DetermineLanguageIfUnknown(doc *Document) {
444	if doc.Language == "" {
445		c := doc.Content
446		// classifier is faster on small files without losing much accuracy
447		if len(c) > 2048 {
448			c = c[:2048]
449		}
450		doc.Language = enry.GetLanguage(doc.Name, c)
451	}
452}
453
454// Add a file which only occurs in certain branches.
455func (b *IndexBuilder) Add(doc Document) error {
456	hasher := crc64.New(crc64.MakeTable(crc64.ISO))
457
458	if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
459		doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
460		doc.Language = "binary"
461	}
462
463	if doc.SkipReason != "" {
464		doc.Content = []byte(notIndexedMarker + doc.SkipReason)
465		doc.Symbols = nil
466		doc.SymbolsMetaData = nil
467		if doc.Language == "" {
468			doc.Language = "skipped"
469		}
470	}
471
472	DetermineLanguageIfUnknown(&doc)
473
474	sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
475	var last DocumentSection
476	for i, s := range doc.Symbols {
477		if i > 0 {
478			if last.End > s.Start {
479				return fmt.Errorf("sections overlap")
480			}
481		}
482		last = s
483	}
484	if last.End > uint32(len(doc.Content)) {
485		return fmt.Errorf("section goes past end of content")
486	}
487
488	if doc.SubRepositoryPath != "" {
489		rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
490		if err != nil || rel == doc.Name {
491			return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
492		}
493	}
494	docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
495	if err != nil {
496		return err
497	}
498	nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
499	if err != nil {
500		return err
501	}
502	b.addSymbols(doc.SymbolsMetaData)
503
504	repoIdx := len(b.repoList) - 1
505	subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
506	if !ok {
507		return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
508	}
509
510	var mask uint64
511	for _, br := range doc.Branches {
512		m := b.branchMask(br)
513		if m == 0 {
514			return fmt.Errorf("no branch found for %s", br)
515		}
516		mask |= m
517	}
518
519	if repoIdx > 1<<16 {
520		return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
521	}
522
523	b.subRepos = append(b.subRepos, subRepoIdx)
524	b.repos = append(b.repos, uint16(repoIdx))
525
526	// doc.Ranks might be nil. In case we don't use offline ranking, doc.Ranks is
527	// always nil.
528	b.ranks = append(b.ranks, doc.Ranks)
529
530	hasher.Write(doc.Content)
531
532	b.contentStrings = append(b.contentStrings, docStr)
533	b.runeDocSections = append(b.runeDocSections, runeSecs...)
534
535	b.nameStrings = append(b.nameStrings, nameStr)
536	b.docSections = append(b.docSections, doc.Symbols)
537	b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
538	b.branchMasks = append(b.branchMasks, mask)
539	b.checksums = append(b.checksums, hasher.Sum(nil)...)
540
541	langCode, ok := b.languageMap[doc.Language]
542	if !ok {
543		if len(b.languageMap) >= 65535 {
544			return fmt.Errorf("too many languages")
545		}
546		langCode = uint16(len(b.languageMap))
547		b.languageMap[doc.Language] = langCode
548	}
549	b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
550
551	return nil
552}
553
554func (b *IndexBuilder) branchMask(br string) uint64 {
555	for i, b := range b.repoList[len(b.repoList)-1].Branches {
556		if b.Name == br {
557			return uint64(1) << uint(i)
558		}
559	}
560	return 0
561}
Configure Feed

Configure Feed