indexbuilder.go at 56aa3cd957c0fa3051a9d6a915a02f309bd05bc0 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / indexbuilder.go
at 56aa3cd957c0fa3051a9d6a915a02f309bd05bc0 14 kB View raw
Stefan Hengl ranking: add document ranks to shards (#449) 3y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"bytes"
 19	"encoding/binary"
 20	"fmt"
 21	"hash/crc64"
 22	"html/template"
 23	"log"
 24	"path/filepath"
 25	"sort"
 26	"time"
 27	"unicode/utf8"
 28
 29	"github.com/go-enry/go-enry/v2"
 30)
 31
 32var _ = log.Println
 33
 34const ngramSize = 3
 35
 36type searchableString struct {
 37	data []byte
 38}
 39
 40// Filled by the linker
 41var Version string
 42
 43// Store character (unicode codepoint) offset (in bytes) this often.
 44const runeOffsetFrequency = 100
 45
 46type postingsBuilder struct {
 47	postings    map[ngram][]byte
 48	lastOffsets map[ngram]uint32
 49
 50	// To support UTF-8 searching, we must map back runes to byte
 51	// offsets. As a first attempt, we sample regularly. The
 52	// precise offset can be found by walking from the recorded
 53	// offset to the desired rune.
 54	runeOffsets []uint32
 55	runeCount   uint32
 56
 57	isPlainASCII bool
 58
 59	endRunes []uint32
 60	endByte  uint32
 61}
 62
 63func newPostingsBuilder() *postingsBuilder {
 64	return &postingsBuilder{
 65		postings:     map[ngram][]byte{},
 66		lastOffsets:  map[ngram]uint32{},
 67		isPlainASCII: true,
 68	}
 69}
 70
 71// Store trigram offsets for the given UTF-8 data. The
 72// DocumentSections must correspond to rune boundaries in the UTF-8
 73// data.
 74func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
 75	dest := searchableString{
 76		data: data,
 77	}
 78	var buf [8]byte
 79	var runeGram [3]rune
 80
 81	var runeIndex uint32
 82	byteCount := 0
 83	dataSz := uint32(len(data))
 84
 85	byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
 86	for _, s := range byteSections {
 87		byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
 88	}
 89	var runeSectionBoundaries []uint32
 90
 91	endRune := s.runeCount
 92	for ; len(data) > 0; runeIndex++ {
 93		c, sz := utf8.DecodeRune(data)
 94		if sz > 1 {
 95			s.isPlainASCII = false
 96		}
 97		data = data[sz:]
 98
 99		runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
100
101		if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
102			s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
103		}
104		for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
105			runeSectionBoundaries = append(runeSectionBoundaries,
106				endRune+uint32(runeIndex))
107			byteSectionBoundaries = byteSectionBoundaries[1:]
108		}
109
110		byteCount += sz
111
112		if runeIndex < 2 {
113			continue
114		}
115
116		ng := runesToNGram(runeGram)
117		lastOff := s.lastOffsets[ng]
118		newOff := endRune + uint32(runeIndex) - 2
119
120		m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
121		s.postings[ng] = append(s.postings[ng], buf[:m]...)
122		s.lastOffsets[ng] = newOff
123	}
124	s.runeCount += runeIndex
125
126	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
127		return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
128	}
129
130	// Handle symbol definition that ends at file end. This can
131	// happen for labels at the end of .bat files.
132
133	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
134		runeSectionBoundaries = append(runeSectionBoundaries,
135			endRune+runeIndex)
136		byteSectionBoundaries = byteSectionBoundaries[1:]
137	}
138	runeSecs := make([]DocumentSection, 0, len(byteSections))
139	for i := 0; i < len(runeSectionBoundaries); i += 2 {
140		runeSecs = append(runeSecs, DocumentSection{
141			Start: runeSectionBoundaries[i],
142			End:   runeSectionBoundaries[i+1],
143		})
144	}
145
146	s.endRunes = append(s.endRunes, s.runeCount)
147	s.endByte += dataSz
148	return &dest, runeSecs, nil
149}
150
151// IndexBuilder builds a single index shard.
152type IndexBuilder struct {
153	// The version we will write to disk. Sourcegraph Specific. This is to
154	// enable feature flagging new format versions.
155	indexFormatVersion int
156	featureVersion     int
157
158	contentStrings  []*searchableString
159	nameStrings     []*searchableString
160	docSections     [][]DocumentSection
161	runeDocSections []DocumentSection
162
163	symID        uint32
164	symIndex     map[string]uint32
165	symKindID    uint32
166	symKindIndex map[string]uint32
167	symMetaData  []uint32
168
169	fileEndSymbol []uint32
170
171	checksums []byte
172
173	branchMasks []uint64
174	subRepos    []uint32
175
176	// docID => repoID
177	repos []uint16
178
179	// Experimental: docID => rank vec
180	ranks [][]float64
181
182	contentPostings *postingsBuilder
183	namePostings    *postingsBuilder
184
185	// root repositories
186	repoList []Repository
187
188	// name to index.
189	subRepoIndices []map[string]uint32
190
191	// language => language code
192	languageMap map[string]uint16
193
194	// language codes, uint16 encoded as little-endian
195	languages []uint8
196
197	// IndexTime will be used as the time if non-zero. Otherwise
198	// time.Now(). This is useful for doing reproducible builds in tests.
199	IndexTime time.Time
200
201	// a sortable 20 chars long id.
202	ID string
203}
204
205func (d *Repository) verify() error {
206	for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} {
207		if _, err := template.New("").Parse(t); err != nil {
208			return err
209		}
210	}
211	return nil
212}
213
214// ContentSize returns the number of content bytes so far ingested.
215func (b *IndexBuilder) ContentSize() uint32 {
216	// Add the name too so we don't skip building index if we have
217	// lots of empty files.
218	return b.contentPostings.endByte + b.namePostings.endByte
219}
220
221// NewIndexBuilder creates a fresh IndexBuilder. The passed in
222// Repository contains repo metadata, and may be set to nil.
223func NewIndexBuilder(r *Repository) (*IndexBuilder, error) {
224	b := newIndexBuilder()
225
226	if r == nil {
227		r = &Repository{}
228	}
229	if err := b.setRepository(r); err != nil {
230		return nil, err
231	}
232	return b, nil
233}
234
235func newIndexBuilder() *IndexBuilder {
236	return &IndexBuilder{
237		indexFormatVersion: IndexFormatVersion,
238		featureVersion:     FeatureVersion,
239
240		contentPostings: newPostingsBuilder(),
241		namePostings:    newPostingsBuilder(),
242		fileEndSymbol:   []uint32{0},
243		symIndex:        make(map[string]uint32),
244		symKindIndex:    make(map[string]uint32),
245		languageMap:     map[string]uint16{},
246	}
247}
248
249func (b *IndexBuilder) setRepository(desc *Repository) error {
250	if err := desc.verify(); err != nil {
251		return err
252	}
253
254	if len(desc.Branches) > 64 {
255		return fmt.Errorf("too many branches")
256	}
257
258	repo := *desc
259
260	// copy subrepomap without root
261	repo.SubRepoMap = map[string]*Repository{}
262	for k, v := range desc.SubRepoMap {
263		if k != "" {
264			repo.SubRepoMap[k] = v
265		}
266	}
267
268	b.repoList = append(b.repoList, repo)
269
270	return b.populateSubRepoIndices()
271}
272
273type DocumentSection struct {
274	Start, End uint32
275}
276
277// Document holds a document (file) to index.
278type Document struct {
279	Name              string
280	Content           []byte
281	Branches          []string
282	SubRepositoryPath string
283	Language          string
284
285	// If set, something is wrong with the file contents, and this
286	// is the reason it wasn't indexed.
287	SkipReason string
288
289	// Document sections for symbols. Offsets should use bytes.
290	Symbols         []DocumentSection
291	SymbolsMetaData []*Symbol
292
293	// Ranks is a vector of ranks for a document as provided by a DocumentRanksFile
294	// file in the git repo.
295	//
296	// Two documents can be ordered by comparing the components of their rank
297	// vectors. Bigger entries are better, as are longer vectors.
298	//
299	// This field is experimental and may change at any time without warning.
300	Ranks []float64
301}
302
303type symbolSlice struct {
304	symbols  []DocumentSection
305	metaData []*Symbol
306}
307
308func (s symbolSlice) Len() int { return len(s.symbols) }
309
310func (s symbolSlice) Swap(i, j int) {
311	s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
312	s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
313}
314
315func (s symbolSlice) Less(i, j int) bool {
316	return s.symbols[i].Start < s.symbols[j].Start
317}
318
319// AddFile is a convenience wrapper for Add
320func (b *IndexBuilder) AddFile(name string, content []byte) error {
321	return b.Add(Document{Name: name, Content: content})
322}
323
324// CheckText returns a reason why the given contents are probably not source texts.
325func CheckText(content []byte, maxTrigramCount int) error {
326	if len(content) == 0 {
327		return nil
328	}
329
330	if len(content) < ngramSize {
331		return fmt.Errorf("file size smaller than %d", ngramSize)
332	}
333
334	// PERF: we only need to do the trigram check if the upperbound on content
335	// is greater than our threshold.
336	var trigrams map[ngram]struct{}
337	if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound > maxTrigramCount {
338		trigrams = make(map[ngram]struct{}, maxTrigramCount+1)
339	}
340
341	var cur [3]rune
342	byteCount := 0
343	for len(content) > 0 {
344		if content[0] == 0 {
345			return fmt.Errorf("binary data at byte offset %d", byteCount)
346		}
347
348		r, sz := utf8.DecodeRune(content)
349		content = content[sz:]
350		byteCount += sz
351
352		cur[0], cur[1], cur[2] = cur[1], cur[2], r
353		if cur[0] == 0 {
354			// start of file.
355			continue
356		}
357
358		if trigrams != nil {
359			trigrams[runesToNGram(cur)] = struct{}{}
360			if len(trigrams) > maxTrigramCount {
361				// probably not text.
362				return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
363			}
364		}
365	}
366	return nil
367}
368
369func (b *IndexBuilder) populateSubRepoIndices() error {
370	if len(b.subRepoIndices) == len(b.repoList) {
371		return nil
372	}
373	if len(b.subRepoIndices) != len(b.repoList)-1 {
374		return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
375	}
376	repo := b.repoList[len(b.repoList)-1]
377	b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
378	return nil
379}
380
381func mkSubRepoIndices(repo Repository) map[string]uint32 {
382	paths := []string{""}
383	for k := range repo.SubRepoMap {
384		paths = append(paths, k)
385	}
386	sort.Strings(paths)
387	subRepoIndices := make(map[string]uint32, len(paths))
388	for i, p := range paths {
389		subRepoIndices[p] = uint32(i)
390	}
391	return subRepoIndices
392}
393
394const notIndexedMarker = "NOT-INDEXED: "
395
396func (b *IndexBuilder) symbolID(sym string) uint32 {
397	if _, ok := b.symIndex[sym]; !ok {
398		b.symIndex[sym] = b.symID
399		b.symID++
400	}
401	return b.symIndex[sym]
402}
403
404func (b *IndexBuilder) symbolKindID(t string) uint32 {
405	if _, ok := b.symKindIndex[t]; !ok {
406		b.symKindIndex[t] = b.symKindID
407		b.symKindID++
408	}
409	return b.symKindIndex[t]
410}
411
412func (b *IndexBuilder) addSymbols(symbols []*Symbol) {
413	for _, sym := range symbols {
414		b.symMetaData = append(b.symMetaData,
415			// This field was removed due to redundancy. To avoid
416			// needing to reindex, it is set to zero for now. In the
417			// future, this field will be completely removed. It
418			// will require incrementing the feature version.
419			0,
420			b.symbolKindID(sym.Kind),
421			b.symbolID(sym.Parent),
422			b.symbolKindID(sym.ParentKind))
423	}
424}
425
426// Add a file which only occurs in certain branches.
427func (b *IndexBuilder) Add(doc Document) error {
428	hasher := crc64.New(crc64.MakeTable(crc64.ISO))
429
430	if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
431		doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
432		doc.Language = "binary"
433	}
434
435	if doc.SkipReason != "" {
436		doc.Content = []byte(notIndexedMarker + doc.SkipReason)
437		doc.Symbols = nil
438		doc.SymbolsMetaData = nil
439		if doc.Language == "" {
440			doc.Language = "skipped"
441		}
442	}
443
444	if doc.Language == "" {
445		c := doc.Content
446		// classifier is faster on small files without losing much accuracy
447		if len(c) > 2048 {
448			c = c[:2048]
449		}
450		doc.Language = enry.GetLanguage(doc.Name, c)
451	}
452
453	sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
454	var last DocumentSection
455	for i, s := range doc.Symbols {
456		if i > 0 {
457			if last.End > s.Start {
458				return fmt.Errorf("sections overlap")
459			}
460		}
461		last = s
462	}
463	if last.End > uint32(len(doc.Content)) {
464		return fmt.Errorf("section goes past end of content")
465	}
466
467	if doc.SubRepositoryPath != "" {
468		rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
469		if err != nil || rel == doc.Name {
470			return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
471		}
472	}
473	docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
474	if err != nil {
475		return err
476	}
477	nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
478	if err != nil {
479		return err
480	}
481	b.addSymbols(doc.SymbolsMetaData)
482
483	repoIdx := len(b.repoList) - 1
484	subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
485	if !ok {
486		return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
487	}
488
489	var mask uint64
490	for _, br := range doc.Branches {
491		m := b.branchMask(br)
492		if m == 0 {
493			return fmt.Errorf("no branch found for %s", br)
494		}
495		mask |= m
496	}
497
498	if repoIdx > 1<<16 {
499		return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
500	}
501
502	b.subRepos = append(b.subRepos, subRepoIdx)
503	b.repos = append(b.repos, uint16(repoIdx))
504
505	// doc.Ranks might be nil. In case we don't use offline ranking, doc.Ranks is
506	// always nil.
507	b.ranks = append(b.ranks, doc.Ranks)
508
509	hasher.Write(doc.Content)
510
511	b.contentStrings = append(b.contentStrings, docStr)
512	b.runeDocSections = append(b.runeDocSections, runeSecs...)
513
514	b.nameStrings = append(b.nameStrings, nameStr)
515	b.docSections = append(b.docSections, doc.Symbols)
516	b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
517	b.branchMasks = append(b.branchMasks, mask)
518	b.checksums = append(b.checksums, hasher.Sum(nil)...)
519
520	langCode, ok := b.languageMap[doc.Language]
521	if !ok {
522		if len(b.languageMap) >= 65535 {
523			return fmt.Errorf("too many languages")
524		}
525		langCode = uint16(len(b.languageMap))
526		b.languageMap[doc.Language] = langCode
527	}
528	b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
529
530	return nil
531}
532
533func (b *IndexBuilder) branchMask(br string) uint64 {
534	for i, b := range b.repoList[len(b.repoList)-1].Branches {
535		if b.Name == br {
536			return uint64(1) << uint(i)
537		}
538	}
539	return 0
540}
Configure Feed

Configure Feed