index/shard_builder.go at main · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / shard_builder.go
at main 21 kB View raw
Keegan Carruthers-Smith fix/index: preserve skipped-file category through shard paths (#1073) 8d ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"bytes"
 19	"encoding/binary"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"net/url"
 24	"os"
 25	"path/filepath"
 26	"slices"
 27	"sort"
 28	"strings"
 29	"text/template"
 30	"time"
 31	"unicode/utf8"
 32
 33	"github.com/sourcegraph/zoekt"
 34	"github.com/sourcegraph/zoekt/languages"
 35)
 36
 37var _ = log.Println
 38
 39const ngramSize = 3
 40
 41type searchableString struct {
 42	data []byte
 43}
 44
 45// Filled by the linker
 46var Version string
 47
 48func HostnameBestEffort() string {
 49	if h := os.Getenv("NODE_NAME"); h != "" {
 50		return h
 51	}
 52	if h := os.Getenv("HOSTNAME"); h != "" {
 53		return h
 54	}
 55	hostname, _ := os.Hostname()
 56	return hostname
 57}
 58
 59// Store character (unicode codepoint) offset (in bytes) this often.
 60const runeOffsetFrequency = 100
 61
 62// postingList holds the varint-encoded delta data and last offset for a
 63// single ngram. Stored by pointer in the asciiPostings array or the
 64// postings map so appending to data does not require rewriting the
 65// map entry or array slot.
 66type postingList struct {
 67	data    []byte
 68	lastOff uint32
 69}
 70
 71// asciiNgramBits is the number of bits needed to index all ASCII trigrams.
 72// ASCII runes are 0-127 (7 bits), so 3 runes = 21 bits = 2M entries.
 73const asciiNgramBits = 21
 74
 75// asciiNgramIndex packs three ASCII bytes into a 21-bit array index.
 76func asciiNgramIndex(a, b, c byte) uint32 {
 77	return uint32(a)<<14 | uint32(b)<<7 | uint32(c)
 78}
 79
 80// asciiIndexToNgram converts a 21-bit ASCII array index back to the
 81// canonical ngram encoding (rune[0]<<42 | rune[1]<<21 | rune[2]).
 82func asciiIndexToNgram(idx uint32) ngram {
 83	r0 := uint64(idx >> 14)
 84	r1 := uint64((idx >> 7) & 0x7f)
 85	r2 := uint64(idx & 0x7f)
 86	return ngram(r0<<42 | r1<<21 | r2)
 87}
 88
 89type postingsBuilder struct {
 90	// ASCII trigrams use direct-indexed array (zero hash/probe cost).
 91	// Non-ASCII trigrams fall back to the map.
 92	asciiPostings [1 << asciiNgramBits]*postingList
 93	postings      map[ngram]*postingList
 94
 95	// asciiPopulated tracks which indices in asciiPostings are non-nil,
 96	// so reset() and writePostings iterate only populated slots — O(n)
 97	// where n is unique ASCII trigrams (~275K) instead of O(2M).
 98	asciiPopulated []uint32
 99
100	// To support UTF-8 searching, we must map back runes to byte
101	// offsets. As a first attempt, we sample regularly. The
102	// precise offset can be found by walking from the recorded
103	// offset to the desired rune.
104	runeOffsets []uint32
105	runeCount   uint32
106
107	isPlainASCII bool
108
109	endRunes []uint32
110	endByte  uint32
111}
112
113// Initial capacity for each posting list's byte slice. On the
114// kubernetes corpus (282K unique trigrams), the median posting list is
115// 10 bytes and 78% are under 64 bytes (power-law distribution).
116// Pre-allocating 64 covers the majority without the 244 MB waste that
117// a mean-based value (1024) would cause.
118const initialPostingCap = 64
119
120// estimateNgrams returns a pre-size hint for the non-ASCII postings map,
121// derived from the maximum shard content size. Intentionally over-estimates
122// (the map only holds non-ASCII trigrams) to avoid rehashing.
123func estimateNgrams(shardMaxBytes int) int {
124	n := shardMaxBytes / 600
125	if n < 1024 {
126		n = 1024
127	}
128	return n
129}
130
131func newPostingsBuilder(shardMaxBytes int) *postingsBuilder {
132	return &postingsBuilder{
133		postings:     make(map[ngram]*postingList, estimateNgrams(shardMaxBytes)),
134		isPlainASCII: true,
135	}
136}
137
138// reset clears the builder for reuse. All postingList allocations
139// (backing arrays, map entries, ASCII array slots) are retained so the
140// next shard build avoids re-allocating them.
141// Uses asciiPopulated to reset only populated slots — O(populated)
142// instead of O(2M). Slots are kept non-nil with data truncated to
143// len 0; the hot path uses len(pl.data)==0 to re-record them in
144// asciiPopulated for the next shard.
145func (s *postingsBuilder) reset() {
146	for _, idx := range s.asciiPopulated {
147		pl := s.asciiPostings[idx]
148		pl.data = pl.data[:0]
149		pl.lastOff = 0
150	}
151	s.asciiPopulated = s.asciiPopulated[:0]
152	for _, pl := range s.postings {
153		pl.data = pl.data[:0]
154		pl.lastOff = 0
155	}
156	s.runeOffsets = s.runeOffsets[:0]
157	s.runeCount = 0
158	s.isPlainASCII = true
159	s.endRunes = s.endRunes[:0]
160	s.endByte = 0
161}
162
163// Store trigram offsets for the given UTF-8 data. The
164// DocumentSections must correspond to rune boundaries in the UTF-8
165// data.
166func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
167	dest := searchableString{
168		data: data,
169	}
170	var buf [8]byte
171	var runeGram [3]rune
172
173	var runeIndex uint32
174	byteCount := 0
175	dataSz := uint32(len(data))
176
177	byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
178	for _, s := range byteSections {
179		byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
180	}
181	var runeSectionBoundaries []uint32
182
183	endRune := s.runeCount
184	for ; len(data) > 0; runeIndex++ {
185		// ASCII fast path: avoid utf8.DecodeRune call overhead.
186		// For source code, 95-99% of bytes are ASCII.
187		var c rune
188		sz := 1
189		if data[0] < utf8.RuneSelf {
190			c = rune(data[0])
191		} else {
192			c, sz = utf8.DecodeRune(data)
193			s.isPlainASCII = false
194		}
195		data = data[sz:]
196
197		runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
198
199		if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
200			s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
201		}
202		for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
203			runeSectionBoundaries = append(runeSectionBoundaries,
204				endRune+uint32(runeIndex))
205			byteSectionBoundaries = byteSectionBoundaries[1:]
206		}
207
208		byteCount += sz
209
210		if runeIndex < 2 {
211			continue
212		}
213
214		newOff := endRune + uint32(runeIndex) - 2
215
216		// ASCII trigrams use direct-indexed array (no hash/probe).
217		var pl *postingList
218		if runeGram[0] < utf8.RuneSelf && runeGram[1] < utf8.RuneSelf && runeGram[2] < utf8.RuneSelf {
219			idx := asciiNgramIndex(byte(runeGram[0]), byte(runeGram[1]), byte(runeGram[2]))
220			pl = s.asciiPostings[idx]
221			if pl == nil {
222				pl = &postingList{data: make([]byte, 0, initialPostingCap)}
223				s.asciiPostings[idx] = pl
224				s.asciiPopulated = append(s.asciiPopulated, idx)
225			} else if len(pl.data) == 0 {
226				// Retained from a previous shard (pool reuse) — re-record
227				// in asciiPopulated for this shard's writePostings.
228				s.asciiPopulated = append(s.asciiPopulated, idx)
229			}
230		} else {
231			ng := runesToNGram(runeGram)
232			pl = s.postings[ng]
233			if pl == nil {
234				pl = &postingList{data: make([]byte, 0, initialPostingCap)}
235				s.postings[ng] = pl
236			}
237		}
238		delta := uint64(newOff - pl.lastOff)
239		if delta < 0x80 {
240			// Single-byte varint fast path: ~80% of deltas are < 128.
241			// append(slice, byte) is cheaper than append(slice, slice...).
242			pl.data = append(pl.data, byte(delta))
243		} else {
244			m := binary.PutUvarint(buf[:], delta)
245			pl.data = append(pl.data, buf[:m]...)
246		}
247		pl.lastOff = newOff
248	}
249	s.runeCount += runeIndex
250
251	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
252		return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
253	}
254
255	// Handle symbol definition that ends at file end. This can
256	// happen for labels at the end of .bat files.
257
258	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
259		runeSectionBoundaries = append(runeSectionBoundaries,
260			endRune+runeIndex)
261		byteSectionBoundaries = byteSectionBoundaries[1:]
262	}
263	runeSecs := make([]DocumentSection, 0, len(byteSections))
264	for i := 0; i < len(runeSectionBoundaries); i += 2 {
265		runeSecs = append(runeSecs, DocumentSection{
266			Start: runeSectionBoundaries[i],
267			End:   runeSectionBoundaries[i+1],
268		})
269	}
270
271	s.endRunes = append(s.endRunes, s.runeCount)
272	s.endByte += dataSz
273	return &dest, runeSecs, nil
274}
275
276// ShardBuilder builds a single index shard.
277type ShardBuilder struct {
278	// The version we will write to disk. Sourcegraph Specific. This is to
279	// enable feature flagging new format versions.
280	indexFormatVersion int
281	featureVersion     int
282
283	contentStrings  []*searchableString
284	nameStrings     []*searchableString
285	docSections     [][]DocumentSection
286	runeDocSections []DocumentSection
287
288	symID        uint32
289	symIndex     map[string]uint32
290	symKindID    uint32
291	symKindIndex map[string]uint32
292	symMetaData  []uint32
293
294	fileEndSymbol []uint32
295
296	checksums []byte
297
298	branchMasks []uint64
299	subRepos    []uint32
300
301	// docID => repoID
302	repos []uint16
303
304	contentPostings *postingsBuilder
305	namePostings    *postingsBuilder
306
307	// root repositories
308	repoList []zoekt.Repository
309
310	// name to index.
311	subRepoIndices []map[string]uint32
312
313	// language => language code
314	languageMap map[string]uint16
315
316	// language codes, uint16 encoded as little-endian
317	languages []uint8
318
319	categories []byte
320
321	// IndexTime will be used as the time if non-zero. Otherwise
322	// time.Now(). This is useful for doing reproducible builds in tests.
323	IndexTime time.Time
324
325	// a sortable 20 chars long id.
326	ID string
327}
328
329func verify(repo *zoekt.Repository) error {
330	for _, t := range []string{repo.FileURLTemplate, repo.LineFragmentTemplate, repo.CommitURLTemplate} {
331		if _, err := ParseTemplate(t); err != nil {
332			return err
333		}
334	}
335	return nil
336}
337
338func urlJoinPath(base string, elem ...string) string {
339	// golangs html/template always escapes "+" appearing in an HTML attribute
340	// [1]. We may even want to treat more characters, differently but this
341	// atleast makes it possible to visit URLs like [2].
342	//
343	// We only do this to elem since base will normally be a hardcoded string.
344	//
345	// [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80
346	// [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift
347	elem = slices.Clone(elem) // copy to mutate
348	for i := range elem {
349		elem[i] = strings.ReplaceAll(elem[i], "+", "%2B")
350	}
351	u, err := url.JoinPath(base, elem...)
352	if err != nil {
353		return "#!error: " + err.Error()
354	}
355	return u
356}
357
358// ParseTemplate will parse the templates for FileURLTemplate,
359// LineFragmentTemplate and CommitURLTemplate.
360//
361// It makes available the extra function UrlJoinPath.
362func ParseTemplate(text string) (*template.Template, error) {
363	return template.New("").Funcs(template.FuncMap{
364		"URLJoinPath": urlJoinPath,
365	}).Parse(text)
366}
367
368// ContentSize returns the number of content bytes so far ingested.
369func (b *ShardBuilder) ContentSize() uint32 {
370	// Add the name too so we don't skip building index if we have
371	// lots of empty files.
372	return b.contentPostings.endByte + b.namePostings.endByte
373}
374
375// NumFiles returns the number of files added to this builder
376func (b *ShardBuilder) NumFiles() int {
377	return len(b.contentStrings)
378}
379
380// NewShardBuilder creates a fresh ShardBuilder. The passed in
381// Repository contains repo metadata, and may be set to nil.
382func NewShardBuilder(r *zoekt.Repository) (*ShardBuilder, error) {
383	b := newShardBuilder(0)
384
385	if r == nil {
386		r = &zoekt.Repository{}
387	}
388	if err := b.setRepository(r); err != nil {
389		return nil, err
390	}
391	return b, nil
392}
393
394const defaultShardMax = 100 << 20 // 100 MB, matches Options.ShardMax default
395
396// newShardBuilder creates a ShardBuilder with fresh postingsBuilders.
397// shardMax is the maximum shard content size in bytes (0 uses defaultShardMax).
398func newShardBuilder(shardMax int) *ShardBuilder {
399	if shardMax <= 0 {
400		shardMax = defaultShardMax
401	}
402	return newShardBuilderWithPostings(
403		newPostingsBuilder(shardMax),
404		newPostingsBuilder(shardMax),
405	)
406}
407
408func newShardBuilderWithPostings(content, name *postingsBuilder) *ShardBuilder {
409	return &ShardBuilder{
410		indexFormatVersion: IndexFormatVersion,
411		featureVersion:     FeatureVersion,
412
413		contentPostings: content,
414		namePostings:    name,
415		fileEndSymbol:   []uint32{0},
416		symIndex:        make(map[string]uint32),
417		symKindIndex:    make(map[string]uint32),
418		languageMap:     make(map[string]uint16),
419	}
420}
421
422func (b *ShardBuilder) setRepository(desc *zoekt.Repository) error {
423	if err := verify(desc); err != nil {
424		return err
425	}
426
427	if len(desc.Branches) > 64 {
428		return fmt.Errorf("too many branches")
429	}
430
431	repo := *desc
432
433	// copy subrepomap without root
434	repo.SubRepoMap = map[string]*zoekt.Repository{}
435	for k, v := range desc.SubRepoMap {
436		if k != "" {
437			repo.SubRepoMap[k] = v
438		}
439	}
440
441	b.repoList = append(b.repoList, repo)
442
443	return b.populateSubRepoIndices()
444}
445
446type symbolSlice struct {
447	symbols  []DocumentSection
448	metaData []*zoekt.Symbol
449}
450
451func (s symbolSlice) Len() int { return len(s.symbols) }
452
453func (s symbolSlice) Swap(i, j int) {
454	s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
455	s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
456}
457
458func (s symbolSlice) Less(i, j int) bool {
459	return s.symbols[i].Start < s.symbols[j].Start
460}
461
462// AddFile is a convenience wrapper for Add
463func (b *ShardBuilder) AddFile(name string, content []byte) error {
464	return b.Add(Document{Name: name, Content: content})
465}
466
467func (b *ShardBuilder) populateSubRepoIndices() error {
468	if len(b.subRepoIndices) == len(b.repoList) {
469		return nil
470	}
471	if len(b.subRepoIndices) != len(b.repoList)-1 {
472		return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
473	}
474	repo := b.repoList[len(b.repoList)-1]
475	b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
476	return nil
477}
478
479func mkSubRepoIndices(repo zoekt.Repository) map[string]uint32 {
480	paths := []string{""}
481	for k := range repo.SubRepoMap {
482		paths = append(paths, k)
483	}
484	sort.Strings(paths)
485	subRepoIndices := make(map[string]uint32, len(paths))
486	for i, p := range paths {
487		subRepoIndices[p] = uint32(i)
488	}
489	return subRepoIndices
490}
491
492const notIndexedMarker = "NOT-INDEXED: "
493
494func (b *ShardBuilder) symbolID(sym string) uint32 {
495	if _, ok := b.symIndex[sym]; !ok {
496		b.symIndex[sym] = b.symID
497		b.symID++
498	}
499	return b.symIndex[sym]
500}
501
502func (b *ShardBuilder) symbolKindID(t string) uint32 {
503	if _, ok := b.symKindIndex[t]; !ok {
504		b.symKindIndex[t] = b.symKindID
505		b.symKindID++
506	}
507	return b.symKindIndex[t]
508}
509
510func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) {
511	for _, sym := range symbols {
512		b.symMetaData = append(b.symMetaData,
513			// This field was removed due to redundancy. To avoid
514			// needing to reindex, it is set to zero for now. In the
515			// future, this field will be completely removed. It
516			// will require incrementing the feature version.
517			0,
518			b.symbolKindID(sym.Kind),
519			b.symbolID(sym.Parent),
520			b.symbolKindID(sym.ParentKind))
521	}
522}
523
524func DetermineLanguageIfUnknown(doc *Document) {
525	if doc.Language != "" {
526		return
527	}
528
529	// If this document has been skipped (doc.SkipReason != SkipReasonNone), it's
530	// likely very large, or it's a non-code file like binary. In this case, we just
531	// guess the language based on the file name to avoid examining the contents.
532	// Note: passing nil content is allowed by the go-enry contract (the underlying
533	// library we use here).
534	var content []byte
535	if doc.SkipReason == SkipReasonNone {
536		content = doc.Content
537	}
538	langs := languages.GetLanguagesFromContent(doc.Name, content)
539	if len(langs) > 0 {
540		doc.Language = langs[0]
541	}
542}
543
544// Add a file which only occurs in certain branches.
545func (b *ShardBuilder) Add(doc Document) error {
546	// Skip binary check if already computed (e.g., by Builder.Add
547	// which calls DocChecker.Check before docs reach buildShard).
548	if doc.Category == FileCategoryMissing {
549		if index := bytes.IndexByte(doc.Content, 0); index >= 0 {
550			doc.SkipReason = SkipReasonBinary
551		}
552		// Preserve the original content for category detection in callers that
553		// bypass Builder.Add and pass skipped documents directly.
554		DetermineFileCategory(&doc)
555	}
556
557	if doc.SkipReason != SkipReasonNone {
558		doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation())
559		doc.Symbols = nil
560		doc.SymbolsMetaData = nil
561	}
562
563	DetermineLanguageIfUnknown(&doc)
564
565	sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
566	var last DocumentSection
567	for i, s := range doc.Symbols {
568		if i > 0 {
569			if last.End > s.Start {
570				return fmt.Errorf("sections overlap")
571			}
572		}
573		last = s
574	}
575	if last.End > uint32(len(doc.Content)) {
576		return fmt.Errorf("section goes past end of content")
577	}
578
579	if doc.SubRepositoryPath != "" {
580		rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
581		if err != nil || rel == doc.Name {
582			return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
583		}
584	}
585	docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
586	if err != nil {
587		return err
588	}
589	nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
590	if err != nil {
591		return err
592	}
593	b.addSymbols(doc.SymbolsMetaData)
594
595	repoIdx := len(b.repoList) - 1
596	subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
597	if !ok {
598		return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
599	}
600
601	var mask uint64
602	for _, br := range doc.Branches {
603		m := b.branchMask(br)
604		if m == 0 {
605			return fmt.Errorf("no branch found for %s", br)
606		}
607		mask |= m
608	}
609
610	if repoIdx > 1<<16 {
611		return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
612	}
613
614	b.subRepos = append(b.subRepos, subRepoIdx)
615	b.repos = append(b.repos, uint16(repoIdx))
616
617	hasher := crc64.New(crc64.MakeTable(crc64.ISO))
618	hasher.Write(doc.Content)
619
620	b.contentStrings = append(b.contentStrings, docStr)
621	b.runeDocSections = append(b.runeDocSections, runeSecs...)
622
623	b.nameStrings = append(b.nameStrings, nameStr)
624	b.docSections = append(b.docSections, doc.Symbols)
625	b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
626	b.branchMasks = append(b.branchMasks, mask)
627	b.checksums = append(b.checksums, hasher.Sum(nil)...)
628
629	langCode, ok := b.languageMap[doc.Language]
630	if !ok {
631		if len(b.languageMap) >= 65535 {
632			return fmt.Errorf("too many languages")
633		}
634		langCode = uint16(len(b.languageMap))
635		b.languageMap[doc.Language] = langCode
636	}
637	b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
638
639	category, err := doc.Category.encode()
640	if err != nil {
641		return err
642	}
643	b.categories = append(b.categories, category)
644
645	return nil
646}
647
648func (b *ShardBuilder) branchMask(br string) uint64 {
649	for i, b := range b.repoList[len(b.repoList)-1].Branches {
650		if b.Name == br {
651			return uint64(1) << uint(i)
652		}
653	}
654	return 0
655}
656
657// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID
658// is missing or there are no repos, this returns false.
659func (b *ShardBuilder) repoIDs() ([]uint32, bool) {
660	if len(b.repoList) == 0 {
661		return nil, false
662	}
663
664	ids := make([]uint32, 0, len(b.repoList))
665	for _, repo := range b.repoList {
666		if repo.ID == 0 {
667			return nil, false
668		}
669		ids = append(ids, repo.ID)
670	}
671	return ids, true
672}
673
674type DocChecker struct {
675	// A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
676	trigrams map[ngram]struct{}
677}
678
679// Check returns a reason why the given contents are probably not source texts.
680func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason {
681	if len(content) == 0 {
682		return SkipReasonNone
683	}
684
685	if len(content) < ngramSize {
686		return SkipReasonTooSmall
687	}
688
689	if index := bytes.IndexByte(content, 0); index >= 0 {
690		return SkipReasonBinary
691	}
692
693	// PERF: we only need to do the trigram check if the upperbound on content is greater than
694	// our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
695	if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
696		return SkipReasonNone
697	}
698
699	var cur [3]rune
700	byteCount := 0
701	t.clearTrigrams(maxTrigramCount)
702
703	for len(content) > 0 {
704		r, sz := utf8.DecodeRune(content)
705		content = content[sz:]
706		byteCount += sz
707
708		cur[0], cur[1], cur[2] = cur[1], cur[2], r
709		if cur[0] == 0 {
710			// start of file.
711			continue
712		}
713
714		t.trigrams[runesToNGram(cur)] = struct{}{}
715		if len(t.trigrams) > maxTrigramCount {
716			// probably not text.
717			return SkipReasonTooManyTrigrams
718		}
719	}
720	return SkipReasonNone
721}
722
723func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
724	if t.trigrams == nil {
725		t.trigrams = make(map[ngram]struct{}, maxTrigramCount)
726	}
727	for key := range t.trigrams {
728		delete(t.trigrams, key)
729	}
730}
731
732// shardName returns the name of the shard for the given prefix, version, and
733// shard number.
734func shardName(indexDir string, prefix string, version, n int) string {
735	prefix = url.QueryEscape(prefix)
736	if len(prefix) > 200 {
737		prefix = prefix[:200] + hashString(prefix)[:8]
738	}
739	return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n))
740}
Configure Feed

Configure Feed