fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "encoding/binary"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "net/url"
24 "os"
25 "path/filepath"
26 "slices"
27 "sort"
28 "strings"
29 "text/template"
30 "time"
31 "unicode/utf8"
32
33 "github.com/sourcegraph/zoekt"
34 "github.com/sourcegraph/zoekt/languages"
35)
36
37var _ = log.Println
38
39const ngramSize = 3
40
41type searchableString struct {
42 data []byte
43}
44
45// Filled by the linker
46var Version string
47
48func HostnameBestEffort() string {
49 if h := os.Getenv("NODE_NAME"); h != "" {
50 return h
51 }
52 if h := os.Getenv("HOSTNAME"); h != "" {
53 return h
54 }
55 hostname, _ := os.Hostname()
56 return hostname
57}
58
59// Store character (unicode codepoint) offset (in bytes) this often.
60const runeOffsetFrequency = 100
61
62// postingList holds the varint-encoded delta data and last offset for a
63// single ngram. Stored by pointer in the asciiPostings array or the
64// postings map so appending to data does not require rewriting the
65// map entry or array slot.
66type postingList struct {
67 data []byte
68 lastOff uint32
69}
70
71// asciiNgramBits is the number of bits needed to index all ASCII trigrams.
72// ASCII runes are 0-127 (7 bits), so 3 runes = 21 bits = 2M entries.
73const asciiNgramBits = 21
74
75// asciiNgramIndex packs three ASCII bytes into a 21-bit array index.
76func asciiNgramIndex(a, b, c byte) uint32 {
77 return uint32(a)<<14 | uint32(b)<<7 | uint32(c)
78}
79
80// asciiIndexToNgram converts a 21-bit ASCII array index back to the
81// canonical ngram encoding (rune[0]<<42 | rune[1]<<21 | rune[2]).
82func asciiIndexToNgram(idx uint32) ngram {
83 r0 := uint64(idx >> 14)
84 r1 := uint64((idx >> 7) & 0x7f)
85 r2 := uint64(idx & 0x7f)
86 return ngram(r0<<42 | r1<<21 | r2)
87}
88
89type postingsBuilder struct {
90 // ASCII trigrams use direct-indexed array (zero hash/probe cost).
91 // Non-ASCII trigrams fall back to the map.
92 asciiPostings [1 << asciiNgramBits]*postingList
93 postings map[ngram]*postingList
94
95 // asciiPopulated tracks which indices in asciiPostings are non-nil,
96 // so reset() and writePostings iterate only populated slots — O(n)
97 // where n is unique ASCII trigrams (~275K) instead of O(2M).
98 asciiPopulated []uint32
99
100 // To support UTF-8 searching, we must map back runes to byte
101 // offsets. As a first attempt, we sample regularly. The
102 // precise offset can be found by walking from the recorded
103 // offset to the desired rune.
104 runeOffsets []uint32
105 runeCount uint32
106
107 isPlainASCII bool
108
109 endRunes []uint32
110 endByte uint32
111}
112
113// Initial capacity for each posting list's byte slice. On the
114// kubernetes corpus (282K unique trigrams), the median posting list is
115// 10 bytes and 78% are under 64 bytes (power-law distribution).
116// Pre-allocating 64 covers the majority without the 244 MB waste that
117// a mean-based value (1024) would cause.
118const initialPostingCap = 64
119
120// estimateNgrams returns a pre-size hint for the non-ASCII postings map,
121// derived from the maximum shard content size. Intentionally over-estimates
122// (the map only holds non-ASCII trigrams) to avoid rehashing.
123func estimateNgrams(shardMaxBytes int) int {
124 n := shardMaxBytes / 600
125 if n < 1024 {
126 n = 1024
127 }
128 return n
129}
130
131func newPostingsBuilder(shardMaxBytes int) *postingsBuilder {
132 return &postingsBuilder{
133 postings: make(map[ngram]*postingList, estimateNgrams(shardMaxBytes)),
134 isPlainASCII: true,
135 }
136}
137
138// reset clears the builder for reuse. All postingList allocations
139// (backing arrays, map entries, ASCII array slots) are retained so the
140// next shard build avoids re-allocating them.
141// Uses asciiPopulated to reset only populated slots — O(populated)
142// instead of O(2M). Slots are kept non-nil with data truncated to
143// len 0; the hot path uses len(pl.data)==0 to re-record them in
144// asciiPopulated for the next shard.
145func (s *postingsBuilder) reset() {
146 for _, idx := range s.asciiPopulated {
147 pl := s.asciiPostings[idx]
148 pl.data = pl.data[:0]
149 pl.lastOff = 0
150 }
151 s.asciiPopulated = s.asciiPopulated[:0]
152 for _, pl := range s.postings {
153 pl.data = pl.data[:0]
154 pl.lastOff = 0
155 }
156 s.runeOffsets = s.runeOffsets[:0]
157 s.runeCount = 0
158 s.isPlainASCII = true
159 s.endRunes = s.endRunes[:0]
160 s.endByte = 0
161}
162
163// Store trigram offsets for the given UTF-8 data. The
164// DocumentSections must correspond to rune boundaries in the UTF-8
165// data.
166func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
167 dest := searchableString{
168 data: data,
169 }
170 var buf [8]byte
171 var runeGram [3]rune
172
173 var runeIndex uint32
174 byteCount := 0
175 dataSz := uint32(len(data))
176
177 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
178 for _, s := range byteSections {
179 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
180 }
181 var runeSectionBoundaries []uint32
182
183 endRune := s.runeCount
184 for ; len(data) > 0; runeIndex++ {
185 // ASCII fast path: avoid utf8.DecodeRune call overhead.
186 // For source code, 95-99% of bytes are ASCII.
187 var c rune
188 sz := 1
189 if data[0] < utf8.RuneSelf {
190 c = rune(data[0])
191 } else {
192 c, sz = utf8.DecodeRune(data)
193 s.isPlainASCII = false
194 }
195 data = data[sz:]
196
197 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
198
199 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
200 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
201 }
202 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
203 runeSectionBoundaries = append(runeSectionBoundaries,
204 endRune+uint32(runeIndex))
205 byteSectionBoundaries = byteSectionBoundaries[1:]
206 }
207
208 byteCount += sz
209
210 if runeIndex < 2 {
211 continue
212 }
213
214 newOff := endRune + uint32(runeIndex) - 2
215
216 // ASCII trigrams use direct-indexed array (no hash/probe).
217 var pl *postingList
218 if runeGram[0] < utf8.RuneSelf && runeGram[1] < utf8.RuneSelf && runeGram[2] < utf8.RuneSelf {
219 idx := asciiNgramIndex(byte(runeGram[0]), byte(runeGram[1]), byte(runeGram[2]))
220 pl = s.asciiPostings[idx]
221 if pl == nil {
222 pl = &postingList{data: make([]byte, 0, initialPostingCap)}
223 s.asciiPostings[idx] = pl
224 s.asciiPopulated = append(s.asciiPopulated, idx)
225 } else if len(pl.data) == 0 {
226 // Retained from a previous shard (pool reuse) — re-record
227 // in asciiPopulated for this shard's writePostings.
228 s.asciiPopulated = append(s.asciiPopulated, idx)
229 }
230 } else {
231 ng := runesToNGram(runeGram)
232 pl = s.postings[ng]
233 if pl == nil {
234 pl = &postingList{data: make([]byte, 0, initialPostingCap)}
235 s.postings[ng] = pl
236 }
237 }
238 m := binary.PutUvarint(buf[:], uint64(newOff-pl.lastOff))
239 pl.data = append(pl.data, buf[:m]...)
240 pl.lastOff = newOff
241 }
242 s.runeCount += runeIndex
243
244 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
245 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
246 }
247
248 // Handle symbol definition that ends at file end. This can
249 // happen for labels at the end of .bat files.
250
251 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
252 runeSectionBoundaries = append(runeSectionBoundaries,
253 endRune+runeIndex)
254 byteSectionBoundaries = byteSectionBoundaries[1:]
255 }
256 runeSecs := make([]DocumentSection, 0, len(byteSections))
257 for i := 0; i < len(runeSectionBoundaries); i += 2 {
258 runeSecs = append(runeSecs, DocumentSection{
259 Start: runeSectionBoundaries[i],
260 End: runeSectionBoundaries[i+1],
261 })
262 }
263
264 s.endRunes = append(s.endRunes, s.runeCount)
265 s.endByte += dataSz
266 return &dest, runeSecs, nil
267}
268
269// ShardBuilder builds a single index shard.
270type ShardBuilder struct {
271 // The version we will write to disk. Sourcegraph Specific. This is to
272 // enable feature flagging new format versions.
273 indexFormatVersion int
274 featureVersion int
275
276 contentStrings []*searchableString
277 nameStrings []*searchableString
278 docSections [][]DocumentSection
279 runeDocSections []DocumentSection
280
281 symID uint32
282 symIndex map[string]uint32
283 symKindID uint32
284 symKindIndex map[string]uint32
285 symMetaData []uint32
286
287 fileEndSymbol []uint32
288
289 checksums []byte
290
291 branchMasks []uint64
292 subRepos []uint32
293
294 // docID => repoID
295 repos []uint16
296
297 contentPostings *postingsBuilder
298 namePostings *postingsBuilder
299
300 // root repositories
301 repoList []zoekt.Repository
302
303 // name to index.
304 subRepoIndices []map[string]uint32
305
306 // language => language code
307 languageMap map[string]uint16
308
309 // language codes, uint16 encoded as little-endian
310 languages []uint8
311
312 categories []byte
313
314 // IndexTime will be used as the time if non-zero. Otherwise
315 // time.Now(). This is useful for doing reproducible builds in tests.
316 IndexTime time.Time
317
318 // a sortable 20 chars long id.
319 ID string
320}
321
322func verify(repo *zoekt.Repository) error {
323 for _, t := range []string{repo.FileURLTemplate, repo.LineFragmentTemplate, repo.CommitURLTemplate} {
324 if _, err := ParseTemplate(t); err != nil {
325 return err
326 }
327 }
328 return nil
329}
330
331func urlJoinPath(base string, elem ...string) string {
332 // golangs html/template always escapes "+" appearing in an HTML attribute
333 // [1]. We may even want to treat more characters, differently but this
334 // atleast makes it possible to visit URLs like [2].
335 //
336 // We only do this to elem since base will normally be a hardcoded string.
337 //
338 // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80
339 // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift
340 elem = slices.Clone(elem) // copy to mutate
341 for i := range elem {
342 elem[i] = strings.ReplaceAll(elem[i], "+", "%2B")
343 }
344 u, err := url.JoinPath(base, elem...)
345 if err != nil {
346 return "#!error: " + err.Error()
347 }
348 return u
349}
350
351// ParseTemplate will parse the templates for FileURLTemplate,
352// LineFragmentTemplate and CommitURLTemplate.
353//
354// It makes available the extra function UrlJoinPath.
355func ParseTemplate(text string) (*template.Template, error) {
356 return template.New("").Funcs(template.FuncMap{
357 "URLJoinPath": urlJoinPath,
358 }).Parse(text)
359}
360
361// ContentSize returns the number of content bytes so far ingested.
362func (b *ShardBuilder) ContentSize() uint32 {
363 // Add the name too so we don't skip building index if we have
364 // lots of empty files.
365 return b.contentPostings.endByte + b.namePostings.endByte
366}
367
368// NumFiles returns the number of files added to this builder
369func (b *ShardBuilder) NumFiles() int {
370 return len(b.contentStrings)
371}
372
373// NewShardBuilder creates a fresh ShardBuilder. The passed in
374// Repository contains repo metadata, and may be set to nil.
375func NewShardBuilder(r *zoekt.Repository) (*ShardBuilder, error) {
376 b := newShardBuilder(0)
377
378 if r == nil {
379 r = &zoekt.Repository{}
380 }
381 if err := b.setRepository(r); err != nil {
382 return nil, err
383 }
384 return b, nil
385}
386
387const defaultShardMax = 100 << 20 // 100 MB, matches Options.ShardMax default
388
389// newShardBuilder creates a ShardBuilder with fresh postingsBuilders.
390// shardMax is the maximum shard content size in bytes (0 uses defaultShardMax).
391func newShardBuilder(shardMax int) *ShardBuilder {
392 if shardMax <= 0 {
393 shardMax = defaultShardMax
394 }
395 return newShardBuilderWithPostings(
396 newPostingsBuilder(shardMax),
397 newPostingsBuilder(shardMax),
398 )
399}
400
401func newShardBuilderWithPostings(content, name *postingsBuilder) *ShardBuilder {
402 return &ShardBuilder{
403 indexFormatVersion: IndexFormatVersion,
404 featureVersion: FeatureVersion,
405
406 contentPostings: content,
407 namePostings: name,
408 fileEndSymbol: []uint32{0},
409 symIndex: make(map[string]uint32),
410 symKindIndex: make(map[string]uint32),
411 languageMap: make(map[string]uint16),
412 }
413}
414
415func (b *ShardBuilder) setRepository(desc *zoekt.Repository) error {
416 if err := verify(desc); err != nil {
417 return err
418 }
419
420 if len(desc.Branches) > 64 {
421 return fmt.Errorf("too many branches")
422 }
423
424 repo := *desc
425
426 // copy subrepomap without root
427 repo.SubRepoMap = map[string]*zoekt.Repository{}
428 for k, v := range desc.SubRepoMap {
429 if k != "" {
430 repo.SubRepoMap[k] = v
431 }
432 }
433
434 b.repoList = append(b.repoList, repo)
435
436 return b.populateSubRepoIndices()
437}
438
439type symbolSlice struct {
440 symbols []DocumentSection
441 metaData []*zoekt.Symbol
442}
443
444func (s symbolSlice) Len() int { return len(s.symbols) }
445
446func (s symbolSlice) Swap(i, j int) {
447 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i]
448 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i]
449}
450
451func (s symbolSlice) Less(i, j int) bool {
452 return s.symbols[i].Start < s.symbols[j].Start
453}
454
455// AddFile is a convenience wrapper for Add
456func (b *ShardBuilder) AddFile(name string, content []byte) error {
457 return b.Add(Document{Name: name, Content: content})
458}
459
460func (b *ShardBuilder) populateSubRepoIndices() error {
461 if len(b.subRepoIndices) == len(b.repoList) {
462 return nil
463 }
464 if len(b.subRepoIndices) != len(b.repoList)-1 {
465 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList))
466 }
467 repo := b.repoList[len(b.repoList)-1]
468 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo))
469 return nil
470}
471
472func mkSubRepoIndices(repo zoekt.Repository) map[string]uint32 {
473 paths := []string{""}
474 for k := range repo.SubRepoMap {
475 paths = append(paths, k)
476 }
477 sort.Strings(paths)
478 subRepoIndices := make(map[string]uint32, len(paths))
479 for i, p := range paths {
480 subRepoIndices[p] = uint32(i)
481 }
482 return subRepoIndices
483}
484
485const notIndexedMarker = "NOT-INDEXED: "
486
487func (b *ShardBuilder) symbolID(sym string) uint32 {
488 if _, ok := b.symIndex[sym]; !ok {
489 b.symIndex[sym] = b.symID
490 b.symID++
491 }
492 return b.symIndex[sym]
493}
494
495func (b *ShardBuilder) symbolKindID(t string) uint32 {
496 if _, ok := b.symKindIndex[t]; !ok {
497 b.symKindIndex[t] = b.symKindID
498 b.symKindID++
499 }
500 return b.symKindIndex[t]
501}
502
503func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) {
504 for _, sym := range symbols {
505 b.symMetaData = append(b.symMetaData,
506 // This field was removed due to redundancy. To avoid
507 // needing to reindex, it is set to zero for now. In the
508 // future, this field will be completely removed. It
509 // will require incrementing the feature version.
510 0,
511 b.symbolKindID(sym.Kind),
512 b.symbolID(sym.Parent),
513 b.symbolKindID(sym.ParentKind))
514 }
515}
516
517func DetermineLanguageIfUnknown(doc *Document) {
518 if doc.Language != "" {
519 return
520 }
521
522 // If this document has been skipped (doc.SkipReason != SkipReasonNone), it's
523 // likely very large, or it's a non-code file like binary. In this case, we just
524 // guess the language based on the file name to avoid examining the contents.
525 // Note: passing nil content is allowed by the go-enry contract (the underlying
526 // library we use here).
527 var content []byte
528 if doc.SkipReason == SkipReasonNone {
529 content = doc.Content
530 }
531 langs := languages.GetLanguagesFromContent(doc.Name, content)
532 if len(langs) > 0 {
533 doc.Language = langs[0]
534 }
535}
536
537// Add a file which only occurs in certain branches.
538func (b *ShardBuilder) Add(doc Document) error {
539 if index := bytes.IndexByte(doc.Content, 0); index > 0 {
540 doc.SkipReason = SkipReasonBinary
541 }
542
543 if doc.SkipReason != SkipReasonNone {
544 doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation())
545 doc.Symbols = nil
546 doc.SymbolsMetaData = nil
547 }
548
549 DetermineLanguageIfUnknown(&doc)
550 DetermineFileCategory(&doc)
551
552 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
553 var last DocumentSection
554 for i, s := range doc.Symbols {
555 if i > 0 {
556 if last.End > s.Start {
557 return fmt.Errorf("sections overlap")
558 }
559 }
560 last = s
561 }
562 if last.End > uint32(len(doc.Content)) {
563 return fmt.Errorf("section goes past end of content")
564 }
565
566 if doc.SubRepositoryPath != "" {
567 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
568 if err != nil || rel == doc.Name {
569 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
570 }
571 }
572 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
573 if err != nil {
574 return err
575 }
576 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
577 if err != nil {
578 return err
579 }
580 b.addSymbols(doc.SymbolsMetaData)
581
582 repoIdx := len(b.repoList) - 1
583 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath]
584 if !ok {
585 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
586 }
587
588 var mask uint64
589 for _, br := range doc.Branches {
590 m := b.branchMask(br)
591 if m == 0 {
592 return fmt.Errorf("no branch found for %s", br)
593 }
594 mask |= m
595 }
596
597 if repoIdx > 1<<16 {
598 return fmt.Errorf("too many repos in shard: max is %d", 1<<16)
599 }
600
601 b.subRepos = append(b.subRepos, subRepoIdx)
602 b.repos = append(b.repos, uint16(repoIdx))
603
604 hasher := crc64.New(crc64.MakeTable(crc64.ISO))
605 hasher.Write(doc.Content)
606
607 b.contentStrings = append(b.contentStrings, docStr)
608 b.runeDocSections = append(b.runeDocSections, runeSecs...)
609
610 b.nameStrings = append(b.nameStrings, nameStr)
611 b.docSections = append(b.docSections, doc.Symbols)
612 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections)))
613 b.branchMasks = append(b.branchMasks, mask)
614 b.checksums = append(b.checksums, hasher.Sum(nil)...)
615
616 langCode, ok := b.languageMap[doc.Language]
617 if !ok {
618 if len(b.languageMap) >= 65535 {
619 return fmt.Errorf("too many languages")
620 }
621 langCode = uint16(len(b.languageMap))
622 b.languageMap[doc.Language] = langCode
623 }
624 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))
625
626 category, err := doc.Category.encode()
627 if err != nil {
628 return err
629 }
630 b.categories = append(b.categories, category)
631
632 return nil
633}
634
635func (b *ShardBuilder) branchMask(br string) uint64 {
636 for i, b := range b.repoList[len(b.repoList)-1].Branches {
637 if b.Name == br {
638 return uint64(1) << uint(i)
639 }
640 }
641 return 0
642}
643
644// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID
645// is missing or there are no repos, this returns false.
646func (b *ShardBuilder) repoIDs() ([]uint32, bool) {
647 if len(b.repoList) == 0 {
648 return nil, false
649 }
650
651 ids := make([]uint32, 0, len(b.repoList))
652 for _, repo := range b.repoList {
653 if repo.ID == 0 {
654 return nil, false
655 }
656 ids = append(ids, repo.ID)
657 }
658 return ids, true
659}
660
661type DocChecker struct {
662 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
663 trigrams map[ngram]struct{}
664}
665
666// Check returns a reason why the given contents are probably not source texts.
667func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason {
668 if len(content) == 0 {
669 return SkipReasonNone
670 }
671
672 if len(content) < ngramSize {
673 return SkipReasonTooSmall
674 }
675
676 if index := bytes.IndexByte(content, 0); index > 0 {
677 return SkipReasonBinary
678 }
679
680 // PERF: we only need to do the trigram check if the upperbound on content is greater than
681 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
682 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
683 return SkipReasonNone
684 }
685
686 var cur [3]rune
687 byteCount := 0
688 t.clearTrigrams(maxTrigramCount)
689
690 for len(content) > 0 {
691 r, sz := utf8.DecodeRune(content)
692 content = content[sz:]
693 byteCount += sz
694
695 cur[0], cur[1], cur[2] = cur[1], cur[2], r
696 if cur[0] == 0 {
697 // start of file.
698 continue
699 }
700
701 t.trigrams[runesToNGram(cur)] = struct{}{}
702 if len(t.trigrams) > maxTrigramCount {
703 // probably not text.
704 return SkipReasonTooManyTrigrams
705 }
706 }
707 return SkipReasonNone
708}
709
710func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
711 if t.trigrams == nil {
712 t.trigrams = make(map[ngram]struct{}, maxTrigramCount)
713 }
714 for key := range t.trigrams {
715 delete(t.trigrams, key)
716 }
717}
718
719// shardName returns the name of the shard for the given prefix, version, and
720// shard number.
721func shardName(indexDir string, prefix string, version, n int) string {
722 prefix = url.QueryEscape(prefix)
723 if len(prefix) > 200 {
724 prefix = prefix[:200] + hashString(prefix)[:8]
725 }
726 return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n))
727}