fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "encoding/binary" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "net/url" 24 "os" 25 "path/filepath" 26 "slices" 27 "sort" 28 "strings" 29 "text/template" 30 "time" 31 "unicode/utf8" 32 33 "github.com/sourcegraph/zoekt" 34 "github.com/sourcegraph/zoekt/languages" 35) 36 37var _ = log.Println 38 39const ngramSize = 3 40 41type searchableString struct { 42 data []byte 43} 44 45// Filled by the linker 46var Version string 47 48func HostnameBestEffort() string { 49 if h := os.Getenv("NODE_NAME"); h != "" { 50 return h 51 } 52 if h := os.Getenv("HOSTNAME"); h != "" { 53 return h 54 } 55 hostname, _ := os.Hostname() 56 return hostname 57} 58 59// Store character (unicode codepoint) offset (in bytes) this often. 60const runeOffsetFrequency = 100 61 62// postingList holds the varint-encoded delta data and last offset for a 63// single ngram. Stored by pointer in the asciiPostings array or the 64// postings map so appending to data does not require rewriting the 65// map entry or array slot. 66type postingList struct { 67 data []byte 68 lastOff uint32 69} 70 71// asciiNgramBits is the number of bits needed to index all ASCII trigrams. 72// ASCII runes are 0-127 (7 bits), so 3 runes = 21 bits = 2M entries. 73const asciiNgramBits = 21 74 75// asciiNgramIndex packs three ASCII bytes into a 21-bit array index. 76func asciiNgramIndex(a, b, c byte) uint32 { 77 return uint32(a)<<14 | uint32(b)<<7 | uint32(c) 78} 79 80// asciiIndexToNgram converts a 21-bit ASCII array index back to the 81// canonical ngram encoding (rune[0]<<42 | rune[1]<<21 | rune[2]). 82func asciiIndexToNgram(idx uint32) ngram { 83 r0 := uint64(idx >> 14) 84 r1 := uint64((idx >> 7) & 0x7f) 85 r2 := uint64(idx & 0x7f) 86 return ngram(r0<<42 | r1<<21 | r2) 87} 88 89type postingsBuilder struct { 90 // ASCII trigrams use direct-indexed array (zero hash/probe cost). 91 // Non-ASCII trigrams fall back to the map. 92 asciiPostings [1 << asciiNgramBits]*postingList 93 postings map[ngram]*postingList 94 95 // asciiPopulated tracks which indices in asciiPostings are non-nil, 96 // so reset() and writePostings iterate only populated slots — O(n) 97 // where n is unique ASCII trigrams (~275K) instead of O(2M). 98 asciiPopulated []uint32 99 100 // To support UTF-8 searching, we must map back runes to byte 101 // offsets. As a first attempt, we sample regularly. The 102 // precise offset can be found by walking from the recorded 103 // offset to the desired rune. 104 runeOffsets []uint32 105 runeCount uint32 106 107 isPlainASCII bool 108 109 endRunes []uint32 110 endByte uint32 111} 112 113// Initial capacity for each posting list's byte slice. On the 114// kubernetes corpus (282K unique trigrams), the median posting list is 115// 10 bytes and 78% are under 64 bytes (power-law distribution). 116// Pre-allocating 64 covers the majority without the 244 MB waste that 117// a mean-based value (1024) would cause. 118const initialPostingCap = 64 119 120// estimateNgrams returns a pre-size hint for the non-ASCII postings map, 121// derived from the maximum shard content size. Intentionally over-estimates 122// (the map only holds non-ASCII trigrams) to avoid rehashing. 123func estimateNgrams(shardMaxBytes int) int { 124 n := shardMaxBytes / 600 125 if n < 1024 { 126 n = 1024 127 } 128 return n 129} 130 131func newPostingsBuilder(shardMaxBytes int) *postingsBuilder { 132 return &postingsBuilder{ 133 postings: make(map[ngram]*postingList, estimateNgrams(shardMaxBytes)), 134 isPlainASCII: true, 135 } 136} 137 138// reset clears the builder for reuse. All postingList allocations 139// (backing arrays, map entries, ASCII array slots) are retained so the 140// next shard build avoids re-allocating them. 141// Uses asciiPopulated to reset only populated slots — O(populated) 142// instead of O(2M). Slots are kept non-nil with data truncated to 143// len 0; the hot path uses len(pl.data)==0 to re-record them in 144// asciiPopulated for the next shard. 145func (s *postingsBuilder) reset() { 146 for _, idx := range s.asciiPopulated { 147 pl := s.asciiPostings[idx] 148 pl.data = pl.data[:0] 149 pl.lastOff = 0 150 } 151 s.asciiPopulated = s.asciiPopulated[:0] 152 for _, pl := range s.postings { 153 pl.data = pl.data[:0] 154 pl.lastOff = 0 155 } 156 s.runeOffsets = s.runeOffsets[:0] 157 s.runeCount = 0 158 s.isPlainASCII = true 159 s.endRunes = s.endRunes[:0] 160 s.endByte = 0 161} 162 163// Store trigram offsets for the given UTF-8 data. The 164// DocumentSections must correspond to rune boundaries in the UTF-8 165// data. 166func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) { 167 dest := searchableString{ 168 data: data, 169 } 170 var buf [8]byte 171 var runeGram [3]rune 172 173 var runeIndex uint32 174 byteCount := 0 175 dataSz := uint32(len(data)) 176 177 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections)) 178 for _, s := range byteSections { 179 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End) 180 } 181 var runeSectionBoundaries []uint32 182 183 endRune := s.runeCount 184 for ; len(data) > 0; runeIndex++ { 185 // ASCII fast path: avoid utf8.DecodeRune call overhead. 186 // For source code, 95-99% of bytes are ASCII. 187 var c rune 188 sz := 1 189 if data[0] < utf8.RuneSelf { 190 c = rune(data[0]) 191 } else { 192 c, sz = utf8.DecodeRune(data) 193 s.isPlainASCII = false 194 } 195 data = data[sz:] 196 197 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c 198 199 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 { 200 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount)) 201 } 202 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 203 runeSectionBoundaries = append(runeSectionBoundaries, 204 endRune+uint32(runeIndex)) 205 byteSectionBoundaries = byteSectionBoundaries[1:] 206 } 207 208 byteCount += sz 209 210 if runeIndex < 2 { 211 continue 212 } 213 214 newOff := endRune + uint32(runeIndex) - 2 215 216 // ASCII trigrams use direct-indexed array (no hash/probe). 217 var pl *postingList 218 if runeGram[0] < utf8.RuneSelf && runeGram[1] < utf8.RuneSelf && runeGram[2] < utf8.RuneSelf { 219 idx := asciiNgramIndex(byte(runeGram[0]), byte(runeGram[1]), byte(runeGram[2])) 220 pl = s.asciiPostings[idx] 221 if pl == nil { 222 pl = &postingList{data: make([]byte, 0, initialPostingCap)} 223 s.asciiPostings[idx] = pl 224 s.asciiPopulated = append(s.asciiPopulated, idx) 225 } else if len(pl.data) == 0 { 226 // Retained from a previous shard (pool reuse) — re-record 227 // in asciiPopulated for this shard's writePostings. 228 s.asciiPopulated = append(s.asciiPopulated, idx) 229 } 230 } else { 231 ng := runesToNGram(runeGram) 232 pl = s.postings[ng] 233 if pl == nil { 234 pl = &postingList{data: make([]byte, 0, initialPostingCap)} 235 s.postings[ng] = pl 236 } 237 } 238 m := binary.PutUvarint(buf[:], uint64(newOff-pl.lastOff)) 239 pl.data = append(pl.data, buf[:m]...) 240 pl.lastOff = newOff 241 } 242 s.runeCount += runeIndex 243 244 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) { 245 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0]) 246 } 247 248 // Handle symbol definition that ends at file end. This can 249 // happen for labels at the end of .bat files. 250 251 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 252 runeSectionBoundaries = append(runeSectionBoundaries, 253 endRune+runeIndex) 254 byteSectionBoundaries = byteSectionBoundaries[1:] 255 } 256 runeSecs := make([]DocumentSection, 0, len(byteSections)) 257 for i := 0; i < len(runeSectionBoundaries); i += 2 { 258 runeSecs = append(runeSecs, DocumentSection{ 259 Start: runeSectionBoundaries[i], 260 End: runeSectionBoundaries[i+1], 261 }) 262 } 263 264 s.endRunes = append(s.endRunes, s.runeCount) 265 s.endByte += dataSz 266 return &dest, runeSecs, nil 267} 268 269// ShardBuilder builds a single index shard. 270type ShardBuilder struct { 271 // The version we will write to disk. Sourcegraph Specific. This is to 272 // enable feature flagging new format versions. 273 indexFormatVersion int 274 featureVersion int 275 276 contentStrings []*searchableString 277 nameStrings []*searchableString 278 docSections [][]DocumentSection 279 runeDocSections []DocumentSection 280 281 symID uint32 282 symIndex map[string]uint32 283 symKindID uint32 284 symKindIndex map[string]uint32 285 symMetaData []uint32 286 287 fileEndSymbol []uint32 288 289 checksums []byte 290 291 branchMasks []uint64 292 subRepos []uint32 293 294 // docID => repoID 295 repos []uint16 296 297 contentPostings *postingsBuilder 298 namePostings *postingsBuilder 299 300 // root repositories 301 repoList []zoekt.Repository 302 303 // name to index. 304 subRepoIndices []map[string]uint32 305 306 // language => language code 307 languageMap map[string]uint16 308 309 // language codes, uint16 encoded as little-endian 310 languages []uint8 311 312 categories []byte 313 314 // IndexTime will be used as the time if non-zero. Otherwise 315 // time.Now(). This is useful for doing reproducible builds in tests. 316 IndexTime time.Time 317 318 // a sortable 20 chars long id. 319 ID string 320} 321 322func verify(repo *zoekt.Repository) error { 323 for _, t := range []string{repo.FileURLTemplate, repo.LineFragmentTemplate, repo.CommitURLTemplate} { 324 if _, err := ParseTemplate(t); err != nil { 325 return err 326 } 327 } 328 return nil 329} 330 331func urlJoinPath(base string, elem ...string) string { 332 // golangs html/template always escapes "+" appearing in an HTML attribute 333 // [1]. We may even want to treat more characters, differently but this 334 // atleast makes it possible to visit URLs like [2]. 335 // 336 // We only do this to elem since base will normally be a hardcoded string. 337 // 338 // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80 339 // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift 340 elem = slices.Clone(elem) // copy to mutate 341 for i := range elem { 342 elem[i] = strings.ReplaceAll(elem[i], "+", "%2B") 343 } 344 u, err := url.JoinPath(base, elem...) 345 if err != nil { 346 return "#!error: " + err.Error() 347 } 348 return u 349} 350 351// ParseTemplate will parse the templates for FileURLTemplate, 352// LineFragmentTemplate and CommitURLTemplate. 353// 354// It makes available the extra function UrlJoinPath. 355func ParseTemplate(text string) (*template.Template, error) { 356 return template.New("").Funcs(template.FuncMap{ 357 "URLJoinPath": urlJoinPath, 358 }).Parse(text) 359} 360 361// ContentSize returns the number of content bytes so far ingested. 362func (b *ShardBuilder) ContentSize() uint32 { 363 // Add the name too so we don't skip building index if we have 364 // lots of empty files. 365 return b.contentPostings.endByte + b.namePostings.endByte 366} 367 368// NumFiles returns the number of files added to this builder 369func (b *ShardBuilder) NumFiles() int { 370 return len(b.contentStrings) 371} 372 373// NewShardBuilder creates a fresh ShardBuilder. The passed in 374// Repository contains repo metadata, and may be set to nil. 375func NewShardBuilder(r *zoekt.Repository) (*ShardBuilder, error) { 376 b := newShardBuilder(0) 377 378 if r == nil { 379 r = &zoekt.Repository{} 380 } 381 if err := b.setRepository(r); err != nil { 382 return nil, err 383 } 384 return b, nil 385} 386 387const defaultShardMax = 100 << 20 // 100 MB, matches Options.ShardMax default 388 389// newShardBuilder creates a ShardBuilder with fresh postingsBuilders. 390// shardMax is the maximum shard content size in bytes (0 uses defaultShardMax). 391func newShardBuilder(shardMax int) *ShardBuilder { 392 if shardMax <= 0 { 393 shardMax = defaultShardMax 394 } 395 return newShardBuilderWithPostings( 396 newPostingsBuilder(shardMax), 397 newPostingsBuilder(shardMax), 398 ) 399} 400 401func newShardBuilderWithPostings(content, name *postingsBuilder) *ShardBuilder { 402 return &ShardBuilder{ 403 indexFormatVersion: IndexFormatVersion, 404 featureVersion: FeatureVersion, 405 406 contentPostings: content, 407 namePostings: name, 408 fileEndSymbol: []uint32{0}, 409 symIndex: make(map[string]uint32), 410 symKindIndex: make(map[string]uint32), 411 languageMap: make(map[string]uint16), 412 } 413} 414 415func (b *ShardBuilder) setRepository(desc *zoekt.Repository) error { 416 if err := verify(desc); err != nil { 417 return err 418 } 419 420 if len(desc.Branches) > 64 { 421 return fmt.Errorf("too many branches") 422 } 423 424 repo := *desc 425 426 // copy subrepomap without root 427 repo.SubRepoMap = map[string]*zoekt.Repository{} 428 for k, v := range desc.SubRepoMap { 429 if k != "" { 430 repo.SubRepoMap[k] = v 431 } 432 } 433 434 b.repoList = append(b.repoList, repo) 435 436 return b.populateSubRepoIndices() 437} 438 439type symbolSlice struct { 440 symbols []DocumentSection 441 metaData []*zoekt.Symbol 442} 443 444func (s symbolSlice) Len() int { return len(s.symbols) } 445 446func (s symbolSlice) Swap(i, j int) { 447 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i] 448 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i] 449} 450 451func (s symbolSlice) Less(i, j int) bool { 452 return s.symbols[i].Start < s.symbols[j].Start 453} 454 455// AddFile is a convenience wrapper for Add 456func (b *ShardBuilder) AddFile(name string, content []byte) error { 457 return b.Add(Document{Name: name, Content: content}) 458} 459 460func (b *ShardBuilder) populateSubRepoIndices() error { 461 if len(b.subRepoIndices) == len(b.repoList) { 462 return nil 463 } 464 if len(b.subRepoIndices) != len(b.repoList)-1 { 465 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList)) 466 } 467 repo := b.repoList[len(b.repoList)-1] 468 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo)) 469 return nil 470} 471 472func mkSubRepoIndices(repo zoekt.Repository) map[string]uint32 { 473 paths := []string{""} 474 for k := range repo.SubRepoMap { 475 paths = append(paths, k) 476 } 477 sort.Strings(paths) 478 subRepoIndices := make(map[string]uint32, len(paths)) 479 for i, p := range paths { 480 subRepoIndices[p] = uint32(i) 481 } 482 return subRepoIndices 483} 484 485const notIndexedMarker = "NOT-INDEXED: " 486 487func (b *ShardBuilder) symbolID(sym string) uint32 { 488 if _, ok := b.symIndex[sym]; !ok { 489 b.symIndex[sym] = b.symID 490 b.symID++ 491 } 492 return b.symIndex[sym] 493} 494 495func (b *ShardBuilder) symbolKindID(t string) uint32 { 496 if _, ok := b.symKindIndex[t]; !ok { 497 b.symKindIndex[t] = b.symKindID 498 b.symKindID++ 499 } 500 return b.symKindIndex[t] 501} 502 503func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) { 504 for _, sym := range symbols { 505 b.symMetaData = append(b.symMetaData, 506 // This field was removed due to redundancy. To avoid 507 // needing to reindex, it is set to zero for now. In the 508 // future, this field will be completely removed. It 509 // will require incrementing the feature version. 510 0, 511 b.symbolKindID(sym.Kind), 512 b.symbolID(sym.Parent), 513 b.symbolKindID(sym.ParentKind)) 514 } 515} 516 517func DetermineLanguageIfUnknown(doc *Document) { 518 if doc.Language != "" { 519 return 520 } 521 522 // If this document has been skipped (doc.SkipReason != SkipReasonNone), it's 523 // likely very large, or it's a non-code file like binary. In this case, we just 524 // guess the language based on the file name to avoid examining the contents. 525 // Note: passing nil content is allowed by the go-enry contract (the underlying 526 // library we use here). 527 var content []byte 528 if doc.SkipReason == SkipReasonNone { 529 content = doc.Content 530 } 531 langs := languages.GetLanguagesFromContent(doc.Name, content) 532 if len(langs) > 0 { 533 doc.Language = langs[0] 534 } 535} 536 537// Add a file which only occurs in certain branches. 538func (b *ShardBuilder) Add(doc Document) error { 539 if index := bytes.IndexByte(doc.Content, 0); index > 0 { 540 doc.SkipReason = SkipReasonBinary 541 } 542 543 if doc.SkipReason != SkipReasonNone { 544 doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation()) 545 doc.Symbols = nil 546 doc.SymbolsMetaData = nil 547 } 548 549 DetermineLanguageIfUnknown(&doc) 550 DetermineFileCategory(&doc) 551 552 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 553 var last DocumentSection 554 for i, s := range doc.Symbols { 555 if i > 0 { 556 if last.End > s.Start { 557 return fmt.Errorf("sections overlap") 558 } 559 } 560 last = s 561 } 562 if last.End > uint32(len(doc.Content)) { 563 return fmt.Errorf("section goes past end of content") 564 } 565 566 if doc.SubRepositoryPath != "" { 567 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name) 568 if err != nil || rel == doc.Name { 569 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath) 570 } 571 } 572 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols) 573 if err != nil { 574 return err 575 } 576 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil) 577 if err != nil { 578 return err 579 } 580 b.addSymbols(doc.SymbolsMetaData) 581 582 repoIdx := len(b.repoList) - 1 583 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath] 584 if !ok { 585 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath) 586 } 587 588 var mask uint64 589 for _, br := range doc.Branches { 590 m := b.branchMask(br) 591 if m == 0 { 592 return fmt.Errorf("no branch found for %s", br) 593 } 594 mask |= m 595 } 596 597 if repoIdx > 1<<16 { 598 return fmt.Errorf("too many repos in shard: max is %d", 1<<16) 599 } 600 601 b.subRepos = append(b.subRepos, subRepoIdx) 602 b.repos = append(b.repos, uint16(repoIdx)) 603 604 hasher := crc64.New(crc64.MakeTable(crc64.ISO)) 605 hasher.Write(doc.Content) 606 607 b.contentStrings = append(b.contentStrings, docStr) 608 b.runeDocSections = append(b.runeDocSections, runeSecs...) 609 610 b.nameStrings = append(b.nameStrings, nameStr) 611 b.docSections = append(b.docSections, doc.Symbols) 612 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections))) 613 b.branchMasks = append(b.branchMasks, mask) 614 b.checksums = append(b.checksums, hasher.Sum(nil)...) 615 616 langCode, ok := b.languageMap[doc.Language] 617 if !ok { 618 if len(b.languageMap) >= 65535 { 619 return fmt.Errorf("too many languages") 620 } 621 langCode = uint16(len(b.languageMap)) 622 b.languageMap[doc.Language] = langCode 623 } 624 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8)) 625 626 category, err := doc.Category.encode() 627 if err != nil { 628 return err 629 } 630 b.categories = append(b.categories, category) 631 632 return nil 633} 634 635func (b *ShardBuilder) branchMask(br string) uint64 { 636 for i, b := range b.repoList[len(b.repoList)-1].Branches { 637 if b.Name == br { 638 return uint64(1) << uint(i) 639 } 640 } 641 return 0 642} 643 644// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID 645// is missing or there are no repos, this returns false. 646func (b *ShardBuilder) repoIDs() ([]uint32, bool) { 647 if len(b.repoList) == 0 { 648 return nil, false 649 } 650 651 ids := make([]uint32, 0, len(b.repoList)) 652 for _, repo := range b.repoList { 653 if repo.ID == 0 { 654 return nil, false 655 } 656 ids = append(ids, repo.ID) 657 } 658 return ids, true 659} 660 661type DocChecker struct { 662 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations. 663 trigrams map[ngram]struct{} 664} 665 666// Check returns a reason why the given contents are probably not source texts. 667func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason { 668 if len(content) == 0 { 669 return SkipReasonNone 670 } 671 672 if len(content) < ngramSize { 673 return SkipReasonTooSmall 674 } 675 676 if index := bytes.IndexByte(content, 0); index > 0 { 677 return SkipReasonBinary 678 } 679 680 // PERF: we only need to do the trigram check if the upperbound on content is greater than 681 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed. 682 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile { 683 return SkipReasonNone 684 } 685 686 var cur [3]rune 687 byteCount := 0 688 t.clearTrigrams(maxTrigramCount) 689 690 for len(content) > 0 { 691 r, sz := utf8.DecodeRune(content) 692 content = content[sz:] 693 byteCount += sz 694 695 cur[0], cur[1], cur[2] = cur[1], cur[2], r 696 if cur[0] == 0 { 697 // start of file. 698 continue 699 } 700 701 t.trigrams[runesToNGram(cur)] = struct{}{} 702 if len(t.trigrams) > maxTrigramCount { 703 // probably not text. 704 return SkipReasonTooManyTrigrams 705 } 706 } 707 return SkipReasonNone 708} 709 710func (t *DocChecker) clearTrigrams(maxTrigramCount int) { 711 if t.trigrams == nil { 712 t.trigrams = make(map[ngram]struct{}, maxTrigramCount) 713 } 714 for key := range t.trigrams { 715 delete(t.trigrams, key) 716 } 717} 718 719// shardName returns the name of the shard for the given prefix, version, and 720// shard number. 721func shardName(indexDir string, prefix string, version, n int) string { 722 prefix = url.QueryEscape(prefix) 723 if len(prefix) > 200 { 724 prefix = prefix[:200] + hashString(prefix)[:8] 725 } 726 return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n)) 727}