fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 21 kB View raw
1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "encoding/binary" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "net/url" 24 "os" 25 "path/filepath" 26 "slices" 27 "sort" 28 "strings" 29 "text/template" 30 "time" 31 "unicode/utf8" 32 33 "github.com/sourcegraph/zoekt" 34 "github.com/sourcegraph/zoekt/languages" 35) 36 37var _ = log.Println 38 39const ngramSize = 3 40 41type searchableString struct { 42 data []byte 43} 44 45// Filled by the linker 46var Version string 47 48func HostnameBestEffort() string { 49 if h := os.Getenv("NODE_NAME"); h != "" { 50 return h 51 } 52 if h := os.Getenv("HOSTNAME"); h != "" { 53 return h 54 } 55 hostname, _ := os.Hostname() 56 return hostname 57} 58 59// Store character (unicode codepoint) offset (in bytes) this often. 60const runeOffsetFrequency = 100 61 62// postingList holds the varint-encoded delta data and last offset for a 63// single ngram. Stored by pointer in the asciiPostings array or the 64// postings map so appending to data does not require rewriting the 65// map entry or array slot. 66type postingList struct { 67 data []byte 68 lastOff uint32 69} 70 71// asciiNgramBits is the number of bits needed to index all ASCII trigrams. 72// ASCII runes are 0-127 (7 bits), so 3 runes = 21 bits = 2M entries. 73const asciiNgramBits = 21 74 75// asciiNgramIndex packs three ASCII bytes into a 21-bit array index. 76func asciiNgramIndex(a, b, c byte) uint32 { 77 return uint32(a)<<14 | uint32(b)<<7 | uint32(c) 78} 79 80// asciiIndexToNgram converts a 21-bit ASCII array index back to the 81// canonical ngram encoding (rune[0]<<42 | rune[1]<<21 | rune[2]). 82func asciiIndexToNgram(idx uint32) ngram { 83 r0 := uint64(idx >> 14) 84 r1 := uint64((idx >> 7) & 0x7f) 85 r2 := uint64(idx & 0x7f) 86 return ngram(r0<<42 | r1<<21 | r2) 87} 88 89type postingsBuilder struct { 90 // ASCII trigrams use direct-indexed array (zero hash/probe cost). 91 // Non-ASCII trigrams fall back to the map. 92 asciiPostings [1 << asciiNgramBits]*postingList 93 postings map[ngram]*postingList 94 95 // asciiPopulated tracks which indices in asciiPostings are non-nil, 96 // so reset() and writePostings iterate only populated slots — O(n) 97 // where n is unique ASCII trigrams (~275K) instead of O(2M). 98 asciiPopulated []uint32 99 100 // To support UTF-8 searching, we must map back runes to byte 101 // offsets. As a first attempt, we sample regularly. The 102 // precise offset can be found by walking from the recorded 103 // offset to the desired rune. 104 runeOffsets []uint32 105 runeCount uint32 106 107 isPlainASCII bool 108 109 endRunes []uint32 110 endByte uint32 111} 112 113// Initial capacity for each posting list's byte slice. On the 114// kubernetes corpus (282K unique trigrams), the median posting list is 115// 10 bytes and 78% are under 64 bytes (power-law distribution). 116// Pre-allocating 64 covers the majority without the 244 MB waste that 117// a mean-based value (1024) would cause. 118const initialPostingCap = 64 119 120// estimateNgrams returns a pre-size hint for the non-ASCII postings map, 121// derived from the maximum shard content size. Intentionally over-estimates 122// (the map only holds non-ASCII trigrams) to avoid rehashing. 123func estimateNgrams(shardMaxBytes int) int { 124 n := shardMaxBytes / 600 125 if n < 1024 { 126 n = 1024 127 } 128 return n 129} 130 131func newPostingsBuilder(shardMaxBytes int) *postingsBuilder { 132 return &postingsBuilder{ 133 postings: make(map[ngram]*postingList, estimateNgrams(shardMaxBytes)), 134 isPlainASCII: true, 135 } 136} 137 138// reset clears the builder for reuse. All postingList allocations 139// (backing arrays, map entries, ASCII array slots) are retained so the 140// next shard build avoids re-allocating them. 141// Uses asciiPopulated to reset only populated slots — O(populated) 142// instead of O(2M). Slots are kept non-nil with data truncated to 143// len 0; the hot path uses len(pl.data)==0 to re-record them in 144// asciiPopulated for the next shard. 145func (s *postingsBuilder) reset() { 146 for _, idx := range s.asciiPopulated { 147 pl := s.asciiPostings[idx] 148 pl.data = pl.data[:0] 149 pl.lastOff = 0 150 } 151 s.asciiPopulated = s.asciiPopulated[:0] 152 for _, pl := range s.postings { 153 pl.data = pl.data[:0] 154 pl.lastOff = 0 155 } 156 s.runeOffsets = s.runeOffsets[:0] 157 s.runeCount = 0 158 s.isPlainASCII = true 159 s.endRunes = s.endRunes[:0] 160 s.endByte = 0 161} 162 163// Store trigram offsets for the given UTF-8 data. The 164// DocumentSections must correspond to rune boundaries in the UTF-8 165// data. 166func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) { 167 dest := searchableString{ 168 data: data, 169 } 170 var buf [8]byte 171 var runeGram [3]rune 172 173 var runeIndex uint32 174 byteCount := 0 175 dataSz := uint32(len(data)) 176 177 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections)) 178 for _, s := range byteSections { 179 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End) 180 } 181 var runeSectionBoundaries []uint32 182 183 endRune := s.runeCount 184 for ; len(data) > 0; runeIndex++ { 185 // ASCII fast path: avoid utf8.DecodeRune call overhead. 186 // For source code, 95-99% of bytes are ASCII. 187 var c rune 188 sz := 1 189 if data[0] < utf8.RuneSelf { 190 c = rune(data[0]) 191 } else { 192 c, sz = utf8.DecodeRune(data) 193 s.isPlainASCII = false 194 } 195 data = data[sz:] 196 197 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c 198 199 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 { 200 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount)) 201 } 202 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 203 runeSectionBoundaries = append(runeSectionBoundaries, 204 endRune+uint32(runeIndex)) 205 byteSectionBoundaries = byteSectionBoundaries[1:] 206 } 207 208 byteCount += sz 209 210 if runeIndex < 2 { 211 continue 212 } 213 214 newOff := endRune + uint32(runeIndex) - 2 215 216 // ASCII trigrams use direct-indexed array (no hash/probe). 217 var pl *postingList 218 if runeGram[0] < utf8.RuneSelf && runeGram[1] < utf8.RuneSelf && runeGram[2] < utf8.RuneSelf { 219 idx := asciiNgramIndex(byte(runeGram[0]), byte(runeGram[1]), byte(runeGram[2])) 220 pl = s.asciiPostings[idx] 221 if pl == nil { 222 pl = &postingList{data: make([]byte, 0, initialPostingCap)} 223 s.asciiPostings[idx] = pl 224 s.asciiPopulated = append(s.asciiPopulated, idx) 225 } else if len(pl.data) == 0 { 226 // Retained from a previous shard (pool reuse) — re-record 227 // in asciiPopulated for this shard's writePostings. 228 s.asciiPopulated = append(s.asciiPopulated, idx) 229 } 230 } else { 231 ng := runesToNGram(runeGram) 232 pl = s.postings[ng] 233 if pl == nil { 234 pl = &postingList{data: make([]byte, 0, initialPostingCap)} 235 s.postings[ng] = pl 236 } 237 } 238 delta := uint64(newOff - pl.lastOff) 239 if delta < 0x80 { 240 // Single-byte varint fast path: ~80% of deltas are < 128. 241 // append(slice, byte) is cheaper than append(slice, slice...). 242 pl.data = append(pl.data, byte(delta)) 243 } else { 244 m := binary.PutUvarint(buf[:], delta) 245 pl.data = append(pl.data, buf[:m]...) 246 } 247 pl.lastOff = newOff 248 } 249 s.runeCount += runeIndex 250 251 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) { 252 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0]) 253 } 254 255 // Handle symbol definition that ends at file end. This can 256 // happen for labels at the end of .bat files. 257 258 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 259 runeSectionBoundaries = append(runeSectionBoundaries, 260 endRune+runeIndex) 261 byteSectionBoundaries = byteSectionBoundaries[1:] 262 } 263 runeSecs := make([]DocumentSection, 0, len(byteSections)) 264 for i := 0; i < len(runeSectionBoundaries); i += 2 { 265 runeSecs = append(runeSecs, DocumentSection{ 266 Start: runeSectionBoundaries[i], 267 End: runeSectionBoundaries[i+1], 268 }) 269 } 270 271 s.endRunes = append(s.endRunes, s.runeCount) 272 s.endByte += dataSz 273 return &dest, runeSecs, nil 274} 275 276// ShardBuilder builds a single index shard. 277type ShardBuilder struct { 278 // The version we will write to disk. Sourcegraph Specific. This is to 279 // enable feature flagging new format versions. 280 indexFormatVersion int 281 featureVersion int 282 283 contentStrings []*searchableString 284 nameStrings []*searchableString 285 docSections [][]DocumentSection 286 runeDocSections []DocumentSection 287 288 symID uint32 289 symIndex map[string]uint32 290 symKindID uint32 291 symKindIndex map[string]uint32 292 symMetaData []uint32 293 294 fileEndSymbol []uint32 295 296 checksums []byte 297 298 branchMasks []uint64 299 subRepos []uint32 300 301 // docID => repoID 302 repos []uint16 303 304 contentPostings *postingsBuilder 305 namePostings *postingsBuilder 306 307 // root repositories 308 repoList []zoekt.Repository 309 310 // name to index. 311 subRepoIndices []map[string]uint32 312 313 // language => language code 314 languageMap map[string]uint16 315 316 // language codes, uint16 encoded as little-endian 317 languages []uint8 318 319 categories []byte 320 321 // IndexTime will be used as the time if non-zero. Otherwise 322 // time.Now(). This is useful for doing reproducible builds in tests. 323 IndexTime time.Time 324 325 // a sortable 20 chars long id. 326 ID string 327} 328 329func verify(repo *zoekt.Repository) error { 330 for _, t := range []string{repo.FileURLTemplate, repo.LineFragmentTemplate, repo.CommitURLTemplate} { 331 if _, err := ParseTemplate(t); err != nil { 332 return err 333 } 334 } 335 return nil 336} 337 338func urlJoinPath(base string, elem ...string) string { 339 // golangs html/template always escapes "+" appearing in an HTML attribute 340 // [1]. We may even want to treat more characters, differently but this 341 // atleast makes it possible to visit URLs like [2]. 342 // 343 // We only do this to elem since base will normally be a hardcoded string. 344 // 345 // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80 346 // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift 347 elem = slices.Clone(elem) // copy to mutate 348 for i := range elem { 349 elem[i] = strings.ReplaceAll(elem[i], "+", "%2B") 350 } 351 u, err := url.JoinPath(base, elem...) 352 if err != nil { 353 return "#!error: " + err.Error() 354 } 355 return u 356} 357 358// ParseTemplate will parse the templates for FileURLTemplate, 359// LineFragmentTemplate and CommitURLTemplate. 360// 361// It makes available the extra function UrlJoinPath. 362func ParseTemplate(text string) (*template.Template, error) { 363 return template.New("").Funcs(template.FuncMap{ 364 "URLJoinPath": urlJoinPath, 365 }).Parse(text) 366} 367 368// ContentSize returns the number of content bytes so far ingested. 369func (b *ShardBuilder) ContentSize() uint32 { 370 // Add the name too so we don't skip building index if we have 371 // lots of empty files. 372 return b.contentPostings.endByte + b.namePostings.endByte 373} 374 375// NumFiles returns the number of files added to this builder 376func (b *ShardBuilder) NumFiles() int { 377 return len(b.contentStrings) 378} 379 380// NewShardBuilder creates a fresh ShardBuilder. The passed in 381// Repository contains repo metadata, and may be set to nil. 382func NewShardBuilder(r *zoekt.Repository) (*ShardBuilder, error) { 383 b := newShardBuilder(0) 384 385 if r == nil { 386 r = &zoekt.Repository{} 387 } 388 if err := b.setRepository(r); err != nil { 389 return nil, err 390 } 391 return b, nil 392} 393 394const defaultShardMax = 100 << 20 // 100 MB, matches Options.ShardMax default 395 396// newShardBuilder creates a ShardBuilder with fresh postingsBuilders. 397// shardMax is the maximum shard content size in bytes (0 uses defaultShardMax). 398func newShardBuilder(shardMax int) *ShardBuilder { 399 if shardMax <= 0 { 400 shardMax = defaultShardMax 401 } 402 return newShardBuilderWithPostings( 403 newPostingsBuilder(shardMax), 404 newPostingsBuilder(shardMax), 405 ) 406} 407 408func newShardBuilderWithPostings(content, name *postingsBuilder) *ShardBuilder { 409 return &ShardBuilder{ 410 indexFormatVersion: IndexFormatVersion, 411 featureVersion: FeatureVersion, 412 413 contentPostings: content, 414 namePostings: name, 415 fileEndSymbol: []uint32{0}, 416 symIndex: make(map[string]uint32), 417 symKindIndex: make(map[string]uint32), 418 languageMap: make(map[string]uint16), 419 } 420} 421 422func (b *ShardBuilder) setRepository(desc *zoekt.Repository) error { 423 if err := verify(desc); err != nil { 424 return err 425 } 426 427 if len(desc.Branches) > 64 { 428 return fmt.Errorf("too many branches") 429 } 430 431 repo := *desc 432 433 // copy subrepomap without root 434 repo.SubRepoMap = map[string]*zoekt.Repository{} 435 for k, v := range desc.SubRepoMap { 436 if k != "" { 437 repo.SubRepoMap[k] = v 438 } 439 } 440 441 b.repoList = append(b.repoList, repo) 442 443 return b.populateSubRepoIndices() 444} 445 446type symbolSlice struct { 447 symbols []DocumentSection 448 metaData []*zoekt.Symbol 449} 450 451func (s symbolSlice) Len() int { return len(s.symbols) } 452 453func (s symbolSlice) Swap(i, j int) { 454 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i] 455 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i] 456} 457 458func (s symbolSlice) Less(i, j int) bool { 459 return s.symbols[i].Start < s.symbols[j].Start 460} 461 462// AddFile is a convenience wrapper for Add 463func (b *ShardBuilder) AddFile(name string, content []byte) error { 464 return b.Add(Document{Name: name, Content: content}) 465} 466 467func (b *ShardBuilder) populateSubRepoIndices() error { 468 if len(b.subRepoIndices) == len(b.repoList) { 469 return nil 470 } 471 if len(b.subRepoIndices) != len(b.repoList)-1 { 472 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList)) 473 } 474 repo := b.repoList[len(b.repoList)-1] 475 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo)) 476 return nil 477} 478 479func mkSubRepoIndices(repo zoekt.Repository) map[string]uint32 { 480 paths := []string{""} 481 for k := range repo.SubRepoMap { 482 paths = append(paths, k) 483 } 484 sort.Strings(paths) 485 subRepoIndices := make(map[string]uint32, len(paths)) 486 for i, p := range paths { 487 subRepoIndices[p] = uint32(i) 488 } 489 return subRepoIndices 490} 491 492const notIndexedMarker = "NOT-INDEXED: " 493 494func (b *ShardBuilder) symbolID(sym string) uint32 { 495 if _, ok := b.symIndex[sym]; !ok { 496 b.symIndex[sym] = b.symID 497 b.symID++ 498 } 499 return b.symIndex[sym] 500} 501 502func (b *ShardBuilder) symbolKindID(t string) uint32 { 503 if _, ok := b.symKindIndex[t]; !ok { 504 b.symKindIndex[t] = b.symKindID 505 b.symKindID++ 506 } 507 return b.symKindIndex[t] 508} 509 510func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) { 511 for _, sym := range symbols { 512 b.symMetaData = append(b.symMetaData, 513 // This field was removed due to redundancy. To avoid 514 // needing to reindex, it is set to zero for now. In the 515 // future, this field will be completely removed. It 516 // will require incrementing the feature version. 517 0, 518 b.symbolKindID(sym.Kind), 519 b.symbolID(sym.Parent), 520 b.symbolKindID(sym.ParentKind)) 521 } 522} 523 524func DetermineLanguageIfUnknown(doc *Document) { 525 if doc.Language != "" { 526 return 527 } 528 529 // If this document has been skipped (doc.SkipReason != SkipReasonNone), it's 530 // likely very large, or it's a non-code file like binary. In this case, we just 531 // guess the language based on the file name to avoid examining the contents. 532 // Note: passing nil content is allowed by the go-enry contract (the underlying 533 // library we use here). 534 var content []byte 535 if doc.SkipReason == SkipReasonNone { 536 content = doc.Content 537 } 538 langs := languages.GetLanguagesFromContent(doc.Name, content) 539 if len(langs) > 0 { 540 doc.Language = langs[0] 541 } 542} 543 544// Add a file which only occurs in certain branches. 545func (b *ShardBuilder) Add(doc Document) error { 546 // Skip binary check if already computed (e.g., by Builder.Add 547 // which calls DocChecker.Check before docs reach buildShard). 548 if doc.Category == FileCategoryMissing { 549 if index := bytes.IndexByte(doc.Content, 0); index >= 0 { 550 doc.SkipReason = SkipReasonBinary 551 } 552 // Preserve the original content for category detection in callers that 553 // bypass Builder.Add and pass skipped documents directly. 554 DetermineFileCategory(&doc) 555 } 556 557 if doc.SkipReason != SkipReasonNone { 558 doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation()) 559 doc.Symbols = nil 560 doc.SymbolsMetaData = nil 561 } 562 563 DetermineLanguageIfUnknown(&doc) 564 565 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 566 var last DocumentSection 567 for i, s := range doc.Symbols { 568 if i > 0 { 569 if last.End > s.Start { 570 return fmt.Errorf("sections overlap") 571 } 572 } 573 last = s 574 } 575 if last.End > uint32(len(doc.Content)) { 576 return fmt.Errorf("section goes past end of content") 577 } 578 579 if doc.SubRepositoryPath != "" { 580 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name) 581 if err != nil || rel == doc.Name { 582 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath) 583 } 584 } 585 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols) 586 if err != nil { 587 return err 588 } 589 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil) 590 if err != nil { 591 return err 592 } 593 b.addSymbols(doc.SymbolsMetaData) 594 595 repoIdx := len(b.repoList) - 1 596 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath] 597 if !ok { 598 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath) 599 } 600 601 var mask uint64 602 for _, br := range doc.Branches { 603 m := b.branchMask(br) 604 if m == 0 { 605 return fmt.Errorf("no branch found for %s", br) 606 } 607 mask |= m 608 } 609 610 if repoIdx > 1<<16 { 611 return fmt.Errorf("too many repos in shard: max is %d", 1<<16) 612 } 613 614 b.subRepos = append(b.subRepos, subRepoIdx) 615 b.repos = append(b.repos, uint16(repoIdx)) 616 617 hasher := crc64.New(crc64.MakeTable(crc64.ISO)) 618 hasher.Write(doc.Content) 619 620 b.contentStrings = append(b.contentStrings, docStr) 621 b.runeDocSections = append(b.runeDocSections, runeSecs...) 622 623 b.nameStrings = append(b.nameStrings, nameStr) 624 b.docSections = append(b.docSections, doc.Symbols) 625 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections))) 626 b.branchMasks = append(b.branchMasks, mask) 627 b.checksums = append(b.checksums, hasher.Sum(nil)...) 628 629 langCode, ok := b.languageMap[doc.Language] 630 if !ok { 631 if len(b.languageMap) >= 65535 { 632 return fmt.Errorf("too many languages") 633 } 634 langCode = uint16(len(b.languageMap)) 635 b.languageMap[doc.Language] = langCode 636 } 637 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8)) 638 639 category, err := doc.Category.encode() 640 if err != nil { 641 return err 642 } 643 b.categories = append(b.categories, category) 644 645 return nil 646} 647 648func (b *ShardBuilder) branchMask(br string) uint64 { 649 for i, b := range b.repoList[len(b.repoList)-1].Branches { 650 if b.Name == br { 651 return uint64(1) << uint(i) 652 } 653 } 654 return 0 655} 656 657// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID 658// is missing or there are no repos, this returns false. 659func (b *ShardBuilder) repoIDs() ([]uint32, bool) { 660 if len(b.repoList) == 0 { 661 return nil, false 662 } 663 664 ids := make([]uint32, 0, len(b.repoList)) 665 for _, repo := range b.repoList { 666 if repo.ID == 0 { 667 return nil, false 668 } 669 ids = append(ids, repo.ID) 670 } 671 return ids, true 672} 673 674type DocChecker struct { 675 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations. 676 trigrams map[ngram]struct{} 677} 678 679// Check returns a reason why the given contents are probably not source texts. 680func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason { 681 if len(content) == 0 { 682 return SkipReasonNone 683 } 684 685 if len(content) < ngramSize { 686 return SkipReasonTooSmall 687 } 688 689 if index := bytes.IndexByte(content, 0); index >= 0 { 690 return SkipReasonBinary 691 } 692 693 // PERF: we only need to do the trigram check if the upperbound on content is greater than 694 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed. 695 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile { 696 return SkipReasonNone 697 } 698 699 var cur [3]rune 700 byteCount := 0 701 t.clearTrigrams(maxTrigramCount) 702 703 for len(content) > 0 { 704 r, sz := utf8.DecodeRune(content) 705 content = content[sz:] 706 byteCount += sz 707 708 cur[0], cur[1], cur[2] = cur[1], cur[2], r 709 if cur[0] == 0 { 710 // start of file. 711 continue 712 } 713 714 t.trigrams[runesToNGram(cur)] = struct{}{} 715 if len(t.trigrams) > maxTrigramCount { 716 // probably not text. 717 return SkipReasonTooManyTrigrams 718 } 719 } 720 return SkipReasonNone 721} 722 723func (t *DocChecker) clearTrigrams(maxTrigramCount int) { 724 if t.trigrams == nil { 725 t.trigrams = make(map[ngram]struct{}, maxTrigramCount) 726 } 727 for key := range t.trigrams { 728 delete(t.trigrams, key) 729 } 730} 731 732// shardName returns the name of the shard for the given prefix, version, and 733// shard number. 734func shardName(indexDir string, prefix string, version, n int) string { 735 prefix = url.QueryEscape(prefix) 736 if len(prefix) > 200 { 737 prefix = prefix[:200] + hashString(prefix)[:8] 738 } 739 return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n)) 740}