fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Indexing: improve doc content checks (#688)

This change makes a couple improvements to the doc content checks during indexing. When a large file is explicitly marked as "allowed", we don't enforce the max trigram count. However, we still iterated through all its trigrams and collected them in a map. Now we short-circuit the check to avoid counting all the trigrams.

We now also reuse the trigram map across documents. This makes sense, as it's always presized with the same capacity hint. This doesn't have a significant effect on indexing speed, but significantly reduces allocations

+81 -61
+3 -11
build/builder.go
··· 22 22 "fmt" 23 23 "io" 24 24 "log" 25 - "math" 26 25 "net/url" 27 26 "os" 28 27 "os/exec" ··· 247 246 248 247 nextShardNum int 249 248 todo []*zoekt.Document 249 + docChecker zoekt.DocChecker 250 250 size int 251 251 252 252 parserMap ctags.ParserMap 253 - 254 - building sync.WaitGroup 253 + building sync.WaitGroup 255 254 256 255 errMu sync.Mutex 257 256 buildError error ··· 618 617 } 619 618 620 619 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name) 621 - 622 - // Adjust trigramMax for allowed large files so we don't exclude them. 623 - trigramMax := b.opts.TrigramMax 624 - if allowLargeFile { 625 - trigramMax = math.MaxInt64 626 - } 627 - 628 620 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile { 629 621 // We could pass the document on to the shardbuilder, but if 630 622 // we pass through a part of the source tree with binary/large 631 623 // files, the corresponding shard would be mostly empty, so 632 624 // insert a reason here too. 633 625 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) 634 - } else if err := zoekt.CheckText(doc.Content, trigramMax); err != nil { 626 + } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { 635 627 doc.SkipReason = err.Error() 636 628 doc.Language = "binary" 637 629 }
+20 -5
index_test.go
··· 3204 3204 } 3205 3205 } 3206 3206 3207 - func TestCheckText(t *testing.T) { 3207 + func TestDocChecker(t *testing.T) { 3208 + docChecker := DocChecker{} 3209 + 3210 + // Test valid and invalid text 3208 3211 for _, text := range []string{"", "simple ascii", "símplé unicödé", "\uFEFFwith utf8 'bom'", "with \uFFFD unicode replacement char"} { 3209 - if err := CheckText([]byte(text), 20000); err != nil { 3210 - t.Errorf("CheckText(%q): %v", text, err) 3212 + if err := docChecker.Check([]byte(text), 20000, false); err != nil { 3213 + t.Errorf("Check(%q): %v", text, err) 3211 3214 } 3212 3215 } 3213 3216 for _, text := range []string{"zero\x00byte", "xx", "0123456789abcdefghi"} { 3214 - if err := CheckText([]byte(text), 15); err == nil { 3215 - t.Errorf("CheckText(%q) succeeded", text) 3217 + if err := docChecker.Check([]byte(text), 15, false); err == nil { 3218 + t.Errorf("Check(%q) succeeded", text) 3219 + } 3220 + } 3221 + 3222 + // Test valid and invalid text with an allowed large file 3223 + for _, text := range []string{"0123456789abcdefghi", "qwertyuiopasdfghjklzxcvbnm"} { 3224 + if err := docChecker.Check([]byte(text), 15, true); err != nil { 3225 + t.Errorf("Check(%q): %v", text, err) 3226 + } 3227 + } 3228 + for _, text := range []string{"zero\x00byte", "xx"} { 3229 + if err := docChecker.Check([]byte(text), 15, true); err == nil { 3230 + t.Errorf("Check(%q) succeeded", text) 3216 3231 } 3217 3232 } 3218 3233 }
+58 -45
indexbuilder.go
··· 338 338 return b.Add(Document{Name: name, Content: content}) 339 339 } 340 340 341 - // CheckText returns a reason why the given contents are probably not source texts. 342 - func CheckText(content []byte, maxTrigramCount int) error { 343 - if len(content) == 0 { 344 - return nil 345 - } 346 - 347 - if len(content) < ngramSize { 348 - return fmt.Errorf("file size smaller than %d", ngramSize) 349 - } 350 - 351 - // PERF: we only need to do the trigram check if the upperbound on content 352 - // is greater than our threshold. 353 - var trigrams map[ngram]struct{} 354 - if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound > maxTrigramCount { 355 - trigrams = make(map[ngram]struct{}, maxTrigramCount+1) 356 - } 357 - 358 - var cur [3]rune 359 - byteCount := 0 360 - for len(content) > 0 { 361 - if content[0] == 0 { 362 - return fmt.Errorf("binary data at byte offset %d", byteCount) 363 - } 364 - 365 - r, sz := utf8.DecodeRune(content) 366 - content = content[sz:] 367 - byteCount += sz 368 - 369 - cur[0], cur[1], cur[2] = cur[1], cur[2], r 370 - if cur[0] == 0 { 371 - // start of file. 372 - continue 373 - } 374 - 375 - if trigrams != nil { 376 - trigrams[runesToNGram(cur)] = struct{}{} 377 - if len(trigrams) > maxTrigramCount { 378 - // probably not text. 379 - return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) 380 - } 381 - } 382 - } 383 - return nil 384 - } 385 - 386 341 func (b *IndexBuilder) populateSubRepoIndices() error { 387 342 if len(b.subRepoIndices) == len(b.repoList) { 388 343 return nil ··· 559 514 } 560 515 return 0 561 516 } 517 + 518 + type DocChecker struct { 519 + // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations. 520 + trigrams map[ngram]struct{} 521 + } 522 + 523 + // Check returns a reason why the given contents are probably not source texts. 524 + func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error { 525 + if len(content) == 0 { 526 + return nil 527 + } 528 + 529 + if len(content) < ngramSize { 530 + return fmt.Errorf("file size smaller than %d", ngramSize) 531 + } 532 + 533 + if index := bytes.IndexByte(content, 0); index > 0 { 534 + return fmt.Errorf("binary data at byte offset %d", index) 535 + } 536 + 537 + // PERF: we only need to do the trigram check if the upperbound on content is greater than 538 + // our threshold. Also skip the trigram check if the file is explicitly marked as allowed. 539 + if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile { 540 + return nil 541 + } 542 + 543 + var cur [3]rune 544 + byteCount := 0 545 + t.clearTrigrams(maxTrigramCount) 546 + 547 + for len(content) > 0 { 548 + r, sz := utf8.DecodeRune(content) 549 + content = content[sz:] 550 + byteCount += sz 551 + 552 + cur[0], cur[1], cur[2] = cur[1], cur[2], r 553 + if cur[0] == 0 { 554 + // start of file. 555 + continue 556 + } 557 + 558 + t.trigrams[runesToNGram(cur)] = struct{}{} 559 + if len(t.trigrams) > maxTrigramCount { 560 + // probably not text. 561 + return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) 562 + } 563 + } 564 + return nil 565 + } 566 + 567 + func (t *DocChecker) clearTrigrams(maxTrigramCount int) { 568 + if t.trigrams == nil { 569 + t.trigrams = make(map[ngram]struct{}, maxTrigramCount) 570 + } 571 + for key := range t.trigrams { 572 + delete(t.trigrams, key) 573 + } 574 + }