Indexing: improve doc content checks (#688) · boltless.me/zoekt@2a4f4a4

+3 -11

build/builder.go

··· 22 22 "fmt" 23 23 "io" 24 24 "log" 25 - "math" 26 25 "net/url" 27 26 "os" 28 27 "os/exec" ··· 247 246 248 247 nextShardNum int 249 248 todo []*zoekt.Document 249 + docChecker zoekt.DocChecker 250 250 size int 251 251 252 252 parserMap ctags.ParserMap 253 - 254 - building sync.WaitGroup 253 + building sync.WaitGroup 255 254 256 255 errMu sync.Mutex 257 256 buildError error ··· 618 617 } 619 618 620 619 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name) 621 - 622 - // Adjust trigramMax for allowed large files so we don't exclude them. 623 - trigramMax := b.opts.TrigramMax 624 - if allowLargeFile { 625 - trigramMax = math.MaxInt64 626 - } 627 - 628 620 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile { 629 621 // We could pass the document on to the shardbuilder, but if 630 622 // we pass through a part of the source tree with binary/large 631 623 // files, the corresponding shard would be mostly empty, so 632 624 // insert a reason here too. 633 625 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) 634 - } else if err := zoekt.CheckText(doc.Content, trigramMax); err != nil { 626 + } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { 635 627 doc.SkipReason = err.Error() 636 628 doc.Language = "binary" 637 629 }

+20 -5

index_test.go

··· 3204 3204 } 3205 3205 } 3206 3206 3207 - func TestCheckText(t *testing.T) { 3207 + func TestDocChecker(t *testing.T) { 3208 + docChecker := DocChecker{} 3209 + 3210 + // Test valid and invalid text 3208 3211 for _, text := range []string{"", "simple ascii", "símplé unicödé", "\uFEFFwith utf8 'bom'", "with \uFFFD unicode replacement char"} { 3209 - if err := CheckText([]byte(text), 20000); err != nil { 3210 - t.Errorf("CheckText(%q): %v", text, err) 3212 + if err := docChecker.Check([]byte(text), 20000, false); err != nil { 3213 + t.Errorf("Check(%q): %v", text, err) 3211 3214 } 3212 3215 } 3213 3216 for _, text := range []string{"zero\x00byte", "xx", "0123456789abcdefghi"} { 3214 - if err := CheckText([]byte(text), 15); err == nil { 3215 - t.Errorf("CheckText(%q) succeeded", text) 3217 + if err := docChecker.Check([]byte(text), 15, false); err == nil { 3218 + t.Errorf("Check(%q) succeeded", text) 3219 + } 3220 + } 3221 + 3222 + // Test valid and invalid text with an allowed large file 3223 + for _, text := range []string{"0123456789abcdefghi", "qwertyuiopasdfghjklzxcvbnm"} { 3224 + if err := docChecker.Check([]byte(text), 15, true); err != nil { 3225 + t.Errorf("Check(%q): %v", text, err) 3226 + } 3227 + } 3228 + for _, text := range []string{"zero\x00byte", "xx"} { 3229 + if err := docChecker.Check([]byte(text), 15, true); err == nil { 3230 + t.Errorf("Check(%q) succeeded", text) 3216 3231 } 3217 3232 } 3218 3233 }

+58 -45

indexbuilder.go

··· 338 338 return b.Add(Document{Name: name, Content: content}) 339 339 } 340 340 341 - // CheckText returns a reason why the given contents are probably not source texts. 342 - func CheckText(content []byte, maxTrigramCount int) error { 343 - if len(content) == 0 { 344 - return nil 345 - } 346 - 347 - if len(content) < ngramSize { 348 - return fmt.Errorf("file size smaller than %d", ngramSize) 349 - } 350 - 351 - // PERF: we only need to do the trigram check if the upperbound on content 352 - // is greater than our threshold. 353 - var trigrams map[ngram]struct{} 354 - if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound > maxTrigramCount { 355 - trigrams = make(map[ngram]struct{}, maxTrigramCount+1) 356 - } 357 - 358 - var cur [3]rune 359 - byteCount := 0 360 - for len(content) > 0 { 361 - if content[0] == 0 { 362 - return fmt.Errorf("binary data at byte offset %d", byteCount) 363 - } 364 - 365 - r, sz := utf8.DecodeRune(content) 366 - content = content[sz:] 367 - byteCount += sz 368 - 369 - cur[0], cur[1], cur[2] = cur[1], cur[2], r 370 - if cur[0] == 0 { 371 - // start of file. 372 - continue 373 - } 374 - 375 - if trigrams != nil { 376 - trigrams[runesToNGram(cur)] = struct{}{} 377 - if len(trigrams) > maxTrigramCount { 378 - // probably not text. 379 - return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) 380 - } 381 - } 382 - } 383 - return nil 384 - } 385 - 386 341 func (b *IndexBuilder) populateSubRepoIndices() error { 387 342 if len(b.subRepoIndices) == len(b.repoList) { 388 343 return nil ··· 559 514 } 560 515 return 0 561 516 } 517 + 518 + type DocChecker struct { 519 + // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations. 520 + trigrams map[ngram]struct{} 521 + } 522 + 523 + // Check returns a reason why the given contents are probably not source texts. 524 + func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error { 525 + if len(content) == 0 { 526 + return nil 527 + } 528 + 529 + if len(content) < ngramSize { 530 + return fmt.Errorf("file size smaller than %d", ngramSize) 531 + } 532 + 533 + if index := bytes.IndexByte(content, 0); index > 0 { 534 + return fmt.Errorf("binary data at byte offset %d", index) 535 + } 536 + 537 + // PERF: we only need to do the trigram check if the upperbound on content is greater than 538 + // our threshold. Also skip the trigram check if the file is explicitly marked as allowed. 539 + if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile { 540 + return nil 541 + } 542 + 543 + var cur [3]rune 544 + byteCount := 0 545 + t.clearTrigrams(maxTrigramCount) 546 + 547 + for len(content) > 0 { 548 + r, sz := utf8.DecodeRune(content) 549 + content = content[sz:] 550 + byteCount += sz 551 + 552 + cur[0], cur[1], cur[2] = cur[1], cur[2], r 553 + if cur[0] == 0 { 554 + // start of file. 555 + continue 556 + } 557 + 558 + t.trigrams[runesToNGram(cur)] = struct{}{} 559 + if len(t.trigrams) > maxTrigramCount { 560 + // probably not text. 561 + return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) 562 + } 563 + } 564 + return nil 565 + } 566 + 567 + func (t *DocChecker) clearTrigrams(maxTrigramCount int) { 568 + if t.trigrams == nil { 569 + t.trigrams = make(map[ngram]struct{}, maxTrigramCount) 570 + } 571 + for key := range t.trigrams { 572 + delete(t.trigrams, key) 573 + } 574 + }

Configure Feed

Configure Feed