ranking: downweight binary files (#924) · boltless.me/zoekt@05b2b92

+1 -1

cmd/zoekt-index/main.go

··· 137 137 if f.size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(displayName) { 138 138 if err := builder.Add(index.Document{ 139 139 Name: displayName, 140 - SkipReason: fmt.Sprintf("document size %d larger than limit %d", f.size, opts.SizeMax), 140 + SkipReason: index.SkipReasonTooLarge, 141 141 }); err != nil { 142 142 return err 143 143 }

+6 -28

index/builder.go

··· 601 601 // we pass through a part of the source tree with binary/large 602 602 // files, the corresponding shard would be mostly empty, so 603 603 // insert a reason here too. 604 - doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) 605 - } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { 606 - doc.SkipReason = err.Error() 604 + doc.SkipReason = SkipReasonTooLarge 605 + } else if skip := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); skip != SkipReasonNone { 606 + doc.SkipReason = skip 607 607 } 608 608 609 609 b.todo = append(b.todo, &doc) 610 610 611 - if doc.SkipReason == "" { 611 + if doc.SkipReason == SkipReasonNone { 612 612 b.size += len(doc.Name) + len(doc.Content) 613 613 } else { 614 - b.size += len(doc.Name) + len(doc.SkipReason) 614 + b.size += len(doc.Name) 615 615 // Drop the content if we are skipping the document. Skipped content is not counted towards the 616 616 // shard size limit, so otherwise we might buffer too much data in memory before flushing. 617 617 doc.Content = nil ··· 865 865 // have a higher chance of being searched before limits kick in. 866 866 func rank(d *Document, origIdx int) []float64 { 867 867 skipped := 0.0 868 - if d.SkipReason != "" { 868 + if d.SkipReason != SkipReasonNone { 869 869 skipped = 1.0 870 870 } 871 871 ··· 1072 1072 1073 1073 func (e *deltaIndexOptionsMismatchError) Error() string { 1074 1074 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions) 1075 - } 1076 - 1077 - // Document holds a document (file) to index. 1078 - type Document struct { 1079 - Name string 1080 - Content []byte 1081 - Branches []string 1082 - SubRepositoryPath string 1083 - Language string 1084 - Category FileCategory 1085 - 1086 - // If set, something is wrong with the file contents, and this 1087 - // is the reason it wasn't indexed. 1088 - SkipReason string 1089 - 1090 - // Document sections for symbols. Offsets should use bytes. 1091 - Symbols []DocumentSection 1092 - SymbolsMetaData []*zoekt.Symbol 1093 - } 1094 - 1095 - type DocumentSection struct { 1096 - Start, End uint32 1097 1075 } 1098 1076 1099 1077 // umask holds the Umask of the current process

+3 -3

index/builder_test.go

··· 246 246 if err != nil { 247 247 t.Fatal(err) 248 248 } 249 - if len(b.todo) != 1 || b.todo[0].SkipReason == "" { 249 + if len(b.todo) != 1 || b.todo[0].SkipReason == SkipReasonNone { 250 250 t.Fatalf("document should have been skipped") 251 251 } 252 252 if b.todo[0].Content != nil { ··· 1143 1143 docs: []*Document{ 1144 1144 { 1145 1145 Name: "binary_file", 1146 - SkipReason: "binary file", 1146 + SkipReason: SkipReasonBinary, 1147 1147 }, 1148 1148 { 1149 1149 Name: "some_test.go", ··· 1151 1151 }, 1152 1152 { 1153 1153 Name: "large_file.go", 1154 - SkipReason: "too large", 1154 + SkipReason: SkipReasonTooLarge, 1155 1155 }, 1156 1156 { 1157 1157 Name: "file.go",

+50

index/document.go

··· 1 + package index 2 + 3 + import "github.com/sourcegraph/zoekt" 4 + 5 + // Document holds a document (file) to index. 6 + type Document struct { 7 + Name string 8 + Content []byte 9 + Branches []string 10 + SubRepositoryPath string 11 + Language string 12 + Category FileCategory 13 + 14 + SkipReason SkipReason 15 + 16 + // Document sections for symbols. Offsets should use bytes. 17 + Symbols []DocumentSection 18 + SymbolsMetaData []*zoekt.Symbol 19 + } 20 + 21 + type SkipReason int 22 + 23 + const ( 24 + SkipReasonNone SkipReason = iota 25 + SkipReasonTooLarge 26 + SkipReasonTooSmall 27 + SkipReasonBinary 28 + SkipReasonTooManyTrigrams 29 + ) 30 + 31 + func (s SkipReason) explanation() string { 32 + switch s { 33 + case SkipReasonNone: 34 + return "" 35 + case SkipReasonTooLarge: 36 + return "exceeds the maximum size limit" 37 + case SkipReasonTooSmall: 38 + return "contains too few trigrams" 39 + case SkipReasonBinary: 40 + return "contains binary content" 41 + case SkipReasonTooManyTrigrams: 42 + return "contains too many trigrams" 43 + default: 44 + return "unknown skip reason" 45 + } 46 + } 47 + 48 + type DocumentSection struct { 49 + Start, End uint32 50 + }

+14 -4

index/file_category.go

··· 22 22 FileCategoryGenerated 23 23 FileCategoryConfig 24 24 FileCategoryDotFile 25 + FileCategoryBinary 25 26 FileCategoryDocumentation 26 27 ) 27 28 28 29 func DetermineFileCategory(doc *Document) { 30 + if doc.SkipReason == SkipReasonBinary { 31 + doc.Category = FileCategoryBinary 32 + return 33 + } 34 + 29 35 name := doc.Name 30 36 content := doc.Content 31 37 32 - // If this document has been skipped, it's likely very large. In this case, we just guess the category based 33 - // on the filename to avoid examining the contents. Note: passing nil content is allowed by the go-enry contract. 34 - if doc.SkipReason != "" { 38 + // If this document was skipped because it was too large, just guess the category based on the filename to avoid 39 + // examining the contents. Note: passing nil content is allowed by the go-enry contract. 40 + if doc.SkipReason == SkipReasonTooLarge || doc.SkipReason == SkipReasonBinary { 35 41 content = nil 36 42 } 37 43 ··· 56 62 // lowPriority returns true if this file category is considered 'low priority'. This is used 57 63 // in search scoring to down-weight matches in these files. 58 64 func (c FileCategory) lowPriority() bool { 59 - return c == FileCategoryTest || c == FileCategoryVendored || c == FileCategoryGenerated 65 + return c == FileCategoryTest || c == FileCategoryVendored || c == FileCategoryGenerated || c == FileCategoryBinary 60 66 } 61 67 62 68 func (c FileCategory) encode() (byte, error) { ··· 77 83 return 6, nil 78 84 case FileCategoryDocumentation: 79 85 return 7, nil 86 + case FileCategoryBinary: 87 + return 8, nil 80 88 default: 81 89 return 0, errors.New("unrecognized file category") 82 90 } ··· 98 106 return FileCategoryDotFile, nil 99 107 case 7: 100 108 return FileCategoryDocumentation, nil 109 + case 8: 110 + return FileCategoryBinary, nil 101 111 default: 102 112 return FileCategoryMissing, errors.New("unrecognized file category") 103 113 }

+9 -9

index/index_test.go

··· 2403 2403 2404 2404 t.Run("LineMatches", func(t *testing.T) { 2405 2405 b := testShardBuilder(t, nil, 2406 - Document{Name: "f1", Content: []byte(corpus)}) 2406 + Document{Name: "f1", Content: corpus}) 2407 2407 2408 2408 res := searchForTest(t, b, &query.Substring{Pattern: needle, Content: true}) 2409 2409 if len(res.Files) != 1 { ··· 2413 2413 2414 2414 t.Run("ChunkMatches", func(t *testing.T) { 2415 2415 b := testShardBuilder(t, nil, 2416 - Document{Name: "f1", Content: []byte(corpus)}) 2416 + Document{Name: "f1", Content: corpus}) 2417 2417 2418 2418 res := searchForTest(t, b, &query.Substring{Pattern: needle, Content: true}, chunkOpts) 2419 2419 if len(res.Files) != 1 { ··· 3191 3191 } 3192 3192 3193 3193 for j, sec := range want.Symbols { 3194 - if sec.Start != uint32(got[j].Start.ByteOffset) { 3194 + if sec.Start != got[j].Start.ByteOffset { 3195 3195 t.Fatalf("got offset %d, want %d in doc %s", got[j].Start.ByteOffset, sec.Start, want.Name) 3196 3196 } 3197 3197 } ··· 3372 3372 3373 3373 // Test valid and invalid text 3374 3374 for _, text := range []string{"", "simple ascii", "símplé unicödé", "\uFEFFwith utf8 'bom'", "with \uFFFD unicode replacement char"} { 3375 - if err := docChecker.Check([]byte(text), 20000, false); err != nil { 3376 - t.Errorf("Check(%q): %v", text, err) 3375 + if skip := docChecker.Check([]byte(text), 20000, false); skip != SkipReasonNone { 3376 + t.Errorf("Check(%q): %v", text, skip) 3377 3377 } 3378 3378 } 3379 3379 for _, text := range []string{"zero\x00byte", "xx", "0123456789abcdefghi"} { 3380 - if err := docChecker.Check([]byte(text), 15, false); err == nil { 3380 + if skip := docChecker.Check([]byte(text), 15, false); skip == SkipReasonNone { 3381 3381 t.Errorf("Check(%q) succeeded", text) 3382 3382 } 3383 3383 } 3384 3384 3385 3385 // Test valid and invalid text with an allowed large file 3386 3386 for _, text := range []string{"0123456789abcdefghi", "qwertyuiopasdfghjklzxcvbnm"} { 3387 - if err := docChecker.Check([]byte(text), 15, true); err != nil { 3388 - t.Errorf("Check(%q): %v", text, err) 3387 + if skip := docChecker.Check([]byte(text), 15, true); skip != SkipReasonNone { 3388 + t.Errorf("Check(%q): %v", text, skip) 3389 3389 } 3390 3390 } 3391 3391 for _, text := range []string{"zero\x00byte", "xx"} { 3392 - if err := docChecker.Check([]byte(text), 15, true); err == nil { 3392 + if skip := docChecker.Check([]byte(text), 15, true); skip == SkipReasonNone { 3393 3393 t.Errorf("Check(%q) succeeded", text) 3394 3394 } 3395 3395 }

+13 -14

index/shard_builder.go

··· 402 402 return 403 403 } 404 404 405 - if doc.SkipReason != "" { 405 + if doc.SkipReason != SkipReasonNone { 406 406 // If this document has been skipped, it's likely very large, or it's a non-code file like binary. 407 407 // In this case, we just guess the language based on file name to avoid examining the contents. 408 408 // Note: passing nil content is allowed by the go-enry contract (the underlying library we use here). ··· 414 414 415 415 // Add a file which only occurs in certain branches. 416 416 func (b *ShardBuilder) Add(doc Document) error { 417 - hasher := crc64.New(crc64.MakeTable(crc64.ISO)) 418 - 419 - if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { 420 - doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx) 417 + if index := bytes.IndexByte(doc.Content, 0); index > 0 { 418 + doc.SkipReason = SkipReasonBinary 421 419 } 422 420 423 - if doc.SkipReason != "" { 424 - doc.Content = []byte(notIndexedMarker + doc.SkipReason) 421 + if doc.SkipReason != SkipReasonNone { 422 + doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation()) 425 423 doc.Symbols = nil 426 424 doc.SymbolsMetaData = nil 427 425 } ··· 481 479 b.subRepos = append(b.subRepos, subRepoIdx) 482 480 b.repos = append(b.repos, uint16(repoIdx)) 483 481 482 + hasher := crc64.New(crc64.MakeTable(crc64.ISO)) 484 483 hasher.Write(doc.Content) 485 484 486 485 b.contentStrings = append(b.contentStrings, docStr) ··· 543 542 } 544 543 545 544 // Check returns a reason why the given contents are probably not source texts. 546 - func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error { 545 + func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason { 547 546 if len(content) == 0 { 548 - return nil 547 + return SkipReasonNone 549 548 } 550 549 551 550 if len(content) < ngramSize { 552 - return fmt.Errorf("file size smaller than %d", ngramSize) 551 + return SkipReasonTooSmall 553 552 } 554 553 555 554 if index := bytes.IndexByte(content, 0); index > 0 { 556 - return fmt.Errorf("binary data at byte offset %d", index) 555 + return SkipReasonBinary 557 556 } 558 557 559 558 // PERF: we only need to do the trigram check if the upperbound on content is greater than 560 559 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed. 561 560 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile { 562 - return nil 561 + return SkipReasonNone 563 562 } 564 563 565 564 var cur [3]rune ··· 580 579 t.trigrams[runesToNGram(cur)] = struct{}{} 581 580 if len(t.trigrams) > maxTrigramCount { 582 581 // probably not text. 583 - return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) 582 + return SkipReasonTooManyTrigrams 584 583 } 585 584 } 586 - return nil 585 + return SkipReasonNone 587 586 } 588 587 589 588 func (t *DocChecker) clearTrigrams(maxTrigramCount int) {

+2 -2

index/shard_builder_test.go

··· 68 68 name: "skipped file", 69 69 doc: Document{ 70 70 Name: "large.js", 71 - SkipReason: "too large", 71 + SkipReason: SkipReasonTooLarge, 72 72 Content: []byte(notIndexedMarker + "too large"), 73 73 }, 74 74 wantLang: "JavaScript", ··· 77 77 name: "skipped file with unknown extension", 78 78 doc: Document{ 79 79 Name: "deadb33f", 80 - SkipReason: "binary", 80 + SkipReason: SkipReasonBinary, 81 81 Content: []byte(notIndexedMarker + "binary"), 82 82 }, 83 83 wantLang: "",

internal/e2e/examples/example.bin

This is a binary file and will not be displayed.

+16 -2

internal/e2e/scoring_test.go

··· 72 72 t.Fatal(err) 73 73 } 74 74 75 + exampleBin, err := os.ReadFile("./examples/example.bin") 76 + if err != nil { 77 + t.Fatal(err) 78 + } 79 + 75 80 cases := []scoreCase{ 76 81 { 77 82 // Matches on both filename and content ··· 153 158 wantScore: 2.07, 154 159 }, 155 160 { 156 - // Match on test should be scored lower than regular files 161 + // Match on test should be downweighted 157 162 fileName: "test_example.py", 158 163 query: &query.Substring{Pattern: "example"}, 159 164 language: "Python", 160 - // sum-termFrequencyScore: 1.0, length-ratio: 0.00 165 + // sum-termFrequencyScore: 1.00, length-ratio: 0.00 161 166 wantScore: 1.69, 167 + }, 168 + { 169 + // Match on binary should be downweighted 170 + fileName: "example.bin", 171 + query: &query.Substring{Pattern: "example"}, 172 + language: "", 173 + content: exampleBin, 174 + // sum-termFrequencyScore: 1.00, length-ratio: 1.00 175 + wantScore: 1.00, 162 176 }, 163 177 } 164 178

+4 -4

internal/gitindex/index.go

··· 882 882 883 883 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found. 884 884 if errors.Is(err, plumbing.ErrObjectNotFound) { 885 - return skippedLargeDoc(key, branches, opts), nil 885 + return skippedLargeDoc(key, branches), nil 886 886 } 887 887 888 888 if err != nil { ··· 891 891 892 892 keyFullPath := key.FullPath() 893 893 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) { 894 - return skippedLargeDoc(key, branches, opts), nil 894 + return skippedLargeDoc(key, branches), nil 895 895 } 896 896 897 897 contents, err := blobContents(blob) ··· 907 907 }, nil 908 908 } 909 909 910 - func skippedLargeDoc(key fileKey, branches []string, opts index.Options) index.Document { 910 + func skippedLargeDoc(key fileKey, branches []string) index.Document { 911 911 return index.Document{ 912 - SkipReason: fmt.Sprintf("file size exceeds maximum size %d", opts.SizeMax), 912 + SkipReason: index.SkipReasonTooLarge, 913 913 Name: key.FullPath(), 914 914 Branches: branches, 915 915 SubRepositoryPath: key.SubRepoPath,

Configure Feed

Configure Feed