fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

ranking: downweight binary files (#924)

In testing, I noticed another problem with BM25: sometimes a binary file is ranked highly because of a match on its filename. In classic Zoekt scoring, these are ranked low because they are skipped, and we always sort skipped docs to the end of the index.

This PR ensures they're also ranked low for BM25 by adding a 'binary' category, and marking it low priority. Adding this category required updating `SkipReason` to track the reason a document was skipped. This is necessary because we set the content of skipped docs to `nil`, and `SkipReason` is the only lasting indication that it was binary.

+118 -67
+1 -1
cmd/zoekt-index/main.go
··· 137 137 if f.size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(displayName) { 138 138 if err := builder.Add(index.Document{ 139 139 Name: displayName, 140 - SkipReason: fmt.Sprintf("document size %d larger than limit %d", f.size, opts.SizeMax), 140 + SkipReason: index.SkipReasonTooLarge, 141 141 }); err != nil { 142 142 return err 143 143 }
+6 -28
index/builder.go
··· 601 601 // we pass through a part of the source tree with binary/large 602 602 // files, the corresponding shard would be mostly empty, so 603 603 // insert a reason here too. 604 - doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) 605 - } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { 606 - doc.SkipReason = err.Error() 604 + doc.SkipReason = SkipReasonTooLarge 605 + } else if skip := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); skip != SkipReasonNone { 606 + doc.SkipReason = skip 607 607 } 608 608 609 609 b.todo = append(b.todo, &doc) 610 610 611 - if doc.SkipReason == "" { 611 + if doc.SkipReason == SkipReasonNone { 612 612 b.size += len(doc.Name) + len(doc.Content) 613 613 } else { 614 - b.size += len(doc.Name) + len(doc.SkipReason) 614 + b.size += len(doc.Name) 615 615 // Drop the content if we are skipping the document. Skipped content is not counted towards the 616 616 // shard size limit, so otherwise we might buffer too much data in memory before flushing. 617 617 doc.Content = nil ··· 865 865 // have a higher chance of being searched before limits kick in. 866 866 func rank(d *Document, origIdx int) []float64 { 867 867 skipped := 0.0 868 - if d.SkipReason != "" { 868 + if d.SkipReason != SkipReasonNone { 869 869 skipped = 1.0 870 870 } 871 871 ··· 1072 1072 1073 1073 func (e *deltaIndexOptionsMismatchError) Error() string { 1074 1074 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions) 1075 - } 1076 - 1077 - // Document holds a document (file) to index. 1078 - type Document struct { 1079 - Name string 1080 - Content []byte 1081 - Branches []string 1082 - SubRepositoryPath string 1083 - Language string 1084 - Category FileCategory 1085 - 1086 - // If set, something is wrong with the file contents, and this 1087 - // is the reason it wasn't indexed. 1088 - SkipReason string 1089 - 1090 - // Document sections for symbols. Offsets should use bytes. 1091 - Symbols []DocumentSection 1092 - SymbolsMetaData []*zoekt.Symbol 1093 - } 1094 - 1095 - type DocumentSection struct { 1096 - Start, End uint32 1097 1075 } 1098 1076 1099 1077 // umask holds the Umask of the current process
+3 -3
index/builder_test.go
··· 246 246 if err != nil { 247 247 t.Fatal(err) 248 248 } 249 - if len(b.todo) != 1 || b.todo[0].SkipReason == "" { 249 + if len(b.todo) != 1 || b.todo[0].SkipReason == SkipReasonNone { 250 250 t.Fatalf("document should have been skipped") 251 251 } 252 252 if b.todo[0].Content != nil { ··· 1143 1143 docs: []*Document{ 1144 1144 { 1145 1145 Name: "binary_file", 1146 - SkipReason: "binary file", 1146 + SkipReason: SkipReasonBinary, 1147 1147 }, 1148 1148 { 1149 1149 Name: "some_test.go", ··· 1151 1151 }, 1152 1152 { 1153 1153 Name: "large_file.go", 1154 - SkipReason: "too large", 1154 + SkipReason: SkipReasonTooLarge, 1155 1155 }, 1156 1156 { 1157 1157 Name: "file.go",
+50
index/document.go
··· 1 + package index 2 + 3 + import "github.com/sourcegraph/zoekt" 4 + 5 + // Document holds a document (file) to index. 6 + type Document struct { 7 + Name string 8 + Content []byte 9 + Branches []string 10 + SubRepositoryPath string 11 + Language string 12 + Category FileCategory 13 + 14 + SkipReason SkipReason 15 + 16 + // Document sections for symbols. Offsets should use bytes. 17 + Symbols []DocumentSection 18 + SymbolsMetaData []*zoekt.Symbol 19 + } 20 + 21 + type SkipReason int 22 + 23 + const ( 24 + SkipReasonNone SkipReason = iota 25 + SkipReasonTooLarge 26 + SkipReasonTooSmall 27 + SkipReasonBinary 28 + SkipReasonTooManyTrigrams 29 + ) 30 + 31 + func (s SkipReason) explanation() string { 32 + switch s { 33 + case SkipReasonNone: 34 + return "" 35 + case SkipReasonTooLarge: 36 + return "exceeds the maximum size limit" 37 + case SkipReasonTooSmall: 38 + return "contains too few trigrams" 39 + case SkipReasonBinary: 40 + return "contains binary content" 41 + case SkipReasonTooManyTrigrams: 42 + return "contains too many trigrams" 43 + default: 44 + return "unknown skip reason" 45 + } 46 + } 47 + 48 + type DocumentSection struct { 49 + Start, End uint32 50 + }
+14 -4
index/file_category.go
··· 22 22 FileCategoryGenerated 23 23 FileCategoryConfig 24 24 FileCategoryDotFile 25 + FileCategoryBinary 25 26 FileCategoryDocumentation 26 27 ) 27 28 28 29 func DetermineFileCategory(doc *Document) { 30 + if doc.SkipReason == SkipReasonBinary { 31 + doc.Category = FileCategoryBinary 32 + return 33 + } 34 + 29 35 name := doc.Name 30 36 content := doc.Content 31 37 32 - // If this document has been skipped, it's likely very large. In this case, we just guess the category based 33 - // on the filename to avoid examining the contents. Note: passing nil content is allowed by the go-enry contract. 34 - if doc.SkipReason != "" { 38 + // If this document was skipped because it was too large, just guess the category based on the filename to avoid 39 + // examining the contents. Note: passing nil content is allowed by the go-enry contract. 40 + if doc.SkipReason == SkipReasonTooLarge || doc.SkipReason == SkipReasonBinary { 35 41 content = nil 36 42 } 37 43 ··· 56 62 // lowPriority returns true if this file category is considered 'low priority'. This is used 57 63 // in search scoring to down-weight matches in these files. 58 64 func (c FileCategory) lowPriority() bool { 59 - return c == FileCategoryTest || c == FileCategoryVendored || c == FileCategoryGenerated 65 + return c == FileCategoryTest || c == FileCategoryVendored || c == FileCategoryGenerated || c == FileCategoryBinary 60 66 } 61 67 62 68 func (c FileCategory) encode() (byte, error) { ··· 77 83 return 6, nil 78 84 case FileCategoryDocumentation: 79 85 return 7, nil 86 + case FileCategoryBinary: 87 + return 8, nil 80 88 default: 81 89 return 0, errors.New("unrecognized file category") 82 90 } ··· 98 106 return FileCategoryDotFile, nil 99 107 case 7: 100 108 return FileCategoryDocumentation, nil 109 + case 8: 110 + return FileCategoryBinary, nil 101 111 default: 102 112 return FileCategoryMissing, errors.New("unrecognized file category") 103 113 }
+9 -9
index/index_test.go
··· 2403 2403 2404 2404 t.Run("LineMatches", func(t *testing.T) { 2405 2405 b := testShardBuilder(t, nil, 2406 - Document{Name: "f1", Content: []byte(corpus)}) 2406 + Document{Name: "f1", Content: corpus}) 2407 2407 2408 2408 res := searchForTest(t, b, &query.Substring{Pattern: needle, Content: true}) 2409 2409 if len(res.Files) != 1 { ··· 2413 2413 2414 2414 t.Run("ChunkMatches", func(t *testing.T) { 2415 2415 b := testShardBuilder(t, nil, 2416 - Document{Name: "f1", Content: []byte(corpus)}) 2416 + Document{Name: "f1", Content: corpus}) 2417 2417 2418 2418 res := searchForTest(t, b, &query.Substring{Pattern: needle, Content: true}, chunkOpts) 2419 2419 if len(res.Files) != 1 { ··· 3191 3191 } 3192 3192 3193 3193 for j, sec := range want.Symbols { 3194 - if sec.Start != uint32(got[j].Start.ByteOffset) { 3194 + if sec.Start != got[j].Start.ByteOffset { 3195 3195 t.Fatalf("got offset %d, want %d in doc %s", got[j].Start.ByteOffset, sec.Start, want.Name) 3196 3196 } 3197 3197 } ··· 3372 3372 3373 3373 // Test valid and invalid text 3374 3374 for _, text := range []string{"", "simple ascii", "símplé unicödé", "\uFEFFwith utf8 'bom'", "with \uFFFD unicode replacement char"} { 3375 - if err := docChecker.Check([]byte(text), 20000, false); err != nil { 3376 - t.Errorf("Check(%q): %v", text, err) 3375 + if skip := docChecker.Check([]byte(text), 20000, false); skip != SkipReasonNone { 3376 + t.Errorf("Check(%q): %v", text, skip) 3377 3377 } 3378 3378 } 3379 3379 for _, text := range []string{"zero\x00byte", "xx", "0123456789abcdefghi"} { 3380 - if err := docChecker.Check([]byte(text), 15, false); err == nil { 3380 + if skip := docChecker.Check([]byte(text), 15, false); skip == SkipReasonNone { 3381 3381 t.Errorf("Check(%q) succeeded", text) 3382 3382 } 3383 3383 } 3384 3384 3385 3385 // Test valid and invalid text with an allowed large file 3386 3386 for _, text := range []string{"0123456789abcdefghi", "qwertyuiopasdfghjklzxcvbnm"} { 3387 - if err := docChecker.Check([]byte(text), 15, true); err != nil { 3388 - t.Errorf("Check(%q): %v", text, err) 3387 + if skip := docChecker.Check([]byte(text), 15, true); skip != SkipReasonNone { 3388 + t.Errorf("Check(%q): %v", text, skip) 3389 3389 } 3390 3390 } 3391 3391 for _, text := range []string{"zero\x00byte", "xx"} { 3392 - if err := docChecker.Check([]byte(text), 15, true); err == nil { 3392 + if skip := docChecker.Check([]byte(text), 15, true); skip == SkipReasonNone { 3393 3393 t.Errorf("Check(%q) succeeded", text) 3394 3394 } 3395 3395 }
+13 -14
index/shard_builder.go
··· 402 402 return 403 403 } 404 404 405 - if doc.SkipReason != "" { 405 + if doc.SkipReason != SkipReasonNone { 406 406 // If this document has been skipped, it's likely very large, or it's a non-code file like binary. 407 407 // In this case, we just guess the language based on file name to avoid examining the contents. 408 408 // Note: passing nil content is allowed by the go-enry contract (the underlying library we use here). ··· 414 414 415 415 // Add a file which only occurs in certain branches. 416 416 func (b *ShardBuilder) Add(doc Document) error { 417 - hasher := crc64.New(crc64.MakeTable(crc64.ISO)) 418 - 419 - if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { 420 - doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx) 417 + if index := bytes.IndexByte(doc.Content, 0); index > 0 { 418 + doc.SkipReason = SkipReasonBinary 421 419 } 422 420 423 - if doc.SkipReason != "" { 424 - doc.Content = []byte(notIndexedMarker + doc.SkipReason) 421 + if doc.SkipReason != SkipReasonNone { 422 + doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation()) 425 423 doc.Symbols = nil 426 424 doc.SymbolsMetaData = nil 427 425 } ··· 481 479 b.subRepos = append(b.subRepos, subRepoIdx) 482 480 b.repos = append(b.repos, uint16(repoIdx)) 483 481 482 + hasher := crc64.New(crc64.MakeTable(crc64.ISO)) 484 483 hasher.Write(doc.Content) 485 484 486 485 b.contentStrings = append(b.contentStrings, docStr) ··· 543 542 } 544 543 545 544 // Check returns a reason why the given contents are probably not source texts. 546 - func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error { 545 + func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason { 547 546 if len(content) == 0 { 548 - return nil 547 + return SkipReasonNone 549 548 } 550 549 551 550 if len(content) < ngramSize { 552 - return fmt.Errorf("file size smaller than %d", ngramSize) 551 + return SkipReasonTooSmall 553 552 } 554 553 555 554 if index := bytes.IndexByte(content, 0); index > 0 { 556 - return fmt.Errorf("binary data at byte offset %d", index) 555 + return SkipReasonBinary 557 556 } 558 557 559 558 // PERF: we only need to do the trigram check if the upperbound on content is greater than 560 559 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed. 561 560 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile { 562 - return nil 561 + return SkipReasonNone 563 562 } 564 563 565 564 var cur [3]rune ··· 580 579 t.trigrams[runesToNGram(cur)] = struct{}{} 581 580 if len(t.trigrams) > maxTrigramCount { 582 581 // probably not text. 583 - return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) 582 + return SkipReasonTooManyTrigrams 584 583 } 585 584 } 586 - return nil 585 + return SkipReasonNone 587 586 } 588 587 589 588 func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
+2 -2
index/shard_builder_test.go
··· 68 68 name: "skipped file", 69 69 doc: Document{ 70 70 Name: "large.js", 71 - SkipReason: "too large", 71 + SkipReason: SkipReasonTooLarge, 72 72 Content: []byte(notIndexedMarker + "too large"), 73 73 }, 74 74 wantLang: "JavaScript", ··· 77 77 name: "skipped file with unknown extension", 78 78 doc: Document{ 79 79 Name: "deadb33f", 80 - SkipReason: "binary", 80 + SkipReason: SkipReasonBinary, 81 81 Content: []byte(notIndexedMarker + "binary"), 82 82 }, 83 83 wantLang: "",
internal/e2e/examples/example.bin

This is a binary file and will not be displayed.

+16 -2
internal/e2e/scoring_test.go
··· 72 72 t.Fatal(err) 73 73 } 74 74 75 + exampleBin, err := os.ReadFile("./examples/example.bin") 76 + if err != nil { 77 + t.Fatal(err) 78 + } 79 + 75 80 cases := []scoreCase{ 76 81 { 77 82 // Matches on both filename and content ··· 153 158 wantScore: 2.07, 154 159 }, 155 160 { 156 - // Match on test should be scored lower than regular files 161 + // Match on test should be downweighted 157 162 fileName: "test_example.py", 158 163 query: &query.Substring{Pattern: "example"}, 159 164 language: "Python", 160 - // sum-termFrequencyScore: 1.0, length-ratio: 0.00 165 + // sum-termFrequencyScore: 1.00, length-ratio: 0.00 161 166 wantScore: 1.69, 167 + }, 168 + { 169 + // Match on binary should be downweighted 170 + fileName: "example.bin", 171 + query: &query.Substring{Pattern: "example"}, 172 + language: "", 173 + content: exampleBin, 174 + // sum-termFrequencyScore: 1.00, length-ratio: 1.00 175 + wantScore: 1.00, 162 176 }, 163 177 } 164 178
+4 -4
internal/gitindex/index.go
··· 882 882 883 883 // We filter out large documents when fetching the repo. So if an object is too large, it will not be found. 884 884 if errors.Is(err, plumbing.ErrObjectNotFound) { 885 - return skippedLargeDoc(key, branches, opts), nil 885 + return skippedLargeDoc(key, branches), nil 886 886 } 887 887 888 888 if err != nil { ··· 891 891 892 892 keyFullPath := key.FullPath() 893 893 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) { 894 - return skippedLargeDoc(key, branches, opts), nil 894 + return skippedLargeDoc(key, branches), nil 895 895 } 896 896 897 897 contents, err := blobContents(blob) ··· 907 907 }, nil 908 908 } 909 909 910 - func skippedLargeDoc(key fileKey, branches []string, opts index.Options) index.Document { 910 + func skippedLargeDoc(key fileKey, branches []string) index.Document { 911 911 return index.Document{ 912 - SkipReason: fmt.Sprintf("file size exceeds maximum size %d", opts.SizeMax), 912 + SkipReason: index.SkipReasonTooLarge, 913 913 Name: key.FullPath(), 914 914 Branches: branches, 915 915 SubRepositoryPath: key.SubRepoPath,