ranking: incorporate file signals into BM25F (#922) · boltless.me/zoekt@fbc5438

+1 -10

index/builder.go

··· 854 854 return x / (1 + x) 855 855 } 856 856 857 - // IsLowPriority takes a file name and makes an educated guess about its priority 858 - // in search results. A file is considered low priority if it looks like a test, 859 - // vendored, or generated file. 860 - // 861 - // These 'priority' criteria affects how documents are ordered within a shard. It's 862 - // also used to help guess a file's rank when we're missing ranking information. 863 - func IsLowPriority(path string, content []byte) bool { 864 - return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content) 865 - } 866 - 867 857 type rankedDoc struct { 868 858 *Document 869 859 rank []float64 ··· 1091 1081 Branches []string 1092 1082 SubRepositoryPath string 1093 1083 Language string 1084 + Category FileCategory 1094 1085 1095 1086 // If set, something is wrong with the file contents, and this 1096 1087 // is the reason it wasn't indexed.

-44

index/builder_test.go

··· 873 873 } 874 874 } 875 875 876 - func TestIsLowPriority(t *testing.T) { 877 - cases := []string{ 878 - "builder_test.go", 879 - "test/TestQuery.java", 880 - "search/vendor/thirdparty.cc", 881 - "search/node_modules/search/js", 882 - "search.min.js", 883 - "internal/search.js.map", 884 - } 885 - 886 - for _, tt := range cases { 887 - t.Run(tt, func(t *testing.T) { 888 - if !IsLowPriority(tt, nil) { 889 - t.Errorf("expected file '%s' to be low priority", tt) 890 - } 891 - }) 892 - } 893 - 894 - negativeCases := []string{ 895 - "builder.go", 896 - "RoutesTrigger.java", 897 - "search.js", 898 - } 899 - 900 - for _, tt := range negativeCases { 901 - t.Run(tt, func(t *testing.T) { 902 - if IsLowPriority(tt, nil) { 903 - t.Errorf("did not expect file '%s' to be low priority", tt) 904 - } 905 - }) 906 - } 907 - 908 - // Explicitly check that content is important by using the same filename but 909 - // different content. 910 - normal := "package mock\n\nvar Mock struct {}" 911 - generated := "// Code generated by mock\npackage mock\n\nvar Mock struct {}" 912 - if IsLowPriority("mock.go", []byte(normal)) { 913 - t.Error("expected non-generated content to not be low priority") 914 - } 915 - if !IsLowPriority("mock.go", []byte(generated)) { 916 - t.Error("expected generated content to be low priority") 917 - } 918 - } 919 - 920 876 func createTestShard(t testing.TB, indexDir string, r zoekt.Repository, numShards int, optFns ...func(options *Options)) []string { 921 877 t.Helper() 922 878

+1 -1

index/eval.go

··· 316 316 } 317 317 318 318 if opts.UseBM25Scoring { 319 - d.scoreFilesUsingBM25(&fileMatch, nextDoc, finalCands, cp, opts) 319 + d.scoreFileBM25(&fileMatch, nextDoc, finalCands, cp, opts) 320 320 } else { 321 321 // Use the standard, non-experimental scoring method by default 322 322 d.scoreFile(&fileMatch, nextDoc, mt, known, opts)

+104

index/file_category.go

··· 1 + package index 2 + 3 + import ( 4 + "errors" 5 + 6 + "github.com/go-enry/go-enry/v2" 7 + ) 8 + 9 + // FileCategory represents the category of a file, as determined by go-enry. It is non-exhaustive 10 + // but tries to the major cases like whether the file is a test, generated, etc. 11 + // 12 + // A file's category is used in search scoring to determine the weight of a file match. 13 + type FileCategory byte 14 + 15 + const ( 16 + // FileCategoryMissing is a sentinel value that indicates we never computed the file category during indexing 17 + // (which means we're reading from an old index version). This value can never be written to the index. 18 + FileCategoryMissing FileCategory = iota 19 + FileCategoryDefault 20 + FileCategoryTest 21 + FileCategoryVendored 22 + FileCategoryGenerated 23 + FileCategoryConfig 24 + FileCategoryDotFile 25 + FileCategoryDocumentation 26 + ) 27 + 28 + func DetermineFileCategory(doc *Document) { 29 + name := doc.Name 30 + content := doc.Content 31 + 32 + // If this document has been skipped, it's likely very large. In this case, we just guess the category based 33 + // on the filename to avoid examining the contents. Note: passing nil content is allowed by the go-enry contract. 34 + if doc.SkipReason != "" { 35 + content = nil 36 + } 37 + 38 + category := FileCategoryDefault 39 + if enry.IsTest(name) { 40 + category = FileCategoryTest 41 + } else if enry.IsDotFile(name) { 42 + category = FileCategoryDotFile 43 + } else if enry.IsVendor(name) { 44 + category = FileCategoryVendored 45 + } else if enry.IsGenerated(name, content) { 46 + category = FileCategoryGenerated 47 + } else if enry.IsConfiguration(name) { 48 + category = FileCategoryConfig 49 + } else if enry.IsDocumentation(name) { 50 + category = FileCategoryDocumentation 51 + } 52 + 53 + doc.Category = category 54 + } 55 + 56 + // lowPriority returns true if this file category is considered 'low priority'. This is used 57 + // in search scoring to down-weight matches in these files. 58 + func (c FileCategory) lowPriority() bool { 59 + return c == FileCategoryTest || c == FileCategoryVendored || c == FileCategoryGenerated 60 + } 61 + 62 + func (c FileCategory) encode() (byte, error) { 63 + switch c { 64 + case FileCategoryMissing: 65 + return 0, errors.New("cannot encode missing file category") 66 + case FileCategoryDefault: 67 + return 1, nil 68 + case FileCategoryTest: 69 + return 2, nil 70 + case FileCategoryVendored: 71 + return 3, nil 72 + case FileCategoryGenerated: 73 + return 4, nil 74 + case FileCategoryConfig: 75 + return 5, nil 76 + case FileCategoryDotFile: 77 + return 6, nil 78 + case FileCategoryDocumentation: 79 + return 7, nil 80 + default: 81 + return 0, errors.New("unrecognized file category") 82 + } 83 + } 84 + 85 + func decodeCategory(c byte) (FileCategory, error) { 86 + switch c { 87 + case 1: 88 + return FileCategoryDefault, nil 89 + case 2: 90 + return FileCategoryTest, nil 91 + case 3: 92 + return FileCategoryVendored, nil 93 + case 4: 94 + return FileCategoryGenerated, nil 95 + case 5: 96 + return FileCategoryConfig, nil 97 + case 6: 98 + return FileCategoryDotFile, nil 99 + case 7: 100 + return FileCategoryDocumentation, nil 101 + default: 102 + return FileCategoryMissing, errors.New("unrecognized file category") 103 + } 104 + }

+72

index/file_category_test.go

··· 1 + package index 2 + 3 + import ( 4 + "testing" 5 + ) 6 + 7 + func TestDetermineFileCategory(t *testing.T) { 8 + tests := []struct { 9 + name string 10 + filename string 11 + content []byte 12 + want FileCategory 13 + }{ 14 + { 15 + name: "test file", 16 + filename: "foo_test.go", 17 + content: []byte("package foo"), 18 + want: FileCategoryTest, 19 + }, 20 + { 21 + name: "vendor file", 22 + filename: "vendor/foo.go", 23 + content: []byte("package foo"), 24 + want: FileCategoryVendored, 25 + }, 26 + { 27 + name: "generated file", 28 + filename: "foo.go", 29 + content: []byte("// Code generated by protoc-gen-go. DO NOT EDIT.\n" + 30 + "... some generated code ..."), 31 + want: FileCategoryGenerated, 32 + }, 33 + { 34 + name: "config file", 35 + filename: "package.json", 36 + content: []byte("{}"), 37 + want: FileCategoryConfig, 38 + }, 39 + { 40 + name: "dot file", 41 + filename: ".gitignore", 42 + content: []byte("*.o"), 43 + want: FileCategoryDotFile, 44 + }, 45 + { 46 + name: "documentation file", 47 + filename: "README.md", 48 + content: []byte("# Documentation"), 49 + want: FileCategoryDocumentation, 50 + }, 51 + { 52 + name: "default file", 53 + filename: "main.go", 54 + content: []byte("package main"), 55 + want: FileCategoryDefault, 56 + }, 57 + } 58 + 59 + for _, tt := range tests { 60 + t.Run(tt.name, func(t *testing.T) { 61 + doc := &Document{ 62 + Name: tt.filename, 63 + Content: tt.content, 64 + } 65 + 66 + DetermineFileCategory(doc) 67 + if doc.Category != tt.want { 68 + t.Errorf("DetermineFileCategory() = %v, want %v", doc.Name, tt.want) 69 + } 70 + }) 71 + } 72 + }

+15

index/indexdata.go

··· 92 92 // inverse of LanguageMap in metaData 93 93 languageMap map[uint16]string 94 94 95 + // file categories for all the files. 96 + categories []byte 97 + 95 98 repoListEntry []zoekt.RepoListEntry 96 99 97 100 // repository indexes for all the files ··· 170 173 } 171 174 // newer zoekt files have 16-bit language entries 172 175 return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8 176 + } 177 + 178 + func (d *indexData) getCategory(idx uint32) FileCategory { 179 + if len(d.categories) == 0 { 180 + // This means we're reading an older index, so return 'missing' 181 + return FileCategoryMissing 182 + } 183 + category, err := decodeCategory(d.categories[idx]) 184 + if err != nil { 185 + return FileCategoryMissing 186 + } 187 + return category 173 188 } 174 189 175 190 // calculates stats for files in the range [start, end).

+5

index/read.go

··· 316 316 return nil, err 317 317 } 318 318 319 + d.categories, err = d.readSectionBlob(toc.categories) 320 + if err != nil { 321 + return nil, err 322 + } 323 + 319 324 d.contentNgrams, err = d.newBtreeIndex(toc.ngramText, toc.postings) 320 325 if err != nil { 321 326 return nil, err

+35 -27

index/score.go

··· 20 20 "math" 21 21 "strings" 22 22 23 + "github.com/go-enry/go-enry/v2" 23 24 "github.com/sourcegraph/zoekt" 24 25 "github.com/sourcegraph/zoekt/internal/ctags" 25 26 ) 26 27 27 28 const ( 28 - ScoreOffset = 10_000_000 29 - ScoreOffsetBM25 = 1_000_000_000 29 + ScoreOffset = 10_000_000 30 30 ) 31 31 32 32 type chunkScore struct { ··· 218 218 L := float64(lineLength) / 100.0 219 219 220 220 score := 0.0 221 - tfs := p.calculateTermFrequency(ms) 221 + tfs := p.calculateTermFrequency(ms, false) // ignore file priority, since we're just scoring within a single file 222 222 for _, f := range tfs { 223 223 score += tfScore(k, b, L, f) 224 224 } ··· 245 245 return ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) 246 246 } 247 247 248 + const importantTermBoost = 5 249 + const lowPriorityFilePenalty = 5 250 + 248 251 // calculateTermFrequency computes the term frequency for the file match. 249 252 // Notes: 250 253 // - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles. 251 254 // - Symbol matches also count more than content matches, to reward matches on symbol definitions. 252 - func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch) map[string]int { 255 + // - "Low priority" files like tests, generated files, etc. have their term frequency down-weighted, to prioritize matches from 'regular' files 256 + func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, lowPriority bool) map[string]int { 253 257 // Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and 254 258 // ignore whether the index is a word boundary. 255 259 termFreqs := map[string]int{} 256 260 for _, m := range cands { 257 261 term := string(m.substrLowered) 258 262 if m.fileName || p.matchesSymbol(m) { 259 - termFreqs[term] += 5 263 + termFreqs[term] += importantTermBoost 260 264 } else { 261 265 termFreqs[term]++ 262 266 } 263 267 } 264 268 269 + // If a file is a test, generated, etc., then down-weight its term frequency. The BM25F interpretation 270 + // is that this data lives in a separate 'field' that is half the priority of regular content. 271 + if lowPriority { 272 + for term := range termFreqs { 273 + termFreqs[term] = termFreqs[term] / lowPriorityFilePenalty 274 + } 275 + } 276 + 265 277 return termFreqs 266 278 } 267 279 ··· 339 351 fileMatch.Score = ScoreOffset*fileMatch.Score + scoreRepoRankFactor*float64(repoRank) + scoreFileOrderFactor*docOrderScore 340 352 } 341 353 342 - // scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search: 354 + // scoreFileBM25 computes the score according to BM25, the most common scoring algorithm for text search: 343 355 // https://en.wikipedia.org/wiki/Okapi_BM25. Note that we treat the inverse document frequency (idf) as constant. This 344 356 // is supported by our evaluations which showed that for keyword style queries, idf can down-weight the score of some 345 357 // keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how ··· 347 359 // 348 360 // Unlike standard file scoring, this scoring strategy ignores the individual LineMatch and ChunkMatch scores, instead 349 361 // calculating a score over all matches in the file. 350 - func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, cands []*candidateMatch, cp *contentProvider, opts *zoekt.SearchOptions) { 351 - tf := cp.calculateTermFrequency(cands) 362 + func (d *indexData) scoreFileBM25(fileMatch *zoekt.FileMatch, doc uint32, cands []*candidateMatch, cp *contentProvider, opts *zoekt.SearchOptions) { 363 + lowPriority := d.isLowPriority(fileMatch, doc) 364 + tf := cp.calculateTermFrequency(cands, lowPriority) 352 365 353 366 // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html) 354 367 k, b := 1.2, 0.75 ··· 374 387 375 388 score := boostScore(bm25Score, cands) 376 389 boosted := score != bm25Score 377 - 378 - // 2 digits of precision 379 - score = math.Trunc(score*100) / 100 380 - 381 - md := d.repoMetaData[d.repos[doc]] 382 - fileOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries)) 383 - 384 - // Offset score by 9 digits and add the tiebreaker. 385 - // 386 - // Example: For a BM25 score of 1.23, a repo rank of 456789 and a file order score of 0.12, we have a final score of 387 - // 12345678901.2 388 - // ^^^ 389 - // bm25 390 - // ^^^^^^ 391 - // repo rank 392 - // ^^^^ 393 - // doc order 394 - fileMatch.Score = score*ScoreOffsetBM25 + scoreRepoRankFactor*float64(md.Rank) + scoreFileOrderFactor*fileOrderScore 390 + fileMatch.Score = score 395 391 396 392 if opts.DebugScore { 397 393 // To make the debug output easier to read, we split the score into the query dependent score and the tiebreaker 398 - fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f (repo-rank: %d, file-rank: %.2f) <- sum-termFrequencies: %d, length-ratio: %.2f", score, md.Rank, fileOrderScore, sumTF, L) 394 + fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f (low-priority: %t) <- sum-termFrequencies: %d, length-ratio: %.2f", score, lowPriority, sumTF, L) 399 395 if boosted { 400 396 fileMatch.Debug += fmt.Sprintf(" (boosted)") 401 397 } 402 398 } 403 399 } 400 + 401 + func (d *indexData) isLowPriority(fileMatch *zoekt.FileMatch, doc uint32) bool { 402 + category := d.getCategory(doc) 403 + if category != FileCategoryMissing { 404 + return category.lowPriority() 405 + } else { 406 + // The category may be missing from older index versions. In this case, 407 + // perform a cheap, best-effort check against the filename. 408 + path := fileMatch.FileName 409 + return enry.IsTest(path) || enry.IsVendor(path) 410 + } 411 + }

+9

index/shard_builder.go

··· 206 206 // language codes, uint16 encoded as little-endian 207 207 languages []uint8 208 208 209 + categories []byte 210 + 209 211 // IndexTime will be used as the time if non-zero. Otherwise 210 212 // time.Now(). This is useful for doing reproducible builds in tests. 211 213 IndexTime time.Time ··· 425 427 } 426 428 427 429 DetermineLanguageIfUnknown(&doc) 430 + DetermineFileCategory(&doc) 428 431 429 432 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 430 433 var last DocumentSection ··· 498 501 b.languageMap[doc.Language] = langCode 499 502 } 500 503 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8)) 504 + 505 + category, err := doc.Category.encode() 506 + if err != nil { 507 + return err 508 + } 509 + b.categories = append(b.categories, category) 501 510 502 511 return nil 503 512 }

+2

index/toc.go

··· 78 78 runeOffsets simpleSection 79 79 fileEndRunes simpleSection 80 80 languages simpleSection 81 + categories simpleSection 81 82 82 83 fileEndSymbol simpleSection 83 84 symbolMap lazyCompoundSection ··· 180 181 {"nameEndRunes", &t.nameEndRunes}, 181 182 {"contentChecksums", &t.contentChecksums}, 182 183 {"languages", &t.languages}, 184 + {"categories", &t.categories}, 183 185 {"runeDocSections", &t.runeDocSections}, 184 186 {"repos", &t.repos}, 185 187 {"reposIDsBitmap", &t.reposIDsBitmap},

+4

index/write.go

··· 169 169 w.Write(b.languages) 170 170 toc.languages.end(w) 171 171 172 + toc.categories.start(w) 173 + w.Write(b.categories) 174 + toc.categories.end(w) 175 + 172 176 toc.runeDocSections.start(w) 173 177 w.Write(marshalDocSections(b.runeDocSections)) 174 178 toc.runeDocSections.end(w)

+11

internal/e2e/examples/test_example.py

··· 1 + import unittest 2 + 3 + class TestSimpleOperations(unittest.TestCase): 4 + def test_addition(self): 5 + self.assertEqual(2 + 2, 4) 6 + 7 + def test_string_upper(self): 8 + self.assertEqual('hello'.upper(), 'HELLO') 9 + 10 + if __name__ == '__main__': 11 + unittest.main()

+17 -2

internal/e2e/scoring_test.go

··· 145 145 // sum-termFrequencyScore: 5.00, length-ratio: 0.00 146 146 wantScore: 2.07, 147 147 }, 148 + { 149 + fileName: "example.py", 150 + query: &query.Substring{Pattern: "example"}, 151 + language: "Python", 152 + // sum-termFrequencyScore: 5.00, length-ratio: 0.00 153 + wantScore: 2.07, 154 + }, 155 + { 156 + // Match on test should be scored lower than regular files 157 + fileName: "test_example.py", 158 + query: &query.Substring{Pattern: "example"}, 159 + language: "Python", 160 + // sum-termFrequencyScore: 1.0, length-ratio: 0.00 161 + wantScore: 1.69, 162 + }, 148 163 } 149 164 150 165 for _, c := range cases { ··· 721 736 // helper to remove the tiebreaker from the score for easier comparison 722 737 func withoutTiebreaker(fullScore float64, useBM25 bool) float64 { 723 738 if useBM25 { 724 - // Shift by ScoreOffsetBM25 and truncate to 2 decimal places 725 - return math.Trunc((fullScore/index.ScoreOffsetBM25)*100) / 100 739 + // BM25 doesn't use a tiebreaker 740 + return fullScore 726 741 } 727 742 return math.Trunc(fullScore / index.ScoreOffset) 728 743 }

testdata/shards/repo17_v17.00000.zoekt

This is a binary file and will not be displayed.

testdata/shards/repo2_v16.00000.zoekt

This is a binary file and will not be displayed.

testdata/shards/repo_v16.00000.zoekt

This is a binary file and will not be displayed.

Configure Feed

Configure Feed