fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Ranking: simplify score combination strategy (#523)

This PR proposes to simplify how match scores are combined with static ranking
signals like the file rank. Instead of using reciprocal rank fusion (RRF), we
stick with the same strategy we already use to combine ranking signals, a
weighted sum. In my experience, this is much easier to debug and tune compared
to RRF. We're in full control of the ranking signals, and can make sure they're
bounded + meaningful, so using a sum seems totally fine.

It also removes the index sorting based on file rank. From my testing, it
didn't really make a difference to improving result quality. Removing it also
opens up the possibility of storing the file ranks outside the immutable index
data. In the future, this would let us update the ranks more often, without
worrying about fully reindexing repositories.

+126 -307
+4 -15
api.go
··· 38 38 // Ranking; the higher, the better. 39 39 Score float64 // TODO - hide this field? 40 40 41 - // Experimental. Ranks is a vector containing floats in the interval [0, 1]. The 42 - // length of the vector depends on the output from the ranking function at index 43 - // time. 44 - // 45 - // This field is only set if the shard contains ranking information and 46 - // SearchOptions.UseDocumentRanks is true. 47 - Ranks []float64 48 - 49 41 // For debugging. Needs DebugScore set, but public so tests in 50 42 // other packages can print some diagnostics. 51 43 Debug string ··· 94 86 func (m *FileMatch) sizeBytes() (sz uint64) { 95 87 // Score 96 88 sz += 8 97 - 98 - // ranks 99 - sz += 8 * uint64(len(m.Ranks)) 100 89 101 90 for _, s := range []string{ 102 91 m.Debug, ··· 894 883 // sorting matches. 895 884 UseDocumentRanks bool 896 885 897 - // RanksDampingFactor determines the contribution of documents ranks to the 898 - // final ranking based on RRF. A value in (0,1] reduces the contribution, 899 - // while a value in (-inf,0) increases it. 900 - RanksDampingFactor float64 886 + // EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust 887 + // their weight in the file match score. If the value is <= 0.0, the default weight value 888 + // will be used. This option is temporary and is only exposed for testing/ tuning purposes. 889 + DocumentRanksWeight float64 901 890 902 891 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as 903 892 // a command-line flag
+1 -35
build/builder.go
··· 978 978 } 979 979 } 980 980 981 - const epsilon = 0.00000001 982 - 983 - // sortDocuments2 sorts []*zoekt.Document according to their Ranks. In general, 984 - // documents can have a nil rank vector if the document to be indexed was added 985 - // after the ranking took place. A nil rank vector translates to the lowest 986 - // possible rank. Longer vectors are more important than shorter vectors, given 987 - // all other ranks are equal. 988 - // 989 - // Note: the logic here is inverted to sortDocuments because rank in 990 - // sortDocuments returns a vector of the form [1-rank, ...]. 991 - func sortDocuments2(rs []*zoekt.Document) { 992 - sort.Slice(rs, func(i, j int) bool { 993 - r1 := rs[i].Ranks 994 - r2 := rs[j].Ranks 995 - 996 - l := len(r1) 997 - if len(r2) < l { 998 - l = len(r2) 999 - } 1000 - for i := 0; i < l; i++ { 1001 - if math.Abs(r1[i]-r2[i]) > epsilon { 1002 - return r1[i] > r2[i] 1003 - } 1004 - } 1005 - // if r1 has more entries it is more important. ie imagine right padding shorter 1006 - // arrays with zeros, so they are the same length. 1007 - return len(r1) > len(r2) 1008 - }) 1009 - } 1010 - 1011 981 func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) { 1012 982 if !b.opts.DisableCTags && b.opts.CTagsPath != "" { 1013 983 err := ctagsAddSymbols(todo, b.parser, b.opts.CTagsPath) ··· 1026 996 return nil, err 1027 997 } 1028 998 1029 - if b.opts.DocumentRanksPath != "" { 1030 - sortDocuments2(todo) 1031 - } else { 1032 - sortDocuments(todo) 1033 - } 999 + sortDocuments(todo) 1034 1000 1035 1001 for _, t := range todo { 1036 1002 if err := shardBuilder.Add(*t); err != nil {
-103
build/builder_test.go
··· 858 858 } 859 859 } 860 860 861 - func Test_sortDocuments2(t *testing.T) { 862 - tests := []struct { 863 - name string 864 - in []*zoekt.Document 865 - want []string 866 - }{ 867 - { 868 - name: "same length", 869 - in: []*zoekt.Document{ 870 - { 871 - Name: "a", 872 - Ranks: []float64{0, 0, 0}, 873 - }, 874 - { 875 - Name: "b", 876 - Ranks: []float64{1, 1, 1}, 877 - }, 878 - { 879 - Name: "c", 880 - Ranks: []float64{1, 0, 1}, 881 - }, 882 - }, 883 - want: []string{"b", "c", "a"}, 884 - }, 885 - { 886 - name: "1 nil", 887 - in: []*zoekt.Document{ 888 - { 889 - Name: "a", 890 - Ranks: []float64{1, 1, 0}, 891 - }, 892 - { 893 - Name: "b", 894 - }, 895 - { 896 - Name: "c", 897 - Ranks: []float64{1, 1, 1}, 898 - }, 899 - }, 900 - want: []string{"c", "a", "b"}, 901 - }, 902 - { 903 - name: "different lengths", 904 - in: []*zoekt.Document{ 905 - { 906 - Name: "a", 907 - Ranks: []float64{0}, 908 - }, 909 - { 910 - Name: "b", 911 - Ranks: []float64{0, 0}, 912 - }, 913 - { 914 - Name: "c", 915 - Ranks: []float64{0, 0, 0}, 916 - }, 917 - }, 918 - want: []string{"c", "b", "a"}, 919 - }, 920 - { 921 - name: "different lengths and nil", 922 - in: []*zoekt.Document{ 923 - { 924 - Name: "a", 925 - Ranks: []float64{0}, 926 - }, 927 - { 928 - Name: "b", 929 - Ranks: []float64{0, 0}, 930 - }, 931 - { 932 - Name: "c", 933 - }, 934 - }, 935 - want: []string{"b", "a", "c"}, 936 - }, 937 - } 938 - 939 - for _, tt := range tests { 940 - t.Run(tt.name, func(t *testing.T) { 941 - sortDocuments2(tt.in) 942 - 943 - for i, name := range tt.want { 944 - if tt.in[i].Name != name { 945 - var got []string 946 - for _, d := range tt.in { 947 - got = append(got, d.Name) 948 - } 949 - t.Fatalf("want %+v, got %+v\n", tt.want, got) 950 - } 951 - } 952 - }) 953 - } 954 - 955 - t.Run("test for panics", func(t *testing.T) { 956 - // Special case: test for panics if all documents have nil rank vectors. 957 - sortDocuments2([]*zoekt.Document{{}, {}}) 958 - sortDocuments2([]*zoekt.Document{{}}) 959 - sortDocuments2(nil) 960 - }) 961 - 962 - } 963 - 964 861 func TestIgnoreSizeMax(t *testing.T) { 965 862 966 863 for _, test := range []struct {
+89
build/e2e_test.go
··· 1115 1115 }) 1116 1116 } 1117 1117 } 1118 + 1119 + func TestScoringWithDocumentRanks(t *testing.T) { 1120 + if os.Getenv("CI") == "" && checkCTags() == "" { 1121 + t.Skip("ctags not available") 1122 + } 1123 + dir := t.TempDir() 1124 + 1125 + opts := Options{ 1126 + IndexDir: dir, 1127 + RepositoryDescription: zoekt.Repository{ 1128 + Name: "repo", 1129 + }, 1130 + DocumentRanksVersion: "ranking", 1131 + } 1132 + 1133 + searchQuery := &query.Substring{Content: true, Pattern: "Inner"} 1134 + exampleJava, err := os.ReadFile("./testdata/example.java") 1135 + if err != nil { 1136 + t.Fatal(err) 1137 + } 1138 + 1139 + cases := []struct { 1140 + name string 1141 + documentRanks []float64 1142 + documentRanksWeight float64 1143 + wantScore float64 1144 + }{ 1145 + { 1146 + name: "score with no document ranks", 1147 + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 400 (atom) + 10 (file order) 1148 + wantScore: 7412.00, 1149 + }, 1150 + { 1151 + name: "score with document ranks", 1152 + documentRanks: []float64{0, 0, 0, 0, 0.8, 0, 0}, 1153 + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 400 (atom) + 7200 (file rank) + 10 (file order) 1154 + wantScore: 14612.00, 1155 + }, 1156 + { 1157 + name: "score with custom document ranks weight", 1158 + documentRanks: []float64{0, 0, 0, 0, 0.8, 0, 0}, 1159 + documentRanksWeight: 1000.0, 1160 + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 400 (atom) + 800 (file rank) + 10 (file order) 1161 + wantScore: 8212.00, 1162 + }, 1163 + } 1164 + 1165 + for _, c := range cases { 1166 + t.Run(c.name, func(t *testing.T) { 1167 + b, err := NewBuilder(opts) 1168 + if err != nil { 1169 + t.Fatalf("NewBuilder: %v", err) 1170 + } 1171 + 1172 + err = b.Add(zoekt.Document{Name: "example.java", Content: exampleJava, Ranks: c.documentRanks}) 1173 + if err != nil { 1174 + t.Fatal(err) 1175 + } 1176 + 1177 + if err := b.Finish(); err != nil { 1178 + t.Fatalf("Finish: %v", err) 1179 + } 1180 + 1181 + ss, err := shards.NewDirectorySearcher(dir) 1182 + if err != nil { 1183 + t.Fatalf("NewDirectorySearcher(%s): %v", dir, err) 1184 + } 1185 + defer ss.Close() 1186 + 1187 + srs, err := ss.Search(context.Background(), searchQuery, &zoekt.SearchOptions{ 1188 + UseDocumentRanks: true, 1189 + DocumentRanksWeight: c.documentRanksWeight, 1190 + DebugScore: true, 1191 + }) 1192 + 1193 + if err != nil { 1194 + t.Fatal(err) 1195 + } 1196 + 1197 + if got, want := len(srs.Files), 1; got != want { 1198 + t.Fatalf("file matches: want %d, got %d", want, got) 1199 + } 1200 + 1201 + if got := srs.Files[0].Score; got != c.wantScore { 1202 + t.Fatalf("score: want %f, got %f\ndebug: %s\ndebugscore: %s", c.wantScore, got, srs.Files[0].Debug, srs.Files[0].LineMatches[0].DebugScore) 1203 + } 1204 + }) 1205 + } 1206 + }
+16 -89
contentprovider.go
··· 18 18 "bytes" 19 19 "fmt" 20 20 "log" 21 - "math" 22 21 "sort" 23 22 "strings" 24 23 "unicode/utf8" ··· 453 452 } 454 453 455 454 const ( 456 - // TODO - how to scale this relative to rank? 455 + // Query-dependent scoring signals. All of these together are bounded at ~9000 456 + // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch). 457 457 scorePartialWordMatch = 50.0 458 458 scoreWordMatch = 500.0 459 459 scoreBase = 7000.0 ··· 461 461 scoreSymbol = 7000.0 462 462 scorePartialSymbol = 4000.0 463 463 scoreKindMatch = 100.0 464 + scoreRepetitionFactor = 1.0 464 465 scoreFactorAtomMatch = 400.0 465 - scoreShardRankFactor = 20.0 466 - scoreFileOrderFactor = 10.0 467 - scoreLineOrderFactor = 1.0 468 - scoreRepetitionFactor = 1.0 466 + 467 + // File-only scoring signals. For now these are also bounded ~9000 to give them 468 + // equal weight with the query-dependent signals. 469 + scoreFileRankFactor = 9000.0 470 + scoreFileOrderFactor = 10.0 471 + scoreShardRankFactor = 20.0 472 + 473 + // Used for ordering line and chunk matches within a file. 474 + scoreLineOrderFactor = 1.0 469 475 ) 470 476 471 477 // findSection checks whether a section defined by offset and size lies within ··· 873 879 sort.Sort(chunkMatchScoreSlice(ms)) 874 880 } 875 881 876 - // k = 60 is arbitrary but reportedly works well (RRF; Cormack et al., 2009). 877 - const k = 60 878 - 879 - // SortFiles sorts files matches. The order depends on the match score and, if 880 - // available, on the pre-computed document ranks. 881 - // 882 - // Rankings derived from match scores and rank vectors are combined based on 883 - // "Reciprocal Rank Fusion" (RRF). 884 - func SortFiles(ms []FileMatch, opts *SearchOptions) { 882 + // SortFiles sorts files matches. The order depends on the match score, which includes both 883 + // query-dependent signals like word overlap, and file-only signals like the file ranks (if 884 + // file ranks are enabled). 885 + func SortFiles(ms []FileMatch) { 885 886 sort.Sort(fileMatchesByScore(ms)) 886 - 887 - if opts.UseDocumentRanks { 888 - rrfScore := make([]float64, len(ms)) 889 - 890 - for i := 0; i < len(ms); i++ { 891 - rrfScore[i] = 1 / (k + float64(i)) 892 - if opts.DebugScore { 893 - ms[i].Debug += fmt.Sprintf("(%d,", i) 894 - } 895 - } 896 - 897 - // We use stable sort in case we don't have ranks. Without stable sort the order 898 - // of file matches would be random which would sully the ranking induces by the 899 - // scores. 900 - sort.Stable(fileMatchesByRank{fileMatches: ms, rrfScore: rrfScore}) 901 - 902 - for i := range rrfScore { 903 - rrfScore[i] += (1 - opts.RanksDampingFactor) / (k + float64(i)) 904 - if opts.DebugScore { 905 - ms[i].Debug += fmt.Sprintf("%d), ", i) 906 - } 907 - } 908 - 909 - sort.Sort(fileMatchesByRRFScore{fileMatches: ms, rrfScore: rrfScore}) 910 - } 911 - } 912 - 913 - type fileMatchesByRank struct { 914 - fileMatches []FileMatch 915 - rrfScore []float64 916 - } 917 - 918 - func (m fileMatchesByRank) Len() int { return len(m.fileMatches) } 919 - 920 - func (m fileMatchesByRank) Swap(i, j int) { 921 - m.fileMatches[i], m.fileMatches[j] = m.fileMatches[j], m.fileMatches[i] 922 - m.rrfScore[i], m.rrfScore[j] = m.rrfScore[j], m.rrfScore[i] 923 - } 924 - 925 - const epsilon = 0.00000001 926 - 927 - func (m fileMatchesByRank) Less(i, j int) bool { 928 - r1 := m.fileMatches[i].Ranks 929 - r2 := m.fileMatches[j].Ranks 930 - 931 - l := len(r1) 932 - if len(r2) < l { 933 - l = len(r2) 934 - } 935 - for i := 0; i < l; i++ { 936 - if math.Abs(r1[i]-r2[i]) > epsilon { 937 - return r1[i] > r2[i] 938 - } 939 - } 940 - // if r1 has more entries it is more important. ie imagine right padding shorter 941 - // arrays with zeros, so they are the same length. 942 - return len(r1) > len(r2) 943 - } 944 - 945 - type fileMatchesByRRFScore struct { 946 - fileMatches []FileMatch 947 - rrfScore []float64 948 - } 949 - 950 - func (m fileMatchesByRRFScore) Len() int { return len(m.fileMatches) } 951 - 952 - func (m fileMatchesByRRFScore) Swap(i, j int) { 953 - m.fileMatches[i], m.fileMatches[j] = m.fileMatches[j], m.fileMatches[i] 954 - m.rrfScore[i], m.rrfScore[j] = m.rrfScore[j], m.rrfScore[i] 955 - } 956 - 957 - func (m fileMatchesByRRFScore) Less(i, j int) bool { 958 - // Higher scores are better. 959 - return m.rrfScore[i] > m.rrfScore[j] 960 887 }
-53
contentprovider_test.go
··· 3 3 import ( 4 4 "bytes" 5 5 "fmt" 6 - "math" 7 6 "testing" 8 7 9 8 "github.com/google/go-cmp/cmp" ··· 327 326 }) 328 327 } 329 328 } 330 - 331 - func TestSortFiles(t *testing.T) { 332 - in := []FileMatch{ 333 - {FileName: "d1", Score: 2, Ranks: []float64{0.75}}, 334 - {FileName: "d2", Score: 4, Ranks: []float64{0.25}}, 335 - {FileName: "d3", Score: 3, Ranks: []float64{1.0}}, 336 - {FileName: "d4", Score: 1, Ranks: []float64{0.5}}, 337 - } 338 - 339 - cases := []struct { 340 - name string 341 - dampingFactor float64 342 - wantOrder []string 343 - }{ 344 - // Document RRF(Score) RRF(Ranks) SUM Rank 345 - // d3 1/(60+1) 1/(60+0) 0,0330601092896175 0 346 - // d2 1/(60+0) 1/(60+3) 0,0325396825396826 1 347 - // d1 1/(60+2) 1/(60+1) 0,0325224748810153 2 348 - // d4 1/(60+3) 1/(60+2) 0,0320020481310804 3 349 - { 350 - "equal", 351 - 0, 352 - []string{"d3", "d2", "d1", "d4"}, 353 - }, 354 - { 355 - "scores only", 356 - 1, 357 - []string{"d2", "d3", "d1", "d4"}, 358 - }, 359 - { 360 - "ranks only", 361 - math.Inf(-1), 362 - []string{"d3", "d1", "d4", "d2"}, 363 - }, 364 - } 365 - 366 - for _, tt := range cases { 367 - t.Run("", func(t *testing.T) { 368 - 369 - SortFiles(in, &SearchOptions{UseDocumentRanks: true, DebugScore: true, RanksDampingFactor: tt.dampingFactor}) 370 - 371 - var haveOrder = []string{} 372 - for _, f := range in { 373 - haveOrder = append(haveOrder, f.FileName) 374 - } 375 - 376 - if d := cmp.Diff(tt.wantOrder, haveOrder); d != "" { 377 - t.Fatalf("-want, +got\n%s\n", d) 378 - } 379 - }) 380 - } 381 - }
+13 -9
eval.go
··· 367 367 368 368 fileMatch.addScore("atom", float64(atomMatchCount)/float64(totalAtomCount)*scoreFactorAtomMatch, opts.DebugScore) 369 369 370 - // Prefer earlier docs. 371 - fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(nextDoc)/float64(len(d.boundaries))), opts.DebugScore) 372 - 373 370 if opts.UseDocumentRanks && len(d.ranks) > int(nextDoc) { 374 - fileMatch.Ranks = d.ranks[nextDoc] 371 + weight := scoreFileRankFactor 372 + if opts.DocumentRanksWeight > 0.0 { 373 + weight = opts.DocumentRanksWeight 374 + } 375 + 376 + ranks := d.ranks[nextDoc] 377 + // This is a temporary workaround -- we only really want the PageRank score, and ignore 378 + // everything else. In a follow-up we'll simplify the rank format and remove this hack. 379 + if len(ranks) > 4 { 380 + fileMatch.addScore("file-rank", weight*d.ranks[nextDoc][4], opts.DebugScore) 381 + } 375 382 } 376 383 384 + fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(nextDoc)/float64(len(d.boundaries))), opts.DebugScore) 377 385 fileMatch.addScore("shard-order", scoreShardRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore) 378 - 379 - if opts.DebugScore && opts.UseDocumentRanks { 380 - fileMatch.Debug += fmt.Sprintf("ranks: %v, ", fileMatch.Ranks) 381 - } 382 386 383 387 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known) 384 388 sortMatchesByScore(fileMatch.LineMatches) ··· 426 430 // I am slightly worried about negative interactions with TotalMaxMatchCount 427 431 // so feature flagging this behaviour behind UseDocumentRanks. 428 432 if limit := opts.MaxDocDisplayCount; opts.UseDocumentRanks && limit > 0 && limit < len(res.Files) { 429 - SortFiles(res.Files, opts) 433 + SortFiles(res.Files) 430 434 res.Files = res.Files[:limit] 431 435 } 432 436
+1 -1
shards/aggregate.go
··· 76 76 agg := c.aggregate 77 77 c.aggregate = nil 78 78 79 - zoekt.SortFiles(agg.Files, c.opts) 79 + zoekt.SortFiles(agg.Files) 80 80 81 81 if max := c.opts.MaxDocDisplayCount; max > 0 && len(agg.Files) > max { 82 82 agg.Files = agg.Files[:max]
+2 -2
shards/shards.go
··· 761 761 func sendByRepository(result *zoekt.SearchResult, opts *zoekt.SearchOptions, sender zoekt.Sender) { 762 762 763 763 if len(result.RepoURLs) <= 1 || len(result.Files) == 0 { 764 - zoekt.SortFiles(result.Files, opts) 764 + zoekt.SortFiles(result.Files) 765 765 sender.Send(result) 766 766 return 767 767 } 768 768 769 769 send := func(repoName string, a, b int, stats zoekt.Stats) { 770 - zoekt.SortFiles(result.Files[a:b], opts) 770 + zoekt.SortFiles(result.Files[a:b]) 771 771 sender.Send(&zoekt.SearchResult{ 772 772 Stats: stats, 773 773 Progress: zoekt.Progress{