Ranking: simplify score combination strategy (#523) · boltless.me/zoekt@f6d0aa0

+4 -15

api.go

··· 38 38 // Ranking; the higher, the better. 39 39 Score float64 // TODO - hide this field? 40 40 41 - // Experimental. Ranks is a vector containing floats in the interval [0, 1]. The 42 - // length of the vector depends on the output from the ranking function at index 43 - // time. 44 - // 45 - // This field is only set if the shard contains ranking information and 46 - // SearchOptions.UseDocumentRanks is true. 47 - Ranks []float64 48 - 49 41 // For debugging. Needs DebugScore set, but public so tests in 50 42 // other packages can print some diagnostics. 51 43 Debug string ··· 94 86 func (m *FileMatch) sizeBytes() (sz uint64) { 95 87 // Score 96 88 sz += 8 97 - 98 - // ranks 99 - sz += 8 * uint64(len(m.Ranks)) 100 89 101 90 for _, s := range []string{ 102 91 m.Debug, ··· 894 883 // sorting matches. 895 884 UseDocumentRanks bool 896 885 897 - // RanksDampingFactor determines the contribution of documents ranks to the 898 - // final ranking based on RRF. A value in (0,1] reduces the contribution, 899 - // while a value in (-inf,0) increases it. 900 - RanksDampingFactor float64 886 + // EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust 887 + // their weight in the file match score. If the value is <= 0.0, the default weight value 888 + // will be used. This option is temporary and is only exposed for testing/ tuning purposes. 889 + DocumentRanksWeight float64 901 890 902 891 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as 903 892 // a command-line flag

+1 -35

build/builder.go

··· 978 978 } 979 979 } 980 980 981 - const epsilon = 0.00000001 982 - 983 - // sortDocuments2 sorts []*zoekt.Document according to their Ranks. In general, 984 - // documents can have a nil rank vector if the document to be indexed was added 985 - // after the ranking took place. A nil rank vector translates to the lowest 986 - // possible rank. Longer vectors are more important than shorter vectors, given 987 - // all other ranks are equal. 988 - // 989 - // Note: the logic here is inverted to sortDocuments because rank in 990 - // sortDocuments returns a vector of the form [1-rank, ...]. 991 - func sortDocuments2(rs []*zoekt.Document) { 992 - sort.Slice(rs, func(i, j int) bool { 993 - r1 := rs[i].Ranks 994 - r2 := rs[j].Ranks 995 - 996 - l := len(r1) 997 - if len(r2) < l { 998 - l = len(r2) 999 - } 1000 - for i := 0; i < l; i++ { 1001 - if math.Abs(r1[i]-r2[i]) > epsilon { 1002 - return r1[i] > r2[i] 1003 - } 1004 - } 1005 - // if r1 has more entries it is more important. ie imagine right padding shorter 1006 - // arrays with zeros, so they are the same length. 1007 - return len(r1) > len(r2) 1008 - }) 1009 - } 1010 - 1011 981 func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) { 1012 982 if !b.opts.DisableCTags && b.opts.CTagsPath != "" { 1013 983 err := ctagsAddSymbols(todo, b.parser, b.opts.CTagsPath) ··· 1026 996 return nil, err 1027 997 } 1028 998 1029 - if b.opts.DocumentRanksPath != "" { 1030 - sortDocuments2(todo) 1031 - } else { 1032 - sortDocuments(todo) 1033 - } 999 + sortDocuments(todo) 1034 1000 1035 1001 for _, t := range todo { 1036 1002 if err := shardBuilder.Add(*t); err != nil {

-103

build/builder_test.go

··· 858 858 } 859 859 } 860 860 861 - func Test_sortDocuments2(t *testing.T) { 862 - tests := []struct { 863 - name string 864 - in []*zoekt.Document 865 - want []string 866 - }{ 867 - { 868 - name: "same length", 869 - in: []*zoekt.Document{ 870 - { 871 - Name: "a", 872 - Ranks: []float64{0, 0, 0}, 873 - }, 874 - { 875 - Name: "b", 876 - Ranks: []float64{1, 1, 1}, 877 - }, 878 - { 879 - Name: "c", 880 - Ranks: []float64{1, 0, 1}, 881 - }, 882 - }, 883 - want: []string{"b", "c", "a"}, 884 - }, 885 - { 886 - name: "1 nil", 887 - in: []*zoekt.Document{ 888 - { 889 - Name: "a", 890 - Ranks: []float64{1, 1, 0}, 891 - }, 892 - { 893 - Name: "b", 894 - }, 895 - { 896 - Name: "c", 897 - Ranks: []float64{1, 1, 1}, 898 - }, 899 - }, 900 - want: []string{"c", "a", "b"}, 901 - }, 902 - { 903 - name: "different lengths", 904 - in: []*zoekt.Document{ 905 - { 906 - Name: "a", 907 - Ranks: []float64{0}, 908 - }, 909 - { 910 - Name: "b", 911 - Ranks: []float64{0, 0}, 912 - }, 913 - { 914 - Name: "c", 915 - Ranks: []float64{0, 0, 0}, 916 - }, 917 - }, 918 - want: []string{"c", "b", "a"}, 919 - }, 920 - { 921 - name: "different lengths and nil", 922 - in: []*zoekt.Document{ 923 - { 924 - Name: "a", 925 - Ranks: []float64{0}, 926 - }, 927 - { 928 - Name: "b", 929 - Ranks: []float64{0, 0}, 930 - }, 931 - { 932 - Name: "c", 933 - }, 934 - }, 935 - want: []string{"b", "a", "c"}, 936 - }, 937 - } 938 - 939 - for _, tt := range tests { 940 - t.Run(tt.name, func(t *testing.T) { 941 - sortDocuments2(tt.in) 942 - 943 - for i, name := range tt.want { 944 - if tt.in[i].Name != name { 945 - var got []string 946 - for _, d := range tt.in { 947 - got = append(got, d.Name) 948 - } 949 - t.Fatalf("want %+v, got %+v\n", tt.want, got) 950 - } 951 - } 952 - }) 953 - } 954 - 955 - t.Run("test for panics", func(t *testing.T) { 956 - // Special case: test for panics if all documents have nil rank vectors. 957 - sortDocuments2([]*zoekt.Document{{}, {}}) 958 - sortDocuments2([]*zoekt.Document{{}}) 959 - sortDocuments2(nil) 960 - }) 961 - 962 - } 963 - 964 861 func TestIgnoreSizeMax(t *testing.T) { 965 862 966 863 for _, test := range []struct {

+89

build/e2e_test.go

··· 1115 1115 }) 1116 1116 } 1117 1117 } 1118 + 1119 + func TestScoringWithDocumentRanks(t *testing.T) { 1120 + if os.Getenv("CI") == "" && checkCTags() == "" { 1121 + t.Skip("ctags not available") 1122 + } 1123 + dir := t.TempDir() 1124 + 1125 + opts := Options{ 1126 + IndexDir: dir, 1127 + RepositoryDescription: zoekt.Repository{ 1128 + Name: "repo", 1129 + }, 1130 + DocumentRanksVersion: "ranking", 1131 + } 1132 + 1133 + searchQuery := &query.Substring{Content: true, Pattern: "Inner"} 1134 + exampleJava, err := os.ReadFile("./testdata/example.java") 1135 + if err != nil { 1136 + t.Fatal(err) 1137 + } 1138 + 1139 + cases := []struct { 1140 + name string 1141 + documentRanks []float64 1142 + documentRanksWeight float64 1143 + wantScore float64 1144 + }{ 1145 + { 1146 + name: "score with no document ranks", 1147 + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 400 (atom) + 10 (file order) 1148 + wantScore: 7412.00, 1149 + }, 1150 + { 1151 + name: "score with document ranks", 1152 + documentRanks: []float64{0, 0, 0, 0, 0.8, 0, 0}, 1153 + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 400 (atom) + 7200 (file rank) + 10 (file order) 1154 + wantScore: 14612.00, 1155 + }, 1156 + { 1157 + name: "score with custom document ranks weight", 1158 + documentRanks: []float64{0, 0, 0, 0, 0.8, 0, 0}, 1159 + documentRanksWeight: 1000.0, 1160 + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 400 (atom) + 800 (file rank) + 10 (file order) 1161 + wantScore: 8212.00, 1162 + }, 1163 + } 1164 + 1165 + for _, c := range cases { 1166 + t.Run(c.name, func(t *testing.T) { 1167 + b, err := NewBuilder(opts) 1168 + if err != nil { 1169 + t.Fatalf("NewBuilder: %v", err) 1170 + } 1171 + 1172 + err = b.Add(zoekt.Document{Name: "example.java", Content: exampleJava, Ranks: c.documentRanks}) 1173 + if err != nil { 1174 + t.Fatal(err) 1175 + } 1176 + 1177 + if err := b.Finish(); err != nil { 1178 + t.Fatalf("Finish: %v", err) 1179 + } 1180 + 1181 + ss, err := shards.NewDirectorySearcher(dir) 1182 + if err != nil { 1183 + t.Fatalf("NewDirectorySearcher(%s): %v", dir, err) 1184 + } 1185 + defer ss.Close() 1186 + 1187 + srs, err := ss.Search(context.Background(), searchQuery, &zoekt.SearchOptions{ 1188 + UseDocumentRanks: true, 1189 + DocumentRanksWeight: c.documentRanksWeight, 1190 + DebugScore: true, 1191 + }) 1192 + 1193 + if err != nil { 1194 + t.Fatal(err) 1195 + } 1196 + 1197 + if got, want := len(srs.Files), 1; got != want { 1198 + t.Fatalf("file matches: want %d, got %d", want, got) 1199 + } 1200 + 1201 + if got := srs.Files[0].Score; got != c.wantScore { 1202 + t.Fatalf("score: want %f, got %f\ndebug: %s\ndebugscore: %s", c.wantScore, got, srs.Files[0].Debug, srs.Files[0].LineMatches[0].DebugScore) 1203 + } 1204 + }) 1205 + } 1206 + }

+16 -89

contentprovider.go

··· 18 18 "bytes" 19 19 "fmt" 20 20 "log" 21 - "math" 22 21 "sort" 23 22 "strings" 24 23 "unicode/utf8" ··· 453 452 } 454 453 455 454 const ( 456 - // TODO - how to scale this relative to rank? 455 + // Query-dependent scoring signals. All of these together are bounded at ~9000 456 + // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch). 457 457 scorePartialWordMatch = 50.0 458 458 scoreWordMatch = 500.0 459 459 scoreBase = 7000.0 ··· 461 461 scoreSymbol = 7000.0 462 462 scorePartialSymbol = 4000.0 463 463 scoreKindMatch = 100.0 464 + scoreRepetitionFactor = 1.0 464 465 scoreFactorAtomMatch = 400.0 465 - scoreShardRankFactor = 20.0 466 - scoreFileOrderFactor = 10.0 467 - scoreLineOrderFactor = 1.0 468 - scoreRepetitionFactor = 1.0 466 + 467 + // File-only scoring signals. For now these are also bounded ~9000 to give them 468 + // equal weight with the query-dependent signals. 469 + scoreFileRankFactor = 9000.0 470 + scoreFileOrderFactor = 10.0 471 + scoreShardRankFactor = 20.0 472 + 473 + // Used for ordering line and chunk matches within a file. 474 + scoreLineOrderFactor = 1.0 469 475 ) 470 476 471 477 // findSection checks whether a section defined by offset and size lies within ··· 873 879 sort.Sort(chunkMatchScoreSlice(ms)) 874 880 } 875 881 876 - // k = 60 is arbitrary but reportedly works well (RRF; Cormack et al., 2009). 877 - const k = 60 878 - 879 - // SortFiles sorts files matches. The order depends on the match score and, if 880 - // available, on the pre-computed document ranks. 881 - // 882 - // Rankings derived from match scores and rank vectors are combined based on 883 - // "Reciprocal Rank Fusion" (RRF). 884 - func SortFiles(ms []FileMatch, opts *SearchOptions) { 882 + // SortFiles sorts files matches. The order depends on the match score, which includes both 883 + // query-dependent signals like word overlap, and file-only signals like the file ranks (if 884 + // file ranks are enabled). 885 + func SortFiles(ms []FileMatch) { 885 886 sort.Sort(fileMatchesByScore(ms)) 886 - 887 - if opts.UseDocumentRanks { 888 - rrfScore := make([]float64, len(ms)) 889 - 890 - for i := 0; i < len(ms); i++ { 891 - rrfScore[i] = 1 / (k + float64(i)) 892 - if opts.DebugScore { 893 - ms[i].Debug += fmt.Sprintf("(%d,", i) 894 - } 895 - } 896 - 897 - // We use stable sort in case we don't have ranks. Without stable sort the order 898 - // of file matches would be random which would sully the ranking induces by the 899 - // scores. 900 - sort.Stable(fileMatchesByRank{fileMatches: ms, rrfScore: rrfScore}) 901 - 902 - for i := range rrfScore { 903 - rrfScore[i] += (1 - opts.RanksDampingFactor) / (k + float64(i)) 904 - if opts.DebugScore { 905 - ms[i].Debug += fmt.Sprintf("%d), ", i) 906 - } 907 - } 908 - 909 - sort.Sort(fileMatchesByRRFScore{fileMatches: ms, rrfScore: rrfScore}) 910 - } 911 - } 912 - 913 - type fileMatchesByRank struct { 914 - fileMatches []FileMatch 915 - rrfScore []float64 916 - } 917 - 918 - func (m fileMatchesByRank) Len() int { return len(m.fileMatches) } 919 - 920 - func (m fileMatchesByRank) Swap(i, j int) { 921 - m.fileMatches[i], m.fileMatches[j] = m.fileMatches[j], m.fileMatches[i] 922 - m.rrfScore[i], m.rrfScore[j] = m.rrfScore[j], m.rrfScore[i] 923 - } 924 - 925 - const epsilon = 0.00000001 926 - 927 - func (m fileMatchesByRank) Less(i, j int) bool { 928 - r1 := m.fileMatches[i].Ranks 929 - r2 := m.fileMatches[j].Ranks 930 - 931 - l := len(r1) 932 - if len(r2) < l { 933 - l = len(r2) 934 - } 935 - for i := 0; i < l; i++ { 936 - if math.Abs(r1[i]-r2[i]) > epsilon { 937 - return r1[i] > r2[i] 938 - } 939 - } 940 - // if r1 has more entries it is more important. ie imagine right padding shorter 941 - // arrays with zeros, so they are the same length. 942 - return len(r1) > len(r2) 943 - } 944 - 945 - type fileMatchesByRRFScore struct { 946 - fileMatches []FileMatch 947 - rrfScore []float64 948 - } 949 - 950 - func (m fileMatchesByRRFScore) Len() int { return len(m.fileMatches) } 951 - 952 - func (m fileMatchesByRRFScore) Swap(i, j int) { 953 - m.fileMatches[i], m.fileMatches[j] = m.fileMatches[j], m.fileMatches[i] 954 - m.rrfScore[i], m.rrfScore[j] = m.rrfScore[j], m.rrfScore[i] 955 - } 956 - 957 - func (m fileMatchesByRRFScore) Less(i, j int) bool { 958 - // Higher scores are better. 959 - return m.rrfScore[i] > m.rrfScore[j] 960 887 }

-53

contentprovider_test.go

··· 3 3 import ( 4 4 "bytes" 5 5 "fmt" 6 - "math" 7 6 "testing" 8 7 9 8 "github.com/google/go-cmp/cmp" ··· 327 326 }) 328 327 } 329 328 } 330 - 331 - func TestSortFiles(t *testing.T) { 332 - in := []FileMatch{ 333 - {FileName: "d1", Score: 2, Ranks: []float64{0.75}}, 334 - {FileName: "d2", Score: 4, Ranks: []float64{0.25}}, 335 - {FileName: "d3", Score: 3, Ranks: []float64{1.0}}, 336 - {FileName: "d4", Score: 1, Ranks: []float64{0.5}}, 337 - } 338 - 339 - cases := []struct { 340 - name string 341 - dampingFactor float64 342 - wantOrder []string 343 - }{ 344 - // Document RRF(Score) RRF(Ranks) SUM Rank 345 - // d3 1/(60+1) 1/(60+0) 0,0330601092896175 0 346 - // d2 1/(60+0) 1/(60+3) 0,0325396825396826 1 347 - // d1 1/(60+2) 1/(60+1) 0,0325224748810153 2 348 - // d4 1/(60+3) 1/(60+2) 0,0320020481310804 3 349 - { 350 - "equal", 351 - 0, 352 - []string{"d3", "d2", "d1", "d4"}, 353 - }, 354 - { 355 - "scores only", 356 - 1, 357 - []string{"d2", "d3", "d1", "d4"}, 358 - }, 359 - { 360 - "ranks only", 361 - math.Inf(-1), 362 - []string{"d3", "d1", "d4", "d2"}, 363 - }, 364 - } 365 - 366 - for _, tt := range cases { 367 - t.Run("", func(t *testing.T) { 368 - 369 - SortFiles(in, &SearchOptions{UseDocumentRanks: true, DebugScore: true, RanksDampingFactor: tt.dampingFactor}) 370 - 371 - var haveOrder = []string{} 372 - for _, f := range in { 373 - haveOrder = append(haveOrder, f.FileName) 374 - } 375 - 376 - if d := cmp.Diff(tt.wantOrder, haveOrder); d != "" { 377 - t.Fatalf("-want, +got\n%s\n", d) 378 - } 379 - }) 380 - } 381 - }

+13 -9

eval.go

··· 367 367 368 368 fileMatch.addScore("atom", float64(atomMatchCount)/float64(totalAtomCount)*scoreFactorAtomMatch, opts.DebugScore) 369 369 370 - // Prefer earlier docs. 371 - fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(nextDoc)/float64(len(d.boundaries))), opts.DebugScore) 372 - 373 370 if opts.UseDocumentRanks && len(d.ranks) > int(nextDoc) { 374 - fileMatch.Ranks = d.ranks[nextDoc] 371 + weight := scoreFileRankFactor 372 + if opts.DocumentRanksWeight > 0.0 { 373 + weight = opts.DocumentRanksWeight 374 + } 375 + 376 + ranks := d.ranks[nextDoc] 377 + // This is a temporary workaround -- we only really want the PageRank score, and ignore 378 + // everything else. In a follow-up we'll simplify the rank format and remove this hack. 379 + if len(ranks) > 4 { 380 + fileMatch.addScore("file-rank", weight*d.ranks[nextDoc][4], opts.DebugScore) 381 + } 375 382 } 376 383 384 + fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(nextDoc)/float64(len(d.boundaries))), opts.DebugScore) 377 385 fileMatch.addScore("shard-order", scoreShardRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore) 378 - 379 - if opts.DebugScore && opts.UseDocumentRanks { 380 - fileMatch.Debug += fmt.Sprintf("ranks: %v, ", fileMatch.Ranks) 381 - } 382 386 383 387 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known) 384 388 sortMatchesByScore(fileMatch.LineMatches) ··· 426 430 // I am slightly worried about negative interactions with TotalMaxMatchCount 427 431 // so feature flagging this behaviour behind UseDocumentRanks. 428 432 if limit := opts.MaxDocDisplayCount; opts.UseDocumentRanks && limit > 0 && limit < len(res.Files) { 429 - SortFiles(res.Files, opts) 433 + SortFiles(res.Files) 430 434 res.Files = res.Files[:limit] 431 435 } 432 436

+1 -1

shards/aggregate.go

··· 76 76 agg := c.aggregate 77 77 c.aggregate = nil 78 78 79 - zoekt.SortFiles(agg.Files, c.opts) 79 + zoekt.SortFiles(agg.Files) 80 80 81 81 if max := c.opts.MaxDocDisplayCount; max > 0 && len(agg.Files) > max { 82 82 agg.Files = agg.Files[:max]

+2 -2

shards/shards.go

··· 761 761 func sendByRepository(result *zoekt.SearchResult, opts *zoekt.SearchOptions, sender zoekt.Sender) { 762 762 763 763 if len(result.RepoURLs) <= 1 || len(result.Files) == 0 { 764 - zoekt.SortFiles(result.Files, opts) 764 + zoekt.SortFiles(result.Files) 765 765 sender.Send(result) 766 766 return 767 767 } 768 768 769 769 send := func(repoName string, a, b int, stats zoekt.Stats) { 770 - zoekt.SortFiles(result.Files[a:b], opts) 770 + zoekt.SortFiles(result.Files[a:b]) 771 771 sender.Send(&zoekt.SearchResult{ 772 772 Stats: stats, 773 773 Progress: zoekt.Progress{

Configure Feed

Configure Feed