scoring: score files based on absolute number of atoms (#542) · boltless.me/zoekt@b65e3e6

+100 -63

build/e2e_test.go

··· 20 20 "encoding/json" 21 21 "fmt" 22 22 "log" 23 + "math" 23 24 "os" 24 25 "path/filepath" 25 26 "reflect" ··· 834 835 content: exampleKotlin, 835 836 query: &query.Substring{Content: true, Pattern: "oxyPreloader"}, 836 837 wantLanguage: "Kotlin", 837 - // 5500 (partial symbol at boundary) + 1000 (Kotlin class) + 50 (partial word) + 400 (atom) + 10 (file order) 838 - wantScore: 6960, 838 + // 5500 (partial symbol at boundary) + 1000 (Kotlin class) + 50 (partial word) + 10 (file order) 839 + wantScore: 6560, 839 840 }, 840 841 { 841 842 fileName: "example.kt", 842 843 content: exampleKotlin, 843 844 query: &query.Substring{Content: true, Pattern: "ViewMetadata"}, 844 845 wantLanguage: "Kotlin", 845 - // 7000 (symbol) + 900 (Kotlin interface) + 500 (word) + 400 (atom) + 10 (file order) 846 - wantScore: 8810, 846 + // 7000 (symbol) + 900 (Kotlin interface) + 500 (word) + 10 (file order) 847 + wantScore: 8410, 847 848 }, 848 849 { 849 850 fileName: "example.kt", 850 851 content: exampleKotlin, 851 852 query: &query.Substring{Content: true, Pattern: "onScrolled"}, 852 853 wantLanguage: "Kotlin", 853 - // 7000 (symbol) + 800 (Kotlin method) + 500 (word) + 400 (atom) + 10 (file order) 854 - wantScore: 8710, 854 + // 7000 (symbol) + 800 (Kotlin method) + 500 (word) + 10 (file order) 855 + wantScore: 8310, 855 856 }, 856 857 { 857 858 fileName: "example.kt", 858 859 content: exampleKotlin, 859 860 query: &query.Substring{Content: true, Pattern: "PreloadErrorHandler"}, 860 861 wantLanguage: "Kotlin", 861 - // 7000 (symbol) + 700 (Kotlin typealias) + 500 (word) + 400 (atom) + 10 (file order) 862 - wantScore: 8610, 862 + // 7000 (symbol) + 700 (Kotlin typealias) + 500 (word) + 10 (file order) 863 + wantScore: 8210, 863 864 }, 864 865 { 865 866 fileName: "example.kt", 866 867 content: exampleKotlin, 867 868 query: &query.Substring{Content: true, Pattern: "FLING_THRESHOLD_PX"}, 868 869 wantLanguage: "Kotlin", 869 - // 7000 (symbol) + 600 (Kotlin constant) + 500 (word) + 400 (atom) + 10 (file order) 870 - wantScore: 8510, 870 + // 7000 (symbol) + 600 (Kotlin constant) + 500 (word) + 10 (file order) 871 + wantScore: 8110, 871 872 }, 872 873 { 873 874 fileName: "example.kt", 874 875 content: exampleKotlin, 875 876 query: &query.Substring{Content: true, Pattern: "scrollState"}, 876 877 wantLanguage: "Kotlin", 877 - // 7000 (symbol) + 500 (Kotlin variable) + 500 (word) + 400 (atom) + 10 (file order) 878 - wantScore: 8410, 878 + // 7000 (symbol) + 500 (Kotlin variable) + 500 (word) + 10 (file order) 879 + wantScore: 8010, 879 880 }, 880 881 // 881 882 // Java ··· 885 886 content: exampleJava, 886 887 query: &query.Substring{Content: true, Pattern: "nerClass"}, 887 888 wantLanguage: "Java", 888 - // 5500 (partial symbol at boundary) + 1000 (Java class) + 50 (partial word) + 400 (atom) + 10 (file order) 889 - wantScore: 6960, 889 + // 5500 (partial symbol at boundary) + 1000 (Java class) + 50 (partial word) + 10 (file order) 890 + wantScore: 6560, 890 891 }, 891 892 { 892 893 fileName: "example.java", 893 894 content: exampleJava, 894 895 query: &query.Substring{Content: true, Pattern: "StaticClass"}, 895 896 wantLanguage: "Java", 896 - // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word) + 400 (atom) + 10 (file order) 897 - wantScore: 7410, 897 + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word) + 10 (file order) 898 + wantScore: 7010, 898 899 }, 899 900 { 900 901 fileName: "example.java", 901 902 content: exampleJava, 902 903 query: &query.Substring{Content: true, Pattern: "innerEnum"}, 903 904 wantLanguage: "Java", 904 - // 7000 (symbol) + 900 (Java enum) + 500 (word) + 400 (atom) + 10 (file order) 905 - wantScore: 8810, 905 + // 7000 (symbol) + 900 (Java enum) + 500 (word) + 10 (file order) 906 + wantScore: 8410, 906 907 }, 907 908 { 908 909 fileName: "example.java", 909 910 content: exampleJava, 910 911 query: &query.Substring{Content: true, Pattern: "innerInterface"}, 911 912 wantLanguage: "Java", 912 - // 7000 (symbol) + 800 (Java interface) + 500 (word) + 400 (atom) + 10 (file order) 913 - wantScore: 8710, 913 + // 7000 (symbol) + 800 (Java interface) + 500 (word) + 10 (file order) 914 + wantScore: 8310, 914 915 }, 915 916 { 916 917 fileName: "example.java", 917 918 content: exampleJava, 918 919 query: &query.Substring{Content: true, Pattern: "innerMethod"}, 919 920 wantLanguage: "Java", 920 - // 7000 (symbol) + 700 (Java method) + 500 (word) + 400 (atom) + 10 (file order) 921 - wantScore: 8610, 921 + // 7000 (symbol) + 700 (Java method) + 500 (word) + 10 (file order) 922 + wantScore: 8210, 922 923 }, 923 924 { 924 925 fileName: "example.java", 925 926 content: exampleJava, 926 927 query: &query.Substring{Content: true, Pattern: "field"}, 927 928 wantLanguage: "Java", 928 - // 7000 (symbol) + 600 (Java field) + 500 (word) + 400 (atom) + 10 (file order) 929 - wantScore: 8510, 929 + // 7000 (symbol) + 600 (Java field) + 500 (word) + 10 (file order) 930 + wantScore: 8110, 930 931 }, 931 932 { 932 933 fileName: "example.java", 933 934 content: exampleJava, 934 935 query: &query.Substring{Content: true, Pattern: "B"}, 935 936 wantLanguage: "Java", 936 - // 7000 (symbol) + 500 (Java enum constant) + 500 (word) + 400 (atom) + 10 (file order) 937 - wantScore: 8410, 937 + // 7000 (symbol) + 500 (Java enum constant) + 500 (word) + 10 (file order) 938 + wantScore: 8010, 939 + }, 940 + // 2 Atoms (1x content and 1x filename) 941 + { 942 + fileName: "example.java", 943 + content: exampleJava, 944 + query: &query.Substring{Pattern: "example"}, // matches filename and a Java field 945 + wantLanguage: "Java", 946 + // 5500 (edge symbol) + 600 (Java field) + 500 (word) + 200 (atom) + 10 (file order) 947 + wantScore: 6810, 948 + }, 949 + // 3 Atoms (2x content, 1x filename) 950 + { 951 + fileName: "example.java", 952 + content: exampleJava, 953 + query: &query.Or{Children: []query.Q{ 954 + &query.Substring{Pattern: "example"}, // matches filename and Java field 955 + &query.Substring{Content: true, Pattern: "runInnerInterface"}, // matches a Java method 956 + }}, 957 + wantLanguage: "Java", 958 + // 7000 (symbol) + 700 (Java method) + 500 (word) + 266.67 (atom) + 10 (file order) 959 + wantScore: 8476.667, 960 + }, 961 + // 4 Atoms (4x content) 962 + { 963 + fileName: "example.java", 964 + content: exampleJava, 965 + query: &query.Or{Children: []query.Q{ 966 + &query.Substring{Content: true, Pattern: "testAnon"}, 967 + &query.Substring{Content: true, Pattern: "Override"}, 968 + &query.Substring{Content: true, Pattern: "InnerEnum"}, 969 + &query.Substring{Content: true, Pattern: "app"}, 970 + }}, 971 + wantLanguage: "Java", 972 + // 7000 (symbol) + 900 (Java enum) + 500 (word) + 300 (atom) + 10 (file order) 973 + wantScore: 8710, 938 974 }, 939 975 // 940 976 // Go ··· 943 979 fileName: "a/b/c/config.go", 944 980 query: &query.Substring{FileName: true, Pattern: "config"}, 945 981 wantLanguage: "Go", 946 - // 5500 (partial base at boundary) + 500 (word) + 400 (atom) + 10 (file order) 947 - wantScore: 6410, 982 + // 5500 (partial base at boundary) + 500 (word) + 10 (file order) 983 + wantScore: 6010, 948 984 }, 949 985 { 950 986 fileName: "a/b/c/config.go", 951 987 query: &query.Substring{FileName: true, Pattern: "config.go"}, 952 988 wantLanguage: "Go", 953 - // 7000 (full base match) + 500 (word) + 400 (atom) + 10 (file order) 954 - wantScore: 7910, 989 + // 7000 (full base match) + 500 (word) + 10 (file order) 990 + wantScore: 7510, 955 991 }, 956 992 { 957 993 fileName: "a/config/c/d.go", 958 994 query: &query.Substring{FileName: true, Pattern: "config"}, 959 995 wantLanguage: "Go", 960 - // 500 (word) + 400 (atom) + 10 (file order) 961 - wantScore: 910, 996 + // 500 (word) + 10 (file order) 997 + wantScore: 510, 962 998 }, 963 999 { 964 1000 fileName: "src/net/http/client.go", ··· 968 1004 `), 969 1005 query: &query.Substring{Content: true, Pattern: "aInterface"}, 970 1006 wantLanguage: "Go", 971 - // 7000 (full base match) + 1000 (Go interface) + 500 (word) + 400 (atom) + 10 (file order) 972 - wantScore: 8910, 1007 + // 7000 (full base match) + 1000 (Go interface) + 500 (word) + 10 (file order) 1008 + wantScore: 8510, 973 1009 }, 974 1010 { 975 1011 fileName: "src/net/http/client.go", ··· 979 1015 `), 980 1016 query: &query.Substring{Content: true, Pattern: "aStruct"}, 981 1017 wantLanguage: "Go", 982 - // 7000 (full base match) + 900 (Go interface) + 500 (word) + 400 (atom) + 10 (file order) 983 - wantScore: 8810, 1018 + // 7000 (full base match) + 900 (Go interface) + 500 (word) + 10 (file order) 1019 + wantScore: 8410, 984 1020 }, 985 1021 { 986 1022 fileName: "src/net/http/client.go", ··· 994 1030 &query.Symbol{Expr: &query.Substring{Pattern: "http", Content: true}}, 995 1031 &query.Symbol{Expr: &query.Substring{Pattern: "Get", Content: true}}}}, 996 1032 wantLanguage: "Go", 997 - // 7000 (full base match) + 800 (Go func) + 500 (word) + 400 (atom) + 10 (file order) 998 - wantScore: 8710, 1033 + // 7000 (full base match) + 800 (Go func) + 500 (word) + 200 (atom) + 10 (file order) 1034 + wantScore: 8510, 999 1035 }, 1000 1036 // 1001 1037 // C++ ··· 1005 1041 content: exampleCpp, 1006 1042 query: &query.Substring{Content: true, Pattern: "FooClass"}, 1007 1043 wantLanguage: "C++", 1008 - // 7000 (Symbol) + 1000 (C++ class) + 500 (full word) + 400 (atom) + 10 (file order) 1009 - wantScore: 8910, 1044 + // 7000 (Symbol) + 1000 (C++ class) + 500 (full word) + 10 (file order) 1045 + wantScore: 8510, 1010 1046 }, 1011 1047 { 1012 1048 fileName: "example.cc", 1013 1049 content: exampleCpp, 1014 1050 query: &query.Substring{Content: true, Pattern: "NestedEnum"}, 1015 1051 wantLanguage: "C++", 1016 - // 7000 (Symbol) + 900 (C++ enum) + 500 (full word) + 400 (atom) + 10 (file order) 1017 - wantScore: 8810, 1052 + // 7000 (Symbol) + 900 (C++ enum) + 500 (full word) + 10 (file order) 1053 + wantScore: 8410, 1018 1054 }, 1019 1055 { 1020 1056 fileName: "example.cc", 1021 1057 content: exampleCpp, 1022 1058 query: &query.Substring{Content: true, Pattern: "main"}, 1023 1059 wantLanguage: "C++", 1024 - // 7000 (Symbol) + 800 (C++ function) + 500 (full word) + 400 (atom) + 10 (file order) 1025 - wantScore: 8710, 1060 + // 7000 (Symbol) + 800 (C++ function) + 500 (full word) + 10 (file order) 1061 + wantScore: 8310, 1026 1062 }, 1027 1063 { 1028 1064 fileName: "example.cc", 1029 1065 content: exampleCpp, 1030 1066 query: &query.Substring{Content: true, Pattern: "FooStruct"}, 1031 1067 wantLanguage: "C++", 1032 - // 7000 (Symbol) + 700 (C++ struct) + 500 (full word) + 400 (atom) + 10 (file order) 1033 - wantScore: 8610, 1068 + // 7000 (Symbol) + 700 (C++ struct) + 500 (full word) + 10 (file order) 1069 + wantScore: 8210, 1034 1070 }, 1035 1071 { 1036 1072 fileName: "example.cc", 1037 1073 content: exampleCpp, 1038 1074 query: &query.Substring{Content: true, Pattern: "TheUnion"}, 1039 1075 wantLanguage: "C++", 1040 - // 7000 (Symbol) + 600 (C++ union) + 500 (full word) + 400 (atom) + 10 (file order) 1041 - wantScore: 8510, 1076 + // 7000 (Symbol) + 600 (C++ union) + 500 (full word) + 10 (file order) 1077 + wantScore: 8110, 1042 1078 }, 1043 1079 // 1044 1080 // Scala ··· 1048 1084 content: exampleScala, 1049 1085 query: &query.Substring{Content: true, Pattern: "SymbolIndexBucket"}, 1050 1086 wantLanguage: "Scala", 1051 - // 7000 (symbol) + 1000 (Scala class) + 500 (word) + 400 (atom) + 10 (file order) 1052 - wantScore: 8910, 1087 + // 7000 (symbol) + 1000 (Scala class) + 500 (word) + 10 (file order) 1088 + wantScore: 8510, 1053 1089 }, 1054 1090 { 1055 1091 fileName: "example.scala", 1056 1092 content: exampleScala, 1057 1093 query: &query.Substring{Content: true, Pattern: "stdLibPatches"}, 1058 1094 wantLanguage: "Scala", 1059 - // 7000 (symbol) + 800 (Scala object) + 500 (word) + 400 (atom) + 10 (file order) 1060 - wantScore: 8710, 1095 + // 7000 (symbol) + 800 (Scala object) + 500 (word) + 10 (file order) 1096 + wantScore: 8310, 1061 1097 }, 1062 1098 { 1063 1099 fileName: "example.scala", 1064 1100 content: exampleScala, 1065 1101 query: &query.Substring{Content: true, Pattern: "close"}, 1066 1102 wantLanguage: "Scala", 1067 - // 7000 (symbol) + 700 (Scala method) + 500 (word) + 400 (atom) + 10 (file order) 1068 - wantScore: 8610, 1103 + // 7000 (symbol) + 700 (Scala method) + 500 (word) + 10 (file order) 1104 + wantScore: 8210, 1069 1105 }, 1070 1106 { 1071 1107 fileName: "example.scala", 1072 1108 content: exampleScala, 1073 1109 query: &query.Substring{Content: true, Pattern: "javaSymbol"}, 1074 1110 wantLanguage: "Scala", 1075 - // 7000 (symbol) + 500 (Scala method) + 500 (word) + 400 (atom) + 10 (file order) 1076 - wantScore: 8410, 1111 + // 7000 (symbol) + 500 (Scala method) + 500 (word) + 10 (file order) 1112 + wantScore: 8010, 1077 1113 }, 1078 1114 } 1079 1115 1116 + epsilon := 0.01 1080 1117 for _, c := range cases { 1081 1118 t.Run(c.wantLanguage, func(t *testing.T) { 1082 1119 b, err := NewBuilder(opts) ··· 1105 1142 t.Fatalf("file matches: want %d, got %d", want, got) 1106 1143 } 1107 1144 1108 - if got := srs.Files[0].Score; got != c.wantScore { 1145 + if got := srs.Files[0].Score; math.Abs(got-c.wantScore) > epsilon { 1109 1146 t.Fatalf("score: want %f, got %f\ndebug: %s\ndebugscore: %s", c.wantScore, got, srs.Files[0].Debug, srs.Files[0].LineMatches[0].DebugScore) 1110 1147 } 1111 1148 ··· 1144 1181 }{ 1145 1182 { 1146 1183 name: "score with no document ranks", 1147 - // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 400 (atom) + 10 (file order) 1148 - wantScore: 7412.00, 1184 + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 10 (file order) 1185 + wantScore: 7012.00, 1149 1186 }, 1150 1187 { 1151 1188 name: "score with document ranks", 1152 1189 documentRanks: []float64{0, 0, 0, 0, 0.8, 0, 0}, 1153 - // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 400 (atom) + 7200 (file rank) + 10 (file order) 1154 - wantScore: 14612.00, 1190 + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 7200 (file rank) + 10 (file order) 1191 + wantScore: 14212.00, 1155 1192 }, 1156 1193 { 1157 1194 name: "score with custom document ranks weight", 1158 1195 documentRanks: []float64{0, 0, 0, 0, 0.8, 0, 0}, 1159 1196 documentRanksWeight: 1000.0, 1160 - // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 400 (atom) + 800 (file rank) + 10 (file order) 1161 - wantScore: 8212.00, 1197 + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 800 (file rank) + 10 (file order) 1198 + wantScore: 7812.00, 1162 1199 }, 1163 1200 } 1164 1201

+5 -6

eval.go

··· 182 182 return &res, nil 183 183 } 184 184 185 - totalAtomCount := 0 186 - visitMatchTree(mt, func(t matchTree) { 187 - totalAtomCount++ 188 - }) 189 - 190 185 res.Stats.ShardsScanned++ 191 186 192 187 cp := &contentProvider{ ··· 369 364 // Prefer docs with several top-scored matches. 370 365 fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(repetitions), opts.DebugScore) 371 366 372 - fileMatch.addScore("atom", float64(atomMatchCount)/float64(totalAtomCount)*scoreFactorAtomMatch, opts.DebugScore) 367 + // atom-count boosts files with matches from more than 1 atom. The 368 + // maximum boost is scoreFactorAtomMatch. 369 + if atomMatchCount > 0 { 370 + fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, opts.DebugScore) 371 + } 373 372 374 373 if opts.UseDocumentRanks && len(d.ranks) > int(nextDoc) { 375 374 weight := scoreFileRankFactor

+4 -8

testdata/golden/TestReadSearch/ctagsrepo_v16.00000.golden

··· 4 4 "FileMatches": [ 5 5 [ 6 6 { 7 - "Score": 910, 8 - "Ranks": null, 7 + "Score": 510, 9 8 "Debug": "", 10 9 "FileName": "main.go", 11 10 "Repository": "repo", ··· 44 43 ], 45 44 [ 46 45 { 47 - "Score": 710, 48 - "Ranks": null, 46 + "Score": 510, 49 47 "Debug": "", 50 48 "FileName": "main.go", 51 49 "Repository": "repo", ··· 84 82 ], 85 83 [ 86 84 { 87 - "Score": 8410, 88 - "Ranks": null, 85 + "Score": 8010, 89 86 "Debug": "", 90 87 "FileName": "main.go", 91 88 "Repository": "repo", ··· 129 126 ], 130 127 [ 131 128 { 132 - "Score": 6260, 133 - "Ranks": null, 129 + "Score": 6060, 134 130 "Debug": "", 135 131 "FileName": "main.go", 136 132 "Repository": "repo",

+4 -8

testdata/golden/TestReadSearch/ctagsrepo_v17.00000.golden

··· 4 4 "FileMatches": [ 5 5 [ 6 6 { 7 - "Score": 910, 8 - "Ranks": null, 7 + "Score": 510, 9 8 "Debug": "", 10 9 "FileName": "main.go", 11 10 "Repository": "repo", ··· 44 43 ], 45 44 [ 46 45 { 47 - "Score": 710, 48 - "Ranks": null, 46 + "Score": 510, 49 47 "Debug": "", 50 48 "FileName": "main.go", 51 49 "Repository": "repo", ··· 84 82 ], 85 83 [ 86 84 { 87 - "Score": 8410, 88 - "Ranks": null, 85 + "Score": 8010, 89 86 "Debug": "", 90 87 "FileName": "main.go", 91 88 "Repository": "repo", ··· 129 126 ], 130 127 [ 131 128 { 132 - "Score": 6260, 133 - "Ranks": null, 129 + "Score": 6060, 134 130 "Debug": "", 135 131 "FileName": "main.go", 136 132 "Repository": "repo",

+2 -4

testdata/golden/TestReadSearch/repo17_v17.00000.golden

··· 4 4 "FileMatches": [ 5 5 [ 6 6 { 7 - "Score": 910, 8 - "Ranks": null, 7 + "Score": 510, 9 8 "Debug": "", 10 9 "FileName": "main.go", 11 10 "Repository": "repo17", ··· 44 43 ], 45 44 [ 46 45 { 47 - "Score": 710, 48 - "Ranks": null, 46 + "Score": 510, 49 47 "Debug": "", 50 48 "FileName": "main.go", 51 49 "Repository": "repo17",

+2 -4

testdata/golden/TestReadSearch/repo2_v16.00000.golden

··· 4 4 "FileMatches": [ 5 5 [ 6 6 { 7 - "Score": 910, 8 - "Ranks": null, 7 + "Score": 510, 9 8 "Debug": "", 10 9 "FileName": "main.go", 11 10 "Repository": "repo2", ··· 44 43 ], 45 44 [ 46 45 { 47 - "Score": 710, 48 - "Ranks": null, 46 + "Score": 510, 49 47 "Debug": "", 50 48 "FileName": "main.go", 51 49 "Repository": "repo2",

+2 -4

testdata/golden/TestReadSearch/repo_v16.00000.golden

··· 4 4 "FileMatches": [ 5 5 [ 6 6 { 7 - "Score": 910, 8 - "Ranks": null, 7 + "Score": 510, 9 8 "Debug": "", 10 9 "FileName": "main.go", 11 10 "Repository": "repo", ··· 44 43 ], 45 44 [ 46 45 { 47 - "Score": 710, 48 - "Ranks": null, 46 + "Score": 510, 49 47 "Debug": "", 50 48 "FileName": "main.go", 51 49 "Repository": "repo",

Configure Feed

Configure Feed