ranking: enhance scoring based on ctags (#296) · boltless.me/zoekt@69eb002

+68

build/e2e_test.go

··· 798 798 }) 799 799 } 800 800 } 801 + 802 + // With this test we want to capture regressions in the names returned by our 803 + // language detection. We rely on the detected language and its spelling, for 804 + // example, in scoring (see scoreKind). 805 + func TestDetectLanguage(t *testing.T) { 806 + dir := t.TempDir() 807 + 808 + opts := Options{ 809 + IndexDir: dir, 810 + RepositoryDescription: zoekt.Repository{ 811 + Name: "repo", 812 + }, 813 + } 814 + 815 + cases := []struct { 816 + fileName string 817 + content []byte 818 + wantLanguage string 819 + }{ 820 + { 821 + fileName: "hw.java", 822 + content: []byte(` 823 + public class HelloWorld 824 + { 825 + public static void main (String[] args) 826 + { 827 + System.out.println("Hello World!"); 828 + } 829 + } 830 + `), 831 + wantLanguage: "Java", 832 + }, 833 + } 834 + 835 + for _, c := range cases { 836 + t.Run(c.wantLanguage, func(t *testing.T) { 837 + b, err := NewBuilder(opts) 838 + if err != nil { 839 + t.Fatalf("NewBuilder: %v", err) 840 + } 841 + if err := b.AddFile(c.fileName, c.content); err != nil { 842 + t.Fatal(err) 843 + } 844 + if err := b.Finish(); err != nil { 845 + t.Fatalf("Finish: %v", err) 846 + } 847 + 848 + ss, err := shards.NewDirectorySearcher(dir) 849 + if err != nil { 850 + t.Fatalf("NewDirectorySearcher(%s): %v", dir, err) 851 + } 852 + defer ss.Close() 853 + 854 + srs, err := ss.Search(context.Background(), &query.Const{true}, new(zoekt.SearchOptions)) 855 + if err != nil { 856 + t.Fatal(err) 857 + } 858 + 859 + if got, want := len(srs.Files), 1; got != want { 860 + t.Fatalf("file matches: want %d, got %d", want, got) 861 + } 862 + 863 + if got := srs.Files[0].Language; got != c.wantLanguage { 864 + t.Fatalf("want %s, got %s", c.wantLanguage, got) 865 + } 866 + }) 867 + } 868 + }

+39 -10

contentprovider.go

··· 130 130 return byteOff 131 131 } 132 132 133 - func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int) []LineMatch { 133 + func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string) []LineMatch { 134 134 var result []LineMatch 135 135 if ms[0].fileName { 136 136 // There is only "line" in a filename. ··· 155 155 156 156 sects := p.docSections() 157 157 for i, m := range result { 158 - result[i].Score = matchScore(sects, &m) 158 + result[i].Score = p.matchScore(sects, &m, language) 159 159 } 160 160 161 161 return result ··· 268 268 scorePartialWordMatch = 50.0 269 269 scoreWordMatch = 500.0 270 270 scoreImportantThreshold = 2000.0 271 + scoreSymbol = 7000.0 271 272 scorePartialSymbol = 4000.0 272 - scoreSymbol = 7000.0 273 + scoreKindMatch = 1000.0 273 274 scoreFactorAtomMatch = 400.0 274 275 scoreShardRankFactor = 20.0 275 276 scoreFileOrderFactor = 10.0 276 277 scoreLineOrderFactor = 1.0 277 278 ) 278 279 279 - func findSection(secs []DocumentSection, off, sz uint32) *DocumentSection { 280 + // findSection checks whether a section defined by offset and size lies within 281 + // one of the sections in secs. 282 + func findSection(secs []DocumentSection, off, sz uint32) (int, bool) { 280 283 j := sort.Search(len(secs), func(i int) bool { 281 284 return secs[i].End >= off+sz 282 285 }) 283 286 284 287 if j == len(secs) { 285 - return nil 288 + return 0, false 286 289 } 287 290 288 291 if secs[j].Start <= off && off+sz <= secs[j].End { 289 - return &secs[j] 292 + return j, true 290 293 } 291 - return nil 294 + return 0, false 292 295 } 293 296 294 - func matchScore(secs []DocumentSection, m *LineMatch) float64 { 297 + func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string) float64 { 295 298 var maxScore float64 296 299 for _, f := range m.LineFragments { 297 300 startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset])) ··· 306 309 score = scorePartialWordMatch 307 310 } 308 311 309 - sec := findSection(secs, f.Offset, uint32(f.MatchLength)) 310 - if sec != nil { 312 + if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok { 313 + sec := secs[secIdx] 311 314 startMatch := sec.Start == f.Offset 312 315 endMatch := sec.End == f.Offset+uint32(f.MatchLength) 313 316 if startMatch && endMatch { ··· 317 320 } else { 318 321 score += scorePartialSymbol 319 322 } 323 + 324 + si := f.SymbolInfo 325 + if si == nil { 326 + // for non-symbol queries, we need to hydrate in SymbolInfo. 327 + start := p.id.fileEndSymbol[p.idx] 328 + si = p.id.symbols.data(start + uint32(secIdx)) 329 + } 330 + if si != nil { 331 + // the LineFragment may not be on a symbol, then si will be nil. 332 + score += scoreKind(language, si.Kind) 333 + } 320 334 } 321 335 322 336 if score > maxScore { ··· 324 338 } 325 339 } 326 340 return maxScore 341 + } 342 + 343 + // scoreKind boosts a match based on the combination of language and kind. The 344 + // language string comes from go-enry, the kind string from ctags. 345 + func scoreKind(language string, kind string) float64 { 346 + // Refer to universal-ctags --list-kinds=<language> to learn about the mappings 347 + // for a language. 348 + switch language { 349 + case "Java": 350 + switch kind { 351 + case "c": // classes 352 + return scoreKindMatch 353 + } 354 + } 355 + return 0 327 356 } 328 357 329 358 type matchScoreSlice []LineMatch

+1 -1

eval.go

··· 349 349 byteMatchSz: uint32(len(nm)), 350 350 }) 351 351 } 352 - fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines) 352 + fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language) 353 353 354 354 maxFileScore := 0.0 355 355 for i := range fileMatch.LineMatches {

Configure Feed

Configure Feed