fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

ranking: enhance scoring based on ctags (#296)

ctags provides information, such as position, language, and kind, about symbols. For scoring we only use the posititon to increase the score of matches that are contained within a symbol.

With this change we introduce a scoring function that boosts results based on the kind of the symbol. The core idea is that, depending on the language, some language entities are more important than others. EG based on user feedack, classes in Java are considered to be more relevant than instance variables, given both match the query.

For now the scoring function is just a skeleton targeting one language.

Co-authored-by: Keegan Carruthers-Smith <keegan.csmith@gmail.com>

+108 -11
+68
build/e2e_test.go
··· 798 798 }) 799 799 } 800 800 } 801 + 802 + // With this test we want to capture regressions in the names returned by our 803 + // language detection. We rely on the detected language and its spelling, for 804 + // example, in scoring (see scoreKind). 805 + func TestDetectLanguage(t *testing.T) { 806 + dir := t.TempDir() 807 + 808 + opts := Options{ 809 + IndexDir: dir, 810 + RepositoryDescription: zoekt.Repository{ 811 + Name: "repo", 812 + }, 813 + } 814 + 815 + cases := []struct { 816 + fileName string 817 + content []byte 818 + wantLanguage string 819 + }{ 820 + { 821 + fileName: "hw.java", 822 + content: []byte(` 823 + public class HelloWorld 824 + { 825 + public static void main (String[] args) 826 + { 827 + System.out.println("Hello World!"); 828 + } 829 + } 830 + `), 831 + wantLanguage: "Java", 832 + }, 833 + } 834 + 835 + for _, c := range cases { 836 + t.Run(c.wantLanguage, func(t *testing.T) { 837 + b, err := NewBuilder(opts) 838 + if err != nil { 839 + t.Fatalf("NewBuilder: %v", err) 840 + } 841 + if err := b.AddFile(c.fileName, c.content); err != nil { 842 + t.Fatal(err) 843 + } 844 + if err := b.Finish(); err != nil { 845 + t.Fatalf("Finish: %v", err) 846 + } 847 + 848 + ss, err := shards.NewDirectorySearcher(dir) 849 + if err != nil { 850 + t.Fatalf("NewDirectorySearcher(%s): %v", dir, err) 851 + } 852 + defer ss.Close() 853 + 854 + srs, err := ss.Search(context.Background(), &query.Const{true}, new(zoekt.SearchOptions)) 855 + if err != nil { 856 + t.Fatal(err) 857 + } 858 + 859 + if got, want := len(srs.Files), 1; got != want { 860 + t.Fatalf("file matches: want %d, got %d", want, got) 861 + } 862 + 863 + if got := srs.Files[0].Language; got != c.wantLanguage { 864 + t.Fatalf("want %s, got %s", c.wantLanguage, got) 865 + } 866 + }) 867 + } 868 + }
+39 -10
contentprovider.go
··· 130 130 return byteOff 131 131 } 132 132 133 - func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int) []LineMatch { 133 + func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string) []LineMatch { 134 134 var result []LineMatch 135 135 if ms[0].fileName { 136 136 // There is only "line" in a filename. ··· 155 155 156 156 sects := p.docSections() 157 157 for i, m := range result { 158 - result[i].Score = matchScore(sects, &m) 158 + result[i].Score = p.matchScore(sects, &m, language) 159 159 } 160 160 161 161 return result ··· 268 268 scorePartialWordMatch = 50.0 269 269 scoreWordMatch = 500.0 270 270 scoreImportantThreshold = 2000.0 271 + scoreSymbol = 7000.0 271 272 scorePartialSymbol = 4000.0 272 - scoreSymbol = 7000.0 273 + scoreKindMatch = 1000.0 273 274 scoreFactorAtomMatch = 400.0 274 275 scoreShardRankFactor = 20.0 275 276 scoreFileOrderFactor = 10.0 276 277 scoreLineOrderFactor = 1.0 277 278 ) 278 279 279 - func findSection(secs []DocumentSection, off, sz uint32) *DocumentSection { 280 + // findSection checks whether a section defined by offset and size lies within 281 + // one of the sections in secs. 282 + func findSection(secs []DocumentSection, off, sz uint32) (int, bool) { 280 283 j := sort.Search(len(secs), func(i int) bool { 281 284 return secs[i].End >= off+sz 282 285 }) 283 286 284 287 if j == len(secs) { 285 - return nil 288 + return 0, false 286 289 } 287 290 288 291 if secs[j].Start <= off && off+sz <= secs[j].End { 289 - return &secs[j] 292 + return j, true 290 293 } 291 - return nil 294 + return 0, false 292 295 } 293 296 294 - func matchScore(secs []DocumentSection, m *LineMatch) float64 { 297 + func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string) float64 { 295 298 var maxScore float64 296 299 for _, f := range m.LineFragments { 297 300 startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset])) ··· 306 309 score = scorePartialWordMatch 307 310 } 308 311 309 - sec := findSection(secs, f.Offset, uint32(f.MatchLength)) 310 - if sec != nil { 312 + if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok { 313 + sec := secs[secIdx] 311 314 startMatch := sec.Start == f.Offset 312 315 endMatch := sec.End == f.Offset+uint32(f.MatchLength) 313 316 if startMatch && endMatch { ··· 317 320 } else { 318 321 score += scorePartialSymbol 319 322 } 323 + 324 + si := f.SymbolInfo 325 + if si == nil { 326 + // for non-symbol queries, we need to hydrate in SymbolInfo. 327 + start := p.id.fileEndSymbol[p.idx] 328 + si = p.id.symbols.data(start + uint32(secIdx)) 329 + } 330 + if si != nil { 331 + // the LineFragment may not be on a symbol, then si will be nil. 332 + score += scoreKind(language, si.Kind) 333 + } 320 334 } 321 335 322 336 if score > maxScore { ··· 324 338 } 325 339 } 326 340 return maxScore 341 + } 342 + 343 + // scoreKind boosts a match based on the combination of language and kind. The 344 + // language string comes from go-enry, the kind string from ctags. 345 + func scoreKind(language string, kind string) float64 { 346 + // Refer to universal-ctags --list-kinds=<language> to learn about the mappings 347 + // for a language. 348 + switch language { 349 + case "Java": 350 + switch kind { 351 + case "c": // classes 352 + return scoreKindMatch 353 + } 354 + } 355 + return 0 327 356 } 328 357 329 358 type matchScoreSlice []LineMatch
+1 -1
eval.go
··· 349 349 byteMatchSz: uint32(len(nm)), 350 350 }) 351 351 } 352 - fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines) 352 + fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language) 353 353 354 354 maxFileScore := 0.0 355 355 for i := range fileMatch.LineMatches {