ranking: favor short paths and multiple matches (#417) · boltless.me/zoekt@543ddf9

+10 -5

build/builder.go

··· 38 38 "time" 39 39 40 40 "github.com/bmatcuk/doublestar" 41 - "github.com/sourcegraph/zoekt" 42 - "github.com/sourcegraph/zoekt/ctags" 43 41 "github.com/grafana/regexp" 44 42 "github.com/rs/xid" 45 43 "gopkg.in/natefinch/lumberjack.v2" 44 + 45 + "github.com/sourcegraph/zoekt" 46 + "github.com/sourcegraph/zoekt/ctags" 46 47 ) 47 48 48 49 var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt") ··· 870 871 rank []float64 871 872 } 872 873 874 + // rank returns a vector of scores which is used at index-time to sort documents 875 + // before writing them to disk. The order of documents in the shard is important 876 + // at query time, because earlier documents receive a boost at query time and 877 + // have a higher chance of being searched before limits kick in. 873 878 func rank(d *zoekt.Document, origIdx int) []float64 { 874 879 generated := 0.0 875 880 if strings.HasSuffix(d.Name, "min.js") || strings.HasSuffix(d.Name, "js.map") { ··· 897 902 // Prefer docs that are not tests 898 903 test, 899 904 905 + // With short names 906 + squashRange(len(d.Name)), 907 + 900 908 // With many symbols 901 909 1.0 - squashRange(len(d.Symbols)), 902 910 903 911 // With short content 904 912 squashRange(len(d.Content)), 905 - 906 - // With short names 907 - squashRange(len(d.Name)), 908 913 909 914 // That is present is as many branches as possible 910 915 1.0 - squashRange(len(d.Branches)),

+15

build/e2e_test.go

··· 844 844 // 500 (word) + 400 (atom) + 10 (file order) 845 845 wantScore: 910, 846 846 }, 847 + { 848 + fileName: "src/net/http/client.go", 849 + content: []byte(` 850 + package http 851 + func Get() { 852 + panic("") 853 + } 854 + `), 855 + query: &query.And{Children: []query.Q{ 856 + &query.Symbol{Expr: &query.Substring{Pattern: "http", Content: true}}, 857 + &query.Symbol{Expr: &query.Substring{Pattern: "Get", Content: true}}}}, 858 + wantLanguage: "Go", 859 + // 7000 (full base match) + 500 (word) + 400 (atom) + 10 (file order) + 1 (repetition-boost) 860 + wantScore: 7911, 861 + }, 847 862 } 848 863 849 864 for _, c := range cases {

+1

contentprovider.go

··· 465 465 scoreShardRankFactor = 20.0 466 466 scoreFileOrderFactor = 10.0 467 467 scoreLineOrderFactor = 1.0 468 + scoreRepetitionFactor = 1.0 468 469 ) 469 470 470 471 // findSection checks whether a section defined by offset and size lies within

+8

eval.go

··· 343 343 } 344 344 345 345 maxFileScore := 0.0 346 + repetitions := 0 346 347 for i := range fileMatch.LineMatches { 347 348 if maxFileScore < fileMatch.LineMatches[i].Score { 348 349 maxFileScore = fileMatch.LineMatches[i].Score 350 + repetitions = 0 351 + } else if maxFileScore == fileMatch.LineMatches[i].Score { 352 + repetitions += 1 349 353 } 350 354 351 355 // Order by ordering in file. ··· 365 369 // strictly dominates the in-file ordering of 366 370 // the matches. 367 371 fileMatch.addScore("fragment", maxFileScore, opts.DebugScore) 372 + 373 + // Prefer docs with several top-scored matches. 374 + fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(repetitions), opts.DebugScore) 375 + 368 376 fileMatch.addScore("atom", float64(atomMatchCount)/float64(totalAtomCount)*scoreFactorAtomMatch, opts.DebugScore) 369 377 370 378 // Prefer earlier docs.

Configure Feed

Configure Feed