fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Ranking: handle files with missing ranks (#555)

Even when a repo has ranking data, certain files will not have ranks, like
Markdown or yaml files. Currently these have rank 0, which puts them at a big
disadvantage and means they're usually ranked last.

This PR proposes to use the mean repo rank instead of 0. The rules:
* If we have a concrete rank for the file, always use it
* If there's no rank, and it's a low priority file like a test, then use rank 0
* Otherwise use the mean rank for the repository

We don't attempt to handle the case where an entire repo is missing ranks
because it doesn't have precise code intel.

+144 -7
+21 -3
build/builder.go
··· 898 898 return x / (1 + x) 899 899 } 900 900 901 - var testRe = regexp.MustCompile("test") 901 + // IsLowPriority takes a file name and makes an educated guess about its priority 902 + // in search results. A file is considered low priority if it looks like a test, 903 + // vendored, or generated file. 904 + // 905 + // These 'priority' criteria affects how documents are ordered within a shard. It's 906 + // also used to help guess a file's rank when we're missing ranking information. 907 + func IsLowPriority(file string) bool { 908 + return testRe.MatchString(file) || isGenerated(file) || isVendored(file) 909 + } 910 + 911 + var testRe = regexp.MustCompile("[Tt]est") 912 + 913 + func isGenerated(file string) bool { 914 + return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map") 915 + } 916 + 917 + func isVendored(file string) bool { 918 + return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/") 919 + } 902 920 903 921 type rankedDoc struct { 904 922 *zoekt.Document ··· 911 929 // have a higher chance of being searched before limits kick in. 912 930 func rank(d *zoekt.Document, origIdx int) []float64 { 913 931 generated := 0.0 914 - if strings.HasSuffix(d.Name, "min.js") || strings.HasSuffix(d.Name, "js.map") { 932 + if isGenerated(d.Name) { 915 933 generated = 1.0 916 934 } 917 935 918 936 vendor := 0.0 919 - if strings.Contains(d.Name, "vendor/") || strings.Contains(d.Name, "node_modules/") { 937 + if isVendored(d.Name) { 920 938 vendor = 1.0 921 939 } 922 940
+34
build/builder_test.go
··· 757 757 } 758 758 } 759 759 760 + func TestIsLowPriority(t *testing.T) { 761 + cases := []string{ 762 + "builder_test.go", 763 + "TestQuery.java", 764 + "test/mocks.go", 765 + "search/vendor/thirdparty.cc", 766 + "search/node_modules/search/index.js", 767 + "search.min.js", 768 + "internal/search.js.map", 769 + } 770 + 771 + for _, tt := range cases { 772 + t.Run(tt, func(t *testing.T) { 773 + if !IsLowPriority(tt) { 774 + t.Errorf("expected file '%s' to be low priority", tt) 775 + } 776 + }) 777 + } 778 + 779 + negativeCases := []string{ 780 + "builder.go", 781 + "RoutesTrigger.java", 782 + "search.js", 783 + } 784 + 785 + for _, tt := range negativeCases { 786 + t.Run(tt, func(t *testing.T) { 787 + if IsLowPriority(tt) { 788 + t.Errorf("did not expect file '%s' to be low priority", tt) 789 + } 790 + }) 791 + } 792 + } 793 + 760 794 func createTestShard(t *testing.T, indexDir string, r zoekt.Repository, numShards int, optFns ...func(options *Options)) []string { 761 795 t.Helper() 762 796
+32 -4
gitindex/index.go
··· 495 495 } 496 496 497 497 var ranks repoPathRanks 498 + var meanRank float64 498 499 if opts.BuildOptions.DocumentRanksPath != "" { 499 500 data, err := os.ReadFile(opts.BuildOptions.DocumentRanksPath) 500 501 if err != nil { ··· 504 505 err = json.Unmarshal(data, &ranks) 505 506 if err != nil { 506 507 return err 508 + } 509 + 510 + // Compute the mean rank for this repository. Note: we overwrite the rank 511 + // mean that's stored in the document ranks file, since that currently 512 + // represents a global mean rank across repos, which is not what we want. 513 + numRanks := len(ranks.Paths) 514 + if numRanks > 0 { 515 + for _, rank := range ranks.Paths { 516 + meanRank += rank 517 + } 518 + ranks.MeanRank = meanRank / float64(numRanks) 507 519 } 508 520 } 509 521 ··· 555 567 return err 556 568 } 557 569 558 - var pathRank []float64 559 - if rank, ok := ranks.Paths[keyFullPath]; ok { 560 - pathRank = []float64{rank} 570 + var pathRanks []float64 571 + if len(ranks.Paths) > 0 { 572 + // If the repository has ranking data, then store the file's rank. 573 + pathRank := ranks.rank(keyFullPath) 574 + pathRanks = []float64{pathRank} 561 575 } 562 576 563 577 if err := builder.Add(zoekt.Document{ ··· 565 579 Name: keyFullPath, 566 580 Content: contents, 567 581 Branches: brs, 568 - Ranks: pathRank, 582 + Ranks: pathRanks, 569 583 }); err != nil { 570 584 return fmt.Errorf("error adding document with name %s: %w", keyFullPath, err) 571 585 } ··· 578 592 type repoPathRanks struct { 579 593 MeanRank float64 `json:"mean_reference_count"` 580 594 Paths map[string]float64 `json:"paths"` 595 + } 596 + 597 + // rank returns the rank for a given path. It uses these rules: 598 + // - If we have a concrete rank for this file, always use it 599 + // - If there's no rank, and it's a low priority file like a test, then use rank 0 600 + // - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage 601 + func (r repoPathRanks) rank(path string) float64 { 602 + if rank, ok := r.Paths[path]; ok { 603 + return rank 604 + } else if build.IsLowPriority(path) { 605 + return 0.0 606 + } else { 607 + return r.MeanRank 608 + } 581 609 } 582 610 583 611 func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) {
+57
gitindex/index_test.go
··· 695 695 } 696 696 } 697 697 698 + func TestRepoPathRanks(t *testing.T) { 699 + pathRanks := repoPathRanks{ 700 + Paths: map[string]float64{ 701 + "search.go": 10.23, 702 + "internal/index.go": 5.5, 703 + "internal/scratch.go": 0.0, 704 + "backend/search_test.go": 2.1, 705 + }, 706 + MeanRank: 3.3, 707 + } 708 + cases := []struct { 709 + name string 710 + path string 711 + rank float64 712 + }{ 713 + { 714 + name: "rank for standard file", 715 + path: "search.go", 716 + rank: 10.23, 717 + }, 718 + { 719 + name: "file with rank 0", 720 + path: "internal/scratch.go", 721 + rank: 0.0, 722 + }, 723 + { 724 + name: "rank for test file", 725 + path: "backend/search_test.go", 726 + rank: 2.1, 727 + }, 728 + { 729 + name: "file with missing rank", 730 + path: "internal/docs.md", 731 + rank: 3.3, 732 + }, 733 + { 734 + name: "test file with missing rank", 735 + path: "backend/index_test.go", 736 + rank: 0.0, 737 + }, 738 + { 739 + name: "third-party file with missing rank", 740 + path: "node_modules/search/index.js", 741 + rank: 0.0, 742 + }, 743 + } 744 + 745 + for _, tt := range cases { 746 + t.Run(tt.name, func(t *testing.T) { 747 + got := pathRanks.rank(tt.path) 748 + if got != tt.rank { 749 + t.Errorf("expected file '%s' to have rank %f, but got %f", tt.path, tt.rank, got) 750 + } 751 + }) 752 + } 753 + } 754 + 698 755 func runScript(t *testing.T, cwd string, script string) { 699 756 err := os.MkdirAll(cwd, 0755) 700 757 if err != nil {