fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Ranking: standardize ctags kind names before scoring (#674)

SCIP ctags can output different kind names than universal-ctags (for example
`typeAlias` instead of `talias`). This change makes sure we handle different
names for the same kind. To do so, it refactors the logic so we first match
strings to standard kinds, then decide how these are scored for each language.
That way, you don't need to remember to cover all the possible kind names each
time you adjust scoring for a new language.

Also added basic tests for Ruby and Python to ensure we don't accidentally
change the scoring.

+424 -88
+64
build/e2e_test.go
··· 815 815 t.Fatal(err) 816 816 } 817 817 818 + examplePython, err := os.ReadFile("./testdata/example.py") 819 + if err != nil { 820 + t.Fatal(err) 821 + } 822 + 823 + exampleRuby, err := os.ReadFile("./testdata/example.rb") 824 + if err != nil { 825 + t.Fatal(err) 826 + } 827 + 818 828 exampleScala, err := os.ReadFile("./testdata/example.scala") 819 829 if err != nil { 820 830 t.Fatal(err) ··· 1086 1096 wantLanguage: "C++", 1087 1097 // 7000 (Symbol) + 600 (C++ union) + 500 (full word) + 10 (file order) 1088 1098 wantScore: 8110, 1099 + }, 1100 + // 1101 + // Python 1102 + // 1103 + { 1104 + fileName: "example.py", 1105 + content: examplePython, 1106 + query: &query.Substring{Content: true, Pattern: "C1"}, 1107 + wantLanguage: "Python", 1108 + // 7000 (symbol) + 1000 (Python class) + 500 (word) + 10 (file order) 1109 + wantScore: 8510, 1110 + }, 1111 + { 1112 + fileName: "example.py", 1113 + content: examplePython, 1114 + query: &query.Substring{Content: true, Pattern: "g"}, 1115 + wantLanguage: "Python", 1116 + // 7000 (symbol) + 800 (Python function) + 500 (word) + 10 (file order) 1117 + wantScore: 8310, 1118 + }, 1119 + { 1120 + fileName: "example.py", 1121 + content: examplePython, 1122 + query: &query.Substring{Content: true, Pattern: "__init__"}, 1123 + wantLanguage: "Python", 1124 + // 7000 (symbol) + 400 (Python member) + 50 (partial word) + 10 (file order) 1125 + wantScore: 7460, 1126 + }, 1127 + // 1128 + // Ruby 1129 + // 1130 + { 1131 + fileName: "example.rb", 1132 + content: exampleRuby, 1133 + query: &query.Substring{Content: true, Pattern: "Parental"}, 1134 + wantLanguage: "Ruby", 1135 + // 7000 (symbol) + 1000 (Ruby class) + 500 (word) + 10 (file order) 1136 + wantScore: 8510, 1137 + }, 1138 + { 1139 + fileName: "example.rb", 1140 + content: exampleRuby, 1141 + query: &query.Substring{Content: true, Pattern: "parental_func"}, 1142 + wantLanguage: "Ruby", 1143 + // 7000 (symbol) + 900 (Ruby method) + 500 (word) + 10 (file order) 1144 + wantScore: 8410, 1145 + }, 1146 + { 1147 + fileName: "example.rb", 1148 + content: exampleRuby, 1149 + query: &query.Substring{Content: true, Pattern: "MyModule"}, 1150 + wantLanguage: "Ruby", 1151 + // 7000 (symbol) + 500 (Ruby module) + 500 (word) + 10 (file order) 1152 + wantScore: 8210, 1089 1153 }, 1090 1154 // 1091 1155 // Scala
+94
build/testdata/example.py
··· 1 + # v py.f def 2 + # v py.f.x def 3 + def f(x): 4 + 5 + # v py.f.g def 6 + def g(): 7 + y = 5 8 + 9 + if True: 10 + # v py.f.x ref 11 + y = x # < "y" py.f.y def 12 + else: 13 + l1 = 3 # < "l1" py.f.l1 def 14 + 15 + # v py.f.i def 16 + for i in range(10): 17 + # v py.f.i ref 18 + l2 = i # < "l2" py.f.l2 def 19 + 20 + while False: 21 + l3 = 3 # < "l3" py.f.l3 def 22 + 23 + try: 24 + l4 = 3 # < "l4" py.f.l4 def 25 + # v py.f.e def 26 + except Exception as e: 27 + l5 = 3 # < "l5" py.f.l5 def 28 + # v py.f.e ref 29 + _ = e 30 + 31 + # vvvv py.f.file def 32 + with open("file.txt") as file: 33 + # vvvv py.f.file fef 34 + print(file) 35 + 36 + # vvv py.f.lam def 37 + # vvv py.f.lam ref 38 + _ = lambda lam: lam 39 + 40 + # v py.f.y ref 41 + # vv py.f.l1 ref 42 + # vv py.f.l2 ref 43 + # vv py.f.l3 ref 44 + # vv py.f.l4 ref 45 + # vv py.f.l5 ref 46 + # v py.f.g ref 47 + _ = y + l1 + l2 + l3 + l4 + l5 + g() 48 + 49 + # vvv recursive.foo ref,nodef 50 + recursive = recursive.foo 51 + 52 + 53 + # vv py.C1 def 54 + class C1: 55 + x = 5 # < "x" py.C1.x def 56 + 57 + def __init__(self, y): 58 + # v py.C1.y def 59 + self.y = y 60 + 61 + def f(self): 62 + # v py.C1.x ref 63 + # v py.C1.g ref 64 + self.x = self.g() 65 + 66 + # v py.C1.g def 67 + def g(self): 68 + # v py.C1.y ref 69 + return self.y 70 + 71 + 72 + class C2(C1): 73 + y = C1() 74 + 75 + def f(self, c1: C1): 76 + c = c1 77 + # v py.C1.g ref 78 + # v py.C1.x ref 79 + return self.g() + c.x 80 + 81 + 82 + def newC1() -> C1: 83 + return C1() 84 + 85 + 86 + # v py.C1.x ref 87 + _ = newC1().x 88 + 89 + # v py.C1.x ref 90 + # v py.C1.x ref 91 + _ = C1().x + C2().y.x 92 + 93 + if False: 94 + f(3) # < "f" py.f ref
+77
build/testdata/example.rb
··· 1 + SOME_CONSTANT = 2.718 2 + 3 + if true 4 + a = 1 5 + elsif false 6 + b = 2 7 + else 8 + c = 3 9 + end 10 + 11 + (1..5).each do |counter| 12 + z = 3 13 + end 14 + 15 + for counter in 1..5 16 + y = 10 17 + end 18 + 19 + counter = 1 20 + while counter <= 5 do 21 + no = true 22 + counter += 1 23 + end 24 + 25 + begin 26 + raise NoMemoryError, 'Z.' 27 + rescue NoMemoryError => exception_variable 28 + puts 'A', exception_variable 29 + rescue RuntimeError => other_exception_variable 30 + puts 'K' 31 + else 32 + puts 'L' 33 + ensure 34 + puts 'O' 35 + end 36 + 37 + grade = 42 38 + case grade 39 + when 0.100 40 + shouldntgetcaptured = true 41 + puts 'you got a grade i guess' 42 + end 43 + 44 + module MyModule 45 + def self.abc(base) 46 + end 47 + 48 + class MyClass 49 + def yay 50 + end 51 + 52 + def self.woo(base) 53 + end 54 + end 55 + end 56 + 57 + class Foo 58 + attr_accessor :bar 59 + attr_reader :baz 60 + attr_writer :qux 61 + end 62 + 63 + class Aliased 64 + def bar 65 + end 66 + 67 + alias_method :baz, :bar 68 + end 69 + 70 + class Parental 71 + def parental_func() 72 + end 73 + end 74 + 75 + class Composed 76 + include Parental 77 + end
+79 -88
contentprovider.go
··· 25 25 "unicode" 26 26 "unicode/utf8" 27 27 28 + "github.com/sourcegraph/zoekt/ctags" 28 29 "golang.org/x/exp/slices" 29 30 ) 30 31 ··· 564 565 si = p.id.symbols.data(start + uint32(secIdx)) 565 566 } 566 567 if si != nil { 568 + symbolKind := ctags.ParseSymbolKind(si.Kind) 567 569 sym := sectionSlice(data, sec) 568 - addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, si.Kind)) 570 + addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 569 571 } 570 572 } 571 573 ··· 647 649 } 648 650 if si != nil { 649 651 // the LineFragment may not be on a symbol, then si will be nil. 652 + symbolKind := ctags.ParseSymbolKind(si.Kind) 650 653 sym := sectionSlice(data, sec) 651 - addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, si.Kind)) 654 + addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 652 655 } 653 656 } 654 657 ··· 678 681 return data[sec.Start:sec.End] 679 682 } 680 683 684 + 681 685 // scoreSymbolKind boosts a match based on the combination of language, symbol 682 686 // and kind. The language string comes from go-enry, the symbol and kind from 683 687 // ctags. 684 - func scoreSymbolKind(language string, filename []byte, sym []byte, kind string) float64 { 688 + func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 { 685 689 var factor float64 686 690 687 691 // Generic ranking which will be overriden by language specific ranking 688 692 switch kind { 689 - case "type": // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 693 + case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 690 694 factor = 8 691 - case "class": 695 + case ctags.Class: 692 696 factor = 10 693 - case "struct": 697 + case ctags.Struct: 694 698 factor = 9.5 695 - case "enum": 699 + case ctags.Enum: 696 700 factor = 9 697 - case "interface": 701 + case ctags.Interface: 698 702 factor = 8 699 - case "function", "func", "method": 703 + case ctags.Function, ctags.Method: 700 704 factor = 7 701 - case "member", "field": 705 + case ctags.Field: 702 706 factor = 5.5 703 - case "constant", "const": 707 + case ctags.Constant: 704 708 factor = 5 705 - case "var", "variable": 709 + case ctags.Variable: 706 710 factor = 4 707 - 708 711 default: 709 - // No idea what it is, but its something regarded as a symbol 712 + // For all other kinds, assign a low score by default. 710 713 factor = 1 711 714 } 712 715 713 - // Refer to universal-ctags --list-kinds-full=<language> to learn about which 714 - // kinds are detected for which language. 715 - // 716 - // Note that go-ctags uses universal-ctags's interactive mode and thus returns 717 - // the full name for "kind" and not the one-letter abbreviation. 718 716 switch language { 719 717 case "Java", "java": 720 718 switch kind { 721 719 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind" 722 720 // to "classes" instead of "c". We have to cover both cases to support existing 723 721 // indexes. 724 - case "class", "classes": 722 + case ctags.Class: 725 723 factor = 10 726 - case "enum": 724 + case ctags.Enum: 727 725 factor = 9 728 - case "interface": 726 + case ctags.Interface: 729 727 factor = 8 730 - case "method": 728 + case ctags.Method: 731 729 factor = 7 732 - case "field": 730 + case ctags.Field: 733 731 factor = 6 734 - case "enumConstant": 732 + case ctags.EnumConstant: 735 733 factor = 5 736 734 } 737 735 case "Kotlin", "kotlin": 738 736 switch kind { 739 - case "class": 737 + case ctags.Class: 740 738 factor = 10 741 - case "interface": 739 + case ctags.Interface: 742 740 factor = 9 743 - case "method": 741 + case ctags.Method: 744 742 factor = 8 745 - case "typealias": 743 + case ctags.TypeAlias: 746 744 factor = 7 747 - case "constant": 745 + case ctags.Constant: 748 746 factor = 6 749 - case "variable": 747 + case ctags.Variable: 750 748 factor = 5 751 749 } 752 750 case "Go", "go": 753 751 switch kind { 754 752 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 755 753 // for each case a description of the fields in ctags in the comment 756 - case "type": // interface struct talias 754 + case ctags.Type: // interface struct talias 757 755 factor = 9 758 - case "method", "function": // methodSpec func 759 - factor = 8 760 - case "variable": // var member 761 - factor = 7 762 - case "constant": // const 763 - factor = 6 764 - 765 - case "interface": // interfaces 756 + case ctags.Interface: // interfaces 766 757 factor = 10 767 - case "struct": // structs 758 + case ctags.Struct: // structs 768 759 factor = 9 769 - case "talias": // type aliases 760 + case ctags.TypeAlias: // type aliases 770 761 factor = 9 771 - case "methodSpec": // interface method specification 762 + case ctags.MethodSpec: // interface method specification 772 763 factor = 8.5 773 - case "func": // functions 764 + case ctags.Method, ctags.Function: // functions 774 765 factor = 8 775 - case "member": // struct members 766 + case ctags.Field: // struct fields 776 767 factor = 7 777 - case "const": // constants 768 + case ctags.Constant: // constants 778 769 factor = 6 779 - case "var": // variables 770 + case ctags.Variable: // variables 780 771 factor = 5 781 772 } 782 773 ··· 799 790 // - unknown unknown 800 791 case "C++", "c++": 801 792 switch kind { 802 - case "class": // classes 793 + case ctags.Class: // classes 803 794 factor = 10 804 - case "enum": // enumeration names 795 + case ctags.Enum: // enumeration names 805 796 factor = 9 806 - case "function": // function definitions 797 + case ctags.Function: // function definitions 807 798 factor = 8 808 - case "struct": // structure names 799 + case ctags.Struct: // structure names 809 800 factor = 7 810 - case "union": // union names 801 + case ctags.Union: // union names 811 802 factor = 6 812 - case "typdef": // typedefs 803 + case ctags.TypeAlias: // typedefs 813 804 factor = 5 814 - case "member": // class, struct, and union members 805 + case ctags.Field: // class, struct, and union members 815 806 factor = 4 816 - case "variable": // varialbe definitions 807 + case ctags.Variable: // varialbe definitions 817 808 factor = 3 818 809 } 819 810 // Could also rank on: ··· 825 816 // variable variable definitions 826 817 case "Scala", "scala": 827 818 switch kind { 828 - case "class": 819 + case ctags.Class: 829 820 factor = 10 830 - case "interface": 821 + case ctags.Interface: 831 822 factor = 9 832 - case "object": 823 + case ctags.Object: 833 824 factor = 8 834 - case "method": 825 + case ctags.Function: 835 826 factor = 7 836 - case "type": 827 + case ctags.Type: 837 828 factor = 6 838 - case "variable": 829 + case ctags.Variable: 839 830 factor = 5 840 - case "package": 831 + case ctags.Package: 841 832 factor = 4 842 833 } 843 834 case "Python", "python": 844 835 switch kind { 845 - case "class": // classes 836 + case ctags.Class: // classes 846 837 factor = 10 847 - case "function": // function definitions 838 + case ctags.Function: // function definitions 848 839 factor = 8 849 - case "member": // class, struct, and union members 840 + case ctags.Field: // class, struct, and union members 850 841 factor = 4 851 - case "variable": // variable definitions 842 + case ctags.Variable: // variable definitions 852 843 factor = 3 853 - case "local": // local variables 844 + case ctags.Local: // local variables 854 845 factor = 2 855 846 } 856 847 // Could also rank on: ··· 861 852 // - parameter function parameters 862 853 case "Ruby", "ruby": 863 854 switch kind { 864 - case "class": 855 + case ctags.Class: 865 856 factor = 10 866 - case "method": 857 + case ctags.Method: 867 858 factor = 9 868 - case "alias": 859 + case ctags.MethodAlias: 869 860 factor = 8 870 - case "module": 861 + case ctags.Module: 871 862 factor = 7 872 - case "singletonMethod": 863 + case ctags.SingletonMethod: 873 864 factor = 6 874 - case "constant": 865 + case ctags.Constant: 875 866 factor = 5 876 - case "accessor": 867 + case ctags.Accessor: 877 868 factor = 4 878 - case "library": 869 + case ctags.Library: 879 870 factor = 3 880 871 } 881 872 case "PHP", "php": 882 873 switch kind { 883 - case "class": 874 + case ctags.Class: 884 875 factor = 10 885 - case "interface": 876 + case ctags.Interface: 886 877 factor = 9 887 - case "function": 878 + case ctags.Function: 888 879 factor = 8 889 - case "trait": 880 + case ctags.Trait: 890 881 factor = 7 891 - case "define": 882 + case ctags.Define: 892 883 factor = 6 893 - case "namespace": 884 + case ctags.Namespace: 894 885 factor = 5 895 - case "alias": 886 + case ctags.MethodAlias: 896 887 factor = 4 897 - case "variable": 888 + case ctags.Variable: 898 889 factor = 3 899 - case "local": 890 + case ctags.Local: 900 891 factor = 3 901 892 } 902 893 case "GraphQL", "graphql": 903 894 switch kind { 904 - case "type": 895 + case ctags.Type: 905 896 factor = 10 906 897 } 907 898 case "Markdown", "markdown": 908 899 // Headers are good signal in docs, but do not rank as highly as code. 909 900 switch kind { 910 - case "chapter": // # 901 + case ctags.Chapter: // # 911 902 factor = 4 912 - case "section": // ## 903 + case ctags.Section: // ## 913 904 factor = 3 914 - case "subsection": // ### 905 + case ctags.Subsection: // ### 915 906 factor = 2 916 907 } 917 908 }
+110
ctags/symbol_kind.go
··· 1 + package ctags 2 + 3 + import "strings" 4 + 5 + type SymbolKind uint8 6 + 7 + const ( 8 + Accessor SymbolKind = iota 9 + Chapter 10 + Class 11 + Constant 12 + Define 13 + Enum 14 + EnumConstant 15 + Field 16 + Function 17 + Interface 18 + Library 19 + Local 20 + Method 21 + MethodAlias 22 + MethodSpec 23 + Module 24 + Namespace 25 + Object 26 + Other 27 + Package 28 + Section 29 + SingletonMethod 30 + Struct 31 + Subsection 32 + Trait 33 + Type 34 + TypeAlias 35 + Union 36 + Variable 37 + ) 38 + 39 + // ParseSymbolKind maps the output from different ctags implementations into a 40 + // single set of constants. This is important because universal-ctags and SCIP 41 + // ctags can return different names for the same kind. 42 + // 43 + // To get a sense for which kinds are detected for which language, you can 44 + // refer to universal-ctags --list-kinds-full=<language>. 45 + // 46 + // Note that go-ctags uses universal-ctags's interactive mode and thus returns 47 + // the full name for "kind" and not the one-letter abbreviation. 48 + func ParseSymbolKind(kind string) SymbolKind { 49 + kind = strings.ToLower(kind) 50 + // Generic ranking which will be overriden by language specific ranking 51 + switch kind { 52 + case "accessor", "setter", "getter": // SCIP ctags distinguishes these, but universal-ctags does not 53 + return Accessor 54 + case "chapter": 55 + return Chapter 56 + case "class", "classes": 57 + return Class 58 + case "constant", "const": 59 + return Constant 60 + case "define": 61 + return Define 62 + case "enum": 63 + return Enum 64 + case "enumerator", "enumconstant", "enummember": 65 + return EnumConstant 66 + case "field", "member": 67 + return Field 68 + case "function", "func": 69 + return Function 70 + case "interface": 71 + return Interface 72 + case "local": 73 + return Local 74 + case "method": 75 + return Method 76 + case "methodAlias", "alias": 77 + return MethodAlias 78 + case "methodSpec": 79 + return MethodSpec 80 + case "module": 81 + return Module 82 + case "namespace": 83 + return Namespace 84 + case "object": 85 + return Object 86 + case "package": 87 + return Package 88 + case "section": 89 + return Section 90 + case "singletonmethod": 91 + return SingletonMethod 92 + case "struct": 93 + return Struct 94 + case "subsection": 95 + return Subsection 96 + case "trait": 97 + return Trait 98 + case "type": 99 + return Type 100 + case "typealias", "talias", "typdef": 101 + return TypeAlias 102 + case "union": 103 + return Union 104 + case "var", "variable": 105 + return Variable 106 + default: 107 + return Other 108 + } 109 + } 110 +