fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

build: use enry to detect low priority files (#829)

This is a much more robust detection mechanism. Additionally we have
these signals we can also add in:

func IsConfiguration(path string) bool
func IsDocumentation(path string) bool
func IsDotFile(path string) bool
func IsImage(path string) bool

My main concern with this change is generated file detection on content
using up RAM or CPU. Will monitor this impact on pprof in production.

Test Plan: go test.

+41 -41
+6 -16
build/builder.go
··· 37 37 "time" 38 38 39 39 "github.com/bmatcuk/doublestar" 40 - "github.com/grafana/regexp" 40 + "github.com/go-enry/go-enry/v2" 41 41 "github.com/rs/xid" 42 42 43 43 "github.com/sourcegraph/zoekt" ··· 901 901 // 902 902 // These 'priority' criteria affects how documents are ordered within a shard. It's 903 903 // also used to help guess a file's rank when we're missing ranking information. 904 - func IsLowPriority(file string) bool { 905 - return testRe.MatchString(file) || isGenerated(file) || isVendored(file) 906 - } 907 - 908 - var testRe = regexp.MustCompile("[Tt]est") 909 - 910 - func isGenerated(file string) bool { 911 - return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map") 912 - } 913 - 914 - func isVendored(file string) bool { 915 - return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/") 904 + func IsLowPriority(path string, content []byte) bool { 905 + return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content) 916 906 } 917 907 918 908 type rankedDoc struct { ··· 931 921 } 932 922 933 923 generated := 0.0 934 - if isGenerated(d.Name) { 924 + if enry.IsGenerated(d.Name, d.Content) { 935 925 generated = 1.0 936 926 } 937 927 938 928 vendor := 0.0 939 - if isVendored(d.Name) { 929 + if enry.IsVendor(d.Name) { 940 930 vendor = 1.0 941 931 } 942 932 943 933 test := 0.0 944 - if testRe.MatchString(d.Name) { 934 + if enry.IsTest(d.Name) { 945 935 test = 1.0 946 936 } 947 937
+14 -4
build/builder_test.go
··· 764 764 func TestIsLowPriority(t *testing.T) { 765 765 cases := []string{ 766 766 "builder_test.go", 767 - "TestQuery.java", 768 - "test/mocks.go", 767 + "test/TestQuery.java", 769 768 "search/vendor/thirdparty.cc", 770 769 "search/node_modules/search/index.js", 771 770 "search.min.js", ··· 774 773 775 774 for _, tt := range cases { 776 775 t.Run(tt, func(t *testing.T) { 777 - if !IsLowPriority(tt) { 776 + if !IsLowPriority(tt, nil) { 778 777 t.Errorf("expected file '%s' to be low priority", tt) 779 778 } 780 779 }) ··· 788 787 789 788 for _, tt := range negativeCases { 790 789 t.Run(tt, func(t *testing.T) { 791 - if IsLowPriority(tt) { 790 + if IsLowPriority(tt, nil) { 792 791 t.Errorf("did not expect file '%s' to be low priority", tt) 793 792 } 794 793 }) 794 + } 795 + 796 + // Explicitly check that content is important by using the same filename but 797 + // different content. 798 + normal := "package mock\n\nvar Mock struct {}" 799 + generated := "// Code generated by mock\npackage mock\n\nvar Mock struct {}" 800 + if IsLowPriority("mock.go", []byte(normal)) { 801 + t.Error("expected non-generated content to not be low priority") 802 + } 803 + if !IsLowPriority("mock.go", []byte(generated)) { 804 + t.Error("expected generated content to be low priority") 795 805 } 796 806 } 797 807
+1 -1
build/e2e_test.go
··· 502 502 name: "test", 503 503 docs: []*zoekt.Document{ 504 504 { 505 - Name: "test", 505 + Name: "foo_test.go", 506 506 Content: []byte("bla"), 507 507 }, 508 508 {
+3 -3
gitindex/index.go
··· 589 589 // - If we have a concrete rank for this file, always use it 590 590 // - If there's no rank, and it's a low priority file like a test, then use rank 0 591 591 // - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage 592 - func (r repoPathRanks) rank(path string) float64 { 592 + func (r repoPathRanks) rank(path string, content []byte) float64 { 593 593 if rank, ok := r.Paths[path]; ok { 594 594 return rank 595 - } else if build.IsLowPriority(path) { 595 + } else if build.IsLowPriority(path, content) { 596 596 return 0.0 597 597 } else { 598 598 return r.MeanRank ··· 910 910 var pathRanks []float64 911 911 if len(ranks.Paths) > 0 { 912 912 // If the repository has ranking data, then store the file's rank. 913 - pathRank := ranks.rank(keyFullPath) 913 + pathRank := ranks.rank(keyFullPath, contents) 914 914 pathRanks = []float64{pathRank} 915 915 } 916 916
+1 -1
gitindex/index_test.go
··· 744 744 745 745 for _, tt := range cases { 746 746 t.Run(tt.name, func(t *testing.T) { 747 - got := pathRanks.rank(tt.path) 747 + got := pathRanks.rank(tt.path, nil) 748 748 if got != tt.rank { 749 749 t.Errorf("expected file '%s' to have rank %f, but got %f", tt.path, tt.rank, got) 750 750 }
+6 -6
internal/e2e/testdata/coverage_data_writer.txt
··· 14 14 32:// as output the coverage data broken down by function, like this: 15 15 hidden 8 more line matches 16 16 17 + github.com/golang/go/src/testing/fuzz.go 18 + 93: Data []byte 19 + 205:// modify the underlying data of the arguments provided by the fuzzing engine. 20 + 275: run := func(captureOut io.Writer, e corpusEntry) (ok bool) { 21 + hidden 7 more line matches 22 + 17 23 github.com/golang/go/src/cmd/cover/html.go 18 24 199: Coverage float64 19 25 170:type templateData struct { ··· 25 31 487:func corpusEntryData(ce CorpusEntry) ([]byte, error) { 26 32 908:func (c *coordinator) updateCoverage(newCoverage []byte) int { 27 33 hidden 91 more line matches 28 - 29 - github.com/golang/go/src/testing/fuzz.go 30 - 93: Data []byte 31 - 205:// modify the underlying data of the arguments provided by the fuzzing engine. 32 - 275: run := func(captureOut io.Writer, e corpusEntry) (ok bool) { 33 - hidden 7 more line matches 34 34 35 35 github.com/golang/go/src/cmd/vendor/golang.org/x/sys/unix/ztypes_linux.go 36 36 227: Data [7]byte
+5 -5
internal/e2e/testdata/test_server.txt
··· 32 32 741:func (s *Server) serverContext() (context.Context, context.CancelFunc) { 33 33 hidden 166 more line matches 34 34 35 - github.com/sourcegraph/zoekt/cmd/zoekt-sourcegraph-indexserver/main.go 36 - 150:type Server struct { 37 - 1232:func startServer(conf rootConfig) error { 38 - 1309:func newServer(conf rootConfig) (*Server, error) { 39 - hidden 52 more line matches 35 + github.com/sourcegraph/sourcegraph-public-snapshot/cmd/frontend/graphqlbackend/testing.go 36 + 46:type Test struct { 37 + 79:func RunTest(t *testing.T, test *Test) { 38 + 58:func RunTests(t *testing.T, tests []*Test) { 39 + hidden 27 more line matches 40 40 41 41 hidden 494 more file matches
+5 -5
internal/e2e/testdata/time_compare.txt
··· 29 29 1381: re := Compare(x.re, token.EQL, y.re) 30 30 hidden 1 more line matches 31 31 32 - github.com/golang/go/src/syscall/zsyscall_windows.go 33 - 878:func GetSystemTimeAsFileTime(time *Filetime) { 34 - 1088:func SetFileTime(handle Handle, ctime *Filetime, atime *Filetime, wtime *Filetime) (err error) { 35 - 132: procGetSystemTimeAsFileTime = modkernel32.NewProc("GetSystemTimeAsFileTime") 36 - hidden 19 more line matches 32 + github.com/golang/go/src/cmd/go/internal/gover/gover.go 33 + 36:func Compare(x, y string) int { 34 + 20:// but at the time this code was written, there was an existing test that used 35 + 49: if c := cmp.Compare(vx.kind, vy.kind); c != 0 { // "" < alpha < beta < rc 36 + hidden 4 more line matches 37 37 38 38 hidden 139 more file matches