fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

ranking: support offline ranking (#441)

Most of the new code is behind a feature flag.

To test, set `OFFLINE_RANKING_REPOS_ALLOW_LIST=<comma-separated list of repos>` for zoekt-sourcegraph-indexserver.

Design:
- indexserver pops a repo from the index queue
- if the repo is on the allow list, indexserver downloads the document scores and stores them as JSON in the gitDir of the repo to be indexed
- The flag `-offline-ranking` tells `zoekt-git-index` to read the new file and sort the documents accordingly.

Becauses `sourcegraphFake` implements the new methods I added to the interface `Sourcegraph`, everything can be tested by just running Zoekt.

author
Stefan Hengl
committer
GitHub
date (Oct 14, 2022, 10:03 AM +0200) commit e60e03b2 parent e091ee33
+403 -11
+41 -1
build/builder.go
··· 100 100 // last run. 101 101 IsDelta bool 102 102 103 + // DocumentRanksPath is the path to the file with document ranks. If empty, 104 + // ranks will be computed on-the-fly. 105 + DocumentRanksPath string 106 + 103 107 // changedOrRemovedFiles is a list of file paths that have been changed or removed 104 108 // since the last indexing job for this repository. These files will be tombstoned 105 109 // in the older shards for this repository. ··· 942 946 } 943 947 } 944 948 949 + const epsilon = 0.00000001 950 + 951 + // sortDocuments2 sorts []*zoekt.Document according to their Ranks. In general, 952 + // documents can have a nil rank vector if the document to be indexed was added 953 + // after the ranking took place. A nil rank vector translates to the lowest 954 + // possible rank. Longer vectors are more important than shorter vectors, given 955 + // all other ranks are equal. 956 + // 957 + // Note: the logic here is inverted to sortDocuments because rank in 958 + // sortDocuments returns a vector of the form [1-rank, ...]. 959 + func sortDocuments2(rs []*zoekt.Document) { 960 + sort.Slice(rs, func(i, j int) bool { 961 + r1 := rs[i].Ranks 962 + r2 := rs[j].Ranks 963 + 964 + l := len(r1) 965 + if len(r2) < l { 966 + l = len(r2) 967 + } 968 + for i := 0; i < l; i++ { 969 + if math.Abs(r1[i]-r2[i]) > epsilon { 970 + return r1[i] > r2[i] 971 + } 972 + } 973 + // if r1 has more entries it is more important. ie imagine right padding shorter 974 + // arrays with zeros, so they are the same length. 975 + return len(r1) > len(r2) 976 + }) 977 + } 978 + 945 979 func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) { 946 980 if !b.opts.DisableCTags && b.opts.CTagsPath != "" { 947 981 err := ctagsAddSymbols(todo, b.parser, b.opts.CTagsPath) ··· 959 993 if err != nil { 960 994 return nil, err 961 995 } 962 - sortDocuments(todo) 996 + 997 + if b.opts.DocumentRanksPath != "" { 998 + sortDocuments2(todo) 999 + } else { 1000 + sortDocuments(todo) 1001 + } 1002 + 963 1003 for _, t := range todo { 964 1004 if err := shardBuilder.Add(*t); err != nil { 965 1005 return nil, err
+104
build/builder_test.go
··· 14 14 15 15 "github.com/google/go-cmp/cmp" 16 16 "github.com/google/go-cmp/cmp/cmpopts" 17 + 17 18 "github.com/sourcegraph/zoekt" 18 19 ) 19 20 ··· 844 845 t.Fatal(err) 845 846 } 846 847 } 848 + 849 + func Test_sortDocuments2(t *testing.T) { 850 + tests := []struct { 851 + name string 852 + in []*zoekt.Document 853 + want []string 854 + }{ 855 + { 856 + name: "same length", 857 + in: []*zoekt.Document{ 858 + { 859 + Name: "a", 860 + Ranks: []float64{0, 0, 0}, 861 + }, 862 + { 863 + Name: "b", 864 + Ranks: []float64{1, 1, 1}, 865 + }, 866 + { 867 + Name: "c", 868 + Ranks: []float64{1, 0, 1}, 869 + }, 870 + }, 871 + want: []string{"b", "c", "a"}, 872 + }, 873 + { 874 + name: "1 nil", 875 + in: []*zoekt.Document{ 876 + { 877 + Name: "a", 878 + Ranks: []float64{1, 1, 0}, 879 + }, 880 + { 881 + Name: "b", 882 + }, 883 + { 884 + Name: "c", 885 + Ranks: []float64{1, 1, 1}, 886 + }, 887 + }, 888 + want: []string{"c", "a", "b"}, 889 + }, 890 + { 891 + name: "different lengths", 892 + in: []*zoekt.Document{ 893 + { 894 + Name: "a", 895 + Ranks: []float64{0}, 896 + }, 897 + { 898 + Name: "b", 899 + Ranks: []float64{0, 0}, 900 + }, 901 + { 902 + Name: "c", 903 + Ranks: []float64{0, 0, 0}, 904 + }, 905 + }, 906 + want: []string{"c", "b", "a"}, 907 + }, 908 + { 909 + name: "different lengths and nil", 910 + in: []*zoekt.Document{ 911 + { 912 + Name: "a", 913 + Ranks: []float64{0}, 914 + }, 915 + { 916 + Name: "b", 917 + Ranks: []float64{0, 0}, 918 + }, 919 + { 920 + Name: "c", 921 + }, 922 + }, 923 + want: []string{"b", "a", "c"}, 924 + }, 925 + } 926 + 927 + for _, tt := range tests { 928 + t.Run(tt.name, func(t *testing.T) { 929 + sortDocuments2(tt.in) 930 + 931 + for i, name := range tt.want { 932 + if tt.in[i].Name != name { 933 + var got []string 934 + for _, d := range tt.in { 935 + got = append(got, d.Name) 936 + } 937 + t.Fatalf("want %+v, got %+v\n", tt.want, got) 938 + } 939 + } 940 + }) 941 + } 942 + 943 + t.Run("test for panics", func(t *testing.T) { 944 + // Special case: test for panics if all documents have nil rank vectors. 945 + sortDocuments2([]*zoekt.Document{{}, {}}) 946 + sortDocuments2([]*zoekt.Document{{}}) 947 + sortDocuments2(nil) 948 + }) 949 + 950 + }
+4 -1
cmd/zoekt-git-index/main.go
··· 22 22 "runtime/pprof" 23 23 "strings" 24 24 25 + "go.uber.org/automaxprocs/maxprocs" 26 + 25 27 "github.com/sourcegraph/zoekt/cmd" 26 28 "github.com/sourcegraph/zoekt/gitindex" 27 - "go.uber.org/automaxprocs/maxprocs" 28 29 ) 29 30 30 31 func run() int { ··· 41 42 "It also affects name if the indexed repository is under this directory.") 42 43 isDelta := flag.Bool("delta", false, "whether we should use delta build") 43 44 deltaShardNumberFallbackThreshold := flag.Uint64("delta_threshold", 0, "upper limit on the number of preexisting shards that can exist before attempting a delta build (0 to disable fallback behavior)") 45 + offlineRanking := flag.String("offline_ranking", "", "the name of the file that contains the ranking info.") 44 46 flag.Parse() 45 47 46 48 // Tune GOMAXPROCS to match Linux container CPU quota. ··· 67 69 } 68 70 opts := cmd.OptionsFromFlags() 69 71 opts.IsDelta = *isDelta 72 + opts.DocumentRanksPath = *offlineRanking 70 73 71 74 var branches []string 72 75 if *branchesStr != "" {
+26 -1
cmd/zoekt-sourcegraph-indexserver/index.go
··· 4 4 "bytes" 5 5 "context" 6 6 "crypto/sha1" 7 + "encoding/json" 7 8 "errors" 8 9 "fmt" 9 10 "io" ··· 83 84 // only be true for repositories we explicitly enable. 84 85 UseDelta bool 85 86 87 + UseOfflineRanking bool 88 + 86 89 // DeltaShardNumberFallbackThreshold is an upper limit on the number of preexisting shards that can exist 87 90 // before attempting a delta build. 88 91 DeltaShardNumberFallbackThreshold uint64 ··· 151 154 findRepositoryMetadata func(args *indexArgs) (repository *zoekt.Repository, ok bool, err error) 152 155 } 153 156 154 - func gitIndex(c gitIndexConfig, o *indexArgs, l sglog.Logger) error { 157 + func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.Logger) error { 155 158 logger := l.Scoped("gitIndex", "fetch commits and then run zoekt-git-index against contents") 156 159 157 160 if len(o.Branches) == 0 { ··· 319 322 320 323 args := []string{ 321 324 "-submodules=false", 325 + } 326 + 327 + // We store the document ranks as JSON in gitDir and tell zoekt-git-index where 328 + // to find the file. 329 + if o.UseOfflineRanking { 330 + documentsRankFile := filepath.Join(gitDir, "documents.rank") 331 + 332 + args = append(args, "-offline_ranking", documentsRankFile) 333 + 334 + r, err := sourcegraph.GetDocumentRanks(context.Background(), o.Name) 335 + if err != nil { 336 + return fmt.Errorf("GetDocumentRanks: %w", err) 337 + } 338 + 339 + b, err := json.Marshal(r) 340 + if err != nil { 341 + return err 342 + } 343 + 344 + if err := os.WriteFile(documentsRankFile, b, 0600); err != nil { 345 + return fmt.Errorf("failed to write %s to disk: %w", documentsRankFile, err) 346 + } 322 347 } 323 348 324 349 // Even though we check for incremental in this process, we still pass it
+3 -2
cmd/zoekt-sourcegraph-indexserver/index_test.go
··· 2 2 3 3 import ( 4 4 "fmt" 5 - "github.com/sourcegraph/log/logtest" 6 5 "net/http" 7 6 "net/http/httptest" 8 7 "net/url" ··· 12 11 "reflect" 13 12 "strings" 14 13 "testing" 14 + 15 + "github.com/sourcegraph/log/logtest" 15 16 16 17 "github.com/google/go-cmp/cmp" 17 18 "github.com/google/go-cmp/cmp/cmpopts" ··· 271 272 findRepositoryMetadata: findRepositoryMetadata, 272 273 } 273 274 274 - if err := gitIndex(c, &tc.args, logtest.Scoped(t)); err != nil { 275 + if err := gitIndex(c, &tc.args, sourcegraphNop{}, logtest.Scoped(t)); err != nil { 275 276 t.Fatal(err) 276 277 } 277 278 if !cmp.Equal(got, tc.want) {
+14 -1
cmd/zoekt-sourcegraph-indexserver/main.go
··· 187 187 // repositoriesSkipSymbolsCalculationAllowList is an allowlist for repositories that 188 188 // we skip calculating symbols metadata for during builds 189 189 repositoriesSkipSymbolsCalculationAllowList map[string]struct{} 190 + 191 + offlineRankingAllowList map[string]struct{} 190 192 } 191 193 192 194 var debug = log.New(io.Discard, "", log.LstdFlags) ··· 538 540 args.UseDelta = true 539 541 } 540 542 543 + if _, ok := s.offlineRankingAllowList[repositoryName]; ok { 544 + tr.LazyPrintf("marking this repository for offline ranking") 545 + args.UseOfflineRanking = true 546 + } 547 + 541 548 args.DeltaShardNumberFallbackThreshold = s.deltaShardNumberFallbackThreshold 542 549 543 550 if _, ok := s.repositoriesSkipSymbolsCalculationAllowList[repositoryName]; ok { ··· 586 593 }, 587 594 } 588 595 589 - return indexStateSuccess, gitIndex(c, args, s.logger) 596 + return indexStateSuccess, gitIndex(c, args, s.Sourcegraph, s.logger) 590 597 } 591 598 592 599 func (s *Server) indexArgs(opts IndexOptions) *indexArgs { ··· 1124 1131 debug.Printf("using delta shard builds for: %s", joinStringSet(deltaBuildRepositoriesAllowList, ", ")) 1125 1132 } 1126 1133 1134 + offlineRankingAllowList := getEnvWithDefaultEmptySet("OFFLINE_RANKING_REPOS_ALLOWLIST") 1135 + if len(offlineRankingAllowList) > 0 { 1136 + debug.Printf("using offline ranking for: %s", joinStringSet(offlineRankingAllowList, ", ")) 1137 + } 1138 + 1127 1139 deltaShardNumberFallbackThreshold := getEnvWithDefaultUint64("DELTA_SHARD_NUMBER_FALLBACK_THRESHOLD", 150) 1128 1140 if deltaShardNumberFallbackThreshold > 0 { 1129 1141 debug.Printf("setting delta shard fallback threshold to %d shard(s)", deltaShardNumberFallbackThreshold) ··· 1180 1192 deltaBuildRepositoriesAllowList: deltaBuildRepositoriesAllowList, 1181 1193 deltaShardNumberFallbackThreshold: deltaShardNumberFallbackThreshold, 1182 1194 repositoriesSkipSymbolsCalculationAllowList: reposShouldSkipSymbolsCalculation, 1195 + offlineRankingAllowList: offlineRankingAllowList, 1183 1196 }, err 1184 1197 } 1185 1198
+171
cmd/zoekt-sourcegraph-indexserver/sg.go
··· 1 1 package main 2 2 3 3 import ( 4 + "bufio" 4 5 "bytes" 5 6 "context" 6 7 "encoding/json" ··· 68 69 // is the forced version of IterateIndexOptions, so will always calculate 69 70 // options for each id in repos. 70 71 ForceIterateIndexOptions(onSuccess func(IndexOptions), onError func(uint32, error), repos ...uint32) 72 + 73 + // GetRepoRank returns a rank vector for the given repository. Repositories are 74 + // assumed to be ordered by each pairwise component of the resulting vector, 75 + // higher ranks coming earlier. 76 + GetRepoRank(ctx context.Context, repoName string) ([]float64, error) 77 + 78 + // GetDocumentRanks returns a map from paths within the given repo to their 79 + // rank vectors. Paths are assumed to be ordered by each pairwise component of 80 + // the resulting vector, higher ranks coming earlier 81 + GetDocumentRanks(ctx context.Context, repoName string) (map[string][]float64, error) 71 82 } 72 83 73 84 func newSourcegraphClient(rootURL *url.URL, hostname string, batchSize int) *sourcegraphClient { ··· 129 140 configFingerprintReset time.Time 130 141 } 131 142 143 + // GetRepoRank asks Sourcegraph for the rank vector of repoName. 144 + func (s *sourcegraphClient) GetRepoRank(ctx context.Context, repoName string) ([]float64, error) { 145 + u := s.Root.ResolveReference(&url.URL{ 146 + Path: "/.internal/ranks/" + strings.Trim(repoName, "/"), 147 + }) 148 + 149 + b, err := s.get(ctx, u) 150 + if err != nil { 151 + return nil, err 152 + } 153 + 154 + var ranks []float64 155 + err = json.Unmarshal(b, &ranks) 156 + if err != nil { 157 + return nil, err 158 + } 159 + 160 + return ranks, nil 161 + } 162 + 163 + // GetDocumentRanks asks Sourcegraph for a mapping of file paths to rank 164 + // vectors. 165 + func (s *sourcegraphClient) GetDocumentRanks(ctx context.Context, repoName string) (map[string][]float64, error) { 166 + u := s.Root.ResolveReference(&url.URL{ 167 + Path: "/.internal/ranks/" + strings.Trim(repoName, "/") + "/documents", 168 + }) 169 + 170 + b, err := s.get(ctx, u) 171 + if err != nil { 172 + return nil, err 173 + } 174 + 175 + ranks := make(map[string][]float64) 176 + err = json.Unmarshal(b, &ranks) 177 + if err != nil { 178 + return nil, err 179 + } 180 + 181 + // Invariant: All rank vectors have the same length. 182 + first := true 183 + wantLen := -1 184 + for _, v := range ranks { 185 + if first { 186 + first = false 187 + wantLen = len(v) 188 + continue 189 + } 190 + if len(v) != wantLen { 191 + return nil, fmt.Errorf("found rank vectors of different length %d<>%d\n", wantLen, len(v)) 192 + } 193 + } 194 + 195 + return ranks, nil 196 + } 197 + 198 + func (s *sourcegraphClient) get(ctx context.Context, u *url.URL) ([]byte, error) { 199 + req, err := retryablehttp.NewRequestWithContext(ctx, "GET", u.String(), nil) 200 + if err != nil { 201 + return nil, err 202 + } 203 + 204 + resp, err := s.doRequest(req) 205 + if err != nil { 206 + return nil, err 207 + } 208 + defer resp.Body.Close() 209 + 210 + if resp.StatusCode != http.StatusOK { 211 + b, err := io.ReadAll(io.LimitReader(resp.Body, 1024)) 212 + _ = resp.Body.Close() 213 + if err != nil { 214 + return nil, err 215 + } 216 + return nil, &url.Error{ 217 + Op: "Get", 218 + URL: u.String(), 219 + Err: fmt.Errorf("%s: %s", resp.Status, string(b)), 220 + } 221 + } 222 + 223 + b, err := io.ReadAll(resp.Body) 224 + if err != nil { 225 + return nil, err 226 + } 227 + 228 + return b, nil 229 + } 230 + 132 231 func (s *sourcegraphClient) List(ctx context.Context, indexed []uint32) (*SourcegraphListResult, error) { 133 232 repos, err := s.listRepoIDs(ctx, indexed) 134 233 if err != nil { ··· 362 461 Log *log.Logger 363 462 } 364 463 464 + // GetRepoRank expects a file with exactly 1 line containing a comma separated 465 + // list of float64 as ranks. 466 + func (sf sourcegraphFake) GetRepoRank(ctx context.Context, repoName string) ([]float64, error) { 467 + dir := filepath.Join(sf.RootDir, filepath.FromSlash(repoName)) 468 + 469 + b, err := os.ReadFile(filepath.Join(dir, "SG_REPO_RANKS")) 470 + if err != nil { 471 + return nil, err 472 + } 473 + 474 + return floats64(string(b)), nil 475 + } 476 + 477 + // GetDocumentRanks expects a file where each line has the following format: 478 + // path<tab>rank... where rank is a float64 in [0,1]. Multiple ranks are 479 + // separated by a comma. Each line must have the same number of ranks. 480 + func (sf sourcegraphFake) GetDocumentRanks(ctx context.Context, repoName string) (map[string][]float64, error) { 481 + dir := filepath.Join(sf.RootDir, filepath.FromSlash(repoName)) 482 + 483 + fd, err := os.Open(filepath.Join(dir, "SG_DOCUMENT_RANKS")) 484 + if err != nil { 485 + return nil, err 486 + } 487 + 488 + ranks := make(map[string][]float64) 489 + 490 + scanner := bufio.NewScanner(fd) 491 + for scanner.Scan() { 492 + s := scanner.Text() 493 + pathRanks := strings.Split(s, "\t") 494 + ranks[pathRanks[0]] = floats64(pathRanks[1]) 495 + } 496 + if err := scanner.Err(); err != nil { 497 + return nil, err 498 + } 499 + 500 + return ranks, nil 501 + } 502 + 503 + func floats64(s string) []float64 { 504 + parts := strings.Split(s, ",") 505 + 506 + var r []float64 507 + for _, rank := range parts { 508 + f, err := strconv.ParseFloat(rank, 64) 509 + if err != nil { 510 + return nil 511 + } 512 + r = append(r, f) 513 + } 514 + 515 + return r 516 + } 517 + 365 518 func (sf sourcegraphFake) List(ctx context.Context, indexed []uint32) (*SourcegraphListResult, error) { 366 519 repos, err := sf.ListRepoIDs(ctx, indexed) 367 520 if err != nil { ··· 571 724 // magic at the end is to ensure we get a positive number when casting. 572 725 return uint32(crc32.ChecksumIEEE([]byte(name))%(1<<31-1) + 1) 573 726 } 727 + 728 + type sourcegraphNop struct{} 729 + 730 + func (s sourcegraphNop) List(ctx context.Context, indexed []uint32) (*SourcegraphListResult, error) { 731 + return nil, nil 732 + } 733 + 734 + func (s sourcegraphNop) ForceIterateIndexOptions(onSuccess func(IndexOptions), onError func(uint32, error), repos ...uint32) { 735 + return 736 + } 737 + 738 + func (s sourcegraphNop) GetRepoRank(ctx context.Context, repoName string) ([]float64, error) { 739 + return nil, nil 740 + } 741 + 742 + func (s sourcegraphNop) GetDocumentRanks(ctx context.Context, repoName string) (map[string][]float64, error) { 743 + return nil, nil 744 + }
+31 -5
gitindex/index.go
··· 18 18 import ( 19 19 "bytes" 20 20 "context" 21 + "encoding/json" 21 22 "errors" 22 23 "fmt" 23 24 "io" ··· 113 114 repo.URL = u.String() 114 115 switch typ { 115 116 case "gitiles": 116 - /// eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20 117 + // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20 117 118 repo.CommitURLTemplate = u.String() + "/+/{{.Version}}" 118 119 repo.FileURLTemplate = u.String() + "/+/{{.Version}}/{{.Path}}" 119 120 repo.LineFragmentTemplate = "#{{.LineNumber}}" ··· 483 484 if err != nil { 484 485 return fmt.Errorf("build.NewBuilder: %w", err) 485 486 } 487 + 488 + var ranks map[string][]float64 489 + if opts.BuildOptions.DocumentRanksPath != "" { 490 + data, err := os.ReadFile(opts.BuildOptions.DocumentRanksPath) 491 + if err != nil { 492 + return err 493 + } 494 + 495 + err = json.Unmarshal(data, &ranks) 496 + if err != nil { 497 + return err 498 + } 499 + } 500 + 501 + rankVecForPath := func(path string) []float64 { 502 + s, ok := ranks[path] 503 + if !ok { 504 + return nil 505 + } 506 + return s 507 + } 508 + 486 509 // we don't need to check error, since we either already have an error, or 487 510 // we returning the first call to builder.Finish. 488 511 defer builder.Finish() // nolint:errcheck ··· 512 535 return err 513 536 } 514 537 515 - if blob.Size > int64(opts.BuildOptions.SizeMax) && !opts.BuildOptions.IgnoreSizeMax(key.FullPath()) { 538 + keyFullPath := key.FullPath() 539 + 540 + if blob.Size > int64(opts.BuildOptions.SizeMax) && !opts.BuildOptions.IgnoreSizeMax(keyFullPath) { 516 541 if err := builder.Add(zoekt.Document{ 517 542 SkipReason: fmt.Sprintf("file size %d exceeds maximum size %d", blob.Size, opts.BuildOptions.SizeMax), 518 - Name: key.FullPath(), 543 + Name: keyFullPath, 519 544 Branches: brs, 520 545 SubRepositoryPath: key.SubRepoPath, 521 546 }); err != nil { ··· 530 555 } 531 556 if err := builder.Add(zoekt.Document{ 532 557 SubRepositoryPath: key.SubRepoPath, 533 - Name: key.FullPath(), 558 + Name: keyFullPath, 534 559 Content: contents, 535 560 Branches: brs, 561 + Ranks: rankVecForPath(keyFullPath), 536 562 }); err != nil { 537 - return fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err) 563 + return fmt.Errorf("error adding document with name %s: %w", keyFullPath, err) 538 564 } 539 565 } 540 566 }
+9
indexbuilder.go
··· 286 286 // Document sections for symbols. Offsets should use bytes. 287 287 Symbols []DocumentSection 288 288 SymbolsMetaData []*Symbol 289 + 290 + // Ranks is a vector of ranks for a document as provided by a DocumentRanksFile 291 + // file in the git repo. 292 + // 293 + // Two documents can be ordered by comparing the components of their rank 294 + // vectors. Bigger entries are better, as are longer vectors. 295 + // 296 + // This field is experimental and may change at any time without warning. 297 + Ranks []float64 289 298 } 290 299 291 300 type symbolSlice struct {