fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

shards: selectRepoSet supports queries which are not wrapped in and (#751)

The previous commit which added support for selectRepoSet in List was
ineffective since the queries we get for List do not look like (and ...)
but instead directly specify the reposet atom.

This commit adds support for it. Additionally to help with our manual
testing we add support for "repo:" query in the selectRepoSet
optimization.

While pairing on this we decided to improve the debug output to tracing
and rename the misnamed query.Q variable in List from r to q.

Test Plan: updated the test case to directly test the sort of queries we
get in List calls. Additionally ran "zoekt-webserver -pprof" and
observed in net/trace output that selectRepoSet optimization reduced the
number of searched shards.

Co-authored-by: Stefan Hengl <stefan@sourcegraph.com>

+63 -21
+44 -20
shards/shards.go
··· 22 22 "os" 23 23 "runtime" 24 24 "runtime/debug" 25 + "slices" 25 26 "sort" 26 27 "strconv" 27 28 "sync" ··· 364 365 365 366 func selectRepoSet(shards []*rankedShard, q query.Q) ([]*rankedShard, query.Q) { 366 367 and, ok := q.(*query.And) 367 - if !ok { 368 - return shards, q 368 + if ok { 369 + return doSelectRepoSet(shards, and) 369 370 } 370 371 372 + // We have queries which look like (reposet ...) and we want to do the same 373 + // optimizations. To simplify we just always wrap the query in And and then 374 + // on the return value call Simplify to unwrap. In particular this is 375 + // important for List calls. 376 + and = &query.And{Children: []query.Q{q}} 377 + shards, q = doSelectRepoSet(shards, and) 378 + return shards, query.Simplify(q) 379 + } 380 + 381 + func doSelectRepoSet(shards []*rankedShard, and *query.And) ([]*rankedShard, query.Q) { 371 382 // (and (reposet ...) (q)) 372 383 // (and true (q)) with a filtered shards 373 384 // (and false) // noop ··· 375 386 // (and (repobranches ...) (q)) 376 387 // (and (repobranches ...) (q)) 377 388 389 + // Note: we also support (and (repo ...) (q)) even though sourcegraph does 390 + // not generate those sorts of queries. This is to support manual testing. 391 + 378 392 hasReposForPredicate := func(pred func(repo *zoekt.Repository) bool) func(repos []*zoekt.Repository) (any, all bool) { 379 393 return func(repos []*zoekt.Repository) (any, all bool) { 380 394 any = false ··· 401 415 setSize = int(setQuery.Repos.GetCardinality()) 402 416 hasRepos = hasReposForPredicate(func(repo *zoekt.Repository) bool { 403 417 return setQuery.Repos.Contains(repo.ID) 418 + }) 419 + case *query.Repo: 420 + setSize = 0 421 + hasRepos = hasReposForPredicate(func(repo *zoekt.Repository) bool { 422 + return setQuery.Regexp.MatchString(repo.Name) 404 423 }) 405 424 case *query.BranchesRepos: 406 425 for _, br := range setQuery.List { ··· 447 466 return filtered, and 448 467 } 449 468 469 + // We don't want to mutate the original and, so we clone it before 470 + // mutating it. 471 + and = &query.And{Children: slices.Clone(and.Children)} 472 + 450 473 // This optimization allows us to avoid the work done by 451 474 // indexData.simplify for each shard. 452 475 // ··· 455 478 // shard indexData.simplify will simplify to (and true (content baz)) -> 456 479 // (content baz). This work can be done now once, rather than per shard. 457 480 switch c := c.(type) { 458 - case *query.RepoSet: 459 - and.Children[i] = &query.Const{Value: true} 460 - return filtered, query.Simplify(and) 461 - 462 - case *query.RepoIDs: 481 + case *query.RepoSet, *query.RepoIDs, *query.Repo: 463 482 and.Children[i] = &query.Const{Value: true} 464 483 return filtered, query.Simplify(and) 465 484 ··· 629 648 tr.Finish() 630 649 }() 631 650 632 - tr.LazyPrintf("before selectRepoSet shards:%d", len(shards)) 633 651 // Select the subset of shards that we will search over for the given query. 634 - shards, q = selectRepoSet(shards, q) 635 - tr.LazyPrintf("after selectRepoSet shards:%d %s", len(shards), q) 652 + { 653 + beforeLen := len(shards) 654 + beforeQ := q 655 + shards, q = selectRepoSet(shards, q) 656 + tr.LazyPrintf("selectRepoSet shards=%d->%d q=%s->%s", beforeLen, len(shards), beforeQ, q) 657 + } 636 658 637 659 if len(shards) == 0 { 638 660 return func() {}, nil ··· 888 910 return s.List(ctx, q, opts) 889 911 } 890 912 891 - func (ss *shardedSearcher) List(ctx context.Context, r query.Q, opts *zoekt.ListOptions) (rl *zoekt.RepoList, err error) { 913 + func (ss *shardedSearcher) List(ctx context.Context, q query.Q, opts *zoekt.ListOptions) (rl *zoekt.RepoList, err error) { 892 914 tr, ctx := trace.New(ctx, "shardedSearcher.List", "") 893 915 metricListRunning.Inc() 894 916 defer func() { 895 917 metricListRunning.Dec() 896 918 if rl != nil { 897 - tr.LazyPrintf("repos size: %d", len(rl.Repos)) 898 - tr.LazyPrintf("reposmap size: %d", len(rl.ReposMap)) 899 - tr.LazyPrintf("crashes: %d", rl.Crashes) 919 + tr.LazyPrintf("repos.size=%d reposmap.size=%d crashes=%d", len(rl.Repos), len(rl.ReposMap), rl.Crashes) 900 920 } 901 921 if err != nil { 902 922 tr.LazyPrintf("error: %v", err) ··· 905 925 tr.Finish() 906 926 }() 907 927 908 - r = query.Simplify(r) 928 + q = query.Simplify(q) 909 929 isAll := false 910 - if c, ok := r.(*query.Const); ok { 930 + if c, ok := q.(*query.Const); ok { 911 931 isAll = c.Value 912 932 } 913 933 ··· 919 939 tr.LazyPrintf("acquired process") 920 940 921 941 loaded := ss.getLoaded() 942 + shards := loaded.shards 922 943 923 944 // Setup what we return now, since we may short circuit if there are no 924 945 // shards to search. ··· 936 957 // PERF: Select the subset of shards that we will search over for the given 937 958 // query. A common List query only asks for a specific repo, so this is an 938 959 // important optimization. 939 - tr.LazyPrintf("before selectRepoSet shards:%d", len(loaded.shards)) 940 - shards, r := selectRepoSet(loaded.shards, r) 941 - tr.LazyPrintf("after selectRepoSet shards:%d %s", len(shards), r) 960 + { 961 + beforeLen := len(shards) 962 + beforeQ := q 963 + shards, q = selectRepoSet(shards, q) 964 + tr.LazyPrintf("selectRepoSet shards=%d->%d q=%s->%s", beforeLen, len(shards), beforeQ, q) 965 + } 942 966 943 967 if len(shards) == 0 { 944 968 return &agg, nil ··· 977 1001 for range workers { 978 1002 g.Go(func() error { 979 1003 for s := range feeder { 980 - result, err := listOneShard(ctx, s, r, opts) 1004 + result, err := listOneShard(ctx, s, q, opts) 981 1005 if err != nil { 982 1006 return err 983 1007 }
+19 -1
shards/shards_test.go
··· 288 288 func TestFilteringShardsByRepoSetOrBranchesReposOrRepoIDs(t *testing.T) { 289 289 ss := newShardedSearcher(1) 290 290 291 + // namePrefix is so we can create a repo:foo filter and match the same set 292 + // of repos. 293 + namePrefix := [3]string{"foo", "bar", "baz"} 294 + 291 295 repoSetNames := []string{} 292 296 repoIDs := []uint32{} 293 297 n := 10 * runtime.GOMAXPROCS(0) 294 298 for i := 0; i < n; i++ { 295 299 shardName := fmt.Sprintf("shard%d", i) 296 - repoName := fmt.Sprintf("repository%.3d", i) 300 + repoName := fmt.Sprintf("%s-repository%.3d", namePrefix[i%3], i) 297 301 repoID := hash(repoName) 298 302 299 303 if i%3 == 0 { ··· 329 333 sub := &query.Substring{Pattern: "bla"} 330 334 331 335 repoIDsQuery := query.NewRepoIDs(repoIDs...) 336 + repoQuery := &query.Repo{Regexp: regexp.MustCompile("^foo-.*")} 332 337 333 338 queries := []query.Q{ 334 339 query.NewAnd(set, sub), ··· 342 347 query.NewAnd(repoIDsQuery, sub), 343 348 // Test with the same repoIDs again 344 349 query.NewAnd(repoIDsQuery, sub), 350 + 351 + query.NewAnd(repoQuery, sub), 352 + query.NewAnd(repoQuery, sub), 353 + 354 + // List has queries which are just the reposet atoms. We also test twice. 355 + set, 356 + set, 357 + branchesRepos, 358 + branchesRepos, 359 + repoIDsQuery, 360 + repoIDsQuery, 361 + repoQuery, 362 + repoQuery, 345 363 } 346 364 347 365 for _, q := range queries {