fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

repoID bitmap for speeding up findShard in compound shards (#899)

We add a new section to shards which contains a roaring bitmap for quickly checking if a shard contains a repo ID. We then can load just this (small amount) of data to rule out a compound shard. We use roaring bitmaps since we already have that dependency in our codebase.

The reason we speed up this operation is we found on a large instance which contained thousands of tiny repos we spent so much time in findShard that our indexing queue would always fall behind.

It is possible this new section won't speed this up enough and we need some sort of global oracle (or in-memory cache in indexserver?). This is noted in the code for future travellers.

Test Plan: the existing unit tests already cover if this is forwards and backwards compatible. Additionally I added some logging to zoekt to test if older version of shards still work correctly in findShard, as well as if older versions of zoekt can read the new shards.

Added a benchmark to check the impact. See comments in the code.

---------

Co-authored-by: Stefan Hengl <stefan@sourcegraph.com>

+228 -22
+1
go.mod
··· 61 61 github.com/davidmz/go-pageant v1.0.2 // indirect 62 62 github.com/go-fed/httpsig v1.1.0 // indirect 63 63 github.com/hashicorp/go-version v1.7.0 // indirect 64 + github.com/kylelemons/godebug v1.1.0 // indirect 64 65 go.opentelemetry.io/auto/sdk v1.1.0 // indirect 65 66 ) 66 67
+7 -13
index/builder.go
··· 467 467 } 468 468 469 469 // Brute force finding the shard in compound shards. We should only hit this 470 - // code path for repositories that are not already existing or are in 471 - // compound shards. 472 - // 473 - // TODO add an oracle which can speed this up in the case of repositories 474 - // already in compound shards. 470 + // code path for repositories that don't exist yet or are in compound shards. 471 + return o.findCompoundShard() 472 + } 473 + 474 + func (o *Options) findCompoundShard() string { 475 475 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) 476 476 if err != nil { 477 477 return "" 478 478 } 479 479 for _, fn := range compoundShards { 480 - repos, _, err := ReadMetadataPathAlive(fn) 481 - if err != nil { 482 - continue 483 - } 484 - for _, repo := range repos { 485 - if repo.ID == o.RepositoryDescription.ID { 486 - return fn 487 - } 480 + if containsRepo(fn, o.RepositoryDescription.ID) { 481 + return fn 488 482 } 489 483 } 490 484
+85 -8
index/builder_test.go
··· 16 16 17 17 "github.com/google/go-cmp/cmp" 18 18 "github.com/google/go-cmp/cmp/cmpopts" 19 + "github.com/prometheus/client_golang/prometheus/testutil" 20 + "github.com/stretchr/testify/require" 21 + 19 22 "github.com/sourcegraph/zoekt" 20 23 ) 21 24 ··· 288 291 } 289 292 } 290 293 294 + // Tests that we skip looping over repos in compound shards when we know that 295 + // the repository we are looking for is not in the shard. 296 + func TestSkipCompoundShards(t *testing.T) { 297 + metricCompoundShardLookups.Reset() 298 + 299 + compoundShards := [][]zoekt.Repository{ 300 + { 301 + {Name: "repoA", ID: 1}, 302 + {Name: "repoB", ID: 2}, 303 + {Name: "repoC", ID: 3}, 304 + }, 305 + { 306 + {Name: "repoD", ID: 4}, 307 + {Name: "repoE", ID: 5}, 308 + {Name: "repoF", ID: 6}, 309 + {Name: "repoF", ID: 7}, 310 + {Name: "repoF", ID: 8}, 311 + }, 312 + } 313 + var lookForRepoID uint32 = 99 314 + wantSkippedCount := 2 315 + 316 + indexDir := t.TempDir() 317 + for _, repositoryGroup := range compoundShards { 318 + createTestCompoundShard(t, indexDir, repositoryGroup) 319 + } 320 + o := &Options{ 321 + IndexDir: indexDir, 322 + RepositoryDescription: zoekt.Repository{ID: lookForRepoID}, 323 + } 324 + 325 + shard := o.findCompoundShard() 326 + require.Empty(t, shard) 327 + 328 + // Check if the "skipped" counter was incremented 329 + skippedCount := int(testutil.ToFloat64(metricCompoundShardLookups.WithLabelValues("skipped"))) 330 + require.Equal(t, wantSkippedCount, skippedCount) 331 + } 332 + 333 + // With optimization 334 + // BenchmarkFindCompoundShard-16 33505 36016 ns/op 335 + // 336 + // Without optimization 337 + // BenchmarkFindCompoundShard-16 76 15568589 ns/op 338 + func BenchmarkFindCompoundShard(b *testing.B) { 339 + // Generate a large compound shard 340 + const numRepos = 5000 341 + repositories := make([]zoekt.Repository, numRepos) 342 + for i := 0; i < numRepos; i++ { 343 + repositories[i] = zoekt.Repository{ 344 + Name: fmt.Sprintf("repo%d", i+1), 345 + ID: uint32(i + 1), 346 + } 347 + } 348 + indexDir := b.TempDir() 349 + createTestCompoundShard(b, indexDir, repositories) 350 + 351 + // pick id that is not in the shard 352 + var searchRepoID uint32 = numRepos + 1 353 + 354 + b.ResetTimer() 355 + for i := 0; i < b.N; i++ { 356 + o := &Options{ 357 + IndexDir: indexDir, 358 + RepositoryDescription: zoekt.Repository{ID: searchRepoID}, 359 + } 360 + 361 + shard := o.findCompoundShard() 362 + if shard != "" { 363 + b.Fatal("expected empty result") 364 + } 365 + } 366 + } 367 + 291 368 func TestOptions_FindAllShards(t *testing.T) { 292 369 type simpleShard struct { 293 370 Repository zoekt.Repository ··· 361 438 compoundShards: [][]zoekt.Repository{ 362 439 { 363 440 {Name: "repoA", ID: 1}, 364 - {Name: "sameName", ID: 2}, 365 - {Name: "sameName", ID: 3}, 441 + {Name: "repoB", ID: 2}, 442 + {Name: "repoC", ID: 3}, 366 443 }, 367 444 { 368 - {Name: "repoB", ID: 4}, 369 - {Name: "sameName", ID: 5}, 370 - {Name: "sameName", ID: 6}, 445 + {Name: "repoD", ID: 4}, 446 + {Name: "repoE", ID: 5}, 447 + {Name: "repoF", ID: 6}, 371 448 }, 372 449 }, 373 450 expectedShardCount: 1, 374 - expectedRepository: zoekt.Repository{Name: "sameName", ID: 5}, 451 + expectedRepository: zoekt.Repository{Name: "something-else", ID: 5}, 375 452 }, 376 453 } 377 454 for _, tt := range tests { ··· 840 917 } 841 918 } 842 919 843 - func createTestShard(t *testing.T, indexDir string, r zoekt.Repository, numShards int, optFns ...func(options *Options)) []string { 920 + func createTestShard(t testing.TB, indexDir string, r zoekt.Repository, numShards int, optFns ...func(options *Options)) []string { 844 921 t.Helper() 845 922 846 923 if err := os.MkdirAll(filepath.Dir(indexDir), 0o700); err != nil { ··· 891 968 return o.FindAllShards() 892 969 } 893 970 894 - func createTestCompoundShard(t *testing.T, indexDir string, repositories []zoekt.Repository, optFns ...func(options *Options)) { 971 + func createTestCompoundShard(t testing.TB, indexDir string, repositories []zoekt.Repository, optFns ...func(options *Options)) { 895 972 t.Helper() 896 973 897 974 var shardNames []string
+100
index/read.go
··· 24 24 "slices" 25 25 "sort" 26 26 27 + "github.com/RoaringBitmap/roaring" 28 + "github.com/prometheus/client_golang/prometheus" 29 + "github.com/prometheus/client_golang/prometheus/promauto" 27 30 "github.com/rs/xid" 31 + 28 32 "github.com/sourcegraph/zoekt" 29 33 ) 30 34 ··· 647 651 } 648 652 } 649 653 return exist, nil 654 + } 655 + 656 + // maybeContainsRepo is a performance optimization mainly intended to be used by 657 + // containsRepo to avoid unmarshalling large metadata files for compound shards. 658 + // It is best-effort, so if it encounters any error returns true (ie indicating 659 + // you need to do more checks). 660 + func maybeContainsRepo(inf IndexFile, repoID uint32) bool { 661 + rd := &reader{r: inf} 662 + var toc indexTOC 663 + err := rd.readTOCSections(&toc, []string{"reposIDsBitmap"}) 664 + if err != nil { 665 + return true 666 + } 667 + 668 + // shard does not yet contain reposIDsBitmap so we can't tell if it contains 669 + // repo. 670 + if toc.reposIDsBitmap.sz == 0 { 671 + return true 672 + } 673 + 674 + blob, err := inf.Read(toc.reposIDsBitmap.off, toc.reposIDsBitmap.sz) 675 + if err != nil { 676 + return true 677 + } 678 + 679 + var rb roaring.Bitmap 680 + _, err = rb.FromUnsafeBytes(blob) 681 + if err != nil { 682 + return true 683 + } 684 + 685 + return rb.Contains(repoID) 686 + } 687 + 688 + var metricCompoundShardLookups = promauto.NewCounterVec(prometheus.CounterOpts{ 689 + Name: "zoekt_compound_shard_lookups", 690 + Help: "Number of compound shard lookups and how much work was done.", 691 + }, []string{"state"}) 692 + 693 + // containsRepo returns true if the shard at path contains a repo with id. The 694 + // function returns false if the shard does not contain the repo or if it 695 + // encounters an error. 696 + func containsRepo(p string, id uint32) bool { 697 + var err error 698 + earlyReturn := false 699 + 700 + defer func() { 701 + if err != nil { 702 + metricCompoundShardLookups.WithLabelValues("error").Inc() 703 + return 704 + } 705 + if earlyReturn { 706 + metricCompoundShardLookups.WithLabelValues("skipped").Inc() 707 + return 708 + } 709 + metricCompoundShardLookups.WithLabelValues("full_lookup").Inc() 710 + }() 711 + 712 + f, err := os.Open(p) 713 + if err != nil { 714 + return false 715 + } 716 + defer f.Close() 717 + 718 + inf, err := NewIndexFile(f) 719 + if err != nil { 720 + return false 721 + } 722 + defer inf.Close() 723 + 724 + // PERF: Looping over repos can be relatively slow on instances with thousands 725 + // of tiny repos in compound shards. This is a much faster check to see if we 726 + // need to do more work. 727 + // 728 + // If we are still seeing performance issues, we should consider adding 729 + // some sort of global oracle here to avoid filepath.Glob and checking 730 + // each compound shard. 731 + if !maybeContainsRepo(inf, id) { 732 + earlyReturn = true 733 + return false 734 + } 735 + 736 + repos, _, err := ReadMetadata(inf) 737 + if err != nil { 738 + return false 739 + } 740 + for _, repo := range repos { 741 + if repo.Tombstone { 742 + continue 743 + } 744 + if repo.ID == id { 745 + return true 746 + } 747 + } 748 + 749 + return false 650 750 } 651 751 652 752 func loadIndexData(r IndexFile) (*indexData, error) {
+1
index/read_test.go
··· 29 29 "testing" 30 30 31 31 "github.com/google/go-cmp/cmp" 32 + 32 33 "github.com/sourcegraph/zoekt" 33 34 "github.com/sourcegraph/zoekt/query" 34 35 )
+17
index/shard_builder.go
··· 511 511 return 0 512 512 } 513 513 514 + // repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID 515 + // is missing or there are no repos, this returns false. 516 + func (b *ShardBuilder) repoIDs() ([]uint32, bool) { 517 + if len(b.repoList) == 0 { 518 + return nil, false 519 + } 520 + 521 + ids := make([]uint32, 0, len(b.repoList)) 522 + for _, repo := range b.repoList { 523 + if repo.ID == 0 { 524 + return nil, false 525 + } 526 + ids = append(ids, repo.ID) 527 + } 528 + return ids, true 529 + } 530 + 514 531 type DocChecker struct { 515 532 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations. 516 533 trigrams map[ngram]struct{}
+3 -1
index/toc.go
··· 96 96 contentChecksums simpleSection 97 97 runeDocSections simpleSection 98 98 99 - repos simpleSection 99 + repos simpleSection 100 + reposIDsBitmap simpleSection 100 101 101 102 ranks simpleSection 102 103 } ··· 181 182 {"languages", &t.languages}, 182 183 {"runeDocSections", &t.runeDocSections}, 183 184 {"repos", &t.repos}, 185 + {"reposIDsBitmap", &t.reposIDsBitmap}, 184 186 185 187 // We no longer write these sections, but we still return them here to avoid 186 188 // warnings about unknown sections.
+14
index/write.go
··· 25 25 "time" 26 26 27 27 "github.com/sourcegraph/zoekt" 28 + 29 + "github.com/RoaringBitmap/roaring" 28 30 ) 29 31 30 32 func (w *writer) writeTOC(toc *indexTOC) { ··· 66 68 return m[string(keys[i].data)] < m[string(keys[j].data)] 67 69 }) 68 70 s.writeStrings(w, keys) 71 + } 72 + 73 + func writeUint32Bitmap(w *writer, dat []uint32) { 74 + rb := roaring.BitmapOf(dat...) 75 + rb.RunOptimize() 76 + rb.WriteTo(w) 69 77 } 70 78 71 79 func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection, ··· 169 177 toc.repos.start(w) 170 178 w.Write(toSizedDeltas16(b.repos)) 171 179 toc.repos.end(w) 180 + } 181 + 182 + if repoIDs, ok := b.repoIDs(); ok && next { 183 + toc.reposIDsBitmap.start(w) 184 + writeUint32Bitmap(w, repoIDs) 185 + toc.reposIDsBitmap.end(w) 172 186 } 173 187 174 188 indexTime := b.IndexTime
testdata/shards/repo2_v16.00000.zoekt

This is a binary file and will not be displayed.

testdata/shards/repo_v16.00000.zoekt

This is a binary file and will not be displayed.