fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

merging: delete tmp code that removed duplicates (#445)

@mpimenov added this code a while ago to remove duplicate indexes in compound shards on .com. The code path was active for a short period of time and fulfilled its purpose. Now we can remove it.

Background:
For a while we saw a handfull of duplicate indexes per shard on .com. The root cause turned out to be a missing lock on index dir during merging. This has since been fixed. However, the duplicates still lingered which is why we added this cleanup code.

I have checked several times over the last couple of weeks and can confirm that no new duplicates materialized.

As a reference, this is how I checked for duplicates

```
cd /data/index/ && apk add jq && for i in compound-*.zoekt; do zoekt-sourcegraph-indexserver debug meta $i | tail -n1 | jq -r '.[] | select(.Tombstone==false) .Name' >> sh_repos; done && cat sh_repos | sort | uniq -d && rm sh_repos
```

I believe the issue is resolved.

author
Stefan Hengl
committer
GitHub
date (Oct 17, 2022, 12:00 PM +0200) commit 4f7781e1 parent 57c3159c
-139
-67
cmd/zoekt-sourcegraph-indexserver/cleanup.go
··· 172 172 } 173 173 } 174 174 175 - // feature flag: place file TOMBSTONE_DUPLICATES in IndexDir 176 - if _, err := os.Stat(filepath.Join(indexDir, "TOMBSTONE_DUPLICATES")); err == nil && shardMerging { 177 - // This breaks an invariant of cleanup() where we guarantee that right 178 - // after a cleanup the index state is consistent with the repos parameter. 179 - // If the duplicates are tombstoned, the same repoID may seem both 180 - // alive and tombstoned, depending on which compound shard you look into. 181 - // However, introducing duplicates in the first place breaks another invariant 182 - // (of having no duplicates), so 183 - // 1. This is bad, but we are fixing an even worse situation. 184 - // 2. This code should be removed as soon as the situation is fixed. 185 - tombstoneDuplicates(indexDir) 186 - } 187 - 188 175 metricCleanupDuration.Observe(time.Since(start).Seconds()) 189 176 } 190 177 ··· 235 222 } 236 223 } 237 224 return shards 238 - } 239 - 240 - // tombstoneDuplicates finds duplicate shards in indexDir and makes it so 241 - // that all of them except one have a tombstone. It tries, on a best effort basis, 242 - // to select the non-tombstoned shard with the latest modification time as the one to 243 - // remain alive. 244 - // The assumption is that duplicate shards exist only as parts of compound shards: 245 - // we haven't seen simple duplicate shards in the wild. 246 - // Duplicate shards should not occur at all but we rarely see them due to a bug somewhere (probably fixed). 247 - // This function is intended to wait for a subsequent call to vacuum that will delete the 248 - // tombstoned repos. 249 - func tombstoneDuplicates(indexDir string) { 250 - // We are only receiving alive shards in this call, so we may skip a 251 - // newer shard if it already has a tombstone. 252 - index := getShards(indexDir) 253 - 254 - for repoID, shardsAll := range index { 255 - shards := make([]shard, 0, len(shardsAll)) 256 - for _, s := range shardsAll { 257 - // One repoID corresponds to multiple shards if either 258 - // a) it is a large repo split across multiple shards, or 259 - // b) it is a repo that is duplicated in several compound shards. 260 - // We are only interested in case b). 261 - if strings.HasPrefix(filepath.Base(s.Path), "compound-") { 262 - shards = append(shards, s) 263 - } 264 - } 265 - if len(shards) <= 1 { 266 - continue 267 - } 268 - 269 - log.Printf("found %d duplicate shards for repoID=%d. Tombstoning all but one", len(shards), repoID) 270 - 271 - latest := 0 272 - for i, s := range shards { 273 - if shards[latest].ModTime.Before(s.ModTime) { 274 - latest = i 275 - } 276 - } 277 - if latest != 0 { 278 - shards[0], shards[latest] = shards[latest], shards[0] 279 - } 280 - 281 - if err := zoekt.UnsetTombstone(shards[0].Path, repoID); err != nil { 282 - log.Printf("error removing tombstone for %v: %s", repoID, err) 283 - } else { 284 - shardsLog(indexDir, "untomb", shards[:1]) 285 - } 286 - for _, s := range shards[1:] { 287 - if !s.RepoTombstone && maybeSetTombstone([]shard{s}, repoID) { 288 - shardsLog(indexDir, "tomb", []shard{s}) 289 - } 290 - } 291 - } 292 225 } 293 226 294 227 // getTombstonedRepos return a map of tombstoned repositories in dir. If a
-72
cmd/zoekt-sourcegraph-indexserver/cleanup_test.go
··· 455 455 } 456 456 } 457 457 458 - func TestTombstoneDuplicateShards(t *testing.T) { 459 - dir := t.TempDir() 460 - 461 - // enable feature flag 462 - if _, err := os.Create(filepath.Join(dir, "TOMBSTONE_DUPLICATES")); err != nil { 463 - t.Fatal(err) 464 - } 465 - 466 - now := time.Now() 467 - recent := now.Add(-1 * time.Hour) 468 - old := now.Add(-2 * time.Hour) 469 - 470 - cs1 := createCompoundShard(t, dir, []uint32{1, 2, 3}, func(*zoekt.Repository) {}) 471 - 472 - // Hack part 1: hide repos from being tombstoned by Builder.Finish() when creating cs2. 473 - for i := 1; i <= 3; i++ { 474 - if err := zoekt.SetTombstone(cs1, uint32(i)); err != nil { 475 - t.Fatal(err) 476 - } 477 - } 478 - 479 - cs2 := createCompoundShard(t, dir, []uint32{1, 2}, func(*zoekt.Repository) {}) 480 - 481 - // Hack part 2: remove tombstones to create duplicates. 482 - for i := 1; i <= 3; i++ { 483 - if err := zoekt.UnsetTombstone(cs1, uint32(i)); err != nil { 484 - t.Fatal(err) 485 - } 486 - } 487 - 488 - if err := os.Chtimes(cs1, old, old); err != nil { 489 - t.Fatal(err) 490 - } 491 - if err := os.Chtimes(cs2, recent, recent); err != nil { 492 - t.Fatal(err) 493 - } 494 - 495 - // want indexed 496 - repos := []uint32{1, 2, 3} 497 - 498 - cleanup(dir, repos, now, true) 499 - 500 - index := getShards(dir) 501 - trash := getShards(filepath.Join(dir, ".trash")) 502 - 503 - if len(trash) != 0 { 504 - t.Fatalf("expected empty trash, got %+v", trash) 505 - } 506 - 507 - wantIndex := map[uint32][]shard{ 508 - 1: {{ 509 - RepoID: 1, 510 - RepoName: "repo1", 511 - Path: cs2, 512 - }}, 513 - 2: {{ 514 - RepoID: 2, 515 - RepoName: "repo2", 516 - Path: cs2, 517 - }}, 518 - 3: {{ 519 - RepoID: 3, 520 - RepoName: "repo3", 521 - Path: cs1, 522 - }}, 523 - } 524 - 525 - if d := cmp.Diff(wantIndex, index, cmpopts.IgnoreFields(shard{}, "ModTime")); d != "" { 526 - t.Fatalf("-want, +got: %s", d) 527 - } 528 - } 529 - 530 458 // createCompoundShard returns a path to a compound shard containing repos with 531 459 // ids. Use optsFns to overwrite fields of zoekt.Repository for all repos. 532 460 func createCompoundShard(t *testing.T, dir string, ids []uint32, optFns ...func(in *zoekt.Repository)) string {