fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

indexserver: tombstone duplicate repos in the cleanup cycle (#387)

This removes duplicate repos in compound shards during the cleanup cycle. Activate by placing a file named TOMBSTONE_DUPLICATES in index dir.

Co-authored-by: Stefan Hengl <stefan@sourcegraph.com>

author
Maxim Pimenov
co-author
Stefan Hengl
committer
GitHub
date (Jul 28, 2022, 3:00 PM +0200) commit 310c80f0 parent bf4a5ab0
+167 -20
+83 -12
cmd/zoekt-sourcegraph-indexserver/cleanup.go
··· 170 170 } 171 171 } 172 172 } 173 + 174 + // feature flag: place file TOMBSTONE_DUPLICATES in IndexDir 175 + if _, err := os.Stat(filepath.Join(indexDir, "TOMBSTONE_DUPLICATES")); err == nil && shardMerging { 176 + // This breaks an invariant of cleanup() where we guarantee that right 177 + // after a cleanup the index state is consistent with the repos parameter. 178 + // If the duplicates are tombstoned, the same repoID may seem both 179 + // alive and tombstoned, depending on which compound shard you look into. 180 + // However, introducing duplicates in the first place breaks another invariant 181 + // (of having no duplicates), so 182 + // 1. This is bad, but we are fixing an even worse situation. 183 + // 2. This code should be removed as soon as the situation is fixed. 184 + tombstoneDuplicates(indexDir) 185 + } 186 + 173 187 metricCleanupDuration.Observe(time.Since(start).Seconds()) 174 188 } 175 189 176 190 type shard struct { 177 - RepoID uint32 178 - RepoName string 179 - Path string 180 - ModTime time.Time 191 + RepoID uint32 192 + RepoName string 193 + Path string 194 + ModTime time.Time 195 + RepoTombstone bool 181 196 } 182 197 183 198 func getShards(dir string) map[uint32][]shard { ··· 210 225 211 226 for _, repo := range repos { 212 227 shards[repo.ID] = append(shards[repo.ID], shard{ 213 - RepoID: repo.ID, 214 - RepoName: repo.Name, 215 - Path: path, 216 - ModTime: fi.ModTime(), 228 + RepoID: repo.ID, 229 + RepoName: repo.Name, 230 + Path: path, 231 + ModTime: fi.ModTime(), 232 + RepoTombstone: repo.Tombstone, 217 233 }) 218 234 } 219 235 } 220 236 return shards 221 237 } 222 238 239 + // tombstoneDuplicates finds duplicate shards in indexDir and makes it so 240 + // that all of them except one have a tombstone. It tries, on a best effort basis, 241 + // to select the non-tombstoned shard with the latest modification time as the one to 242 + // remain alive. 243 + // The assumption is that duplicate shards exist only as parts of compound shards: 244 + // we haven't seen simple duplicate shards in the wild. 245 + // Duplicate shards should not occur at all but we rarely see them due to a bug somewhere (probably fixed). 246 + // This function is intended to wait for a subsequent call to vacuum that will delete the 247 + // tombstoned repos. 248 + func tombstoneDuplicates(indexDir string) { 249 + // We are only receiving alive shards in this call, so we may skip a 250 + // newer shard if it already has a tombstone. 251 + index := getShards(indexDir) 252 + 253 + for repoID, shardsAll := range index { 254 + shards := make([]shard, 0, len(shardsAll)) 255 + for _, s := range shardsAll { 256 + // One repoID corresponds to multiple shards if either 257 + // a) it is a large repo split across multiple shards, or 258 + // b) it is a repo that is duplicated in several compound shards. 259 + // We are only interested in case b). 260 + if strings.HasPrefix(filepath.Base(s.Path), "compound-") { 261 + shards = append(shards, s) 262 + } 263 + } 264 + if len(shards) <= 1 { 265 + continue 266 + } 267 + 268 + log.Printf("found %d duplicate shards for repoID=%d. Tombstoning all but one", len(shards), repoID) 269 + 270 + latest := 0 271 + for i, s := range shards { 272 + if shards[latest].ModTime.Before(s.ModTime) { 273 + latest = i 274 + } 275 + } 276 + if latest != 0 { 277 + shards[0], shards[latest] = shards[latest], shards[0] 278 + } 279 + 280 + if err := zoekt.UnsetTombstone(shards[0].Path, repoID); err != nil { 281 + log.Printf("error removing tombstone for %v: %s", repoID, err) 282 + } else { 283 + shardsLog(indexDir, "untomb", shards[:1]) 284 + } 285 + for _, s := range shards[1:] { 286 + if !s.RepoTombstone && maybeSetTombstone([]shard{s}, repoID) { 287 + shardsLog(indexDir, "tomb", []shard{s}) 288 + } 289 + } 290 + } 291 + } 292 + 223 293 // getTombstonedRepos return a map of tombstoned repositories in dir. If a 224 294 // repository is tombstoned in more than one compound shard, only the latest one, 225 295 // as determined by the date of the latest commit, is returned. ··· 247 317 continue 248 318 } 249 319 m[repo.ID] = shard{ 250 - RepoID: repo.ID, 251 - RepoName: repo.Name, 252 - Path: p, 253 - ModTime: repo.LatestCommitDate, 320 + RepoID: repo.ID, 321 + RepoName: repo.Name, 322 + Path: p, 323 + ModTime: repo.LatestCommitDate, 324 + RepoTombstone: repo.Tombstone, 254 325 } 255 326 } 256 327 }
+79 -5
cmd/zoekt-sourcegraph-indexserver/cleanup_test.go
··· 13 13 14 14 "github.com/google/go-cmp/cmp" 15 15 "github.com/google/go-cmp/cmp/cmpopts" 16 + 16 17 "github.com/google/zoekt" 17 18 "github.com/google/zoekt/build" 18 19 ) ··· 20 21 func TestCleanup(t *testing.T) { 21 22 mk := func(name string, n int, mtime time.Time) shard { 22 23 return shard{ 23 - RepoID: fakeID(name), 24 - RepoName: name, 25 - Path: fmt.Sprintf("%s_v%d.%05d.zoekt", url.QueryEscape(name), 15, n), 26 - ModTime: mtime, 24 + RepoID: fakeID(name), 25 + RepoName: name, 26 + Path: fmt.Sprintf("%s_v%d.%05d.zoekt", url.QueryEscape(name), 15, n), 27 + ModTime: mtime, 28 + RepoTombstone: false, 27 29 } 28 30 } 29 31 // We don't use getShards so that we have two implementations of the same ··· 432 434 } 433 435 } 434 436 437 + func TestTombstoneDuplicateShards(t *testing.T) { 438 + dir := t.TempDir() 439 + 440 + // enable feature flag 441 + if _, err := os.Create(filepath.Join(dir, "TOMBSTONE_DUPLICATES")); err != nil { 442 + t.Fatal(err) 443 + } 444 + 445 + now := time.Now() 446 + recent := now.Add(-1 * time.Hour) 447 + old := now.Add(-2 * time.Hour) 448 + 449 + cs1 := createCompoundShard(t, dir, []uint32{1, 2, 3}, func(*zoekt.Repository) {}) 450 + 451 + // Hack part 1: hide repos from being tombstoned by Builder.Finish() when creating cs2. 452 + for i := 1; i <= 3; i++ { 453 + if err := zoekt.SetTombstone(cs1, uint32(i)); err != nil { 454 + t.Fatal(err) 455 + } 456 + } 457 + 458 + cs2 := createCompoundShard(t, dir, []uint32{1, 2}, func(*zoekt.Repository) {}) 459 + 460 + // Hack part 2: remove tombstones to create duplicates. 461 + for i := 1; i <= 3; i++ { 462 + if err := zoekt.UnsetTombstone(cs1, uint32(i)); err != nil { 463 + t.Fatal(err) 464 + } 465 + } 466 + 467 + if err := os.Chtimes(cs1, old, old); err != nil { 468 + t.Fatal(err) 469 + } 470 + if err := os.Chtimes(cs2, recent, recent); err != nil { 471 + t.Fatal(err) 472 + } 473 + 474 + // want indexed 475 + repos := []uint32{1, 2, 3} 476 + 477 + cleanup(dir, repos, now, true) 478 + 479 + index := getShards(dir) 480 + trash := getShards(filepath.Join(dir, ".trash")) 481 + 482 + if len(trash) != 0 { 483 + t.Fatalf("expected empty trash, got %+v", trash) 484 + } 485 + 486 + wantIndex := map[uint32][]shard{ 487 + 1: []shard{{ 488 + RepoID: 1, 489 + RepoName: "repo1", 490 + Path: cs2, 491 + }}, 492 + 2: []shard{{ 493 + RepoID: 2, 494 + RepoName: "repo2", 495 + Path: cs2, 496 + }}, 497 + 3: []shard{{ 498 + RepoID: 3, 499 + RepoName: "repo3", 500 + Path: cs1, 501 + }}, 502 + } 503 + 504 + if d := cmp.Diff(wantIndex, index, cmpopts.IgnoreFields(shard{}, "ModTime")); d != "" { 505 + t.Fatalf("-want, +got: %s", d) 506 + } 507 + } 508 + 435 509 // createCompoundShard returns a path to a compound shard containing repos with 436 510 // ids. Use optsFns to overwrite fields of zoekt.Repository for all repos. 437 511 func createCompoundShard(t *testing.T, dir string, ids []uint32, optFns ...func(in *zoekt.Repository)) string { ··· 471 545 } 472 546 473 547 // create a compound shard. 474 - tmpFn, dstFn, err := merge(dir, repoFns) 548 + tmpFn, dstFn, err := merge(t, dir, repoFns) 475 549 if err != nil { 476 550 t.Fatal(err) 477 551 }
+4 -2
cmd/zoekt-sourcegraph-indexserver/meta_test.go
··· 62 62 // create a compound shard. Use a new indexdir to avoid the need to cleanup 63 63 // old shards. 64 64 dir = t.TempDir() 65 - tmpFn, dstFn, err := merge(dir, repoFns) 65 + tmpFn, dstFn, err := merge(t, dir, repoFns) 66 66 if err != nil { 67 67 t.Fatal(err) 68 68 } ··· 102 102 } 103 103 } 104 104 105 - func merge(dstDir string, names []string) (string, string, error) { 105 + func merge(t *testing.T, dstDir string, names []string) (string, string, error) { 106 + t.Helper() 107 + 106 108 var files []zoekt.IndexFile 107 109 for _, fn := range names { 108 110 f, err := os.Open(fn)
+1 -1
cmd/zoekt-sourcegraph-indexserver/sg.go
··· 375 375 } 376 376 for _, opt := range opts { 377 377 if opt.Error != "" { 378 - sf.Log.Printf("WARN: ignoring GetIndexOptions error for %s: %v", opt.Name, err) 378 + sf.Log.Printf("WARN: ignoring GetIndexOptions error for %s: %v", opt.Name, opt.Error) 379 379 continue 380 380 } 381 381 f(opt.IndexOptions)