From 9d3859b246977c307a22f5930755fe77e374a634 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov <vdavydov@tarantool.org> Date: Tue, 11 Jun 2024 17:05:58 +0300 Subject: [PATCH] vinyl: fix gc vs vylog race leading to duplicate record Vinyl run files aren't always deleted immediately after compaction, because we need to keep run files corresponding to checkpoints for backups. Such run files are deleted by the garbage collection procedure, which performs the following steps: 1. Loads information about all run files from the last vylog file. 2. For each loaded run record that is marked as dropped: a. Tries to remove the run files. b. On success, writes a "forget" record for the dropped run, which will make vylog purge the run record on the next vylog rotation (checkpoint). (see `vinyl_engine_collect_garbage()`) The garbage collection procedure writes the "forget" records asynchronously using `vy_log_tx_try_commit()`, see `vy_gc_run()`. This procedure can be successfully executed during vylog rotation, because it doesn't take the vylog latch. It simply appends records to a memory buffer which is flushed either on the next synchronous vylog write or vylog recovery. The problem is that the garbage collection isn't necessarily loads the latest vylog file because the vylog file may be rotated between it calls `vy_log_signature()` and `vy_recovery_new()`. This may result in a "forget" record written twice to the same vylog file for the same run file, as follows: 1. GC loads last vylog N 2. GC starts removing dropped run files. 3. CHECKPOINT starts vylog rotation. 4. CHECKPOINT loads vylog N. 5. GC writes a "forget" record for run A to the buffer. 6. GC is completed. 7. GC is restarted. 8. GC finds that the last vylog is N and blocks on the vylog latch trying to load it. 9. CHECKPOINT saves vylog M (M > N). 10. GC loads vylog N. This triggers flushing the forget record for run A to vylog M (not to vylog N), because vylog M is the last vylog at this point of time. 11. GC starts removing dropped run files. 12. GC writes a "forget" record for run A to the buffer again, because in vylog N it's still marked as dropped and not forgotten. (The previous "forget" record was written to vylog M). 13. Now we have two "forget" records for run A in vylog M. Such duplicate run records aren't tolerated by the vylog recovery procedure, resulting in a permanent error on the next checkpoint: ``` ER_INVALID_VYLOG_FILE: Invalid VYLOG file: Run XXXX forgotten but not registered ``` To fix this issue, we move `vy_log_signature()` under the vylog latch to `vy_recovery_new()`. This makes sure that GC will see vylog records that it's written during the previous execution. Catching this race in a function test would require a bunch of ugly error injections so let's assume that it'll be tested by fuzzing. Closes #10128 NO_DOC=bug fix NO_TEST=tested manually with fuzzer --- changelogs/unreleased/gh-10128-vy-invalid-vylog-fix.md | 4 ++++ src/box/vinyl.c | 2 +- src/box/vy_log.c | 9 +++++---- src/box/vy_log.h | 9 +++++++++ 4 files changed, 19 insertions(+), 5 deletions(-) create mode 100644 changelogs/unreleased/gh-10128-vy-invalid-vylog-fix.md diff --git a/changelogs/unreleased/gh-10128-vy-invalid-vylog-fix.md b/changelogs/unreleased/gh-10128-vy-invalid-vylog-fix.md new file mode 100644 index 0000000000..ef4c9a3b00 --- /dev/null +++ b/changelogs/unreleased/gh-10128-vy-invalid-vylog-fix.md @@ -0,0 +1,4 @@ +## bugfix/vinyl + +* Fixed a bug when recovery failed with the error "Invalid VYLOG file: Run XXXX + forgotten but not registered" (gh-10128). diff --git a/src/box/vinyl.c b/src/box/vinyl.c index 19c6ff9ea0..40c47a3cab 100644 --- a/src/box/vinyl.c +++ b/src/box/vinyl.c @@ -3257,7 +3257,7 @@ vinyl_engine_collect_garbage(struct engine *engine, const struct vclock *vclock) vy_log_collect_garbage(vclock); /* Cleanup run files. */ - struct vy_recovery *recovery = vy_recovery_new(vy_log_signature(), 0); + struct vy_recovery *recovery = vy_recovery_new(-1, 0); if (recovery == NULL) { say_error("failed to recover vylog for garbage collection"); return; diff --git a/src/box/vy_log.c b/src/box/vy_log.c index 40270bf789..b9cb67bb70 100644 --- a/src/box/vy_log.c +++ b/src/box/vy_log.c @@ -1003,8 +1003,7 @@ static int vy_log_rebootstrap(void) { struct vy_recovery *recovery; - recovery = vy_recovery_new(vclock_sum(&vy_log.last_checkpoint), - VY_RECOVERY_ABORT_REBOOTSTRAP); + recovery = vy_recovery_new(-1, VY_RECOVERY_ABORT_REBOOTSTRAP); if (recovery == NULL) return -1; @@ -1169,8 +1168,7 @@ vy_log_begin_recovery(const struct vclock *vclock, bool force_recovery) * failed, and we need to mark rebootstrap as aborted. */ struct vy_recovery *recovery; - recovery = vy_recovery_new(vclock_sum(&vy_log.last_checkpoint), - VY_RECOVERY_ABORT_REBOOTSTRAP); + recovery = vy_recovery_new(-1, VY_RECOVERY_ABORT_REBOOTSTRAP); if (recovery == NULL) return NULL; @@ -2463,6 +2461,7 @@ vy_recovery_new_locked(int64_t signature, int flags) int rc; struct vy_recovery *recovery; + assert(signature >= 0); assert(latch_owner(&vy_log.latch) == fiber()); /* * Before proceeding to log recovery, make sure that all @@ -2490,6 +2489,8 @@ vy_recovery_new(int64_t signature, int flags) { /* Lock out concurrent writers while we are loading the log. */ latch_lock(&vy_log.latch); + if (signature < 0) + signature = vclock_sum(&vy_log.last_checkpoint); struct vy_recovery *recovery; recovery = vy_recovery_new_locked(signature, flags); latch_unlock(&vy_log.latch); diff --git a/src/box/vy_log.h b/src/box/vy_log.h index c98cb37f73..603298fef8 100644 --- a/src/box/vy_log.h +++ b/src/box/vy_log.h @@ -593,6 +593,15 @@ enum vy_recovery_flag { * Create a recovery context from the metadata log created * by checkpoint with the given signature. * + * If the signature is -1, the function loads the metadata log corresponding + * to the last created checkpoint. Note, it isn't quite the same as passing + * vy_log_signature(). The latter opens a time window for a log rotation so + * it may load a stale metadata log while using -1 locks out concurrent log + * rotation and thus guarantees that the function loads the latest log. + * This is important if the caller writes new log records asynchronously + * (with vy_log_tx_try_commit()) basing on the recovered state and expects + * them to be recovered on the next execution. + * * For valid values of @flags, see vy_recovery_flag. * * Returns NULL on failure. -- GitLab