From 9d3859b246977c307a22f5930755fe77e374a634 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@tarantool.org>
Date: Tue, 11 Jun 2024 17:05:58 +0300
Subject: [PATCH] vinyl: fix gc vs vylog race leading to duplicate record

Vinyl run files aren't always deleted immediately after compaction,
because we need to keep run files corresponding to checkpoints for
backups. Such run files are deleted by the garbage collection procedure,
which performs the following steps:

 1. Loads information about all run files from the last vylog file.
 2. For each loaded run record that is marked as dropped:
    a. Tries to remove the run files.
    b. On success, writes a "forget" record for the dropped run,
       which will make vylog purge the run record on the next
       vylog rotation (checkpoint).

(see `vinyl_engine_collect_garbage()`)

The garbage collection procedure writes the "forget" records
asynchronously using `vy_log_tx_try_commit()`, see `vy_gc_run()`.
This procedure can be successfully executed during vylog rotation,
because it doesn't take the vylog latch. It simply appends records
to a memory buffer which is flushed either on the next synchronous
vylog write or vylog recovery.

The problem is that the garbage collection isn't necessarily loads
the latest vylog file because the vylog file may be rotated between
it calls `vy_log_signature()` and `vy_recovery_new()`. This may
result in a "forget" record written twice to the same vylog file
for the same run file, as follows:

  1. GC loads last vylog N
  2. GC starts removing dropped run files.
  3. CHECKPOINT starts vylog rotation.
  4. CHECKPOINT loads vylog N.
  5. GC writes a "forget" record for run A to the buffer.
  6. GC is completed.
  7. GC is restarted.
  8. GC finds that the last vylog is N and blocks on the vylog latch
     trying to load it.
  9. CHECKPOINT saves vylog M (M > N).
 10. GC loads vylog N. This triggers flushing the forget record for
     run A to vylog M (not to vylog N), because vylog M is the last
     vylog at this point of time.
 11. GC starts removing dropped run files.
 12. GC writes a "forget" record for run A to the buffer again,
     because in vylog N it's still marked as dropped and not forgotten.
     (The previous "forget" record was written to vylog M).
 13. Now we have two "forget" records for run A in vylog M.

Such duplicate run records aren't tolerated by the vylog recovery
procedure, resulting in a permanent error on the next checkpoint:

```
ER_INVALID_VYLOG_FILE: Invalid VYLOG file: Run XXXX forgotten but not registered
```

To fix this issue, we move `vy_log_signature()` under the vylog latch
to `vy_recovery_new()`. This makes sure that GC will see vylog records
that it's written during the previous execution.

Catching this race in a function test would require a bunch of ugly
error injections so let's assume that it'll be tested by fuzzing.

Closes #10128

NO_DOC=bug fix
NO_TEST=tested manually with fuzzer
---
 changelogs/unreleased/gh-10128-vy-invalid-vylog-fix.md | 4 ++++
 src/box/vinyl.c                                        | 2 +-
 src/box/vy_log.c                                       | 9 +++++----
 src/box/vy_log.h                                       | 9 +++++++++
 4 files changed, 19 insertions(+), 5 deletions(-)
 create mode 100644 changelogs/unreleased/gh-10128-vy-invalid-vylog-fix.md

diff --git a/changelogs/unreleased/gh-10128-vy-invalid-vylog-fix.md b/changelogs/unreleased/gh-10128-vy-invalid-vylog-fix.md
new file mode 100644
index 0000000000..ef4c9a3b00
--- /dev/null
+++ b/changelogs/unreleased/gh-10128-vy-invalid-vylog-fix.md
@@ -0,0 +1,4 @@
+## bugfix/vinyl
+
+* Fixed a bug when recovery failed with the error "Invalid VYLOG file: Run XXXX
+  forgotten but not registered" (gh-10128).
diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 19c6ff9ea0..40c47a3cab 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -3257,7 +3257,7 @@ vinyl_engine_collect_garbage(struct engine *engine, const struct vclock *vclock)
 	vy_log_collect_garbage(vclock);
 
 	/* Cleanup run files. */
-	struct vy_recovery *recovery = vy_recovery_new(vy_log_signature(), 0);
+	struct vy_recovery *recovery = vy_recovery_new(-1, 0);
 	if (recovery == NULL) {
 		say_error("failed to recover vylog for garbage collection");
 		return;
diff --git a/src/box/vy_log.c b/src/box/vy_log.c
index 40270bf789..b9cb67bb70 100644
--- a/src/box/vy_log.c
+++ b/src/box/vy_log.c
@@ -1003,8 +1003,7 @@ static int
 vy_log_rebootstrap(void)
 {
 	struct vy_recovery *recovery;
-	recovery = vy_recovery_new(vclock_sum(&vy_log.last_checkpoint),
-				   VY_RECOVERY_ABORT_REBOOTSTRAP);
+	recovery = vy_recovery_new(-1, VY_RECOVERY_ABORT_REBOOTSTRAP);
 	if (recovery == NULL)
 		return -1;
 
@@ -1169,8 +1168,7 @@ vy_log_begin_recovery(const struct vclock *vclock, bool force_recovery)
 	 * failed, and we need to mark rebootstrap as aborted.
 	 */
 	struct vy_recovery *recovery;
-	recovery = vy_recovery_new(vclock_sum(&vy_log.last_checkpoint),
-				   VY_RECOVERY_ABORT_REBOOTSTRAP);
+	recovery = vy_recovery_new(-1, VY_RECOVERY_ABORT_REBOOTSTRAP);
 	if (recovery == NULL)
 		return NULL;
 
@@ -2463,6 +2461,7 @@ vy_recovery_new_locked(int64_t signature, int flags)
 	int rc;
 	struct vy_recovery *recovery;
 
+	assert(signature >= 0);
 	assert(latch_owner(&vy_log.latch) == fiber());
 	/*
 	 * Before proceeding to log recovery, make sure that all
@@ -2490,6 +2489,8 @@ vy_recovery_new(int64_t signature, int flags)
 {
 	/* Lock out concurrent writers while we are loading the log. */
 	latch_lock(&vy_log.latch);
+	if (signature < 0)
+		signature = vclock_sum(&vy_log.last_checkpoint);
 	struct vy_recovery *recovery;
 	recovery = vy_recovery_new_locked(signature, flags);
 	latch_unlock(&vy_log.latch);
diff --git a/src/box/vy_log.h b/src/box/vy_log.h
index c98cb37f73..603298fef8 100644
--- a/src/box/vy_log.h
+++ b/src/box/vy_log.h
@@ -593,6 +593,15 @@ enum vy_recovery_flag {
  * Create a recovery context from the metadata log created
  * by checkpoint with the given signature.
  *
+ * If the signature is -1, the function loads the metadata log corresponding
+ * to the last created checkpoint. Note, it isn't quite the same as passing
+ * vy_log_signature(). The latter opens a time window for a log rotation so
+ * it may load a stale metadata log while using -1 locks out concurrent log
+ * rotation and thus guarantees that the function loads the latest log.
+ * This is important if the caller writes new log records asynchronously
+ * (with vy_log_tx_try_commit()) basing on the recovered state and expects
+ * them to be recovered on the next execution.
+ *
  * For valid values of @flags, see vy_recovery_flag.
  *
  * Returns NULL on failure.
-- 
GitLab