From f9bbfff9ce46e04c9d4073cd175e6867f7a8ae05 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@tarantool.org>
Date: Mon, 25 Oct 2021 20:00:56 +0300
Subject: [PATCH] box: don't delete inprogress files during normal GC

*.inprogress files shouldn't be deleted during normal GC, because they
may be in use. E.g. GC may happen to run while WAL is rotating xlog;
if GC removes the transient xlog.inprogress file created by WAL (see
xdir_create_xlog), WAL will fail to rename it and abort the current
transaction.

Initially, inprogress files were removed only after recovery. For this
reason, xdir_collect_inprogress, which is used for deleting inprogress
files, accesses FS directly, without COIO. This was changed by commit
5aa243deebbad248566fc421b42cbbe426e83982 ("recovery: build secondary
index in hot standby mode") which moved xdir_collect_inprogress from
memtx_engine_end_recovery to memtx_engine_collect_garbage so that a hot
standby instance doesn't delete inprogress files of the master instance
by mistake.

To fix this issue, let's move xdir_collect_inprogress back where it
belongs, to engine_end_recovery, and introduce a new callback for memtx
to build its secondary keys before entering the hot standby mode -
engine_begin_hot_standby.

Let's also remove engine_collect_garbage from gc_init, which was added
there by the aforementioned commit, probably to make tests pass.

The bug was reported by the vinyl/deferred_delete test (#5089).

Closes #6554
---
 ...h-6554-fix-gc-removing-inprogress-xlogs.md |   4 +
 src/box/blackhole.c                           |   1 +
 src/box/box.cc                                |   4 +-
 src/box/engine.c                              |  18 +++
 src/box/engine.h                              |  20 ++-
 src/box/gc.c                                  |   1 -
 src/box/memtx_engine.cc                       |  27 +++-
 src/box/service_engine.c                      |   1 +
 src/box/sysview.c                             |   1 +
 src/box/vinyl.c                               |   1 +
 src/box/xlog.c                                |   2 +
 src/lib/core/errinj.h                         |   1 +
 test/box/errinj.result                        |   1 +
 ...gh-6554-gc-removes-inprogress-xlogs.result | 118 ++++++++++++++++++
 ...-6554-gc-removes-inprogress-xlogs.test.lua |  56 +++++++++
 test/box/suite.ini                            |   2 +-
 16 files changed, 250 insertions(+), 8 deletions(-)
 create mode 100644 changelogs/unreleased/gh-6554-fix-gc-removing-inprogress-xlogs.md
 create mode 100644 test/box/gh-6554-gc-removes-inprogress-xlogs.result
 create mode 100644 test/box/gh-6554-gc-removes-inprogress-xlogs.test.lua

diff --git a/changelogs/unreleased/gh-6554-fix-gc-removing-inprogress-xlogs.md b/changelogs/unreleased/gh-6554-fix-gc-removing-inprogress-xlogs.md
new file mode 100644
index 0000000000..58e055c271
--- /dev/null
+++ b/changelogs/unreleased/gh-6554-fix-gc-removing-inprogress-xlogs.md
@@ -0,0 +1,4 @@
+## bugfix/core
+
+* Fixed a bug because of which the garbage collector could remove an xlog file
+  that is still in use (gh-6554).
diff --git a/src/box/blackhole.c b/src/box/blackhole.c
index 69f1deba17..b8788580e3 100644
--- a/src/box/blackhole.c
+++ b/src/box/blackhole.c
@@ -188,6 +188,7 @@ static const struct engine_vtab blackhole_engine_vtab = {
 	/* .bootstrap = */ generic_engine_bootstrap,
 	/* .begin_initial_recovery = */ generic_engine_begin_initial_recovery,
 	/* .begin_final_recovery = */ generic_engine_begin_final_recovery,
+	/* .begin_hot_standby = */ generic_engine_begin_hot_standby,
 	/* .end_recovery = */ generic_engine_end_recovery,
 	/* .begin_checkpoint = */ generic_engine_begin_checkpoint,
 	/* .wait_checkpoint = */ generic_engine_wait_checkpoint,
diff --git a/src/box/box.cc b/src/box/box.cc
index 20361db3a3..0aa61b6c1a 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -3401,7 +3401,6 @@ local_recovery(const struct tt_uuid *instance_uuid,
 			diag_raise();
 		diag_log();
 	}
-	engine_end_recovery_xc();
 	/*
 	 * Leave hot standby mode, if any, only after
 	 * acquiring the lock.
@@ -3409,6 +3408,7 @@ local_recovery(const struct tt_uuid *instance_uuid,
 	if (wal_dir_lock < 0) {
 		title("hot_standby");
 		say_info("Entering hot standby mode");
+		engine_begin_hot_standby_xc();
 		recovery_follow_local(recovery, &wal_stream.base, "hot_standby",
 				      cfg_getd("wal_dir_rescan_delay"));
 		while (true) {
@@ -3449,6 +3449,8 @@ local_recovery(const struct tt_uuid *instance_uuid,
 	if (wal_enable() != 0)
 		diag_raise();
 
+	engine_end_recovery_xc();
+
 	/* Check replica set UUID. */
 	if (!tt_uuid_is_nil(replicaset_uuid) &&
 	    !tt_uuid_is_equal(replicaset_uuid, &REPLICASET_UUID)) {
diff --git a/src/box/engine.c b/src/box/engine.c
index 88ed928613..e7375e3c8a 100644
--- a/src/box/engine.c
+++ b/src/box/engine.c
@@ -113,6 +113,17 @@ engine_begin_final_recovery(void)
 	return 0;
 }
 
+int
+engine_begin_hot_standby(void)
+{
+	struct engine *engine;
+	engine_foreach(engine) {
+		if (engine->vtab->begin_hot_standby(engine) != 0)
+			return -1;
+	}
+	return 0;
+}
+
 int
 engine_end_recovery(void)
 {
@@ -348,6 +359,13 @@ generic_engine_begin_final_recovery(struct engine *engine)
 	return 0;
 }
 
+int
+generic_engine_begin_hot_standby(struct engine *engine)
+{
+	(void)engine;
+	return 0;
+}
+
 int
 generic_engine_end_recovery(struct engine *engine)
 {
diff --git a/src/box/engine.h b/src/box/engine.h
index c4da01e139..8e3381f8b5 100644
--- a/src/box/engine.h
+++ b/src/box/engine.h
@@ -147,6 +147,11 @@ struct engine_vtab {
 	 * of WAL catch up durin join on slave side
 	 */
 	int (*begin_final_recovery)(struct engine *);
+	/**
+	 * Notify the engine that the instance is about to enter
+	 * the hot standby mode to complete recovery from WALs.
+	 */
+	int (*begin_hot_standby)(struct engine *);
 	/**
 	 * Inform the engine about the end of recovery from the
 	 * binary log.
@@ -337,9 +342,14 @@ engine_begin_initial_recovery(const struct vclock *recovery_vclock);
 int
 engine_begin_final_recovery(void);
 
+/**
+ * Called before entering the hot standby mode.
+ */
+int
+engine_begin_hot_standby(void);
+
 /**
  * Called at the end of recovery.
- * Build secondary keys in all spaces.
  */
 int
 engine_end_recovery(void);
@@ -395,6 +405,7 @@ int generic_engine_bootstrap(struct engine *);
 int generic_engine_begin_initial_recovery(struct engine *,
 					  const struct vclock *);
 int generic_engine_begin_final_recovery(struct engine *);
+int generic_engine_begin_hot_standby(struct engine *);
 int generic_engine_end_recovery(struct engine *);
 int generic_engine_begin_checkpoint(struct engine *, bool);
 int generic_engine_wait_checkpoint(struct engine *, const struct vclock *);
@@ -478,6 +489,13 @@ engine_begin_final_recovery_xc(void)
 		diag_raise();
 }
 
+static inline void
+engine_begin_hot_standby_xc(void)
+{
+	if (engine_begin_hot_standby() != 0)
+		diag_raise();
+}
+
 static inline void
 engine_end_recovery_xc(void)
 {
diff --git a/src/box/gc.c b/src/box/gc.c
index 10f8999231..9ebe28421b 100644
--- a/src/box/gc.c
+++ b/src/box/gc.c
@@ -117,7 +117,6 @@ gc_init(void)
 	gc_tree_new(&gc.consumers);
 	fiber_cond_create(&gc.cleanup_cond);
 	checkpoint_schedule_cfg(&gc.checkpoint_schedule, 0, 0);
-	engine_collect_garbage(&gc.vclock);
 
 	gc.cleanup_fiber = fiber_new("gc", gc_cleanup_fiber_f);
 	if (gc.cleanup_fiber == NULL)
diff --git a/src/box/memtx_engine.cc b/src/box/memtx_engine.cc
index de918c3353..c51f52a7ae 100644
--- a/src/box/memtx_engine.cc
+++ b/src/box/memtx_engine.cc
@@ -356,15 +356,33 @@ memtx_engine_begin_final_recovery(struct engine *engine)
 	return 0;
 }
 
+static int
+memtx_engine_begin_hot_standby(struct engine *engine)
+{
+	struct memtx_engine *memtx = (struct memtx_engine *)engine;
+	/*
+	 * Build secondary indexes before entering the hot standby mode
+	 * to quickly switch to the hot standby instance after the master
+	 * instance exits.
+	 */
+	if (memtx->state != MEMTX_OK) {
+		assert(memtx->state == MEMTX_FINAL_RECOVERY);
+		memtx->state = MEMTX_OK;
+		if (space_foreach(memtx_build_secondary_keys, memtx) != 0)
+			return -1;
+	}
+	return 0;
+}
+
 static int
 memtx_engine_end_recovery(struct engine *engine)
 {
 	struct memtx_engine *memtx = (struct memtx_engine *)engine;
 	/*
-	 * Recovery is started with enabled keys when:
-	 * - either of force_recovery
-	 *   is false
+	 * Secondary keys have already been built in the following cases:
+	 * - force_recovery is set
 	 * - it's a replication join
+	 * - instance was in the hot standby mode
 	 */
 	if (memtx->state != MEMTX_OK) {
 		assert(memtx->state == MEMTX_FINAL_RECOVERY);
@@ -372,6 +390,7 @@ memtx_engine_end_recovery(struct engine *engine)
 		if (space_foreach(memtx_build_secondary_keys, memtx) != 0)
 			return -1;
 	}
+	xdir_collect_inprogress(&memtx->snap_dir);
 	return 0;
 }
 
@@ -864,7 +883,6 @@ memtx_engine_collect_garbage(struct engine *engine, const struct vclock *vclock)
 	struct memtx_engine *memtx = (struct memtx_engine *)engine;
 	xdir_collect_garbage(&memtx->snap_dir, vclock_sum(vclock),
 			     XDIR_GC_ASYNC);
-	xdir_collect_inprogress(&memtx->snap_dir);
 }
 
 static int
@@ -1044,6 +1062,7 @@ static const struct engine_vtab memtx_engine_vtab = {
 	/* .bootstrap = */ memtx_engine_bootstrap,
 	/* .begin_initial_recovery = */ memtx_engine_begin_initial_recovery,
 	/* .begin_final_recovery = */ memtx_engine_begin_final_recovery,
+	/* .begin_hot_standby = */ memtx_engine_begin_hot_standby,
 	/* .end_recovery = */ memtx_engine_end_recovery,
 	/* .begin_checkpoint = */ memtx_engine_begin_checkpoint,
 	/* .wait_checkpoint = */ memtx_engine_wait_checkpoint,
diff --git a/src/box/service_engine.c b/src/box/service_engine.c
index 5a33a735a8..216d2abc74 100644
--- a/src/box/service_engine.c
+++ b/src/box/service_engine.c
@@ -106,6 +106,7 @@ static const struct engine_vtab service_engine_vtab = {
 	/* .bootstrap = */ generic_engine_bootstrap,
 	/* .begin_initial_recovery = */ generic_engine_begin_initial_recovery,
 	/* .begin_final_recovery = */ generic_engine_begin_final_recovery,
+	/* .begin_hot_standby = */ generic_engine_begin_hot_standby,
 	/* .end_recovery = */ generic_engine_end_recovery,
 	/* .begin_checkpoint = */ generic_engine_begin_checkpoint,
 	/* .wait_checkpoint = */ generic_engine_wait_checkpoint,
diff --git a/src/box/sysview.c b/src/box/sysview.c
index 00c320b6fc..05aa9ab643 100644
--- a/src/box/sysview.c
+++ b/src/box/sysview.c
@@ -578,6 +578,7 @@ static const struct engine_vtab sysview_engine_vtab = {
 	/* .bootstrap = */ generic_engine_bootstrap,
 	/* .begin_initial_recovery = */ generic_engine_begin_initial_recovery,
 	/* .begin_final_recovery = */ generic_engine_begin_final_recovery,
+	/* .begin_hot_standby = */ generic_engine_begin_hot_standby,
 	/* .end_recovery = */ generic_engine_end_recovery,
 	/* .begin_checkpoint = */ generic_engine_begin_checkpoint,
 	/* .wait_checkpoint = */ generic_engine_wait_checkpoint,
diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index c885f4b29f..fabe05849d 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -4521,6 +4521,7 @@ static const struct engine_vtab vinyl_engine_vtab = {
 	/* .bootstrap = */ vinyl_engine_bootstrap,
 	/* .begin_initial_recovery = */ vinyl_engine_begin_initial_recovery,
 	/* .begin_final_recovery = */ vinyl_engine_begin_final_recovery,
+	/* .begin_hot_standby = */ generic_engine_begin_hot_standby,
 	/* .end_recovery = */ vinyl_engine_end_recovery,
 	/* .begin_checkpoint = */ vinyl_engine_begin_checkpoint,
 	/* .wait_checkpoint = */ vinyl_engine_wait_checkpoint,
diff --git a/src/box/xlog.c b/src/box/xlog.c
index 03d2757132..9c6cd59b38 100644
--- a/src/box/xlog.c
+++ b/src/box/xlog.c
@@ -771,6 +771,8 @@ xlog_rename(struct xlog *l)
 	memcpy(new_filename, filename, suffix - filename);
 	new_filename[suffix - filename] = '\0';
 
+	ERROR_INJECT_SLEEP(ERRINJ_XLOG_RENAME_DELAY);
+
 	if (rename(filename, new_filename) != 0) {
 		say_syserror("can't rename %s to %s", filename, new_filename);
 		diag_set(SystemError, "failed to rename '%s' file",
diff --git a/src/lib/core/errinj.h b/src/lib/core/errinj.h
index 12d2814e35..f3f74c6674 100644
--- a/src/lib/core/errinj.h
+++ b/src/lib/core/errinj.h
@@ -166,6 +166,7 @@ struct errinj {
 	_(ERRINJ_XLOG_GARBAGE, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_XLOG_META, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_XLOG_READ, ERRINJ_INT, {.iparam = -1}) \
+	_(ERRINJ_XLOG_RENAME_DELAY, ERRINJ_BOOL, {.bparam = false}) \
 
 ENUM0(errinj_id, ERRINJ_LIST);
 extern struct errinj errinjs[];
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 529da15781..7cf1d7a639 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -136,6 +136,7 @@ evals
   - ERRINJ_XLOG_GARBAGE: false
   - ERRINJ_XLOG_META: false
   - ERRINJ_XLOG_READ: -1
+  - ERRINJ_XLOG_RENAME_DELAY: false
 ...
 errinj.set("some-injection", true)
 ---
diff --git a/test/box/gh-6554-gc-removes-inprogress-xlogs.result b/test/box/gh-6554-gc-removes-inprogress-xlogs.result
new file mode 100644
index 0000000000..e76505bdab
--- /dev/null
+++ b/test/box/gh-6554-gc-removes-inprogress-xlogs.result
@@ -0,0 +1,118 @@
+-- test-run result file version 2
+test_run = require('test_run').new()
+ | ---
+ | ...
+fiber = require('fiber')
+ | ---
+ | ...
+fio = require('fio')
+ | ---
+ | ...
+
+--
+-- gh-6554: GC removes xlog.inprogress files.
+--
+assert(box.cfg.wal_dir == box.cfg.memtx_dir)
+ | ---
+ | - true
+ | ...
+
+function count_inprogress() \
+    return #fio.glob(box.cfg.wal_dir .. '/*.xlog.inprogress') \
+end
+ | ---
+ | ...
+
+-- Run GC after each checkpoint.
+checkpoint_count = box.cfg.checkpoint_count
+ | ---
+ | ...
+box.cfg{checkpoint_count = 1}
+ | ---
+ | ...
+
+s = box.schema.create_space('test')
+ | ---
+ | ...
+_ = s:create_index('pk')
+ | ---
+ | ...
+box.snapshot()
+ | ---
+ | - ok
+ | ...
+
+-- Suspend GC.
+files = box.backup.start()
+ | ---
+ | ...
+
+-- Create a checkpoint.
+_ = s:replace{1}
+ | ---
+ | ...
+box.snapshot()
+ | ---
+ | - ok
+ | ...
+
+-- Block a writer on xlog.inprogress -> xlog rename.
+box.error.injection.set('ERRINJ_XLOG_RENAME_DELAY', true)
+ | ---
+ | - ok
+ | ...
+c = fiber.channel()
+ | ---
+ | ...
+_ = fiber.create(function() local r = pcall(s.replace, s, {1}) c:put(r) end)
+ | ---
+ | ...
+_ = test_run:wait_cond(function() return count_inprogress() > 0 end)
+ | ---
+ | ...
+assert(count_inprogress() == 1)
+ | ---
+ | - true
+ | ...
+
+-- Resume GC and wait for it to delete old files.
+box.backup.stop()
+ | ---
+ | ...
+for _, f in ipairs(files) do \
+    test_run:wait_cond(function() \
+        return not fio.path.exists(f) \
+    end) \
+end
+ | ---
+ | ...
+
+-- The xlog.inprogress file shouldn't be deleted by GC.
+assert(count_inprogress() == 1)
+ | ---
+ | - true
+ | ...
+
+-- Resume the blocked writer and check that it succeeds.
+box.error.injection.set('ERRINJ_XLOG_RENAME_DELAY', false)
+ | ---
+ | - ok
+ | ...
+assert(c:get() == true)
+ | ---
+ | - true
+ | ...
+
+-- The xlog.inprogress file was renamed.
+assert(count_inprogress() == 0)
+ | ---
+ | - true
+ | ...
+
+-- Cleanup.
+s:drop()
+ | ---
+ | ...
+box.cfg{checkpoint_count = checkpoint_count}
+ | ---
+ | ...
diff --git a/test/box/gh-6554-gc-removes-inprogress-xlogs.test.lua b/test/box/gh-6554-gc-removes-inprogress-xlogs.test.lua
new file mode 100644
index 0000000000..59651ace60
--- /dev/null
+++ b/test/box/gh-6554-gc-removes-inprogress-xlogs.test.lua
@@ -0,0 +1,56 @@
+test_run = require('test_run').new()
+fiber = require('fiber')
+fio = require('fio')
+
+--
+-- gh-6554: GC removes xlog.inprogress files.
+--
+assert(box.cfg.wal_dir == box.cfg.memtx_dir)
+
+function count_inprogress() \
+    return #fio.glob(box.cfg.wal_dir .. '/*.xlog.inprogress') \
+end
+
+-- Run GC after each checkpoint.
+checkpoint_count = box.cfg.checkpoint_count
+box.cfg{checkpoint_count = 1}
+
+s = box.schema.create_space('test')
+_ = s:create_index('pk')
+box.snapshot()
+
+-- Suspend GC.
+files = box.backup.start()
+
+-- Create a checkpoint.
+_ = s:replace{1}
+box.snapshot()
+
+-- Block a writer on xlog.inprogress -> xlog rename.
+box.error.injection.set('ERRINJ_XLOG_RENAME_DELAY', true)
+c = fiber.channel()
+_ = fiber.create(function() local r = pcall(s.replace, s, {1}) c:put(r) end)
+_ = test_run:wait_cond(function() return count_inprogress() > 0 end)
+assert(count_inprogress() == 1)
+
+-- Resume GC and wait for it to delete old files.
+box.backup.stop()
+for _, f in ipairs(files) do \
+    test_run:wait_cond(function() \
+        return not fio.path.exists(f) \
+    end) \
+end
+
+-- The xlog.inprogress file shouldn't be deleted by GC.
+assert(count_inprogress() == 1)
+
+-- Resume the blocked writer and check that it succeeds.
+box.error.injection.set('ERRINJ_XLOG_RENAME_DELAY', false)
+assert(c:get() == true)
+
+-- The xlog.inprogress file was renamed.
+assert(count_inprogress() == 0)
+
+-- Cleanup.
+s:drop()
+box.cfg{checkpoint_count = checkpoint_count}
diff --git a/test/box/suite.ini b/test/box/suite.ini
index a53ba4d41c..c3c1b75a2f 100644
--- a/test/box/suite.ini
+++ b/test/box/suite.ini
@@ -5,7 +5,7 @@ script = box.lua
 disabled = rtree_errinj.test.lua tuple_bench.test.lua
 long_run = huge_field_map_long.test.lua alter-primary-index-tuple-leak-long.test.lua
 config = engine.cfg
-release_disabled = errinj.test.lua errinj_index.test.lua rtree_errinj.test.lua upsert_errinj.test.lua iproto_stress.test.lua gh-4648-func-load-unload.test.lua net.box_discard_console_request_gh-6249.test.lua gh-5998-one-tx-for-ddl-errinj.test.lua net.box_closing_without_lost_gh-6338.test.lua net.box_iproto_id.test.lua net.box_error_extension_feature.test.lua gh-6092-invalid-listen-uri.test.lua
+release_disabled = errinj.test.lua errinj_index.test.lua rtree_errinj.test.lua upsert_errinj.test.lua iproto_stress.test.lua gh-4648-func-load-unload.test.lua net.box_discard_console_request_gh-6249.test.lua gh-5998-one-tx-for-ddl-errinj.test.lua net.box_closing_without_lost_gh-6338.test.lua net.box_iproto_id.test.lua net.box_error_extension_feature.test.lua gh-6092-invalid-listen-uri.test.lua gh-6554-gc-removes-inprogress-xlogs.test.lua
 lua_libs = lua/fifo.lua lua/utils.lua lua/bitset.lua lua/index_random_test.lua lua/push.lua lua/identifier.lua lua/txn_proxy.lua
 use_unix_sockets = True
 use_unix_sockets_iproto = True
-- 
GitLab