From 9c446a205f11ca400deef04ff6c767b5a29458b6 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov.dev@gmail.com>
Date: Fri, 16 Feb 2018 15:58:03 +0300
Subject: [PATCH] vinyl: delete runs compacted during join immediately

We keep run files corresponding to (at least) the last snapshot, because
we need them for backups and replication. Deletion of compacted run
files is postponed until the next snapshot. As a consequence, we don't
delete run files created on a replica during the join stage. However, in
contrast to run files created during normal operation, these are pure
garbage and should be deleted right away. Not deleting them can result
in depletion of disk space, because vinyl has quite high write
amplification by design.

We can't write a functional test for this, because there's no way to
guarantee that compaction started during join will finish before join
completion - if it doesn't, compacted runs won't be removed, because
they will be assigned to the snapshot created by join.

Closes #3162
---
 src/box/vinyl.c                               | 17 ++---------
 src/box/vy_run.c                              | 19 +++++++++++++
 src/box/vy_run.h                              |  9 ++++++
 src/box/vy_scheduler.c                        | 17 +++++++++++
 .../{join_quota.lua => replica_quota.lua}     |  0
 test/vinyl/replica_quota.result               | 28 +++++++++++++++++--
 test/vinyl/replica_quota.test.lua             | 16 +++++++++--
 7 files changed, 87 insertions(+), 19 deletions(-)
 rename test/vinyl/{join_quota.lua => replica_quota.lua} (100%)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index d5d5adc11c..1ceeb847cf 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -54,7 +54,6 @@
 #include <small/region.h>
 #include <small/mempool.h>
 
-#include "coio_file.h"
 #include "coio_task.h"
 #include "cbus.h"
 #include "histogram.h"
@@ -3340,20 +3339,8 @@ vy_gc_cb(const struct vy_log_record *record, void *cb_arg)
 				(long long)record->run_id); goto out;});
 
 	/* Try to delete files. */
-	bool forget = true;
-	char path[PATH_MAX];
-	for (int type = 0; type < vy_file_MAX; type++) {
-		vy_run_snprint_path(path, sizeof(path), arg->env->path,
-				    arg->space_id, arg->index_id,
-				    record->run_id, type);
-		say_info("removing %s", path);
-		if (coio_unlink(path) < 0 && errno != ENOENT) {
-			say_syserror("error while removing %s", path);
-			forget = false;
-		}
-	}
-
-	if (!forget)
+	if (vy_run_remove_files(arg->env->path, arg->space_id,
+				arg->index_id, record->run_id) != 0)
 		goto out;
 
 	/* Forget the run on success. */
diff --git a/src/box/vy_run.c b/src/box/vy_run.c
index fa4c5f9b9f..32fdd67bc9 100644
--- a/src/box/vy_run.c
+++ b/src/box/vy_run.c
@@ -37,6 +37,7 @@
 #include "fio.h"
 #include "cbus.h"
 #include "memory.h"
+#include "coio_file.h"
 
 #include "replication.h"
 #include "tuple_hash.h" /* for bloom filter */
@@ -2604,6 +2605,24 @@ vy_run_rebuild_index(struct vy_run *run, const char *dir,
 	return -1;
 }
 
+int
+vy_run_remove_files(const char *dir, uint32_t space_id,
+		    uint32_t iid, int64_t run_id)
+{
+	int ret = 0;
+	char path[PATH_MAX];
+	for (int type = 0; type < vy_file_MAX; type++) {
+		vy_run_snprint_path(path, sizeof(path), dir,
+				    space_id, iid, run_id, type);
+		say_info("removing %s", path);
+		if (coio_unlink(path) < 0 && errno != ENOENT) {
+			say_syserror("error while removing %s", path);
+			ret = -1;
+		}
+	}
+	return ret;
+}
+
 /**
  * Read a page with stream->page_no from the run and save it in stream->page.
  * Support function of slice stream.
diff --git a/src/box/vy_run.h b/src/box/vy_run.h
index cf0569c271..1dde077662 100644
--- a/src/box/vy_run.h
+++ b/src/box/vy_run.h
@@ -403,6 +403,15 @@ vy_run_snprint_path(char *buf, int size, const char *dir,
 	return total;
 }
 
+/**
+ * Remove all files (data, index) corresponding to a run
+ * with the given id. Return 0 on success, -1 if unlink()
+ * failed.
+ */
+int
+vy_run_remove_files(const char *dir, uint32_t space_id,
+		    uint32_t iid, int64_t run_id);
+
 int
 vy_run_write(struct vy_run *run, const char *dirpath,
 	     uint32_t space_id, uint32_t iid,
diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 35fc24f43c..df8eb87fab 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -1071,6 +1071,23 @@ vy_task_compact_complete(struct vy_scheduler *scheduler, struct vy_task *task)
 		return -1;
 	}
 
+	if (gc_lsn < 0) {
+		/*
+		 * If there is no last snapshot, i.e. we are in
+		 * the middle of join, we can delete compacted
+		 * run files right away.
+		 */
+		vy_log_tx_begin();
+		rlist_foreach_entry(run, &unused_runs, in_unused) {
+			if (vy_run_remove_files(index->env->path,
+						index->space_id, index->id,
+						run->id) == 0) {
+				vy_log_forget_run(run->id);
+			}
+		}
+		vy_log_tx_try_commit();
+	}
+
 	/*
 	 * Account the new run if it is not empty,
 	 * otherwise discard it.
diff --git a/test/vinyl/join_quota.lua b/test/vinyl/replica_quota.lua
similarity index 100%
rename from test/vinyl/join_quota.lua
rename to test/vinyl/replica_quota.lua
diff --git a/test/vinyl/replica_quota.result b/test/vinyl/replica_quota.result
index b85c7398b9..460cc1e61e 100644
--- a/test/vinyl/replica_quota.result
+++ b/test/vinyl/replica_quota.result
@@ -10,7 +10,7 @@ box.schema.user.grant('guest', 'replication')
 s = box.schema.space.create('test', { engine = 'vinyl' })
 ---
 ...
-_ = s:create_index('pk')
+_ = s:create_index('pk', {run_count_per_level = 1})
 ---
 ...
 -- Send > 2 MB to replica.
@@ -36,7 +36,7 @@ for i = 1001,2000 do s:insert{i, pad} end
 -- a dump to complete and hence would result in bootstrap failure
 -- were the timeout not ignored.
 --
-_ = test_run:cmd("create server replica with rpl_master=default, script='vinyl/join_quota.lua'")
+_ = test_run:cmd("create server replica with rpl_master=default, script='vinyl/replica_quota.lua'")
 ---
 ...
 _ = test_run:cmd("start server replica")
@@ -58,6 +58,30 @@ _ = test_run:cmd("start server replica")
 _ = test_run:wait_lsn('replica', 'default')
 ---
 ...
+-- During join we remove compacted run files immediately (gh-3162).
+-- Check that we don't delete files that are still in use.
+_ = test_run:cmd("stop server replica")
+---
+...
+_ = test_run:cmd("cleanup server replica")
+---
+...
+box.snapshot()
+---
+- ok
+...
+for i = 3001,4000 do s:insert{i, pad} end
+---
+...
+_ = test_run:cmd("start server replica") -- join
+---
+...
+_ = test_run:cmd("stop server replica")
+---
+...
+_ = test_run:cmd("start server replica") -- recovery
+---
+...
 _ = test_run:cmd("stop server replica")
 ---
 ...
diff --git a/test/vinyl/replica_quota.test.lua b/test/vinyl/replica_quota.test.lua
index ab89c1bc71..eade6f2f75 100644
--- a/test/vinyl/replica_quota.test.lua
+++ b/test/vinyl/replica_quota.test.lua
@@ -4,7 +4,7 @@ box.schema.user.grant('guest', 'read,write,execute', 'universe')
 box.schema.user.grant('guest', 'replication')
 
 s = box.schema.space.create('test', { engine = 'vinyl' })
-_ = s:create_index('pk')
+_ = s:create_index('pk', {run_count_per_level = 1})
 
 -- Send > 2 MB to replica.
 pad = string.rep('x', 1100)
@@ -21,7 +21,7 @@ for i = 1001,2000 do s:insert{i, pad} end
 -- a dump to complete and hence would result in bootstrap failure
 -- were the timeout not ignored.
 --
-_ = test_run:cmd("create server replica with rpl_master=default, script='vinyl/join_quota.lua'")
+_ = test_run:cmd("create server replica with rpl_master=default, script='vinyl/replica_quota.lua'")
 _ = test_run:cmd("start server replica")
 _ = test_run:wait_lsn('replica', 'default')
 
@@ -31,6 +31,18 @@ for i = 2001,3000 do s:insert{i, pad} end
 _ = test_run:cmd("start server replica")
 _ = test_run:wait_lsn('replica', 'default')
 
+-- During join we remove compacted run files immediately (gh-3162).
+-- Check that we don't delete files that are still in use.
+_ = test_run:cmd("stop server replica")
+_ = test_run:cmd("cleanup server replica")
+
+box.snapshot()
+for i = 3001,4000 do s:insert{i, pad} end
+
+_ = test_run:cmd("start server replica") -- join
+_ = test_run:cmd("stop server replica")
+_ = test_run:cmd("start server replica") -- recovery
+
 _ = test_run:cmd("stop server replica")
 _ = test_run:cmd("cleanup server replica")
 
-- 
GitLab