From 9c446a205f11ca400deef04ff6c767b5a29458b6 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov <vdavydov.dev@gmail.com> Date: Fri, 16 Feb 2018 15:58:03 +0300 Subject: [PATCH] vinyl: delete runs compacted during join immediately We keep run files corresponding to (at least) the last snapshot, because we need them for backups and replication. Deletion of compacted run files is postponed until the next snapshot. As a consequence, we don't delete run files created on a replica during the join stage. However, in contrast to run files created during normal operation, these are pure garbage and should be deleted right away. Not deleting them can result in depletion of disk space, because vinyl has quite high write amplification by design. We can't write a functional test for this, because there's no way to guarantee that compaction started during join will finish before join completion - if it doesn't, compacted runs won't be removed, because they will be assigned to the snapshot created by join. Closes #3162 --- src/box/vinyl.c | 17 ++--------- src/box/vy_run.c | 19 +++++++++++++ src/box/vy_run.h | 9 ++++++ src/box/vy_scheduler.c | 17 +++++++++++ .../{join_quota.lua => replica_quota.lua} | 0 test/vinyl/replica_quota.result | 28 +++++++++++++++++-- test/vinyl/replica_quota.test.lua | 16 +++++++++-- 7 files changed, 87 insertions(+), 19 deletions(-) rename test/vinyl/{join_quota.lua => replica_quota.lua} (100%) diff --git a/src/box/vinyl.c b/src/box/vinyl.c index d5d5adc11c..1ceeb847cf 100644 --- a/src/box/vinyl.c +++ b/src/box/vinyl.c @@ -54,7 +54,6 @@ #include <small/region.h> #include <small/mempool.h> -#include "coio_file.h" #include "coio_task.h" #include "cbus.h" #include "histogram.h" @@ -3340,20 +3339,8 @@ vy_gc_cb(const struct vy_log_record *record, void *cb_arg) (long long)record->run_id); goto out;}); /* Try to delete files. */ - bool forget = true; - char path[PATH_MAX]; - for (int type = 0; type < vy_file_MAX; type++) { - vy_run_snprint_path(path, sizeof(path), arg->env->path, - arg->space_id, arg->index_id, - record->run_id, type); - say_info("removing %s", path); - if (coio_unlink(path) < 0 && errno != ENOENT) { - say_syserror("error while removing %s", path); - forget = false; - } - } - - if (!forget) + if (vy_run_remove_files(arg->env->path, arg->space_id, + arg->index_id, record->run_id) != 0) goto out; /* Forget the run on success. */ diff --git a/src/box/vy_run.c b/src/box/vy_run.c index fa4c5f9b9f..32fdd67bc9 100644 --- a/src/box/vy_run.c +++ b/src/box/vy_run.c @@ -37,6 +37,7 @@ #include "fio.h" #include "cbus.h" #include "memory.h" +#include "coio_file.h" #include "replication.h" #include "tuple_hash.h" /* for bloom filter */ @@ -2604,6 +2605,24 @@ vy_run_rebuild_index(struct vy_run *run, const char *dir, return -1; } +int +vy_run_remove_files(const char *dir, uint32_t space_id, + uint32_t iid, int64_t run_id) +{ + int ret = 0; + char path[PATH_MAX]; + for (int type = 0; type < vy_file_MAX; type++) { + vy_run_snprint_path(path, sizeof(path), dir, + space_id, iid, run_id, type); + say_info("removing %s", path); + if (coio_unlink(path) < 0 && errno != ENOENT) { + say_syserror("error while removing %s", path); + ret = -1; + } + } + return ret; +} + /** * Read a page with stream->page_no from the run and save it in stream->page. * Support function of slice stream. diff --git a/src/box/vy_run.h b/src/box/vy_run.h index cf0569c271..1dde077662 100644 --- a/src/box/vy_run.h +++ b/src/box/vy_run.h @@ -403,6 +403,15 @@ vy_run_snprint_path(char *buf, int size, const char *dir, return total; } +/** + * Remove all files (data, index) corresponding to a run + * with the given id. Return 0 on success, -1 if unlink() + * failed. + */ +int +vy_run_remove_files(const char *dir, uint32_t space_id, + uint32_t iid, int64_t run_id); + int vy_run_write(struct vy_run *run, const char *dirpath, uint32_t space_id, uint32_t iid, diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c index 35fc24f43c..df8eb87fab 100644 --- a/src/box/vy_scheduler.c +++ b/src/box/vy_scheduler.c @@ -1071,6 +1071,23 @@ vy_task_compact_complete(struct vy_scheduler *scheduler, struct vy_task *task) return -1; } + if (gc_lsn < 0) { + /* + * If there is no last snapshot, i.e. we are in + * the middle of join, we can delete compacted + * run files right away. + */ + vy_log_tx_begin(); + rlist_foreach_entry(run, &unused_runs, in_unused) { + if (vy_run_remove_files(index->env->path, + index->space_id, index->id, + run->id) == 0) { + vy_log_forget_run(run->id); + } + } + vy_log_tx_try_commit(); + } + /* * Account the new run if it is not empty, * otherwise discard it. diff --git a/test/vinyl/join_quota.lua b/test/vinyl/replica_quota.lua similarity index 100% rename from test/vinyl/join_quota.lua rename to test/vinyl/replica_quota.lua diff --git a/test/vinyl/replica_quota.result b/test/vinyl/replica_quota.result index b85c7398b9..460cc1e61e 100644 --- a/test/vinyl/replica_quota.result +++ b/test/vinyl/replica_quota.result @@ -10,7 +10,7 @@ box.schema.user.grant('guest', 'replication') s = box.schema.space.create('test', { engine = 'vinyl' }) --- ... -_ = s:create_index('pk') +_ = s:create_index('pk', {run_count_per_level = 1}) --- ... -- Send > 2 MB to replica. @@ -36,7 +36,7 @@ for i = 1001,2000 do s:insert{i, pad} end -- a dump to complete and hence would result in bootstrap failure -- were the timeout not ignored. -- -_ = test_run:cmd("create server replica with rpl_master=default, script='vinyl/join_quota.lua'") +_ = test_run:cmd("create server replica with rpl_master=default, script='vinyl/replica_quota.lua'") --- ... _ = test_run:cmd("start server replica") @@ -58,6 +58,30 @@ _ = test_run:cmd("start server replica") _ = test_run:wait_lsn('replica', 'default') --- ... +-- During join we remove compacted run files immediately (gh-3162). +-- Check that we don't delete files that are still in use. +_ = test_run:cmd("stop server replica") +--- +... +_ = test_run:cmd("cleanup server replica") +--- +... +box.snapshot() +--- +- ok +... +for i = 3001,4000 do s:insert{i, pad} end +--- +... +_ = test_run:cmd("start server replica") -- join +--- +... +_ = test_run:cmd("stop server replica") +--- +... +_ = test_run:cmd("start server replica") -- recovery +--- +... _ = test_run:cmd("stop server replica") --- ... diff --git a/test/vinyl/replica_quota.test.lua b/test/vinyl/replica_quota.test.lua index ab89c1bc71..eade6f2f75 100644 --- a/test/vinyl/replica_quota.test.lua +++ b/test/vinyl/replica_quota.test.lua @@ -4,7 +4,7 @@ box.schema.user.grant('guest', 'read,write,execute', 'universe') box.schema.user.grant('guest', 'replication') s = box.schema.space.create('test', { engine = 'vinyl' }) -_ = s:create_index('pk') +_ = s:create_index('pk', {run_count_per_level = 1}) -- Send > 2 MB to replica. pad = string.rep('x', 1100) @@ -21,7 +21,7 @@ for i = 1001,2000 do s:insert{i, pad} end -- a dump to complete and hence would result in bootstrap failure -- were the timeout not ignored. -- -_ = test_run:cmd("create server replica with rpl_master=default, script='vinyl/join_quota.lua'") +_ = test_run:cmd("create server replica with rpl_master=default, script='vinyl/replica_quota.lua'") _ = test_run:cmd("start server replica") _ = test_run:wait_lsn('replica', 'default') @@ -31,6 +31,18 @@ for i = 2001,3000 do s:insert{i, pad} end _ = test_run:cmd("start server replica") _ = test_run:wait_lsn('replica', 'default') +-- During join we remove compacted run files immediately (gh-3162). +-- Check that we don't delete files that are still in use. +_ = test_run:cmd("stop server replica") +_ = test_run:cmd("cleanup server replica") + +box.snapshot() +for i = 3001,4000 do s:insert{i, pad} end + +_ = test_run:cmd("start server replica") -- join +_ = test_run:cmd("stop server replica") +_ = test_run:cmd("start server replica") -- recovery + _ = test_run:cmd("stop server replica") _ = test_run:cmd("cleanup server replica") -- GitLab