diff --git a/changelogs/unreleased/skip-newer-than-snap-vylog.md b/changelogs/unreleased/skip-newer-than-snap-vylog.md new file mode 100644 index 0000000000000000000000000000000000000000..5b21121f07deb2341358086b5d6429d86719f98f --- /dev/null +++ b/changelogs/unreleased/skip-newer-than-snap-vylog.md @@ -0,0 +1,4 @@ +## bugfix/build + +* Make recovering with force_recovery option delete newer than snapshot vylog + files. So that instance can recover after incidents during checkpoint(gh-5823). diff --git a/src/box/memtx_engine.c b/src/box/memtx_engine.c index 7d4c3789c5908a14092e7028dd68cd9817a282ce..8881f7eecbcd2db56b91197f72d80f4c8522804b 100644 --- a/src/box/memtx_engine.c +++ b/src/box/memtx_engine.c @@ -759,6 +759,7 @@ static void memtx_engine_commit_checkpoint(struct engine *engine, const struct vclock *vclock) { + ERROR_INJECT_TERMINATE(ERRINJ_SNAP_COMMIT_FAIL); (void) vclock; struct memtx_engine *memtx = (struct memtx_engine *)engine; diff --git a/src/box/vy_log.c b/src/box/vy_log.c index 81db6f56eba32a420809c7bd906aa6975049a469..c17c00e192bfa24d5402473938d4617ca82585ab 100644 --- a/src/box/vy_log.c +++ b/src/box/vy_log.c @@ -1030,7 +1030,6 @@ struct vy_recovery * vy_log_begin_recovery(const struct vclock *vclock, bool force_recovery) { assert(vy_log.recovery == NULL); - (void) force_recovery; /* * Do not fail recovery if vinyl directory does not exist, @@ -1057,11 +1056,36 @@ vy_log_begin_recovery(const struct vclock *vclock, bool force_recovery) /* * Last vy_log log is newer than the last snapshot. * This can't normally happen, as vy_log is rotated - * after snapshot is created. Looks like somebody + * in a short gap between checkpoint wait and commit. + * However, if memtx for some reason fails to commit its + * changes, instance will crash leaving .inprogress snap + * and corresponding (already rotated) vylog. + * Another and simpler reason is the case when somebody * deleted snap file, but forgot to delete vy_log. + * So in case we are anyway in force recovery mode, let's + * try to delete last .vylog file and continue recovery process. */ - diag_set(ClientError, ER_MISSING_SNAPSHOT); - return NULL; + if (!force_recovery) { + diag_set(ClientError, ER_MISSING_SNAPSHOT); + say_info("To bootstrap instance try to remove last " + ".vylog file or run in force_recovery mode"); + return NULL; + } + if (xdir_remove_file_by_vclock(&vy_log.dir, + &vy_log.last_checkpoint) != 0) { + say_info(".vylog is newer than snapshot. Failed to " + "remove it. Try to delete last .vylog " + "manually"); + return NULL; + } + const struct vclock *prev_checkpoint = + vy_log_prev_checkpoint(&vy_log.last_checkpoint); + if (prev_checkpoint == NULL) { + say_info("Can't find previous vylog"); + return NULL; + } + vclock_copy(&vy_log.last_checkpoint, prev_checkpoint); + assert(vclock_compare(&vy_log.last_checkpoint, vclock) == 0); } if (cmp < 0) { /* diff --git a/src/lib/core/errinj.h b/src/lib/core/errinj.h index 2b61c13045c0099a33cc577e04c00c0a96e8cf98..756899eff4c34d34197c3a6b75fc0bfe82bea90f 100644 --- a/src/lib/core/errinj.h +++ b/src/lib/core/errinj.h @@ -150,6 +150,7 @@ struct errinj { _(ERRINJ_COIO_WRITE_CHUNK, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_APPLIER_SLOW_ACK, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_STDIN_ISATTY, ERRINJ_INT, {.iparam = -1}) \ + _(ERRINJ_SNAP_COMMIT_FAIL, ERRINJ_BOOL, {.bparam = false}) \ ENUM0(errinj_id, ERRINJ_LIST); extern struct errinj errinjs[]; diff --git a/test/box/errinj.result b/test/box/errinj.result index a962dbe2db4046633f87b1179f2c3fc552aeb6fa..d1cbacd15782b440856cd4f283766a48fac3fca9 100644 --- a/test/box/errinj.result +++ b/test/box/errinj.result @@ -72,6 +72,7 @@ evals - ERRINJ_REPLICA_JOIN_DELAY: false - ERRINJ_SIO_READ_MAX: -1 - ERRINJ_SNAP_COMMIT_DELAY: false + - ERRINJ_SNAP_COMMIT_FAIL: false - ERRINJ_SNAP_WRITE_DELAY: false - ERRINJ_SQL_NAME_NORMALIZATION: false - ERRINJ_STDIN_ISATTY: -1 diff --git a/test/vinyl/gh-5823-crash_snapshot.lua b/test/vinyl/gh-5823-crash_snapshot.lua new file mode 100644 index 0000000000000000000000000000000000000000..fc48f94b2a42670f3ba02041c3a49513226a0692 --- /dev/null +++ b/test/vinyl/gh-5823-crash_snapshot.lua @@ -0,0 +1,25 @@ +#!/usr/bin/env tarantool + +-- +-- mode == 0: casual bootstrap; +-- mode == 1: force recovery bootstrap; +-- mode == 2: casual bootstrap and fill in data. +-- +local mode = tonumber(arg[1]) +box.cfg ({ + force_recovery = (mode == 1), +}) + +if mode == 2 then + local v = box.schema.space.create('test_v', {engine = 'vinyl'}) + v:create_index('pk') + local m = box.schema.space.create('test_m') + m:create_index('pk') + local str = string.rep('!', 100) + for i = 1,10 do v:insert{i, str} end + for i = 1,10 do m:insert{i, str} end + box.error.injection.set("ERRINJ_SNAP_COMMIT_FAIL", true); + box.snapshot() +end + +require('console').listen(os.getenv('ADMIN')) diff --git a/test/vinyl/gh-5823-skip-newer-than-snap-vylog.result b/test/vinyl/gh-5823-skip-newer-than-snap-vylog.result new file mode 100644 index 0000000000000000000000000000000000000000..ed5ae19046b963f23fe082121e186bef3024c967 --- /dev/null +++ b/test/vinyl/gh-5823-skip-newer-than-snap-vylog.result @@ -0,0 +1,81 @@ +-- test-run result file version 2 +test_run = require('test_run').new() + | --- + | ... + +-- Test is about following scenario: +-- 1. There's both memtx and vinyl data; +-- 2. User starts checkpoint process; +-- 3. In the most unsuitable moment instance crashes; +-- 4. Recovering in the casual mode does not help; +-- 5. Recovering in the force recovery mode solves the problem (deletes +-- redundant vylog file). +-- +test_run:cmd("create server test with script='vinyl/gh-5823-crash_snapshot.lua'") + | --- + | - true + | ... +test_run:cmd("start server test with args='2' with crash_expected=True") + | --- + | - false + | ... +-- Can't bootstrap instance without force_recovery. +-- +test_run:cmd("start server test with args='0' with crash_expected=True") + | --- + | - false + | ... + +fio = require('fio') + | --- + | ... +fh = fio.open(fio.pathjoin(fio.cwd(), 'gh-5823-crash_snapshot.log'), {'O_RDONLY'}) + | --- + | ... +size = fh:seek(0, 'SEEK_END') + | --- + | ... +fh:seek(-256, 'SEEK_END') ~= nil + | --- + | - true + | ... +line = fh:read(256) + | --- + | ... +fh:close() + | --- + | - true + | ... +string.match(line, "Can\'t find snapshot") ~= nil + | --- + | - true + | ... + +test_run:cmd("start server test with args='1'") + | --- + | - true + | ... +test_run:cmd("switch test") + | --- + | - true + | ... +box.space.test_v:select({5}) + | --- + | - - [5, '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'] + | ... +test_run:cmd("switch default") + | --- + | - true + | ... +test_run:cmd("stop server test") + | --- + | - true + | ... +test_run:cmd("cleanup server test") + | --- + | - true + | ... +test_run:cmd("delete server test") + | --- + | - true + | ... diff --git a/test/vinyl/gh-5823-skip-newer-than-snap-vylog.test.lua b/test/vinyl/gh-5823-skip-newer-than-snap-vylog.test.lua new file mode 100644 index 0000000000000000000000000000000000000000..829224f59b94cbeaa380710d72b219406a42b457 --- /dev/null +++ b/test/vinyl/gh-5823-skip-newer-than-snap-vylog.test.lua @@ -0,0 +1,24 @@ +test_run = require('test_run').new() + +-- Test is about following scenario: +-- 1. There's both memtx and vinyl data; +-- 2. User starts checkpoint process; +-- 3. In the most unsuitable moment instance crashes; +-- 4. Recovering in the casual mode does not help; +-- 5. Recovering in the force recovery mode solves the problem (deletes +-- redundant vylog file). +-- +test_run:cmd("create server test with script='vinyl/gh-5823-crash_snapshot.lua'") +test_run:cmd("start server test with args='2' with crash_expected=True") +-- Can't bootstrap instance without force_recovery. +-- +test_run:cmd("start server test with args='0' with crash_expected=True") +test_run:grep_log('test', "Can\'t find snapshot", nil, {filename='gh-5823-crash_snapshot.log'}) ~= nil + +test_run:cmd("start server test with args='1'") +test_run:cmd("switch test") +box.space.test_v:select({5}) +test_run:cmd("switch default") +test_run:cmd("stop server test") +test_run:cmd("cleanup server test") +test_run:cmd("delete server test") diff --git a/test/vinyl/suite.ini b/test/vinyl/suite.ini index b12ae2e9606b8195c18aca97c4a4c0aee0b61aa8..dc5e0ec5ebc7de5d4f67615ed5119514d9f8190f 100644 --- a/test/vinyl/suite.ini +++ b/test/vinyl/suite.ini @@ -2,7 +2,7 @@ core = tarantool description = vinyl integration tests script = vinyl.lua -release_disabled = errinj.test.lua errinj_ddl.test.lua errinj_gc.test.lua errinj_stat.test.lua errinj_tx.test.lua errinj_vylog.test.lua partial_dump.test.lua quota_timeout.test.lua recovery_quota.test.lua replica_rejoin.test.lua gh-4864-stmt-alloc-fail-compact.test.lua gh-4805-open-run-err-recovery.test.lua gh-4821-ddl-during-throttled-dump.test.lua gh-3395-read-prepared-uncommitted.test.lua +release_disabled = errinj.test.lua errinj_ddl.test.lua errinj_gc.test.lua errinj_stat.test.lua errinj_tx.test.lua errinj_vylog.test.lua partial_dump.test.lua quota_timeout.test.lua recovery_quota.test.lua replica_rejoin.test.lua gh-4864-stmt-alloc-fail-compact.test.lua gh-4805-open-run-err-recovery.test.lua gh-4821-ddl-during-throttled-dump.test.lua gh-3395-read-prepared-uncommitted.test.lua gh-5823-skip-newer-than-snap-vylog.test.lua config = suite.cfg lua_libs = suite.lua stress.lua large.lua ../box/lua/txn_proxy.lua ../box/lua/utils.lua use_unix_sockets = True