From bf620650c429d0b6c0bae8588f54ea56a0299fd6 Mon Sep 17 00:00:00 2001 From: Nikolay Shirokovskiy <nshirokovskiy@tarantool.org> Date: Fri, 19 Jan 2024 12:33:08 +0300 Subject: [PATCH] box: finish client fibers on shutdown In the process of graceful shutdown it is convenient to first finish all client (non system) fibers. Otherwise we should be ready for any subsystem to handle request from client fiber during or after subsystem shutdown. This would make code more complex. We first cancel client fibers and then wait for their finishing. The fiber may not respond to cancel and hang which cause shutdown hang but this is the approach we choose for iproto shutdown already. Note that as a result of this approach application will panic if it is shutdown during execution of initialization script (in particular if this script is doing box.cfg). There are changes in application/test to adopt to client fibers shutdown: - make code cancellable (only to pass existing tests, we did not investigate all the possible places that should be made such). - make console stop sending echo to client before client fibers shutdown. Otherwise as console server fiber is client one we will send message that fiber is cancelled on shutdown which breaks a lot of existing tests. This approach is on par with iproto shutdown. - some tests (7743, replication-luatest/shutdown, replication/anon, replication/force_recovery etc etc) test shutdown during execution of init script. Now panic is expected so change them accordingly. - some tests (8530, errinj_vylog) use injection that block client fiber finishing. In that tests we don't need graceful shutdown so let's just kill tarantool instead. - we change test in vinyl/errinj for gh-3225. We don't really need to check when vinyl reader is blocked as it executes small tasks (we assume reading syscall will not hang). Also change test for vinyl dump shutdown by slowing dump down instead of blocking it entirely. This is required to finish in time client fibers in the test. - other similar changes Also we can drop code from replication shutdown which is required to handle client requests during/after shutdown. Part of #8423 NO_CHANGELOG=internal NO_DOC=internal --- src/box/box.cc | 12 ++ src/box/lua/console.lua | 8 + src/box/memtx_engine.cc | 11 +- src/box/replication.cc | 34 +---- src/box/vy_quota.c | 4 + src/box/vy_scheduler.c | 12 +- src/lib/core/errinj.h | 1 - src/lib/core/fiber.c | 48 +++++- src/lib/core/fiber.h | 12 ++ src/lib/core/fiber_pool.c | 14 +- src/lua/fiber.c | 12 ++ src/lua/fiber.lua | 9 ++ src/lua/init.lua | 1 + src/main.cc | 9 -- .../gh_7743_term_initial_cfg_snap_test.lua | 16 +- .../gh_8530_alter_space_snapshot_test.lua | 6 + test/box/errinj.result | 1 - test/replication-luatest/shutdown_test.lua | 26 +--- test/replication-py/cluster.test.py | 1 + test/replication/anon.result | 2 +- test/replication/anon.test.lua | 2 +- test/replication/force_recovery.result | 2 +- test/replication/force_recovery.test.lua | 2 +- ...637-misc-error-on-replica-auth-fail.result | 13 ++ ...7-misc-error-on-replica-auth-fail.test.lua | 5 + test/replication/gh-4739-vclock-assert.result | 2 +- .../gh-4739-vclock-assert.test.lua | 2 +- .../gh-5613-bootstrap-prefer-booted.result | 2 +- .../gh-5613-bootstrap-prefer-booted.test.lua | 2 +- test/replication/gh-5806-xlog-cleanup.result | 2 +- .../replication/gh-5806-xlog-cleanup.test.lua | 2 +- test/replication/prune.result | 2 +- test/replication/prune.test.lua | 2 +- test/replication/replica_auth.lua | 3 + test/replication/replica_rejoin.result | 2 +- test/replication/replica_rejoin.test.lua | 2 +- test/unit/fiber.cc | 144 +++++++++++++++++- test/unit/fiber.result | 6 + test/vinyl/errinj.result | 26 +--- test/vinyl/errinj.test.lua | 17 +-- test/vinyl/errinj_vylog.result | 4 +- test/vinyl/errinj_vylog.test.lua | 4 +- test/xlog/panic_on_wal_error.result | 2 +- test/xlog/panic_on_wal_error.test.lua | 2 +- 44 files changed, 357 insertions(+), 134 deletions(-) diff --git a/src/box/box.cc b/src/box/box.cc index 9767349ac8..936d58f545 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -4941,6 +4941,8 @@ bootstrap_from_master(struct replica *master) try { applier_resume_to_state(applier, APPLIER_READY, TIMEOUT_INFINITY); + } catch (FiberIsCancelled *e) { + throw e; } catch (...) { return false; } @@ -4958,6 +4960,8 @@ bootstrap_from_master(struct replica *master) try { applier_resume_to_state(applier, APPLIER_FETCH_SNAPSHOT, TIMEOUT_INFINITY); + } catch (FiberIsCancelled *e) { + throw e; } catch (...) { return false; } @@ -5926,6 +5930,14 @@ box_storage_shutdown() if (!is_storage_initialized) return; iproto_shutdown(); + /* + * Finish client fibers after iproto_shutdown otherwise new fibers + * can be started through new iproto requests. Also we should + * finish client fibers before other subsystems shutdown so that + * we won't need to handle requests from client fibers after/during + * subsystem shutdown. + */ + fiber_shutdown(); replication_shutdown(); } diff --git a/src/box/lua/console.lua b/src/box/lua/console.lua index 90111ccade..06e5a85033 100644 --- a/src/box/lua/console.lua +++ b/src/box/lua/console.lua @@ -985,7 +985,15 @@ local function client_handler(client, _peer) state:print(string.format("%-63s\n%-63s\n", "Tarantool ".. version.." (Lua console)", "type 'help' for interactive help")) + local on_shutdown = function() + -- Fiber is going to be cancelled on shutdown. Do not report + -- cancel induced error to the peer. + client:close(); + end + state.fiber = fiber.self() + box.ctl.on_shutdown(on_shutdown) repl(state) + box.ctl.on_shutdown(nil, on_shutdown) session_internal.run_on_disconnect() end diff --git a/src/box/memtx_engine.cc b/src/box/memtx_engine.cc index a0530aae6b..0ace4a5283 100644 --- a/src/box/memtx_engine.cc +++ b/src/box/memtx_engine.cc @@ -1028,12 +1028,19 @@ checkpoint_f(va_list ap) return -1; } - struct mh_i32_t *temp_space_ids = mh_i32_new(); + struct mh_i32_t *temp_space_ids; say_info("saving snapshot `%s'", snap->filename); - ERROR_INJECT_SLEEP(ERRINJ_SNAP_WRITE_DELAY); + ERROR_INJECT_WHILE(ERRINJ_SNAP_WRITE_DELAY, { + fiber_sleep(0.001); + if (fiber_is_cancelled()) { + diag_set(FiberIsCancelled); + goto fail; + } + }); ERROR_INJECT(ERRINJ_SNAP_SKIP_ALL_ROWS, goto done); struct space_read_view *space_rv; + temp_space_ids = mh_i32_new(); read_view_foreach_space(space_rv, &ckpt->rv) { FiberGCChecker gc_check; bool skip = false; diff --git a/src/box/replication.cc b/src/box/replication.cc index 261b5e5cd3..83bc4b6cd4 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -62,18 +62,6 @@ double replication_sync_timeout = 300.0; /* seconds */ bool replication_skip_conflict = false; int replication_threads = 1; -/** - * Fiber executing replicaset_connect. NULL if the function - * is not being executed. - */ -static struct fiber *replication_connect_fiber; - -/** Condition that replicaset_connect finished execution. */ -static struct fiber_cond replication_connect_cond; - -/** If set then replication shutdown is started. */ -static bool replication_is_shutting_down; - bool cfg_replication_anon = true; struct tt_uuid cfg_bootstrap_leader_uuid; struct uri cfg_bootstrap_leader_uri; @@ -231,7 +219,6 @@ replication_init(int num_threads) diag_create(&replicaset.applier.diag); replication_threads = num_threads; - fiber_cond_create(&replication_connect_cond); /* The local instance is always part of the quorum. */ replicaset.healthy_count = 1; @@ -242,12 +229,6 @@ replication_init(int num_threads) void replication_shutdown(void) { - replication_is_shutting_down = true; - if (replication_connect_fiber != NULL) - fiber_cancel(replication_connect_fiber); - while (replication_connect_fiber != NULL) - fiber_cond_wait(&replication_connect_cond); - struct replica *replica; rlist_foreach_entry(replica, &replicaset.anon, in_anon) applier_stop(replica->applier); @@ -263,7 +244,6 @@ replication_free(void) diag_destroy(&replicaset.applier.diag); trigger_destroy(&replicaset.on_ack); trigger_destroy(&replicaset.on_relay_thread_start); - fiber_cond_destroy(&replication_connect_cond); fiber_cond_destroy(&replicaset.applier.cond); latch_destroy(&replicaset.applier.order_latch); applier_free(); @@ -1072,9 +1052,6 @@ void replicaset_connect(const struct uri_set *uris, bool connect_quorum, bool keep_connect) { - if (replication_is_shutting_down) - tnt_raise(ClientError, ER_SHUTDOWN); - if (uris->uri_count == 0) { /* Cleanup the replica set. */ replicaset_update(NULL, 0, false); @@ -1087,12 +1064,6 @@ replicaset_connect(const struct uri_set *uris, tnt_raise(ClientError, ER_CFG, "replication", "too many replicas"); } - assert(replication_connect_fiber == NULL); - replication_connect_fiber = fiber(); - auto connect_fiber_guard = make_scoped_guard([&]{ - replication_connect_fiber = NULL; - fiber_cond_signal(&replication_connect_cond); - }); int count = 0; struct applier *appliers[VCLOCK_MAX] = {}; auto appliers_guard = make_scoped_guard([&]{ @@ -1342,6 +1313,11 @@ replicaset_sync(void) say_info("replica set sync complete"); box_set_orphan(false); } + /* + * If fiber is cancelled raise error here so that orphan status is + * correct. + */ + fiber_testcancel(); } void diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c index 8f86be915c..ebbbde0960 100644 --- a/src/box/vy_quota.c +++ b/src/box/vy_quota.c @@ -346,6 +346,10 @@ vy_quota_use(struct vy_quota *q, enum vy_quota_consumer_type type, diag_set(ClientError, ER_VY_QUOTA_TIMEOUT); return -1; } + if (fiber_is_cancelled()) { + diag_set(FiberIsCancelled); + return -1; + } double wait_time = ev_monotonic_now(loop()) - wait_start; if (wait_time > q->too_long_threshold) { diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c index daddff071c..677bf57491 100644 --- a/src/box/vy_scheduler.c +++ b/src/box/vy_scheduler.c @@ -730,13 +730,20 @@ vy_scheduler_wait_checkpoint(struct vy_scheduler *scheduler) /* A dump error occurred, abort checkpoint. */ struct error *e = diag_last_error(&scheduler->diag); diag_set_error(diag_get(), e); - say_error("vinyl checkpoint failed: %s", e->errmsg); - return -1; + goto error; } fiber_cond_wait(&scheduler->dump_cond); + if (fiber_is_cancelled()) { + diag_set(FiberIsCancelled); + goto error; + } } say_info("vinyl checkpoint completed"); return 0; +error: + say_error("vinyl checkpoint failed: %s", + diag_last_error(diag_get())->errmsg); + return -1; } void @@ -886,6 +893,7 @@ vy_deferred_delete_batch_process_f(struct cmsg *cmsg) struct vy_deferred_delete_batch *batch = container_of(cmsg, struct vy_deferred_delete_batch, cmsg); struct vy_task *task = batch->task; + fiber_set_system(fiber(), true); /* * Wait for memory quota if necessary before starting to * process the batch (we can't yield between statements). diff --git a/src/lib/core/errinj.h b/src/lib/core/errinj.h index 89d81a606d..12e5412fdd 100644 --- a/src/lib/core/errinj.h +++ b/src/lib/core/errinj.h @@ -104,7 +104,6 @@ struct errinj { _(ERRINJ_IPROTO_TX_DELAY, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_IPROTO_WRITE_ERROR_DELAY, ERRINJ_BOOL, {.bparam = false})\ _(ERRINJ_LOG_ROTATE, ERRINJ_BOOL, {.bparam = false}) \ - _(ERRINJ_MAIN_MAKE_FILE_ON_RETURN, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_MEMTX_DELAY_GC, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_NETBOX_DISABLE_ID, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_NETBOX_FLIP_FEATURE, ERRINJ_INT, {.iparam = -1}) \ diff --git a/src/lib/core/fiber.c b/src/lib/core/fiber.c index 9ec5d2fa69..26bc4459e0 100644 --- a/src/lib/core/fiber.c +++ b/src/lib/core/fiber.c @@ -1187,6 +1187,13 @@ fiber_loop(MAYBE_UNUSED void *data) assert(f != fiber); fiber_wakeup(f); } + if (!(fiber->flags & FIBER_IS_SYSTEM)) { + assert(cord()->client_fiber_count > 0); + cord()->client_fiber_count--; + if (cord()->shutdown_fiber != NULL && + cord()->client_fiber_count == 0) + fiber_wakeup(cord()->shutdown_fiber); + } fiber_on_stop(fiber); /* reset pending wakeups */ rlist_del(&fiber->state); @@ -1590,6 +1597,8 @@ fiber_new_ex(const char *name, const struct fiber_attr *fiber_attr, fiber_gc_checker_init(fiber); cord->next_fid++; assert(cord->next_fid > FIBER_ID_MAX_RESERVED); + if (!(fiber->flags & FIBER_IS_SYSTEM)) + cord()->client_fiber_count++; return fiber; @@ -1849,7 +1858,7 @@ cord_create(struct cord *cord, const char *name) cord->sched.name = NULL; fiber_set_name(&cord->sched, "sched"); cord->fiber = &cord->sched; - cord->sched.flags = FIBER_IS_RUNNING; + cord->sched.flags = FIBER_IS_RUNNING | FIBER_IS_SYSTEM; cord->sched.max_slice = zero_slice; cord->max_slice = default_slice; @@ -1884,6 +1893,8 @@ cord_create(struct cord *cord, const char *name) cord->sched.stack_watermark = NULL; #endif signal_stack_init(); + cord->shutdown_fiber = NULL; + cord->client_fiber_count = 0; } void @@ -2339,3 +2350,38 @@ fiber_lua_state(struct fiber *f) { return f->storage.lua.stack; } + +void +fiber_set_system(struct fiber *f, bool yesno) +{ + if (yesno) { + if (!(f->flags & FIBER_IS_SYSTEM)) { + f->flags |= FIBER_IS_SYSTEM; + assert(cord()->client_fiber_count > 0); + cord()->client_fiber_count--; + if (cord()->shutdown_fiber != NULL && + cord()->client_fiber_count == 0) + fiber_wakeup(cord()->shutdown_fiber); + } + } else { + if (f->flags & FIBER_IS_SYSTEM) { + f->flags &= ~FIBER_IS_SYSTEM; + cord()->client_fiber_count++; + } + } +} + +void +fiber_shutdown(void) +{ + assert(cord()->shutdown_fiber == NULL); + struct fiber *fiber; + rlist_foreach_entry(fiber, &cord()->alive, link) { + if (!(fiber->flags & FIBER_IS_SYSTEM)) + fiber_cancel(fiber); + } + cord()->shutdown_fiber = fiber(); + while (cord()->client_fiber_count != 0) + fiber_yield(); + cord()->shutdown_fiber = NULL; +} diff --git a/src/lib/core/fiber.h b/src/lib/core/fiber.h index 2835220501..e58553bdd2 100644 --- a/src/lib/core/fiber.h +++ b/src/lib/core/fiber.h @@ -852,6 +852,10 @@ struct cord { struct fiber *main_fiber; /** An event triggered to cancel cord main fiber. */ ev_async cancel_event; + /** Number of alive client (non system) fibers. */ + int client_fiber_count; + /** Fiber calling fiber_shutdown. NULL if there is no such. */ + struct fiber *shutdown_fiber; }; extern __thread struct cord *cord_ptr; @@ -1243,6 +1247,14 @@ fiber_check_gc(void); struct lua_State * fiber_lua_state(struct fiber *f); +/** Change whether fiber is system or not. */ +void +fiber_set_system(struct fiber *f, bool yesno); + +/** Cancel all client (non system) fibers and wait until they finished. */ +void +fiber_shutdown(void); + #if defined(__cplusplus) } /* extern "C" */ diff --git a/src/lib/core/fiber_pool.c b/src/lib/core/fiber_pool.c index cdf067a7f7..bd5f47284a 100644 --- a/src/lib/core/fiber_pool.c +++ b/src/lib/core/fiber_pool.c @@ -61,7 +61,9 @@ fiber_pool_f(va_list ap) f->caller->flags |= FIBER_IS_READY; assert(f->caller->caller == &cord->sched); } + fiber_set_system(fiber(), false); cmsg_deliver(msg); + fiber_set_system(fiber(), true); fiber_check_gc(); /* * Normally fibers die after their function @@ -131,7 +133,17 @@ fiber_pool_cb(ev_loop *loop, struct ev_watcher *watcher, int events) f = rlist_shift_entry(&pool->idle, struct fiber, state); fiber_call(f); } else if (pool->size < pool->max_size) { - f = fiber_new(cord_name(cord()), fiber_pool_f); + /* + * We don't want fibers to be cancellable by client + * while they are in the pool. However system flag is + * reset during processing message from pool endpoint + * so that fiber is made cancellable back. + * + * If some message processing should not be cancellable + * by client then it can just set system flag during + * it's execution. + */ + f = fiber_new_system(cord_name(cord()), fiber_pool_f); if (f == NULL) { diag_log(); break; diff --git a/src/lua/fiber.c b/src/lua/fiber.c index 8a8f413084..bed60bd1ff 100644 --- a/src/lua/fiber.c +++ b/src/lua/fiber.c @@ -883,6 +883,17 @@ lbox_fiber_stall(struct lua_State *L) return 0; } +/** Make fiber system. Takes the fiber as a single argument. */ +static int +lbox_fiber_set_system(struct lua_State *L) +{ + if (lua_gettop(L) != 1) + luaL_error(L, "fiber.set_system(id): bad arguments"); + struct fiber *fiber = lbox_checkfiber(L, 1); + fiber_set_system(fiber, true); + return 0; +} + /** Helper for fiber slice parsing. */ static struct fiber_slice lbox_fiber_slice_parse(struct lua_State *L, int idx) @@ -1018,6 +1029,7 @@ static const struct luaL_Reg fiberlib[] = { {"extend_slice", lbox_fiber_extend_slice}, /* Internal functions, to hide in fiber.lua. */ {"stall", lbox_fiber_stall}, + {"set_system", lbox_fiber_set_system}, {NULL, NULL} }; diff --git a/src/lua/fiber.lua b/src/lua/fiber.lua index faaae31cfd..48eda368ad 100644 --- a/src/lua/fiber.lua +++ b/src/lua/fiber.lua @@ -73,7 +73,9 @@ fiber.clock = fiber_clock fiber.clock64 = fiber_clock64 local stall = fiber.stall +local fiber_set_system = fiber.set_system fiber.stall = nil +fiber.set_system = nil local worker_next_task = nil local worker_last_task @@ -101,15 +103,21 @@ local function worker_f() end end +local worker_name = 'tasks_worker_fiber' + local function worker_safe_f() pcall(worker_f) -- Worker_f never returns. If the execution is here, this -- fiber is probably canceled and now is not able to sleep. -- Create a new one. worker_fiber = fiber.new(worker_safe_f) + fiber_set_system(worker_fiber) + worker_fiber:name(worker_name) end worker_fiber = fiber.new(worker_safe_f) +fiber_set_system(worker_fiber) +worker_fiber:name(worker_name) local function worker_schedule_task(f, arg) local task = {f = f, arg = arg} @@ -125,6 +133,7 @@ end -- Start from '_' to hide it from auto completion. fiber._internal = fiber._internal or {} fiber._internal.schedule_task = worker_schedule_task +fiber._internal.set_system = fiber_set_system setmetatable(fiber, {__serialize = function(self) local res = table.copy(self) diff --git a/src/lua/init.lua b/src/lua/init.lua index 352e7a8c37..e61772a695 100644 --- a/src/lua/init.lua +++ b/src/lua/init.lua @@ -157,6 +157,7 @@ local function exit(code) -- os.exit() never yields. After on_shutdown -- fiber completes, we will never wake up again. local TIMEOUT_INFINITY = 500 * 365 * 86400 + fiber._internal.set_system(fiber.self()) while true do fiber.sleep(TIMEOUT_INFINITY) end end rawset(os, "exit", exit) diff --git a/src/main.cc b/src/main.cc index 42f3a68a2f..2bdf675932 100644 --- a/src/main.cc +++ b/src/main.cc @@ -1109,14 +1109,5 @@ main(int argc, char **argv) free((void *)instance.name); free((void *)instance.config); tarantool_free(); - ERROR_INJECT(ERRINJ_MAIN_MAKE_FILE_ON_RETURN, do { - int fd = open("tt_exit_file.txt.inprogress", - O_WRONLY | O_CREAT | O_TRUNC, -1); - if (fd < 0) - break; - dprintf(fd, "ExitCode: %d\n", exit_code); - close(fd); - rename("tt_exit_file.txt.inprogress", "tt_exit_file.txt"); - } while (false)); return exit_code; } diff --git a/test/box-luatest/gh_7743_term_initial_cfg_snap_test.lua b/test/box-luatest/gh_7743_term_initial_cfg_snap_test.lua index daebfcd699..c3e87490b0 100644 --- a/test/box-luatest/gh_7743_term_initial_cfg_snap_test.lua +++ b/test/box-luatest/gh_7743_term_initial_cfg_snap_test.lua @@ -20,7 +20,6 @@ g.test_sigterm_during_initial_snapshot = function() -- uses usleep() which is a pthread cancellation point. TARANTOOL_RUN_BEFORE_BOX_CFG = [[ box.ctl.set_on_shutdown_timeout(1000) - box.error.injection.set('ERRINJ_MAIN_MAKE_FILE_ON_RETURN', true) box.error.injection.set('ERRINJ_SNAP_WRITE_DELAY', true) ]] } @@ -30,18 +29,7 @@ g.test_sigterm_during_initial_snapshot = function() t.helpers.retrying({}, function() assert(g.server:grep_log('saving snapshot', nil, {filename = logname})) end) - g.server.process:kill('TERM') - local path = fio.pathjoin(g.server.workdir, 'tt_exit_file.txt') - local exit_text - t.helpers.retrying({}, function() - local f = fio.open(path, 'O_RDONLY') - if f == nil then - error('could not open') - end - exit_text = f:read() - f:close() - end) - g.server.process = nil g.server:stop() - t.assert_str_contains(exit_text, 'ExitCode: 0\n') + local panic_msg = "failed to create a checkpoint" + t.assert(g.server:grep_log(panic_msg, nil, {filename = logname})) end diff --git a/test/box-luatest/gh_8530_alter_space_snapshot_test.lua b/test/box-luatest/gh_8530_alter_space_snapshot_test.lua index cddc401f03..1e9eb0004d 100644 --- a/test/box-luatest/gh_8530_alter_space_snapshot_test.lua +++ b/test/box-luatest/gh_8530_alter_space_snapshot_test.lua @@ -49,6 +49,9 @@ g.test_build_index = function(cg) box.snapshot() t.assert_equals(f:status(), 'suspended') end) + -- Use KILL because server will hang on shutdown due to injection. + -- We don't need graceful shutdown for the test anyway. + cg.server.process:kill('KILL') cg.server:restart() cg.server:exec(function() local s = box.space.test @@ -69,6 +72,9 @@ g.test_change_format = function(cg) box.snapshot() t.assert_equals(f:status(), 'suspended') end) + -- Use KILL because server will hang on shutdown due to injection. + -- We don't need graceful shutdown for the test anyway. + cg.server.process:kill('KILL') cg.server:restart() cg.server:exec(function() local s = box.space.test diff --git a/test/box/errinj.result b/test/box/errinj.result index 8c4c8150f1..979031600d 100644 --- a/test/box/errinj.result +++ b/test/box/errinj.result @@ -77,7 +77,6 @@ evals - ERRINJ_IPROTO_TX_DELAY: false - ERRINJ_IPROTO_WRITE_ERROR_DELAY: false - ERRINJ_LOG_ROTATE: false - - ERRINJ_MAIN_MAKE_FILE_ON_RETURN: false - ERRINJ_MEMTX_DELAY_GC: false - ERRINJ_NETBOX_DISABLE_ID: false - ERRINJ_NETBOX_FLIP_FEATURE: -1 diff --git a/test/replication-luatest/shutdown_test.lua b/test/replication-luatest/shutdown_test.lua index 0b1a0f4b78..b4901803ff 100644 --- a/test/replication-luatest/shutdown_test.lua +++ b/test/replication-luatest/shutdown_test.lua @@ -17,20 +17,6 @@ g.after_each(function(cg) end end) -local test_no_crash_on_shutdown = function(server) - server.process:kill() - local path = fio.pathjoin(server.workdir, 'tt_exit_file.txt') - t.helpers.retrying({}, function() - t.assert(fio.path.exists(path)) - end) - local fh, err = fio.open(path, 'O_RDONLY') - assert(fh, err) - local str, err = fh:read() - assert(str, err) - fh:close() - t.assert_str_contains(str, 'ExitCode: 0\n') -end - g.test_shutdown_on_rebootstrap = function(cg) t.tarantool.skip_if_not_debug() -- It is critical for test that we can connect to uri but cannot auth. @@ -40,13 +26,7 @@ g.test_shutdown_on_rebootstrap = function(cg) replication = 'no:way@' .. cg.master.net_box_uri, replication_timeout = 100, } - local env = { - -- There will be no connection to replica in test. - TARANTOOL_RUN_BEFORE_BOX_CFG = [[ - box.error.injection.set('ERRINJ_MAIN_MAKE_FILE_ON_RETURN', true) - ]], - } - cg.replica = server:new({box_cfg = cfg, env = env}) + cg.replica = server:new({box_cfg = cfg}) -- Can't not wait because replica will not be bootstrapped. cg.replica:start({wait_until_ready = false}) local retry_msg = string.format('will retry every %.2f second', @@ -56,5 +36,7 @@ g.test_shutdown_on_rebootstrap = function(cg) t.helpers.retrying({}, function() t.assert(cg.replica:grep_log(retry_msg, nil, {filename = log})) end) - test_no_crash_on_shutdown(cg.replica) + cg.replica:stop() + local panic_msg = "can't initialize storage: fiber is cancelled" + t.assert(cg.replica:grep_log(panic_msg, nil, {filename = log})) end diff --git a/test/replication-py/cluster.test.py b/test/replication-py/cluster.test.py index 514c874a39..8249a79876 100644 --- a/test/replication-py/cluster.test.py +++ b/test/replication-py/cluster.test.py @@ -230,6 +230,7 @@ failed.rpl_master = master failed.name = "failed" failed.deploy(True, wait=False) +failed.crash_expected = True line = "ER_READONLY" if failed.logfile_pos.seek_wait(line): print("'{}' exists in server log".format(line)) diff --git a/test/replication/anon.result b/test/replication/anon.result index 68e629f61b..997e5f0280 100644 --- a/test/replication/anon.result +++ b/test/replication/anon.result @@ -407,7 +407,7 @@ test_run:cmd([[create server replica with rpl_master=replica_anon1,\ | --- | - true | ... -test_run:cmd('start server replica with wait_load=False, wait=False') +test_run:cmd('start server replica with wait_load=False, wait=False, crash_expected=True') | --- | - true | ... diff --git a/test/replication/anon.test.lua b/test/replication/anon.test.lua index 97b2e7d67f..a2fc8b47df 100644 --- a/test/replication/anon.test.lua +++ b/test/replication/anon.test.lua @@ -146,7 +146,7 @@ test_run:cmd('delete server replica_anon2') -- Check that joining to an anonymous replica is prohibited. test_run:cmd([[create server replica with rpl_master=replica_anon1,\ script="replication/replica.lua"]]) -test_run:cmd('start server replica with wait_load=False, wait=False') +test_run:cmd('start server replica with wait_load=False, wait=False, crash_expected=True') test_run:wait_log('replica', 'ER_UNSUPPORTED: Anonymous replica does not support registration of non%-anonymous nodes.', nil, 10) test_run:cmd('stop server replica') test_run:cmd('delete server replica') diff --git a/test/replication/force_recovery.result b/test/replication/force_recovery.result index e142e829ab..c278a218a3 100644 --- a/test/replication/force_recovery.result +++ b/test/replication/force_recovery.result @@ -63,7 +63,7 @@ fio.unlink(xlog) box.cfg{force_recovery = true} --- ... -test_run:cmd("start server test with wait=False") +test_run:cmd("start server test with wait=False, crash_expected=True") --- - true ... diff --git a/test/replication/force_recovery.test.lua b/test/replication/force_recovery.test.lua index bd3b439d2a..e6f7ae7160 100644 --- a/test/replication/force_recovery.test.lua +++ b/test/replication/force_recovery.test.lua @@ -27,7 +27,7 @@ fio.unlink(xlog) -- Check that even though box.cfg.force_recovery is set, -- replication will still fail due to LSN gap. box.cfg{force_recovery = true} -test_run:cmd("start server test with wait=False") +test_run:cmd("start server test with wait=False, crash_expected=True") test_run:cmd("switch test") test_run:wait_upstream(1, {message_re = 'Missing %.xlog file', status = 'loading'}) box.space.test:select() diff --git a/test/replication/gh-3637-misc-error-on-replica-auth-fail.result b/test/replication/gh-3637-misc-error-on-replica-auth-fail.result index 98880d8e40..9008f88c11 100644 --- a/test/replication/gh-3637-misc-error-on-replica-auth-fail.result +++ b/test/replication/gh-3637-misc-error-on-replica-auth-fail.result @@ -49,6 +49,19 @@ vclock[0] = nil _ = test_run:wait_vclock('replica_auth', vclock) --- ... +-- Wait server init script finish or server will panic on stop. +test_run:switch('replica_auth') +--- +- true +... +test_run:wait_cond(function() return _G.startup_finished == true end) +--- +- true +... +test_run:switch('default') +--- +- true +... test_run:cmd("stop server replica_auth") --- - true diff --git a/test/replication/gh-3637-misc-error-on-replica-auth-fail.test.lua b/test/replication/gh-3637-misc-error-on-replica-auth-fail.test.lua index c51a2f6289..6028796d74 100644 --- a/test/replication/gh-3637-misc-error-on-replica-auth-fail.test.lua +++ b/test/replication/gh-3637-misc-error-on-replica-auth-fail.test.lua @@ -24,6 +24,11 @@ vclock = test_run:get_vclock('default') vclock[0] = nil _ = test_run:wait_vclock('replica_auth', vclock) +-- Wait server init script finish or server will panic on stop. +test_run:switch('replica_auth') +test_run:wait_cond(function() return _G.startup_finished == true end) + +test_run:switch('default') test_run:cmd("stop server replica_auth") test_run:cmd("cleanup server replica_auth") test_run:cmd("delete server replica_auth") diff --git a/test/replication/gh-4739-vclock-assert.result b/test/replication/gh-4739-vclock-assert.result index 83896c4e16..21247e42b8 100644 --- a/test/replication/gh-4739-vclock-assert.result +++ b/test/replication/gh-4739-vclock-assert.result @@ -56,7 +56,7 @@ end, 10) -- Restart the remote instance. This will make the first instance -- resubscribe without entering orphan mode. -test_run:cmd('restart server rebootstrap2 with wait=False') +test_run:cmd('restart server rebootstrap2 with wait=False, crash_expected=True') | --- | - true | ... diff --git a/test/replication/gh-4739-vclock-assert.test.lua b/test/replication/gh-4739-vclock-assert.test.lua index 5755ad7528..781b7bc041 100644 --- a/test/replication/gh-4739-vclock-assert.test.lua +++ b/test/replication/gh-4739-vclock-assert.test.lua @@ -24,7 +24,7 @@ end, 10) -- Restart the remote instance. This will make the first instance -- resubscribe without entering orphan mode. -test_run:cmd('restart server rebootstrap2 with wait=False') +test_run:cmd('restart server rebootstrap2 with wait=False, crash_expected=True') test_run:cmd('switch rebootstrap1') -- Wait until resubscribe is sent test_run:wait_cond(function()\ diff --git a/test/replication/gh-5613-bootstrap-prefer-booted.result b/test/replication/gh-5613-bootstrap-prefer-booted.result index d31b66c191..077b2992c3 100644 --- a/test/replication/gh-5613-bootstrap-prefer-booted.result +++ b/test/replication/gh-5613-bootstrap-prefer-booted.result @@ -43,7 +43,7 @@ test_run:cmd('create server replica2 with script="replication/gh-5613-replica2.l | --- | - true | ... -test_run:cmd('start server replica2 with wait=False') +test_run:cmd('start server replica2 with wait=False, crash_expected=True') | --- | - true | ... diff --git a/test/replication/gh-5613-bootstrap-prefer-booted.test.lua b/test/replication/gh-5613-bootstrap-prefer-booted.test.lua index 6d4fcd1426..9300d4e3d4 100644 --- a/test/replication/gh-5613-bootstrap-prefer-booted.test.lua +++ b/test/replication/gh-5613-bootstrap-prefer-booted.test.lua @@ -17,7 +17,7 @@ box.cfg{read_only = true} test_run:switch('default') test_run:cmd('create server replica2 with script="replication/gh-5613-replica2.lua"') -test_run:cmd('start server replica2 with wait=False') +test_run:cmd('start server replica2 with wait=False, crash_expected=True') opts = {filename = 'gh-5613-replica2.log'} assert(test_run:wait_log(nil, 'ER_READONLY', nil, nil, opts) ~= nil) diff --git a/test/replication/gh-5806-xlog-cleanup.result b/test/replication/gh-5806-xlog-cleanup.result index aa709f8c8b..21d6d18b6f 100644 --- a/test/replication/gh-5806-xlog-cleanup.result +++ b/test/replication/gh-5806-xlog-cleanup.result @@ -153,7 +153,7 @@ assert(not box.info.gc().is_paused) -- -- Start replica and wait for error. -test_run:cmd('start server replica with wait=False, wait_load=False') +test_run:cmd('start server replica with wait=False, wait_load=False, crash_expected=True') | --- | - true | ... diff --git a/test/replication/gh-5806-xlog-cleanup.test.lua b/test/replication/gh-5806-xlog-cleanup.test.lua index 3c4abe5ee4..310ab6b641 100644 --- a/test/replication/gh-5806-xlog-cleanup.test.lua +++ b/test/replication/gh-5806-xlog-cleanup.test.lua @@ -78,7 +78,7 @@ assert(not box.info.gc().is_paused) -- -- Start replica and wait for error. -test_run:cmd('start server replica with wait=False, wait_load=False') +test_run:cmd('start server replica with wait=False, wait_load=False, crash_expected=True') -- -- Wait error to appear, 60 seconds should be more than enough, diff --git a/test/replication/prune.result b/test/replication/prune.result index e25e9684e2..b2040cc198 100644 --- a/test/replication/prune.result +++ b/test/replication/prune.result @@ -137,7 +137,7 @@ test_run:cmd('stop server replica1') --- - true ... -test_run:cmd('start server replica1 with args="true", wait=False') +test_run:cmd('start server replica1 with args="true", wait=False, crash_expected=True') --- - true ... diff --git a/test/replication/prune.test.lua b/test/replication/prune.test.lua index 68300b270c..fd24b70773 100644 --- a/test/replication/prune.test.lua +++ b/test/replication/prune.test.lua @@ -66,7 +66,7 @@ test_run:cmd('eval replica1 "box.info.replication[1].upstream.message"') -- restart replica and check that replica isn't able to join to cluster test_run:cmd('stop server replica1') -test_run:cmd('start server replica1 with args="true", wait=False') +test_run:cmd('start server replica1 with args="true", wait=False, crash_expected=True') test_run:cmd('switch replica1') test_run:wait_upstream(1, {message_re = "Can't subscribe non%-anonymous replica"}) test_run:cmd('switch default') diff --git a/test/replication/replica_auth.lua b/test/replication/replica_auth.lua index 61d046fc47..72898c6186 100644 --- a/test/replication/replica_auth.lua +++ b/test/replication/replica_auth.lua @@ -4,9 +4,12 @@ local USER_PASS = arg[1] local TIMEOUT = arg[2] and tonumber(arg[2]) or 0.1 require('console').listen(os.getenv('ADMIN')) +_G.startup_finished = false box.cfg({ listen = os.getenv("LISTEN"), replication = USER_PASS .. "@" .. os.getenv("MASTER"), replication_timeout = TIMEOUT, }) + +_G.startup_finished = true diff --git a/test/replication/replica_rejoin.result b/test/replication/replica_rejoin.result index e489c150a6..0cccc7f0b4 100644 --- a/test/replication/replica_rejoin.result +++ b/test/replication/replica_rejoin.result @@ -238,7 +238,7 @@ test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*. box.cfg{checkpoint_count = checkpoint_count} --- ... -test_run:cmd("start server replica with wait=False") +test_run:cmd("start server replica with wait=False, crash_expected=True") --- - true ... diff --git a/test/replication/replica_rejoin.test.lua b/test/replication/replica_rejoin.test.lua index 2563177cf5..f9d1b45f63 100644 --- a/test/replication/replica_rejoin.test.lua +++ b/test/replication/replica_rejoin.test.lua @@ -90,7 +90,7 @@ for i = 1, 3 do box.space.test:insert{i * 100} end fio = require('fio') test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 end) or fio.pathjoin(box.cfg.wal_dir, '*.xlog') box.cfg{checkpoint_count = checkpoint_count} -test_run:cmd("start server replica with wait=False") +test_run:cmd("start server replica with wait=False, crash_expected=True") test_run:cmd("switch replica") test_run:wait_upstream(1, {message_re = 'Missing %.xlog file', status = 'loading'}) box.space.test:select() diff --git a/test/unit/fiber.cc b/test/unit/fiber.cc index 2ccd5a8ca7..7caab3a8e2 100644 --- a/test/unit/fiber.cc +++ b/test/unit/fiber.cc @@ -581,6 +581,145 @@ fiber_test_leak_modes() say_logger_free(); } +static void +fiber_test_client_fiber_count(void) +{ + header(); + + int count = cord()->client_fiber_count; + + struct fiber *fiber1 = fiber_new("fiber1", wait_cancel_f); + fail_unless(fiber1 != NULL); + fail_unless(++count == cord()->client_fiber_count); + + struct fiber *fiber2 = fiber_new("fiber2", wait_cancel_f); + fail_unless(fiber2 != NULL); + fail_unless(++count == cord()->client_fiber_count); + + struct fiber *fiber3 = fiber_new_system("fiber3", wait_cancel_f); + fail_unless(fiber3 != NULL); + fail_unless(count == cord()->client_fiber_count); + + struct fiber *fiber4 = fiber_new_system("fiber4", wait_cancel_f); + fail_unless(fiber4 != NULL); + fail_unless(count == cord()->client_fiber_count); + + fiber_set_joinable(fiber1, true); + fiber_cancel(fiber1); + fiber_join(fiber1); + fail_unless(--count == cord()->client_fiber_count); + + fiber_set_joinable(fiber4, true); + fiber_cancel(fiber4); + fiber_join(fiber4); + fail_unless(count == cord()->client_fiber_count); + + fiber_set_joinable(fiber2, true); + fiber_cancel(fiber2); + fiber_join(fiber2); + fail_unless(--count == cord()->client_fiber_count); + + fiber_set_joinable(fiber3, true); + fiber_cancel(fiber3); + fiber_join(fiber3); + fail_unless(count == cord()->client_fiber_count); + + footer(); +} + +static void +fiber_test_set_system(void) +{ + header(); + + struct fiber *fiber1 = fiber_new("fiber1", wait_cancel_f); + fail_unless(fiber1 != NULL); + int count = cord()->client_fiber_count; + + fiber_set_system(fiber1, true); + fail_unless(--count == cord()->client_fiber_count); + fail_unless((fiber1->flags & FIBER_IS_SYSTEM) != 0); + + fiber_set_system(fiber1, true); + fail_unless(count == cord()->client_fiber_count); + fail_unless((fiber1->flags & FIBER_IS_SYSTEM) != 0); + + fiber_set_system(fiber1, false); + fail_unless(++count == cord()->client_fiber_count); + fail_unless((fiber1->flags & FIBER_IS_SYSTEM) == 0); + + fiber_set_system(fiber1, false); + fail_unless(count == cord()->client_fiber_count); + fail_unless((fiber1->flags & FIBER_IS_SYSTEM) == 0); + + struct fiber *fiber2 = fiber_new_system("fiber2", wait_cancel_f); + fail_unless(fiber2 != NULL); + count = cord()->client_fiber_count; + + fiber_set_system(fiber2, false); + fail_unless(++count == cord()->client_fiber_count); + fail_unless((fiber2->flags & FIBER_IS_SYSTEM) == 0); + + fiber_set_system(fiber2, false); + fail_unless(count == cord()->client_fiber_count); + fail_unless((fiber2->flags & FIBER_IS_SYSTEM) == 0); + + fiber_set_system(fiber2, true); + fail_unless(--count == cord()->client_fiber_count); + fail_unless((fiber2->flags & FIBER_IS_SYSTEM) != 0); + + fiber_set_system(fiber2, true); + fail_unless(count == cord()->client_fiber_count); + fail_unless((fiber2->flags & FIBER_IS_SYSTEM) != 0); + + fiber_set_joinable(fiber1, true); + fiber_cancel(fiber1); + fiber_join(fiber1); + fiber_set_joinable(fiber2, true); + fiber_cancel(fiber2); + fiber_join(fiber2); + + footer(); +} + +static int +hang_on_cancel_f(va_list ap) +{ + while (!fiber_is_cancelled()) + fiber_yield(); + fiber_set_system(fiber(), true); + while (true) + fiber_yield(); + return 0; +} + +static void +fiber_test_shutdown(void) +{ + footer(); + + struct fiber *fiber1 = fiber_new("fiber1", wait_cancel_f); + fail_unless(fiber1 != NULL); + fiber_set_joinable(fiber1, true); + struct fiber *fiber2 = fiber_new_system("fiber2", wait_cancel_f); + fail_unless(fiber2 != NULL); + struct fiber *fiber3 = fiber_new("fiber3", hang_on_cancel_f); + fail_unless(fiber3 != NULL); + + fiber_shutdown(); + fail_unless((fiber1->flags & FIBER_IS_DEAD) != 0); + fail_unless((fiber2->flags & FIBER_IS_DEAD) == 0); + fail_unless((fiber3->flags & FIBER_IS_DEAD) == 0); + + fiber_join(fiber1); + + fiber_set_joinable(fiber2, true); + fiber_cancel(fiber2); + fiber_join(fiber2); + + header(); +} + static int main_f(va_list ap) { @@ -597,6 +736,9 @@ main_f(va_list ap) cord_cancel_and_join_test(); fiber_test_defaults(); fiber_test_leak_modes(); + fiber_test_client_fiber_count(); + fiber_test_set_system(); + fiber_test_shutdown(); ev_break(loop(), EVBREAK_ALL); return 0; } @@ -611,7 +753,7 @@ int main() memory_init(); fiber_init(fiber_cxx_invoke); fiber_attr_create(&default_attr); - struct fiber *main = fiber_new_xc("main", main_f); + struct fiber *main = fiber_new_system_xc("main", main_f); fiber_wakeup(main); ev_run(loop(), 0); fiber_free(); diff --git a/test/unit/fiber.result b/test/unit/fiber.result index 0066138356..2b5469cfba 100644 --- a/test/unit/fiber.result +++ b/test/unit/fiber.result @@ -40,3 +40,9 @@ OutOfMemory: Failed to allocate 42 bytes in allocator for exception *** fiber_test_leak: done *** *** fiber_test_leak *** *** fiber_test_leak: done *** + *** fiber_test_client_fiber_count *** + *** fiber_test_client_fiber_count: done *** + *** fiber_test_set_system *** + *** fiber_test_set_system: done *** + *** fiber_test_shutdown: done *** + *** fiber_test_shutdown *** diff --git a/test/vinyl/errinj.result b/test/vinyl/errinj.result index 18d10b077b..e30c34ab11 100644 --- a/test/vinyl/errinj.result +++ b/test/vinyl/errinj.result @@ -1145,8 +1145,8 @@ s:drop() --- ... -- --- Check that tarantool stops immediately even if a vinyl worker --- thread is blocked (see gh-3225). +-- Check that tarantool stops immediately if large snapshot write +-- is in progress. -- s = box.schema.space.create('test', {engine = 'vinyl'}) --- @@ -1154,33 +1154,17 @@ s = box.schema.space.create('test', {engine = 'vinyl'}) _ = s:create_index('pk') --- ... -s:replace{1, 1} ---- -- [1, 1] -... -box.snapshot() ---- -- ok -... -errinj.set('ERRINJ_VY_READ_PAGE_TIMEOUT', 9000) ---- -- ok -... -_ = fiber.create(function() s:get(1) end) +for i = 1, 10000 do s:replace({i}) end --- ... -s:replace{1, 2} ---- -- [1, 2] -... -errinj.set('ERRINJ_VY_RUN_WRITE_STMT_TIMEOUT', 9000) +errinj.set('ERRINJ_VY_RUN_WRITE_STMT_TIMEOUT', 0.01) --- - ok ... _ = fiber.create(function() box.snapshot() end) --- ... -test_run:cmd("restart server default") +test_run:cmd("restart server default") -- don't stuck box.space.test:drop() --- ... diff --git a/test/vinyl/errinj.test.lua b/test/vinyl/errinj.test.lua index d698b44084..0a7beac682 100644 --- a/test/vinyl/errinj.test.lua +++ b/test/vinyl/errinj.test.lua @@ -414,23 +414,16 @@ box.schema.user.revoke('guest', 'replication') s:drop() -- --- Check that tarantool stops immediately even if a vinyl worker --- thread is blocked (see gh-3225). +-- Check that tarantool stops immediately if large snapshot write +-- is in progress. -- s = box.schema.space.create('test', {engine = 'vinyl'}) _ = s:create_index('pk') -s:replace{1, 1} -box.snapshot() - -errinj.set('ERRINJ_VY_READ_PAGE_TIMEOUT', 9000) -_ = fiber.create(function() s:get(1) end) - -s:replace{1, 2} - -errinj.set('ERRINJ_VY_RUN_WRITE_STMT_TIMEOUT', 9000) +for i = 1, 10000 do s:replace({i}) end +errinj.set('ERRINJ_VY_RUN_WRITE_STMT_TIMEOUT', 0.01) _ = fiber.create(function() box.snapshot() end) -test_run:cmd("restart server default") +test_run:cmd("restart server default") -- don't stuck box.space.test:drop() -- diff --git a/test/vinyl/errinj_vylog.result b/test/vinyl/errinj_vylog.result index b9ae9332e9..6ac76b2c81 100644 --- a/test/vinyl/errinj_vylog.result +++ b/test/vinyl/errinj_vylog.result @@ -399,7 +399,9 @@ fiber.sleep(0.01) --- ... -- Should ignore the incomplete index on recovery. -test_run:cmd('restart server default') +-- Use KILL because server will hang on shutdown due to injection. +-- We don't need graceful shutdown for the test anyway. +test_run:cmd('restart server default with signal=KILL') s = box.space.test --- ... diff --git a/test/vinyl/errinj_vylog.test.lua b/test/vinyl/errinj_vylog.test.lua index 4401f30150..54a69c6599 100644 --- a/test/vinyl/errinj_vylog.test.lua +++ b/test/vinyl/errinj_vylog.test.lua @@ -198,7 +198,9 @@ _ = fiber.create(function() s:create_index('sk', {parts = {2, 'unsigned'}}) end) fiber.sleep(0.01) -- Should ignore the incomplete index on recovery. -test_run:cmd('restart server default') +-- Use KILL because server will hang on shutdown due to injection. +-- We don't need graceful shutdown for the test anyway. +test_run:cmd('restart server default with signal=KILL') s = box.space.test s.index[1] == nil diff --git a/test/xlog/panic_on_wal_error.result b/test/xlog/panic_on_wal_error.result index c4494ac87a..0806a96ed2 100644 --- a/test/xlog/panic_on_wal_error.result +++ b/test/xlog/panic_on_wal_error.result @@ -121,7 +121,7 @@ box.cfg.force_recovery -- try to start the replica, ha-ha -- (replication should fail, some rows are missing) -- -test_run:cmd("start server replica with wait=False") +test_run:cmd("start server replica with wait=False, crash_expected=True") --- - true ... diff --git a/test/xlog/panic_on_wal_error.test.lua b/test/xlog/panic_on_wal_error.test.lua index eea6aad300..77bcde7877 100644 --- a/test/xlog/panic_on_wal_error.test.lua +++ b/test/xlog/panic_on_wal_error.test.lua @@ -57,7 +57,7 @@ box.cfg.force_recovery -- try to start the replica, ha-ha -- (replication should fail, some rows are missing) -- -test_run:cmd("start server replica with wait=False") +test_run:cmd("start server replica with wait=False, crash_expected=True") test_run:cmd("switch replica") -- Need to wait for box.info.replication[1] defined, otherwise test-run fails to -- wait for the upstream status sometimes. -- GitLab