From 6cc1b1f2572f7b364a57b814d374afd31f508d60 Mon Sep 17 00:00:00 2001 From: Serge Petrenko <sergepetrenko@tarantool.org> Date: Wed, 15 Jun 2022 17:57:56 +0300 Subject: [PATCH] txn_limbo: do not confirm/rollback anything after restart It's important for the synchro queue owner to not finalize any of the pending synchronous transactions after restart. Since the node was down for some time the chances are pretty high it was deposed by some new leader during its downtime. It means that the node might not know yet that it's transactions were already finalized by someone else. So, any arbitrary finalization might lead to a future split-brain, once the remote PROMOTE finally reaches the local node. Let's fix this by adding a new reason for the limbo to be frozen - a queue owner has recovered but has not issued a new PROMOTE locally and hasn't received any PROMOTE requests from the remote nodes. Once the first PROMOTE is issued or received, it's safe to return to the old mode of operation. So, now the synchro queue owner starts in "frozen" state and can't CONFIRM, ROLLBACK or issue new transactions until either issuing a PROMOTE or receiving a PROMOTE from some remote node. This also required modifying box.ctl.promote() behaviour: it's no longer a no-op on a synchro queue owner, when elections are disabled and the queue is frozen due to restart. Also fix the tests, which assumed the queue owner is writeable after a restart. gh-5298 test was partially deleted, because it became pointless. And while we are at it, remove the double run of gh-5288 test. It is storage engine agnostic, so there's no point in running it for both memtx and vinyl. Part-of #5295 NO_CHANGELOG=covered by previous commit @TarantoolBot document Title: ER_READONLY error receives new reasons When box.info.ro_reason is "synchro" and some operation throws an ER_READONLY error, this error now might include the following reason: ``` Can't modify data on a read-only instance - synchro queue with term 2 belongs to 1 (06c05d18-456e-4db3-ac4c-b8d0f291fd92) and is frozen due to fencing ``` This means that the current instance is indeed the synchro queue owner, but it has noticed, that someone else in the cluster might start new elections or might overtake the synchro queue soon. This may be also detected by `box.info.election.term` becoming greater than `box.info.synchro.queue.term` (this is the case for the second error message). There is also a slightly different error message: ``` Can't modify data on a read-only instance - synchro queue with term 2 belongs to 1 (06c05d18-456e-4db3-ac4c-b8d0f291fd92) and is frozen until promotion ``` This means that the node simply cannot guarantee that it is still the synchro queue owner (for example, after a restart, when a node still thinks it is the queue owner, but someone else in the cluster has already overtaken the queue). --- src/box/box.cc | 15 +++++- src/box/txn_limbo.c | 19 +++++-- src/box/txn_limbo.h | 6 +++ .../gh-5140-qsync-casc-rollback.result | 4 ++ .../gh-5140-qsync-casc-rollback.test.lua | 2 + .../gh-5163-qsync-restart-crash.result | 3 ++ .../gh-5163-qsync-restart-crash.test.lua | 1 + .../replication/gh-5288-qsync-recovery.result | 3 ++ .../gh-5288-qsync-recovery.test.lua | 1 + .../gh-5298-qsync-recovery-snap.result | 50 ++----------------- .../gh-5298-qsync-recovery-snap.test.lua | 26 ++-------- .../gh-5874-qsync-txn-recovery.result | 4 ++ .../gh-5874-qsync-txn-recovery.test.lua | 2 + test/replication/suite.cfg | 1 + 14 files changed, 63 insertions(+), 74 deletions(-) diff --git a/src/box/box.cc b/src/box/box.cc index cc0b8ed739..ed2627771d 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -271,6 +271,15 @@ box_check_writable(void) error_set_uuid(e, "queue_owner_uuid", &r->uuid); error_append_msg(e, " (%s)", tt_uuid_str(&r->uuid)); } + if (txn_limbo.owner_id == instance_id) { + if (txn_limbo.is_frozen_due_to_fencing) { + error_append_msg(e, " and is frozen due to " + "fencing"); + } else if (txn_limbo.is_frozen_until_promotion) { + error_append_msg(e, " and is frozen until " + "promotion"); + } + } } else { if (is_ro) error_append_msg(e, "box.cfg.read_only is true"); @@ -2005,8 +2014,10 @@ box_promote(void) * Currently active leader (the instance that is seen as leader by both * raft and txn_limbo) can't issue another PROMOTE. */ - bool is_leader = txn_limbo_replica_term(&txn_limbo, instance_id) == - raft->term && txn_limbo.owner_id == instance_id; + bool is_leader = + txn_limbo_replica_term(&txn_limbo, instance_id) == raft->term && + txn_limbo.owner_id == instance_id && + !txn_limbo.is_frozen_until_promotion; if (box_election_mode != ELECTION_MODE_OFF) is_leader = is_leader && raft->state == RAFT_STATE_LEADER; diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c index d85e33bb2c..f8382243cb 100644 --- a/src/box/txn_limbo.c +++ b/src/box/txn_limbo.c @@ -54,6 +54,7 @@ txn_limbo_create(struct txn_limbo *limbo) limbo->is_in_rollback = false; limbo->svp_confirmed_lsn = -1; limbo->frozen_reasons = 0; + limbo->is_frozen_until_promotion = true; } static inline bool @@ -830,6 +831,16 @@ txn_limbo_req_rollback(struct txn_limbo *limbo, } } +/** Unfreeze the limbo encountering the first new PROMOTE after a restart. */ +static inline void +txn_limbo_unfreeze_on_first_promote(struct txn_limbo *limbo) +{ + if (box_is_configured()) { + limbo->is_frozen_until_promotion = false; + box_update_ro_summary(); + } +} + void txn_limbo_req_commit(struct txn_limbo *limbo, const struct synchro_request *req) { @@ -854,9 +865,11 @@ txn_limbo_req_commit(struct txn_limbo *limbo, const struct synchro_request *req) vclock_follow(&limbo->promote_term_map, origin, term); if (term > limbo->promote_greatest_term) { limbo->promote_greatest_term = term; - if (iproto_type_is_promote_request(req->type) && - term >= box_raft()->volatile_term) - txn_limbo_unfence(&txn_limbo); + if (iproto_type_is_promote_request(req->type)) { + if (term >= box_raft()->volatile_term) + txn_limbo_unfence(limbo); + txn_limbo_unfreeze_on_first_promote(&txn_limbo); + } } } else if (iproto_type_is_promote_request(req->type) && limbo->promote_greatest_term > 1) { diff --git a/src/box/txn_limbo.h b/src/box/txn_limbo.h index 095c965a0c..7167065c22 100644 --- a/src/box/txn_limbo.h +++ b/src/box/txn_limbo.h @@ -207,6 +207,12 @@ struct txn_limbo { * remote instance. */ bool is_frozen_due_to_fencing : 1; + /* + * This mode is always on upon node start and is turned + * off by any new PROMOTE arriving either via + * replication or issued by the node. + */ + bool is_frozen_until_promotion : 1; }; }; }; diff --git a/test/replication/gh-5140-qsync-casc-rollback.result b/test/replication/gh-5140-qsync-casc-rollback.result index d3208e1a4f..a71d8f9818 100644 --- a/test/replication/gh-5140-qsync-casc-rollback.result +++ b/test/replication/gh-5140-qsync-casc-rollback.result @@ -206,6 +206,10 @@ box.space.sync:select{} | - [4] | ... +box.ctl.promote() + | --- + | ... + box.space.sync:drop() | --- | ... diff --git a/test/replication/gh-5140-qsync-casc-rollback.test.lua b/test/replication/gh-5140-qsync-casc-rollback.test.lua index 96ddfd2609..15732e3324 100644 --- a/test/replication/gh-5140-qsync-casc-rollback.test.lua +++ b/test/replication/gh-5140-qsync-casc-rollback.test.lua @@ -97,6 +97,8 @@ test_run:switch('default') box.space.async:select{} box.space.sync:select{} +box.ctl.promote() + box.space.sync:drop() box.space.async:drop() diff --git a/test/replication/gh-5163-qsync-restart-crash.result b/test/replication/gh-5163-qsync-restart-crash.result index 1b4d3d9b5c..79bf96e474 100644 --- a/test/replication/gh-5163-qsync-restart-crash.result +++ b/test/replication/gh-5163-qsync-restart-crash.result @@ -30,6 +30,9 @@ box.space.sync:select{} | --- | - - [1] | ... +box.ctl.promote() + | --- + | ... box.space.sync:drop() | --- | ... diff --git a/test/replication/gh-5163-qsync-restart-crash.test.lua b/test/replication/gh-5163-qsync-restart-crash.test.lua index c8d54aad23..0298d8ce6e 100644 --- a/test/replication/gh-5163-qsync-restart-crash.test.lua +++ b/test/replication/gh-5163-qsync-restart-crash.test.lua @@ -12,5 +12,6 @@ box.ctl.promote() box.space.sync:replace{1} test_run:cmd('restart server default') box.space.sync:select{} +box.ctl.promote() box.space.sync:drop() box.ctl.demote() diff --git a/test/replication/gh-5288-qsync-recovery.result b/test/replication/gh-5288-qsync-recovery.result index 704b71d930..dc796181d5 100644 --- a/test/replication/gh-5288-qsync-recovery.result +++ b/test/replication/gh-5288-qsync-recovery.result @@ -25,6 +25,9 @@ box.snapshot() | ... test_run:cmd('restart server default') | +box.ctl.promote() + | --- + | ... box.space.sync:drop() | --- | ... diff --git a/test/replication/gh-5288-qsync-recovery.test.lua b/test/replication/gh-5288-qsync-recovery.test.lua index 2455f7278a..095bc71f9b 100644 --- a/test/replication/gh-5288-qsync-recovery.test.lua +++ b/test/replication/gh-5288-qsync-recovery.test.lua @@ -9,5 +9,6 @@ box.ctl.promote() s:insert{1} box.snapshot() test_run:cmd('restart server default') +box.ctl.promote() box.space.sync:drop() box.ctl.demote() diff --git a/test/replication/gh-5298-qsync-recovery-snap.result b/test/replication/gh-5298-qsync-recovery-snap.result index 0883fe5f5e..52e13d75d0 100644 --- a/test/replication/gh-5298-qsync-recovery-snap.result +++ b/test/replication/gh-5298-qsync-recovery-snap.result @@ -43,58 +43,16 @@ box.snapshot() test_run:cmd("restart server default") | --- Could hang if the limbo would incorrectly handle the snapshot end. -box.space.sync:replace{11} +-- Would be non-empty if limbo would incorrectly handle the snapshot end. +box.info.synchro.queue.len | --- - | - [11] + | - 0 | ... -old_synchro_quorum = box.cfg.replication_synchro_quorum - | --- - | ... -old_synchro_timeout = box.cfg.replication_synchro_timeout - | --- - | ... - -box.cfg{ \ - replication_synchro_timeout = 0.001, \ - replication_synchro_quorum = 2, \ -} - | --- - | ... -box.space.sync:replace{12} - | --- - | - error: Quorum collection for a synchronous transaction is timed out - | ... - -box.cfg{ \ - replication_synchro_timeout = 1000, \ - replication_synchro_quorum = 1, \ -} - | --- - | ... -box.space.sync:replace{13} - | --- - | - [13] - | ... -box.space.sync:get({11}) - | --- - | - [11] - | ... -box.space.sync:get({12}) - | --- - | ... -box.space.sync:get({13}) +box.ctl.promote() | --- - | - [13] | ... -box.cfg{ \ - replication_synchro_timeout = old_synchro_timeout, \ - replication_synchro_quorum = old_synchro_quorum, \ -} - | --- - | ... box.space.sync:drop() | --- | ... diff --git a/test/replication/gh-5298-qsync-recovery-snap.test.lua b/test/replication/gh-5298-qsync-recovery-snap.test.lua index 084cde963d..55d9501647 100644 --- a/test/replication/gh-5298-qsync-recovery-snap.test.lua +++ b/test/replication/gh-5298-qsync-recovery-snap.test.lua @@ -20,31 +20,11 @@ box.snapshot() test_run:cmd("restart server default") --- Could hang if the limbo would incorrectly handle the snapshot end. -box.space.sync:replace{11} +-- Would be non-empty if limbo would incorrectly handle the snapshot end. +box.info.synchro.queue.len -old_synchro_quorum = box.cfg.replication_synchro_quorum -old_synchro_timeout = box.cfg.replication_synchro_timeout - -box.cfg{ \ - replication_synchro_timeout = 0.001, \ - replication_synchro_quorum = 2, \ -} -box.space.sync:replace{12} - -box.cfg{ \ - replication_synchro_timeout = 1000, \ - replication_synchro_quorum = 1, \ -} -box.space.sync:replace{13} -box.space.sync:get({11}) -box.space.sync:get({12}) -box.space.sync:get({13}) +box.ctl.promote() -box.cfg{ \ - replication_synchro_timeout = old_synchro_timeout, \ - replication_synchro_quorum = old_synchro_quorum, \ -} box.space.sync:drop() box.space.loc:drop() box.ctl.demote() diff --git a/test/replication/gh-5874-qsync-txn-recovery.result b/test/replication/gh-5874-qsync-txn-recovery.result index 01328a9e32..11f2fc9c79 100644 --- a/test/replication/gh-5874-qsync-txn-recovery.result +++ b/test/replication/gh-5874-qsync-txn-recovery.result @@ -154,6 +154,10 @@ loc:select() | - [2] | - [3] | ... + +box.ctl.promote() + | --- + | ... async:drop() | --- | ... diff --git a/test/replication/gh-5874-qsync-txn-recovery.test.lua b/test/replication/gh-5874-qsync-txn-recovery.test.lua index 6ddf164ace..5bf3ba206e 100644 --- a/test/replication/gh-5874-qsync-txn-recovery.test.lua +++ b/test/replication/gh-5874-qsync-txn-recovery.test.lua @@ -80,6 +80,8 @@ loc = box.space.loc async:select() sync:select() loc:select() + +box.ctl.promote() async:drop() sync:drop() loc:drop() diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg index 3eee0803c5..8ed9eeb034 100644 --- a/test/replication/suite.cfg +++ b/test/replication/suite.cfg @@ -18,6 +18,7 @@ "gh-4424-misc-orphan-on-reconfiguration-error.test.lua": {}, "gh-5213-qsync-applier-order.test.lua": {}, "gh-5213-qsync-applier-order-3.test.lua": {}, + "gh-5288-qsync-recovery.test.lua": {}, "gh-5426-election-on-off.test.lua": {}, "gh-5430-cluster-mvcc.test.lua": {}, "gh-5433-election-restart-recovery.test.lua": {}, -- GitLab