Skip to content
Snippets Groups Projects
Commit e7559bfe authored by Vladislav Shpilevoy's avatar Vladislav Shpilevoy
Browse files

txn_limbo: handle CONFIRM during ROLLBACK

Limbo could try to CONFIRM LSN whose ROLLBACK is in progress. This
is how it could happen:

- A synchronous transaction is created, written to WAL;
- The fiber sleeps in the limbo waiting for CONFIRM or timeout;
- Timeout happens. ROLLBACK for this and all next LSNs is sent to
  WAL;
- Replica receives the transaction, sends ACK;
- Master receives ACK, starts writing CONFIRM for the LSN, whose
  ROLLBACK is in progress right now.

Another case - attempt to lower synchro quorum during ROLLBACK
write. It also could try to write CONFIRM.

The patch skips CONFIRM if there is a ROLLBACK in progress. Not
even necessary to check LSNs. Because ROLLBACK always reverts the
entire limbo queue, so it will cancel all pending transactions
with all LSNs, and new commits are rolled back even before they
try to go to WAL. CONFIRM can't help here with anything already.

Part of #5185
parent 849ba7dc
No related branches found
No related tags found
No related merge requests found
......@@ -336,6 +336,7 @@ static void
txn_limbo_write_confirm(struct txn_limbo *limbo, int64_t lsn)
{
assert(lsn > limbo->confirmed_lsn);
assert(!limbo->is_in_rollback);
limbo->confirmed_lsn = lsn;
txn_limbo_write_confirm_rollback(limbo, lsn, true);
}
......@@ -438,6 +439,18 @@ txn_limbo_ack(struct txn_limbo *limbo, uint32_t replica_id, int64_t lsn)
{
if (rlist_empty(&limbo->queue))
return;
/*
* If limbo is currently writing a rollback, it means that the whole
* queue will be rolled back. Because rollback is written only for
* timeout. Timeout always happens first for the oldest entry, i.e.
* first entry in the queue. The rollback will clear all the newer
* entries. So in total the whole queue is dead already. Would be
* strange to write CONFIRM for rolled back LSNs. Even though
* probably it wouldn't break anything. Would be just 2 conflicting
* decisions for the same LSNs.
*/
if (limbo->is_in_rollback)
return;
assert(limbo->instance_id != REPLICA_ID_NIL);
int64_t prev_lsn = vclock_get(&limbo->vclock, replica_id);
vclock_follow(&limbo->vclock, replica_id, lsn);
......@@ -601,7 +614,7 @@ txn_limbo_on_parameters_change(struct txn_limbo *limbo)
assert(confirm_lsn > 0);
}
}
if (confirm_lsn > limbo->confirmed_lsn) {
if (confirm_lsn > limbo->confirmed_lsn && !limbo->is_in_rollback) {
txn_limbo_write_confirm(limbo, confirm_lsn);
txn_limbo_read_confirm(limbo, confirm_lsn);
}
......
......@@ -368,6 +368,153 @@ box.space.sync:select{}
| - - [1]
| ...
--
-- See what happens when the quorum is collected during writing ROLLBACK.
-- CONFIRM for the same LSN should not be written.
--
test_run:switch('default')
| ---
| - true
| ...
box.cfg{replication_synchro_timeout = 1000, replication_synchro_quorum = 2}
| ---
| ...
box.space.sync:truncate()
| ---
| ...
-- Write something to flush the master's state to the replica.
_ = box.space.sync:insert({1})
| ---
| ...
_ = box.space.sync:delete({1})
| ---
| ...
test_run:switch('replica')
| ---
| - true
| ...
-- Block WAL write to block ACK sending.
box.error.injection.set("ERRINJ_WAL_DELAY", true)
| ---
| - ok
| ...
test_run:switch('default')
| ---
| - true
| ...
-- Set a trap for ROLLBACK write so as the txn itself won't hang, but ROLLBACK
-- will.
box.error.injection.set('ERRINJ_WAL_DELAY_COUNTDOWN', 1)
| ---
| - ok
| ...
box.cfg{replication_synchro_timeout = 0.001}
| ---
| ...
lsn = box.info.lsn
| ---
| ...
ok, err = nil
| ---
| ...
f = fiber.create(function() \
ok, err = pcall(box.space.sync.replace, box.space.sync, {1}) \
end)
| ---
| ...
-- Wait ROLLBACK WAL write start.
test_run:wait_cond(function() \
return box.error.injection.get("ERRINJ_WAL_DELAY") \
end)
| ---
| - true
| ...
-- The transaction is written to WAL. ROLLBACK is not yet.
lsn = lsn + 1
| ---
| ...
assert(box.info.lsn == lsn)
| ---
| - true
| ...
test_run:switch('replica')
| ---
| - true
| ...
-- Let ACKs go. Master will receive ACK, but shouldn't try to CONFIRM. Because
-- ROLLBACK for the same LSN is in progress right now already.
box.error.injection.set("ERRINJ_WAL_DELAY", false)
| ---
| - ok
| ...
test_run:switch('default')
| ---
| - true
| ...
-- Wait ACK receipt.
function wait_lsn_ack(id, lsn) \
local this_id = box.info.id \
test_run:wait_downstream(id, {status='follow'}) \
test_run:wait_cond(function() \
return box.info.replication[id].downstream.vclock[this_id] >= lsn \
end) \
end
| ---
| ...
replica_id = test_run:get_server_id('replica')
| ---
| ...
wait_lsn_ack(replica_id, lsn)
| ---
| ...
-- See if parameters change will try to write CONFIRM.
box.cfg{replication_synchro_quorum = 1}
| ---
| ...
box.cfg{replication_synchro_quorum = 2}
| ---
| ...
-- Let ROLLBACK go and finish the test.
box.error.injection.set("ERRINJ_WAL_DELAY", false)
| ---
| - ok
| ...
test_run:wait_cond(function() return f:status() == 'dead' end)
| ---
| - true
| ...
ok, err
| ---
| - false
| - Quorum collection for a synchronous transaction is timed out
| ...
box.cfg{replication_synchro_timeout = 1000}
| ---
| ...
box.space.sync:replace{2}
| ---
| - [2]
| ...
box.space.sync:select{}
| ---
| - - [2]
| ...
test_run:switch('replica')
| ---
| - true
| ...
box.space.sync:select{}
| ---
| - - [2]
| ...
test_run:cmd('switch default')
| ---
| - true
......
......@@ -145,6 +145,71 @@ box.space.sync:select{}
test_run:switch('replica')
box.space.sync:select{}
--
-- See what happens when the quorum is collected during writing ROLLBACK.
-- CONFIRM for the same LSN should not be written.
--
test_run:switch('default')
box.cfg{replication_synchro_timeout = 1000, replication_synchro_quorum = 2}
box.space.sync:truncate()
-- Write something to flush the master's state to the replica.
_ = box.space.sync:insert({1})
_ = box.space.sync:delete({1})
test_run:switch('replica')
-- Block WAL write to block ACK sending.
box.error.injection.set("ERRINJ_WAL_DELAY", true)
test_run:switch('default')
-- Set a trap for ROLLBACK write so as the txn itself won't hang, but ROLLBACK
-- will.
box.error.injection.set('ERRINJ_WAL_DELAY_COUNTDOWN', 1)
box.cfg{replication_synchro_timeout = 0.001}
lsn = box.info.lsn
ok, err = nil
f = fiber.create(function() \
ok, err = pcall(box.space.sync.replace, box.space.sync, {1}) \
end)
-- Wait ROLLBACK WAL write start.
test_run:wait_cond(function() \
return box.error.injection.get("ERRINJ_WAL_DELAY") \
end)
-- The transaction is written to WAL. ROLLBACK is not yet.
lsn = lsn + 1
assert(box.info.lsn == lsn)
test_run:switch('replica')
-- Let ACKs go. Master will receive ACK, but shouldn't try to CONFIRM. Because
-- ROLLBACK for the same LSN is in progress right now already.
box.error.injection.set("ERRINJ_WAL_DELAY", false)
test_run:switch('default')
-- Wait ACK receipt.
function wait_lsn_ack(id, lsn) \
local this_id = box.info.id \
test_run:wait_downstream(id, {status='follow'}) \
test_run:wait_cond(function() \
return box.info.replication[id].downstream.vclock[this_id] >= lsn \
end) \
end
replica_id = test_run:get_server_id('replica')
wait_lsn_ack(replica_id, lsn)
-- See if parameters change will try to write CONFIRM.
box.cfg{replication_synchro_quorum = 1}
box.cfg{replication_synchro_quorum = 2}
-- Let ROLLBACK go and finish the test.
box.error.injection.set("ERRINJ_WAL_DELAY", false)
test_run:wait_cond(function() return f:status() == 'dead' end)
ok, err
box.cfg{replication_synchro_timeout = 1000}
box.space.sync:replace{2}
box.space.sync:select{}
test_run:switch('replica')
box.space.sync:select{}
test_run:cmd('switch default')
box.cfg{ \
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment