diff --git a/src/box/applier.cc b/src/box/applier.cc index 6bfe5a99a1ced9bbd4f6b0532f53315a02163188..9aa951c3401cc619266017cdf253e88719ccbd5a 100644 --- a/src/box/applier.cc +++ b/src/box/applier.cc @@ -305,7 +305,7 @@ applier_join(struct applier *applier) * server is 1.6. Since we have * not initialized replication * vclock yet, do it now. In 1.7+ - * this vlcock is not used. + * this vclock is not used. */ xrow_decode_vclock_xc(&row, &replicaset.vclock); } @@ -370,6 +370,7 @@ applier_subscribe(struct applier *applier) struct ev_io *coio = &applier->io; struct ibuf *ibuf = &applier->ibuf; struct xrow_header row; + struct vclock remote_vclock_at_subscribe; xrow_encode_subscribe_xc(&row, &REPLICASET_UUID, &INSTANCE_UUID, &replicaset.vclock); @@ -411,9 +412,8 @@ applier_subscribe(struct applier *applier) * In case of successful subscribe, the server * responds with its current vclock. */ - struct vclock vclock; - vclock_create(&vclock); - xrow_decode_vclock_xc(&row, &vclock); + vclock_create(&remote_vclock_at_subscribe); + xrow_decode_vclock_xc(&row, &remote_vclock_at_subscribe); } /** * Tarantool < 1.6.7: @@ -452,8 +452,15 @@ applier_subscribe(struct applier *applier) applier_set_state(applier, APPLIER_FOLLOW); } + /* + * Stay 'orphan' until appliers catch up with + * the remote vclock at the time of SUBSCRIBE + * and the lag is less than configured. + */ if (applier->state == APPLIER_SYNC && - applier->lag <= replication_sync_lag) { + applier->lag <= replication_sync_lag && + vclock_compare(&remote_vclock_at_subscribe, + &replicaset.vclock) <= 0) { /* Applier is synced, switch to "follow". */ applier_set_state(applier, APPLIER_FOLLOW); } diff --git a/src/box/relay.cc b/src/box/relay.cc index 2bd05ad5f243ba0e1dfd6455b49604299a499414..d2ceaf1108b4c059379866a126394e7b258827f8 100644 --- a/src/box/relay.cc +++ b/src/box/relay.cc @@ -110,6 +110,11 @@ struct relay { struct vclock recv_vclock; /** Replicatoin slave version. */ uint32_t version_id; + /** + * Local vclock at the moment of subscribe, used to check + * dataset on the other side and send missing data rows if any. + */ + struct vclock local_vclock_at_subscribe; /** Relay endpoint */ struct cbus_endpoint endpoint; @@ -541,6 +546,7 @@ relay_subscribe(int fd, uint64_t sync, struct replica *replica, relay.version_id = replica_version_id; relay.replica = replica; replica_set_relay(replica, &relay); + vclock_copy(&relay.local_vclock_at_subscribe, &replicaset.vclock); int rc = cord_costart(&relay.cord, tt_sprintf("relay_%p", &relay), relay_subscribe_f, &relay); @@ -583,10 +589,16 @@ relay_send_row(struct xstream *stream, struct xrow_header *packet) /* * We're feeding a WAL, thus responding to SUBSCRIBE request. * In that case, only send a row if it is not from the same replica - * (i.e. don't send replica's own rows back). + * (i.e. don't send replica's own rows back) or if this row is + * missing on the other side (i.e. in case of sudden power-loss, + * data was not written to WAL, so remote master can't recover + * it). In the latter case packet's LSN is less than or equal to + * local master's LSN at the moment it received 'SUBSCRIBE' request. */ if (relay->replica == NULL || - packet->replica_id != relay->replica->id) { + packet->replica_id != relay->replica->id || + packet->lsn <= vclock_get(&relay->local_vclock_at_subscribe, + packet->replica_id)) { relay_send(relay, packet); } } diff --git a/src/box/vinyl.c b/src/box/vinyl.c index 8abfaac9fca04c2cd11a811356acf72dd6f4b894..d2cd61f00d1c450c206c5dd7b54638951069f72b 100644 --- a/src/box/vinyl.c +++ b/src/box/vinyl.c @@ -2324,6 +2324,16 @@ vinyl_engine_prepare(struct engine *engine, struct txn *txn) vinyl_check_wal(env, "DML") != 0) return -1; + /* + * The configured memory limit will never allow us to commit + * this transaction. Fail early. + */ + if (tx->write_size > env->quota.limit) { + diag_set(OutOfMemory, tx->write_size, + "lsregion", "vinyl transaction"); + return -1; + } + /* * Do not abort join/subscribe on quota timeout - replication * is asynchronous anyway and there's box.info.replication diff --git a/src/box/wal.cc b/src/box/wal.cc index 4576cfe0956bdada6c51600a5d266a69b1055430..099c70caaf509b1cb2c9b13c5806dbd4ef120e26 100644 --- a/src/box/wal.cc +++ b/src/box/wal.cc @@ -770,8 +770,19 @@ wal_write(struct journal *journal, struct journal_entry *entry) * and promote vclock. */ if ((*last)->replica_id == instance_id) { - vclock_follow(&replicaset.vclock, instance_id, - (*last)->lsn); + /* + * In master-master configuration, during sudden + * power-loss, if the data have not been written + * to WAL but have already been sent to others, + * they will send the data back. In this case + * vclock has already been promoted by applier. + */ + if (vclock_get(&replicaset.vclock, + instance_id) < (*last)->lsn) { + vclock_follow(&replicaset.vclock, + instance_id, + (*last)->lsn); + } break; } --last; diff --git a/test/replication/catch.result b/test/replication/catch.result index 7d61ad26f43223c6096551834268bfc54d09a398..91be327258f68b1abb539b0cd6f40af9bb0dda49 100644 --- a/test/replication/catch.result +++ b/test/replication/catch.result @@ -19,11 +19,11 @@ errinj = box.error.injection box.schema.user.grant('guest', 'replication') --- ... -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +test_run:cmd("create server replica with rpl_master=default, script='replication/replica_timeout.lua'") --- - true ... -test_run:cmd("start server replica") +test_run:cmd("start server replica with args='1'") --- - true ... @@ -69,7 +69,7 @@ errinj.set("ERRINJ_RELAY_TIMEOUT", 1000.0) --- - ok ... -test_run:cmd("start server replica") +test_run:cmd("start server replica with args='0.01'") --- - true ... @@ -77,12 +77,6 @@ test_run:cmd("switch replica") --- - true ... -fiber = require('fiber') ---- -... -while box.space.test:count() < 1 do fiber.sleep(0.01) end ---- -... -- Check that replica doesn't enter read-write mode before -- catching up with the master: to check that we inject sleep into -- the master relay_send function and attempt a data modifying @@ -99,8 +93,9 @@ box.space.test ~= nil ... d = box.space.test:delete{1} --- +- error: Can't modify data because this instance is in read-only mode. ... -box.space.test:get(1) == nil +box.space.test:get(1) ~= nil --- - true ... @@ -116,7 +111,11 @@ test_run:cmd("set variable r_uri to 'replica.listen'") c = net_box.connect(r_uri) --- ... -c.space.test:get(1) == nil +d = c.space.test:delete{1} +--- +- error: Can't modify data because this instance is in read-only mode. +... +c.space.test:get(1) ~= nil --- - true ... diff --git a/test/replication/catch.test.lua b/test/replication/catch.test.lua index cb865aa3caea307dbd09a309edac932ac3710cc0..2e2e97bc4d9cf9f582e7f0ed8472ff8e78ebe2b8 100644 --- a/test/replication/catch.test.lua +++ b/test/replication/catch.test.lua @@ -8,8 +8,8 @@ net_box = require('net.box') errinj = box.error.injection box.schema.user.grant('guest', 'replication') -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") +test_run:cmd("create server replica with rpl_master=default, script='replication/replica_timeout.lua'") +test_run:cmd("start server replica with args='1'") test_run:cmd("switch replica") test_run:cmd("switch default") @@ -29,11 +29,9 @@ for i=1,100 do s:insert{i, 'this is test message12345'} end -- sleep after every tuple errinj.set("ERRINJ_RELAY_TIMEOUT", 1000.0) -test_run:cmd("start server replica") +test_run:cmd("start server replica with args='0.01'") test_run:cmd("switch replica") -fiber = require('fiber') -while box.space.test:count() < 1 do fiber.sleep(0.01) end -- Check that replica doesn't enter read-write mode before -- catching up with the master: to check that we inject sleep into -- the master relay_send function and attempt a data modifying @@ -46,14 +44,15 @@ while box.space.test:count() < 1 do fiber.sleep(0.01) end -- box.space.test ~= nil d = box.space.test:delete{1} -box.space.test:get(1) == nil +box.space.test:get(1) ~= nil -- case #2: delete tuple by net.box test_run:cmd("switch default") test_run:cmd("set variable r_uri to 'replica.listen'") c = net_box.connect(r_uri) -c.space.test:get(1) == nil +d = c.space.test:delete{1} +c.space.test:get(1) ~= nil -- check sync errinj.set("ERRINJ_RELAY_TIMEOUT", 0) diff --git a/test/replication/recover_missing_xlog.result b/test/replication/recover_missing_xlog.result new file mode 100644 index 0000000000000000000000000000000000000000..027f8761ef5d6907dc1b1a99257b3bdb1c7f70ae --- /dev/null +++ b/test/replication/recover_missing_xlog.result @@ -0,0 +1,113 @@ +env = require('test_run') +--- +... +test_run = env.new() +--- +... +SERVERS = { 'autobootstrap1', 'autobootstrap2', 'autobootstrap3' } +--- +... +-- Start servers +test_run:create_cluster(SERVERS) +--- +... +-- Wait for full mesh +test_run:wait_fullmesh(SERVERS) +--- +... +test_run:cmd("switch autobootstrap1") +--- +- true +... +for i = 0, 9 do box.space.test:insert{i, 'test' .. i} end +--- +... +box.space.test:count() +--- +- 10 +... +test_run:cmd('switch default') +--- +- true +... +vclock1 = test_run:get_vclock('autobootstrap1') +--- +... +vclock2 = test_run:wait_cluster_vclock(SERVERS, vclock1) +--- +... +test_run:cmd("switch autobootstrap2") +--- +- true +... +box.space.test:count() +--- +- 10 +... +box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.01) +--- +- ok +... +test_run:cmd("stop server autobootstrap1") +--- +- true +... +fio = require('fio') +--- +... +-- This test checks ability to recover missing local data +-- from remote replica. See #3210. +-- Delete data on first master and test that after restart, +-- due to difference in vclock it will be able to recover +-- all missing data from replica. +-- Also check that there is no concurrency, i.e. master is +-- in 'read-only' mode unless it receives all data. +fio.unlink(fio.pathjoin(fio.abspath("."), string.format('autobootstrap1/%020d.xlog', 8))) +--- +- true +... +test_run:cmd("start server autobootstrap1") +--- +- true +... +test_run:cmd("switch autobootstrap1") +--- +- true +... +for i = 10, 19 do box.space.test:insert{i, 'test' .. i} end +--- +... +fiber = require('fiber') +--- +... +box.space.test:select() +--- +- - [0, 'test0'] + - [1, 'test1'] + - [2, 'test2'] + - [3, 'test3'] + - [4, 'test4'] + - [5, 'test5'] + - [6, 'test6'] + - [7, 'test7'] + - [8, 'test8'] + - [9, 'test9'] + - [10, 'test10'] + - [11, 'test11'] + - [12, 'test12'] + - [13, 'test13'] + - [14, 'test14'] + - [15, 'test15'] + - [16, 'test16'] + - [17, 'test17'] + - [18, 'test18'] + - [19, 'test19'] +... +-- Cleanup. +test_run:cmd('switch default') +--- +- true +... +test_run:drop_cluster(SERVERS) +--- +... diff --git a/test/replication/recover_missing_xlog.test.lua b/test/replication/recover_missing_xlog.test.lua new file mode 100644 index 0000000000000000000000000000000000000000..57bc7d31f93166e2421a744f824b8ad065225bf8 --- /dev/null +++ b/test/replication/recover_missing_xlog.test.lua @@ -0,0 +1,40 @@ +env = require('test_run') +test_run = env.new() + +SERVERS = { 'autobootstrap1', 'autobootstrap2', 'autobootstrap3' } +-- Start servers +test_run:create_cluster(SERVERS) +-- Wait for full mesh +test_run:wait_fullmesh(SERVERS) + +test_run:cmd("switch autobootstrap1") +for i = 0, 9 do box.space.test:insert{i, 'test' .. i} end +box.space.test:count() + +test_run:cmd('switch default') +vclock1 = test_run:get_vclock('autobootstrap1') +vclock2 = test_run:wait_cluster_vclock(SERVERS, vclock1) + +test_run:cmd("switch autobootstrap2") +box.space.test:count() +box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.01) +test_run:cmd("stop server autobootstrap1") +fio = require('fio') +-- This test checks ability to recover missing local data +-- from remote replica. See #3210. +-- Delete data on first master and test that after restart, +-- due to difference in vclock it will be able to recover +-- all missing data from replica. +-- Also check that there is no concurrency, i.e. master is +-- in 'read-only' mode unless it receives all data. +fio.unlink(fio.pathjoin(fio.abspath("."), string.format('autobootstrap1/%020d.xlog', 8))) +test_run:cmd("start server autobootstrap1") + +test_run:cmd("switch autobootstrap1") +for i = 10, 19 do box.space.test:insert{i, 'test' .. i} end +fiber = require('fiber') +box.space.test:select() + +-- Cleanup. +test_run:cmd('switch default') +test_run:drop_cluster(SERVERS) diff --git a/test/replication/suite.ini b/test/replication/suite.ini index ee76a3b0073ae8900c3707a76275dafd65ceecc6..b489add5810822a40807ae2c9227075a0c07f00f 100644 --- a/test/replication/suite.ini +++ b/test/replication/suite.ini @@ -3,7 +3,7 @@ core = tarantool script = master.lua description = tarantool/box, replication disabled = consistent.test.lua -release_disabled = catch.test.lua errinj.test.lua gc.test.lua before_replace.test.lua quorum.test.lua +release_disabled = catch.test.lua errinj.test.lua gc.test.lua before_replace.test.lua quorum.test.lua recover_missing_xlog.test.lua config = suite.cfg lua_libs = lua/fast_replica.lua long_run = prune.test.lua diff --git a/test/vinyl/quota_timeout.result b/test/vinyl/quota_timeout.result index 1e66d39280280152aa3c8a96c0faac65d67c246b..e4bced02a22db10e09bbdffc5080c6cd112884f8 100644 --- a/test/vinyl/quota_timeout.result +++ b/test/vinyl/quota_timeout.result @@ -108,7 +108,7 @@ box.error.injection.set('ERRINJ_VY_RUN_WRITE_TIMEOUT', 0) --- - ok ... -s:drop() +s:truncate() --- ... box.snapshot() @@ -116,11 +116,36 @@ box.snapshot() - ok ... -- --- Check that exceeding quota triggers dump of all spaces. +-- Check that exceeding quota doesn't hang the scheduler +-- in case there's nothing to dump. -- -box.cfg{vinyl_timeout=0.01} +-- The following operation should fail instantly irrespective +-- of the value of 'vinyl_timeout' (gh-3291). +-- +box.info.vinyl().quota.used == 0 +--- +- true +... +box.cfg{vinyl_timeout = 9000} +--- +... +pad = string.rep('x', box.cfg.vinyl_memory) +--- +... +_ = s:auto_increment{pad} +--- +- error: Failed to allocate 1048615 bytes in lsregion for vinyl transaction +... +s:drop() +--- +... +box.snapshot() --- +- ok ... +-- +-- Check that exceeding quota triggers dump of all spaces. +-- s1 = box.schema.space.create('test1', {engine = 'vinyl'}) --- ... @@ -133,34 +158,28 @@ s2 = box.schema.space.create('test2', {engine = 'vinyl'}) _ = s2:create_index('pk') --- ... -_ = s1:auto_increment{} +pad = string.rep('x', 64) --- ... -box.info.vinyl().quota.used +_ = s1:auto_increment{pad} --- -- 49186 ... -pad = string.rep('x', box.cfg.vinyl_memory) +s1.index.pk:info().memory.bytes > 0 --- +- true ... -_ = s2:auto_increment{pad} +pad = string.rep('x', box.cfg.vinyl_memory - string.len(pad)) --- -- error: Timed out waiting for Vinyl memory quota ... -while box.info.vinyl().quota.used > 0 do fiber.sleep(0.01) end +_ = s2:auto_increment{pad} --- ... -box.info.vinyl().quota.used +while s1.index.pk:info().disk.dump.count == 0 do fiber.sleep(0.01) end --- -- 0 ... --- --- Check that exceeding quota doesn't hang the scheduler --- in case there's nothing to dump. --- -s2:auto_increment{pad} +s1.index.pk:info().memory.bytes == 0 --- -- error: Timed out waiting for Vinyl memory quota +- true ... test_run:cmd('switch default') --- diff --git a/test/vinyl/quota_timeout.test.lua b/test/vinyl/quota_timeout.test.lua index ed5ba79d7cbff223c4d3b05e6f17d26a0a41acea..c3d17b44c35f24e6c0ebb48057f8423dd9e9f2ea 100644 --- a/test/vinyl/quota_timeout.test.lua +++ b/test/vinyl/quota_timeout.test.lua @@ -50,33 +50,41 @@ test_run:cmd("clear filter") box.error.injection.set('ERRINJ_VY_RUN_WRITE_TIMEOUT', 0) +s:truncate() +box.snapshot() + +-- +-- Check that exceeding quota doesn't hang the scheduler +-- in case there's nothing to dump. +-- +-- The following operation should fail instantly irrespective +-- of the value of 'vinyl_timeout' (gh-3291). +-- +box.info.vinyl().quota.used == 0 +box.cfg{vinyl_timeout = 9000} +pad = string.rep('x', box.cfg.vinyl_memory) +_ = s:auto_increment{pad} + s:drop() box.snapshot() -- -- Check that exceeding quota triggers dump of all spaces. -- -box.cfg{vinyl_timeout=0.01} - s1 = box.schema.space.create('test1', {engine = 'vinyl'}) _ = s1:create_index('pk') s2 = box.schema.space.create('test2', {engine = 'vinyl'}) _ = s2:create_index('pk') -_ = s1:auto_increment{} -box.info.vinyl().quota.used +pad = string.rep('x', 64) +_ = s1:auto_increment{pad} +s1.index.pk:info().memory.bytes > 0 -pad = string.rep('x', box.cfg.vinyl_memory) +pad = string.rep('x', box.cfg.vinyl_memory - string.len(pad)) _ = s2:auto_increment{pad} -while box.info.vinyl().quota.used > 0 do fiber.sleep(0.01) end -box.info.vinyl().quota.used - --- --- Check that exceeding quota doesn't hang the scheduler --- in case there's nothing to dump. --- -s2:auto_increment{pad} +while s1.index.pk:info().disk.dump.count == 0 do fiber.sleep(0.01) end +s1.index.pk:info().memory.bytes == 0 test_run:cmd('switch default') test_run:cmd("stop server test") diff --git a/third_party/luarocks b/third_party/luarocks index 6e6fe62d9409fe2103c0fd091cccb3da0451faf5..b2dde9a4cede89aa985e26aa339e4ea340199101 160000 --- a/third_party/luarocks +++ b/third_party/luarocks @@ -1 +1 @@ -Subproject commit 6e6fe62d9409fe2103c0fd091cccb3da0451faf5 +Subproject commit b2dde9a4cede89aa985e26aa339e4ea340199101 diff --git a/third_party/tarantool_ev.h b/third_party/tarantool_ev.h index 5ec22ff6a2dd8d62497697bf761efdda763c417e..2fea76e6b52c8ad39d023beac10b0f6ecac36a1f 100644 --- a/third_party/tarantool_ev.h +++ b/third_party/tarantool_ev.h @@ -48,6 +48,10 @@ #define EV_FORK_ENABLE 1 #define EV_CONFIG_H 0 #define EV_USE_FLOOR 1 +#ifdef HAVE_CLOCK_GETTIME_DECL +# define EV_USE_REALTIME 1 +# define EV_USE_MONOTONIC 1 +#endif #include "third_party/libev/ev.h" #else /* !defined(ENABLE_BUNDLED_LIBEV) */ #include <ev.h>