Skip to content
Snippets Groups Projects
Commit 52268eda authored by Georgy Kirichenko's avatar Georgy Kirichenko Committed by Roman Tsisyk
Browse files

replication: reconnect applier on cfg error

Sometimes old relay instance couldn't be deleted yet before new
slave subscribe is there. For this case an invalid cfg error is
returned, and applier should reconnect after timeout.

Fixes #2277
parent 7bf7a0d1
No related branches found
No related tags found
No related merge requests found
......@@ -497,6 +497,10 @@ applier_f(va_list ap)
/* System error from master instance. */
applier_log_error(applier, e);
goto reconnect;
} else if (e->errcode() == ER_CFG) {
/* Invalid configuration */
applier_log_error(applier, e);
goto reconnect;
} else {
/* Unrecoverable errors */
applier_log_error(applier, e);
......
......@@ -418,6 +418,10 @@ relay_subscribe_f(va_list ap)
/* An error has occured while ACKs of xlog reading */
diag_move(&relay->diag, diag_get());
}
struct errinj *inj = errinj(ERRINJ_RELAY_EXIT_DELAY, ERRINJ_DOUBLE);
if (inj != NULL && inj->dparam > 0)
fiber_sleep(inj->dparam);
return diag_is_empty(diag_get()) ? 0: -1;
}
......
......@@ -104,6 +104,7 @@ struct errinj {
_(ERRINJ_VYRUN_DATA_READ, ERRINJ_BOOL, {.bparam = false}) \
_(ERRINJ_BUILD_SECONDARY, ERRINJ_INT, {.iparam = -1}) \
_(ERRINJ_VY_POINT_ITER_WAIT, ERRINJ_BOOL, {.bparam = false}) \
_(ERRINJ_RELAY_EXIT_DELAY, ERRINJ_DOUBLE, {.dparam = 0}) \
ENUM0(errinj_id, ERRINJ_LIST);
extern struct errinj errinjs[];
......
......@@ -56,6 +56,8 @@ errinj.info()
state: false
ERRINJ_WAL_ROTATE:
state: false
ERRINJ_RELAY_EXIT_DELAY:
state: 0
ERRINJ_VY_POINT_ITER_WAIT:
state: false
ERRINJ_TUPLE_FIELD:
......
......@@ -187,25 +187,9 @@ errinj.set("ERRINJ_WAL_WRITE_EOF", false)
---
- ok
...
test_run:cmd("switch replica")
---
- true
...
test_run:cmd("stop server default")
---
- true
...
test_run:cmd("deploy server default")
---
- true
...
test_run:cmd("start server default")
---
- true
...
test_run:cmd("switch default")
box.snapshot()
---
- true
- ok
...
test_run:cmd("stop server replica")
---
......@@ -218,9 +202,6 @@ test_run:cmd("cleanup server replica")
box.cfg{replication_timeout = 0.01}
---
...
box.schema.user.grant("guest", "replication")
---
...
test_run:cmd("start server replica")
---
- true
......@@ -281,6 +262,56 @@ fiber.sleep(0.01)
---
- - follow
...
test_run:cmd("switch default")
---
- true
...
test_run:cmd("stop server replica")
---
- true
...
test_run:cmd("cleanup server replica")
---
- true
...
errinj = box.error.injection
---
...
errinj.set("ERRINJ_RELAY_EXIT_DELAY", 0.01)
---
- ok
...
test_run:cmd("start server replica")
---
- true
...
test_run:cmd("switch replica")
---
- true
...
fiber = require('fiber')
---
...
old_repl = box.cfg.replication
---
...
-- shutdown applier
box.cfg{replication = {}, replication_timeout = 0.1}
---
...
while box.info.replication[1].upstream ~= nil do fiber.sleep(0.0001) end
---
...
-- reconnect
box.cfg{replication = {old_repl}}
---
...
while box.info.replication[1].upstream.status ~= 'disconnected' do fiber.sleep(0.0001) end
---
...
while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001) end
---
...
test_run:cmd("stop server default")
---
- true
......
......@@ -84,17 +84,12 @@ test_run:cmd("switch replica")
wait_repl(60)
test_run:cmd("switch default")
errinj.set("ERRINJ_WAL_WRITE_EOF", false)
box.snapshot()
test_run:cmd("switch replica")
test_run:cmd("stop server default")
test_run:cmd("deploy server default")
test_run:cmd("start server default")
test_run:cmd("switch default")
test_run:cmd("stop server replica")
test_run:cmd("cleanup server replica")
box.cfg{replication_timeout = 0.01}
box.schema.user.grant("guest", "replication")
test_run:cmd("start server replica")
test_run:cmd("switch replica")
......@@ -120,6 +115,27 @@ while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001)
fiber.sleep(0.01)
{box.info.replication[1].upstream.status, box.info.replication[1].upstream.message}
test_run:cmd("switch default")
test_run:cmd("stop server replica")
test_run:cmd("cleanup server replica")
errinj = box.error.injection
errinj.set("ERRINJ_RELAY_EXIT_DELAY", 0.01)
test_run:cmd("start server replica")
test_run:cmd("switch replica")
fiber = require('fiber')
old_repl = box.cfg.replication
-- shutdown applier
box.cfg{replication = {}, replication_timeout = 0.1}
while box.info.replication[1].upstream ~= nil do fiber.sleep(0.0001) end
-- reconnect
box.cfg{replication = {old_repl}}
while box.info.replication[1].upstream.status ~= 'disconnected' do fiber.sleep(0.0001) end
while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001) end
test_run:cmd("stop server default")
test_run:cmd("deploy server default")
test_run:cmd("start server default")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment