test: fix flaky downstream lag test

It could fail in ASAN build. Can't tell why just there. The main reason was that in a topology server1 + server2->server3 one of the cases - did a txn on server1, - then enabled server2->server3 replication, - then waited for server2->server3 sync, - and instantly assumed the txn reached server3. Surely it not always did. At the server2->server3 sync the txn might not had reached server2 itself yet. The fix is as simple as explicitly ensure the txn is on server2 before waiting server2->server3 sync. Another potential for flakiness was that the default timeout in luatest.helpers.retrying is super low, just 5 seconds. The patch manually bumps it to 60 seconds to be sure any future failures wouldn't be related to too small timeout. Closes #10031 NO_DOC=test NO_CHANGELOG=test (cherry picked from commit d4ea121b)

test: fix flaky downstream lag test
abf52e08 · Vladislav Shpilevoy · Serge Petrenko · 5d1f8c48 · abf52e08
Commit abf52e08 authored 11 months ago by Vladislav Shpilevoy Committed by Serge Petrenko 10 months ago
--- a/test/replication-luatest/gh_9748_downstream_lag_test.lua
+++ b/test/replication-luatest/gh_9748_downstream_lag_test.lua
@@ -4,6 +4,7 @@ local replica_set = require('luatest.replica_set')

 local g = t.group('gh_9748')
 local delay = 0.1
+local wait_timeout = 60

 g.before_all(function(cg)
    cg.replica_set = replica_set:new({})
@@ -62,7 +63,7 @@ g.test_lag_on_master_restart = function(cg)
        box.space.test:replace{2}
    end)
    cg.server1:start()
-    t.helpers.retrying({}, function()
+    t.helpers.retrying({timeout = wait_timeout}, function()
        cg.server2:assert_follows_upstream(cg.server1:get_instance_id())
    end)
    cg.server1:exec(function(id)
@@ -107,7 +108,7 @@ g.test_lag_from_third_node = function(cg)
    -- Retry, because with a non-huge replication timeout the replicas sometimes
    -- might timeout when the system is slow, and that would make downstream lag
    -- disappear, breaking the test.
-    t.helpers.retrying({}, test_lag_from_third_node, cg)
+    t.helpers.retrying({timeout = wait_timeout}, test_lag_from_third_node, cg)
 end

 --
@@ -127,6 +128,7 @@ local function test_lag_is_local_to_sender(cg)
        require('fiber').sleep(delay)
        box.cfg{replication = replication}
    end, {delay, {cg.replication[1], cg.replication[2]}})
+    cg.server1:wait_for_downstream_to(cg.server2)
    -- server1 -> server2 -> server3
    cg.server3:update_box_cfg{replication = {cg.replication[2]}}
    cg.server2:wait_for_downstream_to(cg.server3)
@@ -148,7 +150,8 @@ g.test_lag_is_local_to_sender = function(cg)
    -- Retry, because with a non-huge replication timeout the replicas sometimes
    -- might timeout when the system is slow, and that would make downstream lag
    -- disappear, breaking the test.
-    t.helpers.retrying({}, test_lag_is_local_to_sender, cg)
+    t.helpers.retrying({timeout = wait_timeout}, test_lag_is_local_to_sender,
+                       cg)
 end

 --
@@ -170,12 +173,12 @@ g.test_lag_no_update_when_replica_follows_third_node = function(cg)
        box.space.test:replace{1}
        return box.info.vclock
    end)
-    cg.server1:exec(function(id, vclock, lag)
-        t.helpers.retrying({}, function()
+    cg.server1:exec(function(id, vclock, lag, timeout)
+        t.helpers.retrying({timeout = timeout}, function()
            require('log').info(box.info.replication[id].downstream)
            t.assert_equals(box.info.replication[id].downstream.vclock, vclock,
                            'Server2 did not ack server3 vclock to server1')
        end)
        t.assert_equals(box.info.replication[id].downstream.lag, lag)
-    end, {cg.server2:get_instance_id(), vclock, lag})
+    end, {cg.server2:get_instance_id(), vclock, lag, wait_timeout})
 end