From a167a0704e882b80b78ce379b7136a8296fb3dac Mon Sep 17 00:00:00 2001
From: Serge Petrenko <sergepetrenko@tarantool.org>
Date: Tue, 16 Aug 2022 11:15:29 +0300
Subject: [PATCH] replication: fix downstream lag growing when there's no new
 transactions

downstream lag is the difference in time between the moment a
transaction was written to master's WAL and the moment an ack for it
arrived.

Its calculation is supported by replicas sending the last applied row
timestamp. When there is no replication, the last applied row timestamp
stays the same, so in this case downstream lag grows as time passes.

Once an old master is replaced by a new one, it notices changes in peer
vclocks and tries to update downstream lag unconditionally. This makes
the lag appear to be growing indefinitely, showing the time since the
last transaction on the old master:

```
 downstream:
   status: follow
   idle: 0.018218606001028
   vclock: {1: 3, 2: 2}
   lag: 34.623061401367
```

The commit 56571d83172f ("raft: make followers notice leader hang")
made relay exchange information with tx even when there are no new
transactions, so the issue became even easier to reproduce.

The issue itself was present since downstream lag introduction in commit
29025bce7b31 ("relay: provide information about downstream lag").

Closes #7581

NO_DOC=bugfix
---
 .../gh-7581-downstream-lag-calculation-fix.md |  4 +
 src/box/relay.cc                              |  9 +-
 .../gh_7581_downstream_lag_test.lua           | 88 +++++++++++++++++++
 3 files changed, 99 insertions(+), 2 deletions(-)
 create mode 100644 changelogs/unreleased/gh-7581-downstream-lag-calculation-fix.md
 create mode 100644 test/replication-luatest/gh_7581_downstream_lag_test.lua

diff --git a/changelogs/unreleased/gh-7581-downstream-lag-calculation-fix.md b/changelogs/unreleased/gh-7581-downstream-lag-calculation-fix.md
new file mode 100644
index 0000000000..2bf78b61a9
--- /dev/null
+++ b/changelogs/unreleased/gh-7581-downstream-lag-calculation-fix.md
@@ -0,0 +1,4 @@
+## bugfix/replication
+
+* Fixed `box.info.replication[id].downstream.lag` growing indefinitely on a
+  server when it's not writing any new transactions (gh-7581).
diff --git a/src/box/relay.cc b/src/box/relay.cc
index b2dc47907d..388ca62295 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -717,9 +717,14 @@ relay_reader_f(va_list ap)
 			 * been written to WAL with our current realtime
 			 * clock value, thus when it get reported back we
 			 * can compute time spent regardless of the clock
-			 * value on remote replica.
+			 * value on remote replica. Update the lag only when the
+			 * timestamp corresponds to some transaction the replica
+			 * has just applied, i.e. received vclock is bigger than
+			 * the previous one.
 			 */
-			if (xrow.tm != 0)
+			if (xrow.tm != 0 &&
+			    vclock_get(&relay->status_msg.vclock, instance_id) <
+			    vclock_get(&relay->recv_vclock, instance_id))
 				relay->txn_lag = ev_now(loop()) - xrow.tm;
 			fiber_cond_signal(&relay->reader_cond);
 		}
diff --git a/test/replication-luatest/gh_7581_downstream_lag_test.lua b/test/replication-luatest/gh_7581_downstream_lag_test.lua
new file mode 100644
index 0000000000..d4a89240e3
--- /dev/null
+++ b/test/replication-luatest/gh_7581_downstream_lag_test.lua
@@ -0,0 +1,88 @@
+local fiber = require("fiber")
+local t = require("luatest")
+local server = require("test.luatest_helpers.server")
+local cluster = require("test.luatest_helpers.cluster")
+
+local g = t.group("downstream-lag-after-master-switch")
+
+local POLL_TIMEOUT = 0.1
+local DISCONNECT_TIMEOUT = 4 * POLL_TIMEOUT
+--
+-- Downstream lag calculations were off after master switch or after a period of
+-- time with no transactions: downstream lag showed time since the last
+-- processed transaction.
+--
+g.before_each(function(g)
+    g.cluster = cluster:new{}
+    local box_cfg = {
+        replication_timeout = POLL_TIMEOUT,
+        replication = {
+            server.build_instance_uri("server1"),
+            server.build_instance_uri("server2"),
+        },
+    }
+    g.server1 = g.cluster:build_and_add_server({
+        alias = "server1",
+        box_cfg = box_cfg,
+    })
+    g.server2 = g.cluster:build_and_add_server({
+        alias = "server2",
+        box_cfg = box_cfg,
+    })
+    g.cluster:start()
+    g.server1:exec(function()
+        box.schema.space.create("test")
+        box.space.test:create_index("pk")
+    end)
+end)
+
+g.after_each(function(g)
+    g.cluster:drop()
+end)
+
+local function get_downstream_lag(master, replica)
+    local id = replica:instance_id()
+    return master:exec(function(id)
+        return require("luatest").helpers.retrying({}, function()
+            return box.info.replication[id].downstream.lag
+        end)
+    end, {id})
+end
+
+local function wait_downstream_updated(master, replica)
+    local id = replica:instance_id()
+    t.helpers.retrying({}, function()
+        master:exec(function(id)
+            local t = require("luatest")
+            t.assert_equals(box.info.vclock,
+                            box.info.replication[id].downstream.vclock,
+                            "Downstream vclock is updated")
+        end, {id})
+    end)
+end
+
+g.test_downstream_lag = function(g)
+    g.server1:exec(function()
+        box.space.test:insert{1}
+    end)
+    g.server2:wait_vclock_of(g.server1)
+    wait_downstream_updated(g.server1, g.server2)
+    local lag = get_downstream_lag(g.server1, g.server2)
+    t.assert(lag ~= 0, "Real lag value is updated")
+    t.assert(lag < DISCONNECT_TIMEOUT, "Lag value is sane")
+    -- Let a couple of pings pass through.
+    fiber.sleep(2 * POLL_TIMEOUT);
+    lag = get_downstream_lag(g.server1, g.server2)
+    t.assert_equals(lag, get_downstream_lag(g.server1, g.server2),
+                    "Lag doesn't change on standby")
+    g.server2:exec(function()
+        box.space.test:insert{2}
+    end)
+    -- Wait for relay -> tx status update on server 1. It shouldn"t spoil the
+    -- downstream lag.
+    g.server1:wait_vclock_of(g.server2)
+    wait_downstream_updated(g.server1, g.server2)
+    t.assert_equals(lag, get_downstream_lag(g.server1, g.server2),
+                    "Lag doesn't change when there are updates from remote \
+                     servers")
+end
-- 
GitLab