From b6630a4108512d9267fb4c9a1f32473418a39562 Mon Sep 17 00:00:00 2001 From: Valentin Syrovatskiy <v.syrovatskiy@picodata.io> Date: Tue, 5 Jul 2022 08:36:50 +0000 Subject: [PATCH] refactor: rewrite lua tests in python --- test/couple_test.lua | 71 ------------ test/helper.lua | 7 -- test/helper/picodata.lua | 185 ------------------------------- test/int/conftest.py | 19 +++- test/int/test_network_effects.py | 104 +++++++++++++++++ test/int/test_supervisor.py | 95 ++++++++++++++++ test/single_test.lua | 38 ------- test/supervisor_test.lua | 85 -------------- test/threesome_test.lua | 159 -------------------------- 9 files changed, 217 insertions(+), 546 deletions(-) delete mode 100644 test/couple_test.lua delete mode 100644 test/helper.lua delete mode 100644 test/helper/picodata.lua create mode 100644 test/int/test_network_effects.py create mode 100644 test/int/test_supervisor.py delete mode 100644 test/single_test.lua delete mode 100644 test/supervisor_test.lua delete mode 100644 test/threesome_test.lua diff --git a/test/couple_test.lua b/test/couple_test.lua deleted file mode 100644 index eca4226cdf..0000000000 --- a/test/couple_test.lua +++ /dev/null @@ -1,71 +0,0 @@ -local t = require('luatest') -local h = require('test.helper') -local g = t.group() - -local fio = require('fio') - -g.before_all(function() - g.data_dir = fio.tempdir() - local peer = {'127.0.0.1:13301', '127.0.0.1:13302'} - - g.cluster = { - i1 = h.Picodata:new({ - name = 'i1', - data_dir = g.data_dir .. '/i1', - listen = '127.0.0.1:13301', - peer = peer, - }), - i2 = h.Picodata:new({ - name = 'i2', - data_dir = g.data_dir .. '/i2', - listen = '127.0.0.1:13302', - peer = peer, - }), - } - - for _, node in pairs(g.cluster) do - node:start() - end - - for _, node in pairs(g.cluster) do - node:wait_started() - end -end) - -g.after_all(function() - for _, node in pairs(g.cluster) do - node:stop() - end - fio.rmtree(g.data_dir) -end) - -g.test_follower_proposal = function() - -- Speed up node election - g.cluster.i1:promote_or_fail() - - t.assert( - g.cluster.i2:raft_propose_eval(1, '_G.check = box.info.listen') - ) - t.assert_equals( - g.cluster.i1:connect():eval('return check'), - '127.0.0.1:13301' - ) - t.assert_equals( - g.cluster.i2:connect():eval('return check'), - '127.0.0.1:13302' - ) -end - -g.test_failover = function() - g.cluster.i1:promote_or_fail() - h.retrying({}, function() - g.cluster.i2:assert_raft_status("Follower", 1) - end) - - h.retrying({}, function() - -- Speed up election timeout - g.cluster.i2:connect():eval('picolib.raft_tick(20)') - g.cluster.i2:assert_raft_status("Leader") - g.cluster.i1:assert_raft_status("Follower", 2) - end) -end diff --git a/test/helper.lua b/test/helper.lua deleted file mode 100644 index dfd7d5da1f..0000000000 --- a/test/helper.lua +++ /dev/null @@ -1,7 +0,0 @@ -local log = require('log') -local helper = table.copy(require('luatest.helpers')) - -log.cfg({log_level = 6}) -helper.Picodata = require('test.helper.picodata') - -return helper diff --git a/test/helper/picodata.lua b/test/helper/picodata.lua deleted file mode 100644 index 52f2694fee..0000000000 --- a/test/helper/picodata.lua +++ /dev/null @@ -1,185 +0,0 @@ -local checks = require('checks') -local log = require('log') -local fun = require('fun') -local netbox = require('net.box') - -local luatest = require('luatest') -local Process = require('luatest.process') - --- Defaults. -local Picodata = { - workdir = nil, - name = 'default', - listen = '127.0.0.1:13301', - peer = {'127.0.0.1:13301'}, - args = {'run'}, - env = {}, - - command = 'target/debug/picodata', - process = nil, - __type = 'Picodata', - id = -1, -} - -function Picodata:inherit(object) - setmetatable(object, self) - self.__index = self - return object -end - ---- Build picodata node. --- @param object --- @string object.name Human-readable node name. --- @string object.data_dir Path to the data directory. --- @string object.listen Socket bind address. --- @table object.peer URL of other peers in cluster. --- @tab[opt] object.env Environment variables passed to the process. --- @tab[opt] object.args Command-line arguments passed to the process. --- @return object -function Picodata:new(object) - checks('table', { - name = '?string', - data_dir = 'string', - listen = '?string', - peer = '?table', - args = '?table', - env = '?table', - }) - self:inherit(object) - object:initialize() - return object -end - -function Picodata:initialize() - checks('Picodata') - - self.env = fun.chain({ - PICODATA_INSTANCE_ID = self.name, - PICODATA_DATA_DIR = self.data_dir, - PICODATA_LISTEN = self.listen, - PICODATA_PEER = table.concat(self.peer, ','), - }, self.env):tomap() -end - ---- Start the node. -function Picodata:start() - checks('Picodata') - - local env = table.copy(os.environ()) - local log_cmd = {} - for k, v in pairs(self.env) do - table.insert(log_cmd, string.format('%s=%q', k, v)) - env[k] = v - end - table.insert(log_cmd, self.command) - for _, v in ipairs(self.args) do - table.insert(log_cmd, string.format('%q', v)) - end - log.info(table.concat(log_cmd, ' ')) - - self.process = Process:start(self.command, self.args, env, { - output_prefix = self.name, - }) - log.debug('Started server PID: ' .. self.process.pid) -end - -function Picodata:wait_started() - checks('Picodata') - - luatest.helpers.retrying({}, function() - self:connect() - local raft_status = self:raft_status() - luatest.assert(raft_status) - luatest.assert_ge(raft_status.leader_id, 1) - self.id = raft_status.id - end) -end - ---- Connect to the node. --- --- Connection is established synchronously. The result is cached. --- --- @function connect --- @return netbox connection -function Picodata:connect() - checks('Picodata') - - if self.conn ~= nil and self.conn:is_connected() then - return self.conn - end - - local conn = netbox.connect(self.listen) - if conn.error then - error(conn.error) - end - - self.conn = conn - return conn -end - ---- Stop the node. -function Picodata:stop() - local process = self.process - if process == nil then - return - end - - os.execute('pkill -TERM -P ' .. self.process.pid) - luatest.helpers.retrying({}, function() - if process:is_alive() then - return error(self.name .. ' is still running') - end - end) - - log.warn('%s killed', self.name) - self.process = nil -end - ---- Get the status of raft node. --- @function --- @return {id = number, leader_id = number, state = string} --- State can be one of "Follower", "Candidate", "Leader", "PreCandidate". -function Picodata:raft_status() - checks('Picodata') - return self:connect():call('picolib.raft_status') -end - ---- Assert raft status matches expectations. --- @function --- @tparam string raft_state --- @tparam[opt] number leader_id -function Picodata:assert_raft_status(raft_state, leader_id) - checks('Picodata', 'string', '?number') - return luatest.assert_covers( - self:raft_status(), - { - leader_id = leader_id, - raft_state = raft_state, - } - ) -end - ---- Propose Lua code evaluation on every node in cluster. --- @tparam number timeout --- @tparam string code --- @treturn boolean whether proposal was committed on the current node. -function Picodata:raft_propose_eval(timeout, code) - checks('Picodata', 'number', 'string') - return self:connect():call( - 'picolib.raft_propose_eval', - {timeout, code} - ) -end - ---- Forcing leader election as if previous leader was dead. --- Wait for the node becoming a leader. --- Raise an exception if promotion fails. -function Picodata:promote_or_fail() - checks('Picodata') - return luatest.helpers.retrying({}, function() - self:connect():call('picolib.raft_timeout_now') - self:assert_raft_status("Leader", self.id) - end) -end - -return Picodata diff --git a/test/int/conftest.py b/test/int/conftest.py index f73ddf9870..4efbc601fb 100644 --- a/test/int/conftest.py +++ b/test/int/conftest.py @@ -9,6 +9,7 @@ import pytest import signal import subprocess +from datetime import datetime from shutil import rmtree from typing import Callable, Generator, Iterator from itertools import count @@ -260,13 +261,16 @@ class Instance: out.write(line) out.flush() - def start(self): + def start(self, peers=[]): if self.process: # Be idempotent return eprint(f"{self} starting...") + if peers != []: + self.peers = map(lambda i: i.listen, peers) + self.process = subprocess.Popen( self.command, env=self.env or None, @@ -493,3 +497,16 @@ def cluster( def instance(cluster: Cluster) -> Generator[Instance, None, None]: cluster.deploy(instance_count=1) yield cluster[0] + + +def retrying(fn, timeout=3): + # Usage example: + # retrying(lambda: assert(value == 1)) + # retrying(lambda: assert(value == 1), timeout = 5) + start = datetime.now() + while True: + try: + return fn() + except AssertionError as ex: + if (datetime.now() - start).seconds > timeout: + raise ex from ex diff --git a/test/int/test_network_effects.py b/test/int/test_network_effects.py new file mode 100644 index 0000000000..6130db0cb5 --- /dev/null +++ b/test/int/test_network_effects.py @@ -0,0 +1,104 @@ +import pytest + +from conftest import ( + Cluster, + Instance, + ReturnError, + retrying, +) + + +@pytest.fixture +def cluster3(cluster: Cluster): + cluster.deploy(instance_count=3) + return cluster + + +def test_log_rollback(cluster3: Cluster): + # Scanario: the Leader can't propose without Followers + # Given a cluster + # When all Followers killed without graceful shutdown + # And the Leader proposing changes + # Then the proposition failed + + i1, i2, i3 = cluster3.instances + i1.assert_raft_status("Leader") + i2.assert_raft_status("Follower") + i3.assert_raft_status("Follower") + + def propose_state_change(srv: Instance, value): + code = 'box.space.raft_state:put({"test-timeline", "%s"})' % value + return srv.raft_propose_eval(code, 0.1) + + propose_state_change(i1, "i1 is a leader") + + # Simulate the network partitioning: i1 can't reach i2 and i3. + i2.kill() + i3.kill() + + # No operations can be committed, i1 is alone. + with pytest.raises(ReturnError, match="timeout"): + propose_state_change(i1, "i1 lost the quorum") + + # And now i2 + i3 can't reach i1. + i1.terminate() + i2.start(peers=[i3]) + i3.start(peers=[i2]) + i2.wait_ready() + i3.wait_ready() + + # Help i2 to become a new leader + i2.promote_or_fail() + retrying(lambda: i3.assert_raft_status("Follower", i2.raft_id)) + + propose_state_change(i2, "i2 takes the leadership") + + # Now i1 has an uncommitted, but persisted entry that should be rolled back. + i1.start(peers=[i2, i3]) + i1.wait_ready() + retrying(lambda: i1.assert_raft_status("Follower", i2.raft_id)) + + propose_state_change(i1, "i1 is alive again") + + +def test_leader_disruption(cluster3: Cluster): + # Scenario: Follower reconnection on disconnect from the cluster + # Given a cluster + # When any Follower lost network connection with all other cluster nodes + # And this Follower starts new election + # And the network connection was established again + # Then the Follower became Follower as it was before + + i1, i2, i3 = cluster3.instances + i1.assert_raft_status("Leader") + i2.assert_raft_status("Follower") + i3.assert_raft_status("Follower") + + # Simulate asymmetric network failure. + # Node i3 doesn't receive any messages, + # including the heartbeat from the leader. + # Then it starts a new election. + i3.call("box.schema.func.drop", ".raft_interact") + + # Speed up election timeout + i3.eval( + """ + while picolib.raft_status().raft_state == 'Follower' do + picolib.raft_tick(1) + end + """ + ) + i3.assert_raft_status("PreCandidate", None) + + # Advance the raft log. It makes i1 and i2 to reject the RequestPreVote. + i1.raft_propose_eval("return", timeout_seconds=1) + + # Restore normal network operation + i3.call( + "box.schema.func.create", + ".raft_interact", + {"language": "C", "if_not_exists": True}, + ) + + # i3 should become the follower again without disrupting i1 + retrying(lambda: i3.assert_raft_status("Follower", i1.raft_id)) diff --git a/test/int/test_supervisor.py b/test/int/test_supervisor.py new file mode 100644 index 0000000000..684bf1fa31 --- /dev/null +++ b/test/int/test_supervisor.py @@ -0,0 +1,95 @@ +import signal +import subprocess +import os +import time +import pytest + +from conftest import ( + Cluster, + Instance, + retrying, +) +from functools import reduce + + +@pytest.fixture +def instance(cluster: Cluster): + cluster.deploy(instance_count=1) + [i1] = cluster.instances + return i1 + + +def pgrep_tree(pid): + command = f"exec pgrep -P{pid}" + try: + ps = subprocess.check_output(command, shell=True) + ps = ps.strip().split() + ps = list(map(lambda p: int(p), ps)) + subps = map(lambda p: pgrep_tree(p), ps) # list of lists of pids + subps = reduce(lambda acc, p: [*acc, *p], subps, []) # list of pids + return [pid, *subps] + except subprocess.SubprocessError: + return [pid] + + +def pid_alive(pid): + """Check For the existence of a unix pid.""" + try: + os.kill(pid, 0) + except OSError: + return False + else: + return True + + +def assert_all_pids_down(pids): + assert all(map(lambda pid: not pid_alive(pid), pids)) + + +def test_sigkill(instance: Instance): + # Scenario: terminating process should terminate all child processes + # Given an instance + # When the process terminated + # Then all subprocesses are teminated too + + assert instance.process + pids = pgrep_tree(instance.process.pid) + assert len(pids) == 2 + + instance.kill() + + retrying(lambda: assert_all_pids_down(pids)) + + +def test_sigint(instance: Instance): + # Scenario: suspending of child process prevents the parent process from interrupting + # Given an instance + # When child process is stopped + # And parent process got SIGINT + # Then parent process keep living + # When child process is continued + # Then parent process gracefully exits + + assert instance.process + pids = pgrep_tree(instance.process.pid) + child_pid = pids[1] + + os.kill(child_pid, signal.SIGSTOP) + + # Signal the supervisor and give it some time to handle one. + # Without a sleep the next assertion is useless. Unfortunately, + # there're no alternatives to sleep, because the signal + # delivery is a mystery of the kernel. + os.kill(instance.process.pid, signal.SIGINT) + time.sleep(0.1) + + # We've signalled supervisor. It should forward the signal + # the child and keep waiting. But the child is stopped now, + # and can't handle the forwarded signal. + # Supervisor must still be alive. + assert pid_alive(instance.process.pid) + + os.kill(child_pid, signal.SIGCONT) + instance.process.wait(timeout=1) + + retrying(lambda: assert_all_pids_down(pids)) diff --git a/test/single_test.lua b/test/single_test.lua deleted file mode 100644 index dd2b397adf..0000000000 --- a/test/single_test.lua +++ /dev/null @@ -1,38 +0,0 @@ -local t = require('luatest') -local h = require('test.helper') -local g = t.group() - -local fio = require('fio') - -g.before_all(function() - g.data_dir = fio.tempdir() - - g.node = h.Picodata:new({ - name = 'single', - data_dir = g.data_dir, - listen = '127.0.0.1:13301', - peer = {'127.0.0.1:13301'}, - }) - g.node:start() - g.node:wait_started() -end) - -g.after_all(function() - g.node:stop() - fio.rmtree(g.data_dir) -end) - -g.test = function() - t.assert_equals( - {g.node:raft_propose_eval(0, 'return')}, - {nil, "timeout"} -- Timeout - ) - - t.assert( - g.node:raft_propose_eval(1, '_G.success = true') - ) - t.assert_equals( - g.node:connect():eval('return success'), - true - ) -end diff --git a/test/supervisor_test.lua b/test/supervisor_test.lua deleted file mode 100644 index f856ee0e02..0000000000 --- a/test/supervisor_test.lua +++ /dev/null @@ -1,85 +0,0 @@ -local t = require('luatest') -local h = require('test.helper') -local g = t.group() - -local fio = require('fio') -local log = require('log') -local popen = require('popen') - -local function pgrep_children(pid, result) - pid = pid or require('tarantool').pid() - result = result or {} - - local ps = t.assert(popen.shell('exec pgrep -P' .. pid, 'r')) - for _, child in ipairs(ps:read():strip():split()) do - table.insert(result, child) - pgrep_children(child, result) - end - return result -end - -g.before_each(function() - g.data_dir = fio.tempdir() - - g.node = h.Picodata:new({ - name = 'single', - data_dir = g.data_dir, - listen = '127.0.0.1:13301', - peer = {'127.0.0.1:13301'}, - }) - g.node:start() - g.node:wait_started() - - g.children = pgrep_children() - t.assert_equals(#g.children, 2, "something wrong with pgrep") -end) - -g.after_each(function() - for _, pid in ipairs(g.children) do - t.Process.kill_pid(pid, 9, {quiet = true}) - end - fio.rmtree(g.data_dir) -end) - -g.test_sigkill = function() - g.node.process:kill('KILL') - log.warn("Sent SIGKILL to the supervisor") - - h.retrying({}, function() - for i, pid in ipairs(g.children) do - t.assert_not( - t.Process.is_pid_alive(pid), - string.format("child #%d (pid %s) didn't die", i, pid) - ) - end - end) -end - -g.test_sigint = function() - t.Process.kill_pid(g.children[2], 'STOP') - - -- Signal the supervisor and give it some time to handle one. - -- Without a sleep the next assertion is useless. Unfortunately, - -- there're no alternatives to sleep, because the signal - -- delivery is a mystery of the kernel. - g.node.process:kill('INT') - log.warn("Sent SIGINT to the supervisor") - require('fiber').sleep(0.1) - - -- We've signalled supervisor. It should forward the signal - -- the child and keep waiting. But the child is stopped now, - -- and can't handle the forwarded signal. - -- Supervisor must still be alive. - t.assert(g.node.process:is_alive(), "supervisor treminated prematurely") - - t.Process.kill_pid(g.children[2], 'CONT') - - h.retrying({}, function() - for i, pid in ipairs(g.children) do - t.assert_not( - t.Process.is_pid_alive(pid), - string.format("child #%d (pid %s) didn't die", i, pid) - ) - end - end) -end diff --git a/test/threesome_test.lua b/test/threesome_test.lua deleted file mode 100644 index 352ccc0fa9..0000000000 --- a/test/threesome_test.lua +++ /dev/null @@ -1,159 +0,0 @@ -local t = require('luatest') -local h = require('test.helper') -local g = t.group() - -local fio = require('fio') - -g.before_all(function() - g.data_dir = fio.tempdir() - local peer = { - '127.0.0.1:13301', - '127.0.0.1:13302', - '127.0.0.1:13303', - } - - g.cluster = { - i1 = h.Picodata:new({ - name = 'i1', - data_dir = g.data_dir .. '/i1', - listen = '127.0.0.1:13301', - peer = peer, - }), - i2 = h.Picodata:new({ - name = 'i2', - data_dir = g.data_dir .. '/i2', - listen = '127.0.0.1:13302', - peer = peer, - }), - i3 = h.Picodata:new({ - name = 'i3', - data_dir = g.data_dir .. '/i3', - listen = '127.0.0.1:13303', - peer = peer, - }), - } - - for _, node in pairs(g.cluster) do - node:start() - end - - for _, node in pairs(g.cluster) do - node:wait_started() - end -end) - -g.after_all(function() - for _, node in pairs(g.cluster) do - node:stop() - end - fio.rmtree(g.data_dir) -end) - -local function propose_state_change(srv, value) - -- It's just a boilerplate - local code = string.format( - 'box.space.raft_state:put({"test-timeline", %q})', - value - ) - return srv:raft_propose_eval(0.1, code) -end - -g.test_log_rollback = function() - -- TODO - -- Ðтот теÑÑ‚ Ñтал некорректен Ñ Ð¿Ð¾Ñвлением фазы диÑкавери. - -- ИнÑтанÑÑ‹ i2 и i3 не Ñмогут Ñтартануть, Ñ‚.к. одним из пиров - -- ÑвлÑетÑÑ Ð´Ð¾Ñ…Ð»Ñ‹Ð¹ i1. - -- Тем не менее, Ñам теÑÑ‚ удалÑÑ‚ÑŒ не Ñледует. Ð”Ð°Ð½Ð½Ð°Ñ - -- проблема требует переÑмотреть лишь подход к Ñозданию причин - -- Ð´Ð»Ñ Ñ€Ð¾Ð»Ð±ÐµÐºÐ° рафт лога. Ðо оÑÐ½Ð¾Ð²Ð½Ð°Ñ Ð·Ð°Ð´Ð°Ñ‡Ð° теÑта (проверка - -- Ð¿Ð¾Ð²ÐµÐ´ÐµÐ½Ð¸Ñ Ð¿Ð¸ÐºÐ¾Ð´Ð°Ñ‚Ñ‹ при ролбеке) оÑтаетÑÑ Ð°ÐºÑ‚ÑƒÐ°Ð»ÑŒÐ½Ð¾Ð¹. - t.skip('Fix me') - - -- Speed up node election - g.cluster.i1:promote_or_fail() - h.retrying({}, function() - g.cluster.i2:assert_raft_status("Follower", g.cluster.i1.id) - g.cluster.i3:assert_raft_status("Follower", g.cluster.i1.id) - end) - - t.assert( - propose_state_change(g.cluster.i1, "i1 is a leader") - ) - - -- Simulate the network partitioning: i1 can't reach i2 and i3. - g.cluster.i2:stop() - g.cluster.i3:stop() - - -- No operations can be committed, i1 is alone. - t.assert_equals( - {propose_state_change(g.cluster.i1, "i1 lost the quorum")}, - {nil, "timeout"} - ) - - -- And now i2 + i3 can't reach i1. - g.cluster.i1:stop() - g.cluster.i2:start() - g.cluster.i3:start() - - -- Help I2 to become a new leader. - g.cluster.i2:promote_or_fail() - h.retrying({}, function() - g.cluster.i3:assert_raft_status("Follower", g.cluster.i2.id) - end) - - t.assert( - propose_state_change(g.cluster.i2, "i2 takes the leadership") - ) - - -- Now i1 has an uncommitted, but persisted entry that should be rolled back. - g.cluster.i1:start() - h.retrying({}, function() - g.cluster.i1:assert_raft_status("Follower", g.cluster.i2.id) - end) - - t.assert( - propose_state_change(g.cluster.i1, "i1 is alive again") - ) -end - -g.test_leader_disruption = function() - g.cluster.i1:promote_or_fail() - h.retrying({}, function() - g.cluster.i2:assert_raft_status("Follower", 1) - g.cluster.i3:assert_raft_status("Follower", 1) - end) - - -- Simulate asymmetric network failure. - -- Node i3 doesn't receive any messages, - -- including the heartbeat from the leader. - -- Then it starts a new election. - g.cluster.i3:connect():call( - 'box.schema.func.drop', - {'.raft_interact'} - ) - - -- Speed up election timeout - g.cluster.i3:connect():eval([[ - while picolib.raft_status().raft_state == 'Follower' do - picolib.raft_tick(1) - end - ]]) - g.cluster.i3:assert_raft_status("PreCandidate", 0) - - -- Advance the raft log. It makes i1 and i2 to reject the RequestPreVote. - g.cluster.i1:raft_propose_eval(1, 'return') - - -- Restore normal network operation - g.cluster.i3:connect():call( - 'box.schema.func.create', - {'.raft_interact', { - language = "C", - if_not_exists = true - }} - ) - - -- i3 should become the follower again without disrupting i1 - h.retrying({}, function() - g.cluster.i3:assert_raft_status("Follower", 1) - end) -end -- GitLab