From 02fae15a3adb8ea450ebbe3c250a4846cf1cca69 Mon Sep 17 00:00:00 2001 From: Sergey Bronnikov <sergeyb@tarantool.org> Date: Mon, 8 Aug 2022 17:06:22 +0300 Subject: [PATCH] test: fix flakiness in app-luatest/http_client_test.lua The problem could be easily reproduced with following command line: ./test/test-run.py $(yes app-luatest/http_client_test.lua | head -n 1000). Before this commit we did a socket binding to know a free network port, then close a socket and started httpd.py on that network port. However it was not reliable and even with socket options SO_REUSE_PORT start of httpd.py has failed. With proposed patch schema is changed: we start httpd.py and pass only a socket family (AF_INET for TCP connection and AF_UNIX for connection via Unix socket) and then reading output from a process. Successfully started httpd.py prints a path to a Unix socket or a pair of IP address and network port split with ":". With proposed patch test has passed 1000 times without any problems. Tests previously marked as "fragile" are passed too: ./test/test-run.py --builddir=$(pwd)/build box-tap/net.box.test.lua \ box-tap/cfg.test.lua box-tap/session.storage.test.lua \ box-tap/session.test.lua app-tap/tarantoolctl.test.lua \ app-tap/debug.test.lua app-tap/inspector.test.lua \ app-tap/logger.test.lua app-tap/transitive1.test.lua \ app-tap/csv.test.lua app-luatest/http_client_test.lua P.S. The problem with "fragile" tests is that rerunning hides other problems. [1] is about "Address already in use" and [2] is about hangs in test. I made a pull request with changes in http client module and triggered CI run. Job has been passed, but in log [3] I see three test restarts due to fails in http_client test related to my changes. 1. https://github.com/tarantool/tarantool-qa/issues/186 2. https://github.com/tarantool/tarantool-qa/issues/31 3. https://github.com/tarantool/tarantool/runs/7726358823?check_suite_focus=true Closes https://github.com/tarantool/tarantool-qa/issues/186 Closes https://github.com/tarantool/tarantool-qa/issues/31 NO_CHANGELOG=testing NO_DOC=testing NO_TEST=testing --- test/app-luatest/http_client_test.lua | 68 +++++++++++---------------- test/app-luatest/httpd.py | 29 +++++++----- test/app-luatest/suite.ini | 8 ---- test/app-tap/suite.ini | 17 +------ test/box-tap/suite.ini | 11 +---- 5 files changed, 46 insertions(+), 87 deletions(-) diff --git a/test/app-luatest/http_client_test.lua b/test/app-luatest/http_client_test.lua index 5778dac51e..d9f1a777f8 100644 --- a/test/app-luatest/http_client_test.lua +++ b/test/app-luatest/http_client_test.lua @@ -1,7 +1,6 @@ local client = require('http.client') local json = require('json') local fiber = require('fiber') -local socketlib = require('socket') local os = require('os') local t = require('luatest') local g = t.group('http_client', { @@ -22,60 +21,47 @@ local function merge(...) return res end -local function start_server(sock_family, sock_addr) - local arg, url, opts - if sock_family == 'AF_INET' then - arg = string.format("--inet %s", sock_addr) - url = string.format("http://%s/", sock_addr) - opts = {} - elseif sock_family == 'AF_UNIX' then - arg = string.format("--unix %s", sock_addr) - url = "http://localhost/" - opts = {unix_socket = sock_addr} - else +local function start_server(sock_family) + if not sock_family == 'AF_INET' and + not sock_family == 'AF_UNIX' then error(string.format('invalid socket family: %s', sock_family)) end + local url + local client_opts = {} -- PYTHON_EXECUTABLE is set in http_client.skipcond. local python_executable = os.getenv('PYTHON_EXECUTABLE') or '' local cmd_prefix = (python_executable .. ' '):lstrip() + local arg = string.format("--conn-type %s", sock_family) local cmd = string.format("%s%s/test/app-luatest/httpd.py %s", cmd_prefix, TARANTOOL_SRC_DIR, arg) local server = io.popen(cmd) - t.assert_equals(server:read("*l"), "heartbeat", "server started") - local r - for _=1,10 do - r = client.get(url, merge(opts, {timeout = 0.01})) - if r.status == 200 then - break - end - fiber.sleep(0.01) + t.assert_not_equals(server, nil, "Process descriptor is not nil") + local sock_addr + t.helpers.retrying({ timeout = 2, delay = 0.01 }, function() + sock_addr = server:read("*l") + t.assert_not_equals(sock_addr, nil, sock_addr) + end) + if sock_family == "AF_UNIX" then + client_opts.unix_socket = sock_addr + sock_addr = "localhost" end - t.assert_equals(r.status, 200, "connection is ok") - return server, url, opts + url = string.format("http://%s/", sock_addr) + + t.helpers.retrying({}, function() + local ok, res = pcall(client.get, url, client_opts) + t.skip_if(not ok, "not supported") + t.assert_equals(res.status, 200, "connection is ok") + end) + + return server, url, client_opts end -g.before_all(function(cg) +g.before_each(function(cg) local sock_family = cg.params.sock_family - local sock_addr - if sock_family == 'AF_INET' then - local s = socketlib('AF_INET', 'SOCK_STREAM', 0) - s:bind('127.0.0.1', 0) - sock_addr = string.format("%s:%d", s:name().host, s:name().port) - s:close() - else - assert(sock_family == 'AF_UNIX') - local path = os.tmpname() - os.remove(path) - local status = pcall(client.get, 'http://localhost/', - {unix_socket = path}) - t.skip_if(not status, "not supported") - sock_addr = path - cg.unix_socket_path = path - end - cg.server, cg.url, cg.opts = start_server(sock_family, sock_addr) + cg.server, cg.url, cg.opts = start_server(sock_family) end) -g.after_all(function(cg) +g.after_each(function(cg) cg.server:close() if cg.unix_socket_path then os.remove(cg.unix_socket_path) diff --git a/test/app-luatest/httpd.py b/test/app-luatest/httpd.py index 00b538df0f..0c6060d3eb 100755 --- a/test/app-luatest/httpd.py +++ b/test/app-luatest/httpd.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import sys +import tempfile from gevent.pywsgi import WSGIServer from gevent import spawn, sleep, socket @@ -109,31 +110,35 @@ def handle(env, response) : return other_handle(env, response, method, "200 Ok") return other_handle(env, response, method, "400 Bad Request") -def heartbeat(): +def heartbeat(sockname): + sockname_str = sockname + if type(sockname) == tuple: + sockname_str = "{}:{}".format(sockname[0], sockname[1]) try: while True: - sys.stdout.write("heartbeat\n") + sys.stdout.write(sockname_str + "\n") sys.stdout.flush() sleep(1e-1) except IOError: sys.exit(1) def usage(): - message = "Usage: {} {{ --inet HOST:PORT | --unix PATH }}\n".format(sys.argv[0]) + message = "Usage: {} {{ --conn-type AF_INET | AF_UNIX }}\n".format(sys.argv[0]) sys.stderr.write(message) sys.exit(1) if len(sys.argv) != 3: usage() -if sys.argv[1] == "--inet": - host, port = sys.argv[2].split(":") - sock_family = socket.AF_INET - sock_addr = (host, int(port)) -elif sys.argv[1] == "--unix": - path = sys.argv[2] - sock_family = socket.AF_UNIX - sock_addr = path +if sys.argv[1] == "--conn-type": + conn_type = sys.argv[2] + if conn_type == "AF_UNIX": + sock_family = socket.AF_UNIX + td = tempfile.TemporaryDirectory() + sock_addr = "{}/httpd.sock".format(td.name) + elif conn_type == "AF_INET": + sock_family = socket.AF_INET + sock_addr = ("127.0.0.1", 0) else: usage() @@ -143,5 +148,5 @@ sock.bind(sock_addr) sock.listen(10) server = WSGIServer(sock, handle, log=None) -spawn(heartbeat) +spawn(heartbeat, sock.getsockname()) server.serve_forever() diff --git a/test/app-luatest/suite.ini b/test/app-luatest/suite.ini index 4b271b6b07..fca7aa970a 100644 --- a/test/app-luatest/suite.ini +++ b/test/app-luatest/suite.ini @@ -2,11 +2,3 @@ core = luatest description = application server tests on luatest is_parallel = True -fragile = { - "retries": 10, - "tests": { - "http_client_test.lua": { - "issues": [ "gh-5346", "gh-5574" ] - } - } - } diff --git a/test/app-tap/suite.ini b/test/app-tap/suite.ini index 3263185067..e550dbb9d6 100644 --- a/test/app-tap/suite.ini +++ b/test/app-tap/suite.ini @@ -9,22 +9,7 @@ fragile = { "retries": 10, "tests": { "tarantoolctl.test.lua": { - "issues": [ "gh-5059", "gh-5346" ] + "issues": [ "gh-5059" ] }, - "debug.test.lua": { - "issues": [ "gh-5346" ] - }, - "inspector.test.lua": { - "issues": [ "gh-5346" ] - }, - "logger.test.lua": { - "issues": [ "gh-5346" ] - }, - "transitive1.test.lua": { - "issues": [ "gh-5346" ] - }, - "csv.test.lua": { - "issues": [ "gh-5346" ] - } } } diff --git a/test/box-tap/suite.ini b/test/box-tap/suite.ini index 4d61bb8cb2..b6329b5162 100644 --- a/test/box-tap/suite.ini +++ b/test/box-tap/suite.ini @@ -9,16 +9,7 @@ fragile = { "retries": 10, "tests": { "cfg.test.lua": { - "issues": [ "gh-5346", "gh-4344" ] - }, - "net.box.test.lua": { - "issues": [ "gh-5346" ] - }, - "session.storage.test.lua": { - "issues": [ "gh-5346" ] - }, - "session.test.lua": { - "issues": [ "gh-5346" ] + "issues": [ "gh-4344" ] }, "gh-4231-box-execute-locking.test.lua": { "issues": [ "gh-5558" ] -- GitLab