From 02fae15a3adb8ea450ebbe3c250a4846cf1cca69 Mon Sep 17 00:00:00 2001
From: Sergey Bronnikov <sergeyb@tarantool.org>
Date: Mon, 8 Aug 2022 17:06:22 +0300
Subject: [PATCH] test: fix flakiness in app-luatest/http_client_test.lua

The problem could be easily reproduced with following command line:
./test/test-run.py
	$(yes app-luatest/http_client_test.lua | head -n 1000).

Before this commit we did a socket binding to know a free network port,
then close a socket and started httpd.py on that network port. However
it was not reliable and even with socket options SO_REUSE_PORT start of
httpd.py has failed. With proposed patch schema is changed: we start
httpd.py and pass only a socket family (AF_INET for TCP connection and
AF_UNIX for connection via Unix socket) and then reading output from a
process. Successfully started httpd.py prints a path to a Unix socket or
a pair of IP address and network port split with ":".

With proposed patch test has passed 1000 times without any problems.
Tests previously marked as "fragile" are passed too:

./test/test-run.py --builddir=$(pwd)/build box-tap/net.box.test.lua \
	box-tap/cfg.test.lua box-tap/session.storage.test.lua \
	box-tap/session.test.lua app-tap/tarantoolctl.test.lua \
	app-tap/debug.test.lua app-tap/inspector.test.lua \
	app-tap/logger.test.lua app-tap/transitive1.test.lua \
	app-tap/csv.test.lua app-luatest/http_client_test.lua

P.S. The problem with "fragile" tests is that rerunning hides other
problems. [1] is about "Address already in use" and [2] is about hangs
in test. I made a pull request with changes in http client module and
triggered CI run. Job has been passed, but in log [3] I see three test
restarts due to fails in http_client test related to my changes.

1. https://github.com/tarantool/tarantool-qa/issues/186
2. https://github.com/tarantool/tarantool-qa/issues/31
3. https://github.com/tarantool/tarantool/runs/7726358823?check_suite_focus=true

Closes https://github.com/tarantool/tarantool-qa/issues/186
Closes https://github.com/tarantool/tarantool-qa/issues/31

NO_CHANGELOG=testing
NO_DOC=testing
NO_TEST=testing
---
 test/app-luatest/http_client_test.lua | 68 +++++++++++----------------
 test/app-luatest/httpd.py             | 29 +++++++-----
 test/app-luatest/suite.ini            |  8 ----
 test/app-tap/suite.ini                | 17 +------
 test/box-tap/suite.ini                | 11 +----
 5 files changed, 46 insertions(+), 87 deletions(-)

diff --git a/test/app-luatest/http_client_test.lua b/test/app-luatest/http_client_test.lua
index 5778dac51e..d9f1a777f8 100644
--- a/test/app-luatest/http_client_test.lua
+++ b/test/app-luatest/http_client_test.lua
@@ -1,7 +1,6 @@
 local client = require('http.client')
 local json = require('json')
 local fiber = require('fiber')
-local socketlib = require('socket')
 local os = require('os')
 local t = require('luatest')
 local g = t.group('http_client', {
@@ -22,60 +21,47 @@ local function merge(...)
     return res
 end
 
-local function start_server(sock_family, sock_addr)
-    local arg, url, opts
-    if sock_family == 'AF_INET' then
-        arg = string.format("--inet %s", sock_addr)
-        url = string.format("http://%s/", sock_addr)
-        opts = {}
-    elseif sock_family == 'AF_UNIX' then
-        arg = string.format("--unix %s", sock_addr)
-        url = "http://localhost/"
-        opts = {unix_socket = sock_addr}
-    else
+local function start_server(sock_family)
+    if not sock_family == 'AF_INET' and
+       not sock_family == 'AF_UNIX' then
         error(string.format('invalid socket family: %s', sock_family))
     end
+    local url
+    local client_opts = {}
     -- PYTHON_EXECUTABLE is set in http_client.skipcond.
     local python_executable = os.getenv('PYTHON_EXECUTABLE') or ''
     local cmd_prefix = (python_executable .. ' '):lstrip()
+    local arg = string.format("--conn-type %s", sock_family)
     local cmd = string.format("%s%s/test/app-luatest/httpd.py %s",
                               cmd_prefix, TARANTOOL_SRC_DIR, arg)
     local server = io.popen(cmd)
-    t.assert_equals(server:read("*l"), "heartbeat", "server started")
-    local r
-    for _=1,10 do
-        r = client.get(url, merge(opts, {timeout = 0.01}))
-        if r.status == 200 then
-            break
-        end
-        fiber.sleep(0.01)
+    t.assert_not_equals(server, nil, "Process descriptor is not nil")
+    local sock_addr
+    t.helpers.retrying({ timeout = 2, delay = 0.01 }, function()
+        sock_addr = server:read("*l")
+        t.assert_not_equals(sock_addr, nil, sock_addr)
+    end)
+    if sock_family == "AF_UNIX" then
+        client_opts.unix_socket = sock_addr
+        sock_addr = "localhost"
     end
-    t.assert_equals(r.status, 200, "connection is ok")
-    return server, url, opts
+    url = string.format("http://%s/", sock_addr)
+
+    t.helpers.retrying({}, function()
+        local ok, res = pcall(client.get, url, client_opts)
+        t.skip_if(not ok, "not supported")
+        t.assert_equals(res.status, 200, "connection is ok")
+    end)
+
+    return server, url, client_opts
 end
 
-g.before_all(function(cg)
+g.before_each(function(cg)
     local sock_family = cg.params.sock_family
-    local sock_addr
-    if sock_family == 'AF_INET' then
-        local s = socketlib('AF_INET', 'SOCK_STREAM', 0)
-        s:bind('127.0.0.1', 0)
-        sock_addr = string.format("%s:%d", s:name().host, s:name().port)
-        s:close()
-    else
-        assert(sock_family == 'AF_UNIX')
-        local path = os.tmpname()
-        os.remove(path)
-        local status = pcall(client.get, 'http://localhost/',
-                             {unix_socket = path})
-        t.skip_if(not status, "not supported")
-        sock_addr = path
-        cg.unix_socket_path = path
-    end
-    cg.server, cg.url, cg.opts = start_server(sock_family, sock_addr)
+    cg.server, cg.url, cg.opts = start_server(sock_family)
 end)
 
-g.after_all(function(cg)
+g.after_each(function(cg)
     cg.server:close()
     if cg.unix_socket_path then
         os.remove(cg.unix_socket_path)
diff --git a/test/app-luatest/httpd.py b/test/app-luatest/httpd.py
index 00b538df0f..0c6060d3eb 100755
--- a/test/app-luatest/httpd.py
+++ b/test/app-luatest/httpd.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 import sys
+import tempfile
 from gevent.pywsgi import WSGIServer
 from gevent import spawn, sleep, socket
 
@@ -109,31 +110,35 @@ def handle(env, response) :
         return other_handle(env, response, method, "200 Ok")
     return other_handle(env, response, method, "400 Bad Request")
 
-def heartbeat():
+def heartbeat(sockname):
+    sockname_str = sockname
+    if type(sockname) == tuple:
+        sockname_str = "{}:{}".format(sockname[0], sockname[1])
     try:
         while True:
-            sys.stdout.write("heartbeat\n")
+            sys.stdout.write(sockname_str + "\n")
             sys.stdout.flush()
             sleep(1e-1)
     except IOError:
         sys.exit(1)
 
 def usage():
-    message = "Usage: {} {{ --inet HOST:PORT | --unix PATH }}\n".format(sys.argv[0])
+    message = "Usage: {} {{ --conn-type AF_INET | AF_UNIX }}\n".format(sys.argv[0])
     sys.stderr.write(message)
     sys.exit(1)
 
 if len(sys.argv) != 3:
     usage()
 
-if sys.argv[1] == "--inet":
-    host, port = sys.argv[2].split(":")
-    sock_family = socket.AF_INET
-    sock_addr = (host, int(port))
-elif sys.argv[1] == "--unix":
-    path = sys.argv[2]
-    sock_family = socket.AF_UNIX
-    sock_addr = path
+if sys.argv[1] == "--conn-type":
+    conn_type = sys.argv[2]
+    if conn_type == "AF_UNIX":
+        sock_family = socket.AF_UNIX
+        td = tempfile.TemporaryDirectory()
+        sock_addr = "{}/httpd.sock".format(td.name)
+    elif conn_type == "AF_INET":
+        sock_family = socket.AF_INET
+        sock_addr = ("127.0.0.1", 0)
 else:
     usage()
 
@@ -143,5 +148,5 @@ sock.bind(sock_addr)
 sock.listen(10)
 
 server = WSGIServer(sock, handle, log=None)
-spawn(heartbeat)
+spawn(heartbeat, sock.getsockname())
 server.serve_forever()
diff --git a/test/app-luatest/suite.ini b/test/app-luatest/suite.ini
index 4b271b6b07..fca7aa970a 100644
--- a/test/app-luatest/suite.ini
+++ b/test/app-luatest/suite.ini
@@ -2,11 +2,3 @@
 core = luatest
 description = application server tests on luatest
 is_parallel = True
-fragile = {
-    "retries": 10,
-    "tests": {
-        "http_client_test.lua": {
-            "issues": [ "gh-5346", "gh-5574" ]
-        }
-    }
-  }
diff --git a/test/app-tap/suite.ini b/test/app-tap/suite.ini
index 3263185067..e550dbb9d6 100644
--- a/test/app-tap/suite.ini
+++ b/test/app-tap/suite.ini
@@ -9,22 +9,7 @@ fragile = {
     "retries": 10,
     "tests": {
         "tarantoolctl.test.lua": {
-            "issues": [ "gh-5059", "gh-5346" ]
+            "issues": [ "gh-5059" ]
         },
-        "debug.test.lua": {
-            "issues": [ "gh-5346" ]
-        },
-        "inspector.test.lua": {
-            "issues": [ "gh-5346" ]
-        },
-        "logger.test.lua": {
-            "issues": [ "gh-5346" ]
-        },
-        "transitive1.test.lua": {
-            "issues": [ "gh-5346" ]
-        },
-        "csv.test.lua": {
-            "issues": [ "gh-5346" ]
-        }
     }
   }
diff --git a/test/box-tap/suite.ini b/test/box-tap/suite.ini
index 4d61bb8cb2..b6329b5162 100644
--- a/test/box-tap/suite.ini
+++ b/test/box-tap/suite.ini
@@ -9,16 +9,7 @@ fragile = {
     "retries": 10,
     "tests": {
         "cfg.test.lua": {
-            "issues": [ "gh-5346", "gh-4344" ]
-        },
-        "net.box.test.lua": {
-            "issues": [ "gh-5346" ]
-        },
-        "session.storage.test.lua": {
-            "issues": [ "gh-5346" ]
-        },
-        "session.test.lua": {
-            "issues": [ "gh-5346" ]
+            "issues": [ "gh-4344" ]
         },
         "gh-4231-box-execute-locking.test.lua": {
             "issues": [ "gh-5558" ]
-- 
GitLab