From ccb7a649837a827850f143d5ce3e232971e89a54 Mon Sep 17 00:00:00 2001
From: Sergey Kaplun <skaplun@tarantool.org>
Date: Mon, 10 Jun 2024 10:38:16 +0300
Subject: [PATCH] perf: standardize gh-7089-vclock-copy benchmark

The output now contains items per second without the mean time in
seconds. The number of iterations is reduced to 40 to avoid running the
test too long. The `wal_mode` option (default is "none") is set via
command line flags, as far as the number of nodes (default is 10). Also,
the master nodes are set up via the `popen()` command without using any
Makefile.

Also, two new options are introduced:
* The `--output` option allows you to specify the output file.
* The `--output_format` option means the format for the printed output.
  The default is "console". It prints items proceeded per second to the
  stdout. The "json" format contains all the information about the
  benchmark in a format similar to Google Benchmark's.

Usually, these options should be used together to dump machine-readable
results for the benchmarks.

NO_DOC=perf test
NO_CHANGELOG=perf test
NO_TEST=perf test
---
 perf/gh-7089-vclock-copy/Makefile      |  23 ---
 perf/gh-7089-vclock-copy/Readme.md     |   3 -
 perf/gh-7089-vclock-copy/speedtest.lua |  88 -----------
 perf/gh-7089-vclock-copy/swarm.lua     |  38 -----
 perf/lua/CMakeLists.txt                |   1 +
 perf/lua/gh-7089-vclock-copy.lua       | 200 +++++++++++++++++++++++++
 6 files changed, 201 insertions(+), 152 deletions(-)
 delete mode 100644 perf/gh-7089-vclock-copy/Makefile
 delete mode 100644 perf/gh-7089-vclock-copy/Readme.md
 delete mode 100755 perf/gh-7089-vclock-copy/speedtest.lua
 delete mode 100755 perf/gh-7089-vclock-copy/swarm.lua
 create mode 100644 perf/lua/gh-7089-vclock-copy.lua

diff --git a/perf/gh-7089-vclock-copy/Makefile b/perf/gh-7089-vclock-copy/Makefile
deleted file mode 100644
index f688fcd797..0000000000
--- a/perf/gh-7089-vclock-copy/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-SHELL := /bin/bash
-
-test: test_none stop
-
-test_wal: test_write stop
-
-dirs:
-	for i in {1..11}; do mkdir $${i} || true; done
-
-start_swarm:
-	for i in {2..11}; do ./swarm.lua $${i}; done
-
-test_none: clean dirs start_swarm
-	./speedtest.lua none 1
-
-test_write: clean dirs start_swarm
-	./speedtest.lua write 1
-
-stop stop1:
-	for i in {2..11}; do pkill -F $${i}/$${i}.pid || true; done
-
-clean: stop1
-	for i in {1..11}; do rm -rf $${i}; done
diff --git a/perf/gh-7089-vclock-copy/Readme.md b/perf/gh-7089-vclock-copy/Readme.md
deleted file mode 100644
index 7cbff4dcc1..0000000000
--- a/perf/gh-7089-vclock-copy/Readme.md
+++ /dev/null
@@ -1,3 +0,0 @@
-gh-7089 extra vclock copy for each ack test.
-
-Issue `make test` to run.
diff --git a/perf/gh-7089-vclock-copy/speedtest.lua b/perf/gh-7089-vclock-copy/speedtest.lua
deleted file mode 100755
index 4c4b5e2754..0000000000
--- a/perf/gh-7089-vclock-copy/speedtest.lua
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/env tarantool
-
--- An instance file for the node which tests applier thread ack speed.
--- There are 10 threads, one per each replication source, so each WAL write
--- results in an ack message for each thread. This magnifies the possible
--- performance drawbacks of copying vclocks for each thread.
-
-local mode = arg[1] or 'none'
-assert(mode == 'write' or mode == 'none',
-       "mode should be either 'write' or 'none'")
-
-local id = tonumber(arg[2]) or 1
-
-assert(id < 2 or id > 11,
-       'The id should be outside of the occupied range [2, 11]')
-
-local fiber = require('fiber')
-
-box.cfg{
-    listen = 3300 + id,
-    replication = {
-        3302,
-        3303,
-        3304,
-        3305,
-        3306,
-        3307,
-        3308,
-        3309,
-        3310,
-        3311,
-    },
-    replication_threads = 10,
-    -- Disable WAL on a node to notice slightest differences in TX thread
-    -- performance. It's okay to replicate TO a node with disabled WAL. You only
-    -- can't replicate FROM it.
-    wal_mode = mode,
-    work_dir = tostring(id),
-    log = id..'.log',
-}
-
-box.schema.space.create('test', {if_not_exists = true})
-box.space.test:create_index('pk', {if_not_exists = true})
-box.snapshot()
-
-local function replace_func(num_iters)
-    for i = 1, num_iters do
-        box.space.test:replace{i, i}
-    end
-end
-
-local function test(num_fibers)
-    local fibers = {}
-    local num_replaces = 1e6
-    local num_iters = num_replaces / num_fibers
-    local start = fiber.time()
-    for _ = 1, num_fibers do
-        local fib = fiber.new(replace_func, num_iters)
-        fib:set_joinable(true)
-        table.insert(fibers, fib)
-    end
-    assert(#fibers == num_fibers, "Fibers created successfully")
-    for _, fib in pairs(fibers) do
-        fib:join()
-    end
-    -- Update fiber.time() if there were no yields.
-    fiber.yield()
-    local dt = fiber.time() - start
-    return dt, num_replaces / dt
-end
-
-local mean_time = 0
-local mean_rps = 0
-local num_iters = 100
-
--- Fiber count > 1 makes no sense for wal_mode = 'none'. There are no yields
--- on replace when there are no wal writes.
-local num_fibers = mode == 'none' and 1 or 100
-
-for test_iter = 1,num_iters do
-    local time, rps = test(num_fibers)
-    print(('Iteration #%d finished in %f seconds. RPS: %f'):format(test_iter,
-                                                                   time, rps))
-    mean_time = mean_time + time / num_iters
-    mean_rps = mean_rps + rps / num_iters
-end
-print(('Mean iteraion time: %f, mean RPS: %f'):format(mean_time, mean_rps))
-os.exit()
diff --git a/perf/gh-7089-vclock-copy/swarm.lua b/perf/gh-7089-vclock-copy/swarm.lua
deleted file mode 100755
index 6c6e5a6f34..0000000000
--- a/perf/gh-7089-vclock-copy/swarm.lua
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env tarantool
-
--- Instance file for one of the 10 swarm nodes. They bootstrap the cluster, each
--- bump their vclock component and then do nothing and serve as replication
--- masters for the eleventh node.
-
-local id = tonumber(arg[1])
-assert(id ~= nil, 'Please pass a numeric instance id')
-assert(id >= 2 and id <= 11, 'The id should be in ramge [2, 11]')
-
-box.cfg{
-    listen = 3300 + id,
-    replication = {
-        3302,
-        3303,
-        3304,
-        3305,
-        3306,
-        3307,
-        3308,
-        3309,
-        3310,
-        3311,
-    },
-    background = true,
-    work_dir = tostring(id),
-    pid_file = id..'.pid',
-    log = id..'.log',
-}
-
-box.once('bootstrap', function()
-    box.schema.user.grant('guest', 'replication')
-end)
-
--- This is executed on every instance so that vclock is non-empty in each
--- component. This will make the testing instance copy a larger portion of data
--- on each write and make the performance degradation, if any, more obvious.
-box.space._schema:replace{'Something to bump vclock '..id}
diff --git a/perf/lua/CMakeLists.txt b/perf/lua/CMakeLists.txt
index 5e39f1e839..685bec7366 100644
--- a/perf/lua/CMakeLists.txt
+++ b/perf/lua/CMakeLists.txt
@@ -33,6 +33,7 @@ endfunction()
 create_perf_lua_test(NAME 1mops_write)
 create_perf_lua_test(NAME box_select)
 create_perf_lua_test(NAME column_scan)
+create_perf_lua_test(NAME gh-7089-vclock-copy)
 create_perf_lua_test(NAME uri_escape_unescape)
 
 include_directories(${MSGPUCK_INCLUDE_DIRS})
diff --git a/perf/lua/gh-7089-vclock-copy.lua b/perf/lua/gh-7089-vclock-copy.lua
new file mode 100644
index 0000000000..5113653849
--- /dev/null
+++ b/perf/lua/gh-7089-vclock-copy.lua
@@ -0,0 +1,200 @@
+local fiber = require('fiber')
+local fio = require('fio')
+local popen = require('popen')
+local clock = require('clock')
+
+local benchmark = require('benchmark')
+
+local USAGE = [[
+   nodes <number, 10>        - number of nodes as replication sources
+   wal_mode <string, 'none'> - WAL mode for tested replica ('none', 'write')
+
+ Being run without options, this benchmark tests applier thread ACK speed.
+ There are 10 threads, one per replication source, so each WAL write results in
+ an ACK message for each thread. This magnifies the possible performance
+ drawbacks of copying vclocks for each thread. The test performs 1000000
+ replaces, which are repeated 40 times, and measures the average RPS.
+]]
+
+local params = benchmark.argparse(arg, {
+    {'nodes', 'number'},
+    {'wal_mode', 'string'},
+}, USAGE)
+
+local bench = benchmark.new(params)
+
+local wal_mode = params.wal_mode or 'none'
+assert(wal_mode == 'write' or wal_mode == 'none',
+       "mode should be either 'write' or 'none'")
+
+-- Number of nodes.
+local nodes = params.nodes or 10
+assert(nodes > 0 and nodes < 32, 'incorrect nodes number')
+
+local test_dir = fio.tempdir()
+
+local function rmtree(s)
+    if (fio.path.is_file(s) or fio.path.is_link(s)) then
+        fio.unlink(s)
+        return
+    end
+    if fio.path.is_dir(s) then
+        for _, file in pairs(fio.listdir(s)) do
+            rmtree(s .. '/' .. file)
+        end
+        fio.rmdir(s)
+    end
+end
+
+-- Number of nodes, storage for popen handles.
+local nodes_ph = {}
+
+local function exit(res, details)
+    for listen, master in pairs(nodes_ph) do
+        print(('# killing node on %d'):format(listen))
+        master:kill()
+        master:wait()
+    end
+
+    if (details ~= nil) then
+        print(details)
+    end
+    if test_dir ~= nil then
+        rmtree(test_dir)
+        test_dir = nil
+    end
+    os.exit(res)
+end
+
+-- The port for replica.
+local LISTEN_PORT = 3301
+
+local master_nodes = {}
+for i = 3302, 3301 + nodes do
+  table.insert(master_nodes, ('%d'):format(i))
+end
+
+local function bootstrap_node(listen)
+    local work_dir = ('%s/%d'):format(test_dir, listen)
+    -- Subdirectory for node's data.
+    os.execute('mkdir ' .. work_dir)
+
+    local cmd = {arg[-1], '-e', string.format([[
+        local fiber = require('fiber')
+        box.cfg {
+            listen = %d,
+            work_dir = '%s',
+            read_only = false,
+            replication = {%s},
+            log = 'log.log',
+        }
+        box.once('bootstrap', function()
+            box.schema.user.grant('guest', 'replication')
+        end)
+
+        repeat
+            fiber.sleep(0.1)
+        until not (#box.info.replication < %d or box.info().status ~= 'running')
+
+        -- This is executed on every instance so that vclock is
+        -- non-empty in each component. This will make the testing
+        -- instance copy a larger portion of data on each write
+        -- and make the performance degradation, if any.
+        box.space._schema:replace({'Something to bump vclock ' .. %d})
+    ]], listen, work_dir, table.concat(master_nodes, ','), nodes, listen)}
+    local res, err = popen.new(cmd)
+
+    if not res then
+        exit(1, 'error running replica: ' .. err)
+    end
+
+    nodes_ph[listen] = res
+end
+
+
+if (nodes ~= nil and nodes < 32 and nodes > 0) then
+    print('# starting ' .. nodes .. ' masters')
+    for listen = 3302, 3301 + nodes do
+        bootstrap_node(listen)
+    end
+else
+    exit(1, 'Incorrect number of nodes: "' .. arg[1] .. '" must be 1..31')
+end
+
+box.cfg{
+    listen = LISTEN_PORT,
+    replication_threads = nodes,
+    -- Disable WAL on a node to notice slightest differences in TX
+    -- thread performance. It's okay to replicate _to_ a node with
+    -- disabled WAL. You only can't replicate _from_ it.
+    wal_mode = wal_mode,
+    replication = master_nodes,
+    read_only = false,
+    log = 'test.log',
+    work_dir = test_dir,
+}
+
+-- Wait for all nodes to connect.
+repeat
+    fiber.sleep(0.1)
+    print('# replication', #box.info.replication,
+          'status ', box.info().status)
+until not (#box.info.replication < nodes or box.info().status ~= 'running')
+
+box.schema.space.create('test', {if_not_exists = true})
+box.space.test:create_index('pk', {if_not_exists = true})
+box.snapshot()
+
+local function replace_func(num_iters)
+    for i = 1, num_iters do
+        box.space.test:replace({i, i})
+    end
+end
+
+local num_replaces = 1e6
+
+local function test(num_fibers)
+    local fibers = {}
+    local num_iters = num_replaces / num_fibers
+    local start_realtime = clock.time()
+    local start_cputime = clock.proc()
+    for _ = 1, num_fibers do
+        local fib = fiber.new(replace_func, num_iters)
+        fib:set_joinable(true)
+        table.insert(fibers, fib)
+    end
+    assert(#fibers == num_fibers, 'Fibers created successfully')
+    for _, fib in pairs(fibers) do
+        fib:join()
+    end
+    local dt_realtime = clock.time() - start_realtime
+    local dt_cputime = clock.proc() - start_cputime
+    return dt_realtime, dt_cputime, num_replaces / dt_realtime
+end
+
+local num_iters = 40
+
+-- Fiber count > 1 makes no sense for `wal_mode = 'none'`. There
+-- are no yields on replace when there are no WAL writes.
+local num_fibers = wal_mode == 'none' and 1 or 100
+
+local total_realtime = 0
+local total_cputime = 0
+for test_iter = 1, num_iters do
+    local realtime, cputime, rps = test(num_fibers)
+    print(('# Iteration #%d finished in %f seconds. RPS: %f'):format(
+        test_iter, realtime, rps
+    ))
+    total_realtime = total_realtime + realtime
+    total_cputime = total_cputime + cputime
+end
+
+bench:add_result('walmode_' .. wal_mode, {
+    real_time = total_realtime,
+    cpu_time =  total_cputime,
+    items = num_iters * num_replaces,
+})
+
+bench:dump_results()
+
+exit(0)
-- 
GitLab