From 0479cfafc9615bc6eb69442f98735704c3bc5aa0 Mon Sep 17 00:00:00 2001
From: Ilya Verbin <iverbin@tarantool.org>
Date: Tue, 10 Jan 2023 15:48:52 +0300
Subject: [PATCH] box: dynamically adjust xlog readahead size

XLOG_READ_AHEAD defines the number of bytes added to the `count' argument
of the `pread' syscall, currently it equals to 16 KB. However xlog and snap
files are written by 128 KB chunks of data, which turn into ~80 KB chunks
after compression (in average, depending on the data), so the 16 KB read-
ahead doesn't make any sense.

According to performance experiments, 8 MB readahead gives the best results
for large files. However, 8 MB read buffers would increase memory
consumption of the replication relays, which usually read small portions of
data and does not need such a big buffers. For this reason, dynamically-
sized read buffer is implemented by this patch. The minimal readahead is
now 128 KB, and the maximal is 8 MB. As a result, the recovery time of a
900 MB snapshot decreased from 25 sec to 13 sec (when reading from HDD).
Performance of a recovery from SSD is not affected.

Closes #8108

NO_DOC=performance improvement
NO_TEST=performance improvement
---
 .../gh-8108-adjust-xlog_read_ahead.md         |   4 +
 src/box/xlog.c                                |  29 +++-
 src/box/xlog.h                                |   4 +
 src/lib/small                                 |   2 +-
 test/unit/CMakeLists.txt                      |   4 +
 test/unit/xlog.c                              | 161 ++++++++++++++++++
 6 files changed, 200 insertions(+), 4 deletions(-)
 create mode 100644 changelogs/unreleased/gh-8108-adjust-xlog_read_ahead.md
 create mode 100644 test/unit/xlog.c

diff --git a/changelogs/unreleased/gh-8108-adjust-xlog_read_ahead.md b/changelogs/unreleased/gh-8108-adjust-xlog_read_ahead.md
new file mode 100644
index 0000000000..25e56c1b37
--- /dev/null
+++ b/changelogs/unreleased/gh-8108-adjust-xlog_read_ahead.md
@@ -0,0 +1,4 @@
+## feature/core
+
+* Reduced recovery time from a snapshot by up to 2x on the systems with a hard
+  disk drive (gh-8108).
diff --git a/src/box/xlog.c b/src/box/xlog.c
index ca8f2978d9..ec7c73f6c8 100644
--- a/src/box/xlog.c
+++ b/src/box/xlog.c
@@ -1526,7 +1526,13 @@ xlog_close(struct xlog *l, bool reuse_fd)
 
 /* {{{ struct xlog_cursor */
 
-#define XLOG_READ_AHEAD		(1 << 14)
+enum {
+	/**
+	 * Min and max values for xlog_cursor::read_ahead.
+	 */
+	XLOG_READ_AHEAD_MIN = XLOG_TX_AUTOCOMMIT_THRESHOLD,
+	XLOG_READ_AHEAD_MAX = 8 * 1024 * 1024,
+};
 
 /**
  * Ensure that at least count bytes are in read buffer
@@ -1545,7 +1551,7 @@ xlog_cursor_ensure(struct xlog_cursor *cursor, size_t count)
 		return 1;
 
 	size_t to_load = count - ibuf_used(&cursor->rbuf);
-	to_load += XLOG_READ_AHEAD;
+	to_load += cursor->read_ahead;
 
 	void *dst = ibuf_reserve(&cursor->rbuf, to_load);
 	if (dst == NULL) {
@@ -1567,9 +1573,25 @@ xlog_cursor_ensure(struct xlog_cursor *cursor, size_t count)
 			 cursor->name);
 		return -1;
 	}
-	/* ibuf_reserve() has been called above, ibuf_alloc() must not fail */
 	assert((size_t)readen <= to_load);
+	/* ibuf_reserve() has been called above, ibuf_alloc() must not fail */
 	ibuf_alloc(&cursor->rbuf, readen);
+	/* Shrink the read buffer to reduce the memory consumption. */
+	if (cursor->need_rbuf_shrink) {
+		ibuf_shrink(&cursor->rbuf);
+		cursor->need_rbuf_shrink = false;
+	}
+	/*
+	 * Grow readahead size if the requested number of bytes was successfully
+	 * read, and decrease it to the minimum otherwise.
+	 */
+	if ((size_t)readen == to_load) {
+		if (cursor->read_ahead * 2 <= XLOG_READ_AHEAD_MAX)
+			cursor->read_ahead *= 2;
+	} else {
+		cursor->need_rbuf_shrink = true;
+		cursor->read_ahead = XLOG_READ_AHEAD_MIN;
+	}
 	cursor->read_offset += readen;
 	return ibuf_used(&cursor->rbuf) >= count ? 0: 1;
 }
@@ -1982,6 +2004,7 @@ xlog_cursor_openfd(struct xlog_cursor *i, int fd, const char *name)
 {
 	memset(i, 0, sizeof(*i));
 	i->fd = fd;
+	i->read_ahead = XLOG_READ_AHEAD_MIN;
 	ibuf_create(&i->rbuf, &cord()->slabc,
 		    XLOG_TX_AUTOCOMMIT_THRESHOLD << 1);
 
diff --git a/src/box/xlog.h b/src/box/xlog.h
index 1d7f564aba..1cc2934611 100644
--- a/src/box/xlog.h
+++ b/src/box/xlog.h
@@ -661,6 +661,10 @@ struct xlog_cursor {
 	char name[PATH_MAX];
 	/** file read buffer */
 	struct ibuf rbuf;
+	/** whether the buffer needs to be shrunk */
+	bool need_rbuf_shrink;
+	/** how many additional bytes are read to the buffer */
+	size_t read_ahead;
 	/** file read position */
 	off_t read_offset;
 	/** cursor for current tx */
diff --git a/src/lib/small b/src/lib/small
index 5c3a0880ae..dd61202855 160000
--- a/src/lib/small
+++ b/src/lib/small
@@ -1 +1 @@
-Subproject commit 5c3a0880ae4659cb662085563ff0b9eee40565fe
+Subproject commit dd612028552908e06078f1bc94fde017bebd2446
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 353571389a..43bda24690 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -180,6 +180,10 @@ create_unit_test(PREFIX xrow
                  SOURCES xrow.cc core_test_utils.c
                  LIBRARIES xrow unit
 )
+create_unit_test(PREFIX xlog
+                 SOURCES xlog.c core_test_utils.c
+                 LIBRARIES xlog xrow unit
+)
 create_unit_test(PREFIX decimal
                  SOURCES decimal.c
                  LIBRARIES core unit
diff --git a/test/unit/xlog.c b/test/unit/xlog.c
new file mode 100644
index 0000000000..ae4d920ef9
--- /dev/null
+++ b/test/unit/xlog.c
@@ -0,0 +1,161 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright 2010-2023, Tarantool AUTHORS, please see AUTHORS file.
+ */
+
+#define UNIT_TAP_COMPATIBLE 1
+#include "unit.h"
+#include "xlog.h"
+#include "xrow.h"
+#include "crc32.h"
+#include "random.h"
+#include "memory.h"
+#include "iproto_constants.h"
+
+/**
+ * Keep in sync with src/box/xlog.c!
+ */
+enum {
+	XLOG_READ_AHEAD_MIN = 128 * 1024,
+	XLOG_READ_AHEAD_MAX = 8 * 1024 * 1024,
+};
+
+/**
+ * Create a temporary directory, initialize it as xdir, and create a new xlog.
+ */
+static void
+create_xlog(struct xlog *xlog, char *dirname)
+{
+	fail_if(mkdtemp(dirname) == NULL);
+
+	struct xdir xdir;
+	struct tt_uuid tt_uuid;
+	struct vclock vclock;
+	memset(&tt_uuid, 1, sizeof(tt_uuid));
+	memset(&vclock, 0, sizeof(vclock));
+
+	xdir_create(&xdir, dirname, XLOG, &tt_uuid, &xlog_opts_default);
+
+	fail_if(xdir_create_xlog(&xdir, xlog, &vclock) < 0);
+}
+
+/**
+ * Write a tuple to the xlog.
+ */
+static void
+write_tuple(struct xlog *xlog, const char *data, uint32_t size)
+{
+	static int64_t lsn;
+	struct request_replace_body body;
+	request_replace_body_create(&body, 0);
+
+	struct xrow_header row;
+	memset(&row, 0, sizeof(struct xrow_header));
+	row.lsn = ++lsn;
+	row.type = IPROTO_INSERT;
+	row.bodycnt = 2;
+	row.body[0].iov_base = &body;
+	row.body[0].iov_len = sizeof(body);
+	row.body[1].iov_base = (char *)data;
+	row.body[1].iov_len = size;
+
+	fail_if(xlog_write_row(xlog, &row) < 0);
+}
+
+/**
+ * Write 1 KB of random uncompressed data to the xlog. The compressed size
+ * is roughly the same due to the randomness of the data.
+ */
+static void
+write_1k(struct xlog *xlog)
+{
+	char data[1024];
+	const size_t data_size = sizeof(data) - 3;
+	random_bytes(mp_encode_binl(data, data_size), data_size);
+	write_tuple(xlog, data, sizeof(data));
+}
+
+/**
+ * Test that the size of the read buffer dynamically increased while reading a
+ * large file, and shrunk when xlog is written/read by small portions of data.
+ */
+static void
+test_dynamic_sized_ibuf(void)
+{
+	header();
+	plan(4);
+	struct xlog xlog;
+	char dirname[] = "./xlog.XXXXXX";
+	char filename[PATH_MAX];
+	create_xlog(&xlog, dirname);
+	strlcpy(filename, xlog.filename, sizeof(filename));
+
+	/* Write about 20 MB of data to the xlog. */
+	for (int i = 0; i < 20 * 1024; i++)
+		write_1k(&xlog);
+	fail_if(xlog_flush(&xlog) < 0);
+
+	struct xlog_cursor cursor;
+	fail_if(xlog_cursor_open(&cursor, xlog.filename) < 0);
+
+	/*
+	 * Read the whole xlog and check that the size of the buffer reaches
+	 * maximum value while reading, it will decrease when reading near the
+	 * end of the file, so keep it on each iteration.
+	 */
+	int rc;
+	int64_t prev_lsn = 0;
+	struct xrow_header row;
+	size_t read_ahead_max = 0;
+	size_t ibuf_used_max = 0;
+	while ((rc = xlog_cursor_next(&cursor, &row, false)) == 0) {
+		fail_if(row.lsn != prev_lsn + 1);
+		prev_lsn = row.lsn;
+
+		read_ahead_max = MAX(cursor.read_ahead, read_ahead_max);
+		ibuf_used_max = MAX(ibuf_used_max, ibuf_used(&cursor.rbuf));
+	}
+
+	is(read_ahead_max, XLOG_READ_AHEAD_MAX,
+	   "read_ahead increased to %d", XLOG_READ_AHEAD_MAX);
+	ok(ibuf_used_max >= XLOG_READ_AHEAD_MAX,
+	   "ibuf size increased to at least %d", XLOG_READ_AHEAD_MAX);
+
+	/*
+	 * Do 1 KB write/read to shrink the read buffer to the minimal capacity.
+	 */
+	write_1k(&xlog);
+	fail_if(xlog_flush(&xlog) < 0);
+	while ((rc = xlog_cursor_next(&cursor, &row, false)) == 0) {
+		fail_if(row.lsn != prev_lsn + 1);
+		prev_lsn = row.lsn;
+	}
+
+	is(cursor.read_ahead, XLOG_READ_AHEAD_MIN,
+	   "read_ahead decreased to %d", XLOG_READ_AHEAD_MIN);
+	ok(ibuf_capacity(&cursor.rbuf) == 0, "ibuf capacity decreased to 0");
+
+	xlog_cursor_close(&cursor, false);
+	fail_if(xlog_close(&xlog, false) < 0);
+	unlink(filename);
+	rmdir(dirname);
+
+	check_plan();
+	footer();
+}
+
+int
+main(void)
+{
+	plan(1);
+	crc32_init();
+	memory_init();
+	random_init();
+
+	test_dynamic_sized_ibuf();
+
+	random_free();
+	memory_free();
+	return check_plan();
+}
-- 
GitLab