From e92db8b7ade1711b88d1403807e730b270841c1a Mon Sep 17 00:00:00 2001
From: Ilya Verbin <iverbin@tarantool.org>
Date: Wed, 7 Jun 2023 17:05:31 +0300
Subject: [PATCH] box: introduce tuple_builder class

It encapsulates the logic that helps to build a new MsgPack array by
concatenating tuple fields from various locations. The idea is to
postpone memory allocation and copying until the finalization.

Needed for #8157

NO_DOC=internal
NO_CHANGELOG=internal
---
 src/box/CMakeLists.txt    |   1 +
 src/box/tuple_builder.c   | 109 ++++++++++++++++++++++++
 src/box/tuple_builder.h   |  77 +++++++++++++++++
 test/unit/CMakeLists.txt  |   5 ++
 test/unit/tuple_builder.c | 172 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 364 insertions(+)
 create mode 100644 src/box/tuple_builder.c
 create mode 100644 src/box/tuple_builder.h
 create mode 100644 test/unit/tuple_builder.c

diff --git a/src/box/CMakeLists.txt b/src/box/CMakeLists.txt
index 535d498d7b..53564068d3 100644
--- a/src/box/CMakeLists.txt
+++ b/src/box/CMakeLists.txt
@@ -105,6 +105,7 @@ set(tuple_sources
     tuple_format.c
     tuple_constraint_def.c
     tuple_constraint.c
+    tuple_builder.c
     xrow_update.c
     xrow_update_field.c
     xrow_update_array.c
diff --git a/src/box/tuple_builder.c b/src/box/tuple_builder.c
new file mode 100644
index 0000000000..e8bfb7c0b5
--- /dev/null
+++ b/src/box/tuple_builder.c
@@ -0,0 +1,109 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright 2010-2023, Tarantool AUTHORS, please see AUTHORS file.
+ */
+#include <stddef.h>
+#include <stdint.h>
+#include "tuple.h"
+#include "msgpuck.h"
+#include "small/region.h"
+#include "salad/stailq.h"
+#include "tuple_builder.h"
+
+/**
+ * A chunk of data with tuple fields.
+ */
+struct tuple_chunk {
+	/** Start of the data. */
+	const char *data;
+	/** End of the data. */
+	const char *data_end;
+	/** Number of NULL fields. If > 0 then data/data_end are not used. */
+	uint32_t null_count;
+	/** Link in `tuple_builder::chunks`. */
+	struct stailq_entry in_builder;
+};
+
+void
+tuple_builder_new(struct tuple_builder *builder, struct region *region)
+{
+	stailq_create(&builder->chunks);
+	builder->field_count = 0;
+	builder->size = 0;
+	builder->region = region;
+}
+
+void
+tuple_builder_add_nil(struct tuple_builder *builder)
+{
+	builder->field_count++;
+	builder->size += mp_sizeof_nil();
+
+	struct tuple_chunk *chunk;
+	if (!stailq_empty(&builder->chunks)) {
+		chunk = stailq_last_entry(&builder->chunks, struct tuple_chunk,
+					  in_builder);
+		/* Avoid unnecessary allocation. */
+		if (chunk->null_count > 0) {
+			chunk->null_count++;
+			return;
+		}
+	}
+	chunk = xregion_alloc_object(builder->region, typeof(*chunk));
+	chunk->data = NULL;
+	chunk->data_end = NULL;
+	chunk->null_count = 1;
+	stailq_add_tail_entry(&builder->chunks, chunk, in_builder);
+}
+
+void
+tuple_builder_add(struct tuple_builder *builder, const char *data,
+		  size_t data_size, uint32_t field_count)
+{
+	const char *data_end = data + data_size;
+	builder->field_count += field_count;
+	builder->size += data_size;
+
+	struct tuple_chunk *chunk;
+	if (!stailq_empty(&builder->chunks)) {
+		chunk = stailq_last_entry(&builder->chunks, struct tuple_chunk,
+					  in_builder);
+		/* Avoid unnecessary allocation. */
+		if (chunk->data_end == data) {
+			chunk->data_end = data_end;
+			return;
+		}
+	}
+	chunk = xregion_alloc_object(builder->region, typeof(*chunk));
+	chunk->data = data;
+	chunk->data_end = data_end;
+	chunk->null_count = 0;
+	stailq_add_tail_entry(&builder->chunks, chunk, in_builder);
+}
+
+void
+tuple_builder_finalize(struct tuple_builder *builder, const char **data,
+		       const char **data_end)
+{
+	size_t data_size = builder->size +
+			   mp_sizeof_array(builder->field_count);
+	char *buf = xregion_alloc(builder->region, data_size);
+	*data = buf;
+	*data_end = buf + data_size;
+	buf = mp_encode_array(buf, builder->field_count);
+
+	struct tuple_chunk *chunk;
+	stailq_foreach_entry(chunk, &builder->chunks, in_builder) {
+		if (chunk->null_count == 0) {
+			uint32_t size = chunk->data_end - chunk->data;
+			memcpy(buf, chunk->data, size);
+			buf += size;
+		} else {
+			for (uint32_t i = 0; i < chunk->null_count; i++)
+				buf = mp_encode_nil(buf);
+		}
+	}
+	assert(buf == *data_end);
+	mp_tuple_assert(*data, *data_end);
+}
diff --git a/src/box/tuple_builder.h b/src/box/tuple_builder.h
new file mode 100644
index 0000000000..6d7730a104
--- /dev/null
+++ b/src/box/tuple_builder.h
@@ -0,0 +1,77 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright 2010-2023, Tarantool AUTHORS, please see AUTHORS file.
+ */
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include "salad/stailq.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* defined(__cplusplus) */
+
+struct region;
+
+/**
+ * A builder that helps to construct a tuple by concatenating chunks of data.
+ * A chunk represents one or more tuple fields (MsgPack objects).
+ *
+ * First, chunks are added to a builder object. The builder doesn't allocate
+ * any memory for the MsgPack, and doesn't copy it, only pointers to the start
+ * and to the end of the data are preserved.
+ *
+ * Once all chunks have been added, the builder can be used to encode them into
+ * the final MsgPack array.
+ */
+struct tuple_builder {
+	/** List of chunks, linked by `tuple_chunk::in_builder`. */
+	struct stailq chunks;
+	/**
+	 * Number of tuple fields. It can be greater than the number of
+	 * elements in the list of chunks.
+	 */
+	uint32_t field_count;
+	/** Total size of memory required to encode chunks from the list. */
+	size_t size;
+	/** The region used to perform memory allocation. */
+	struct region *region;
+};
+
+/**
+ * Initialize the builder. The region argument is saved to perform memory
+ * allocation for internal structures and for the resulting MsgPack array.
+ */
+void
+tuple_builder_new(struct tuple_builder *builder, struct region *region);
+
+/**
+ * Add a NULL tuple field to the builder.
+ */
+void
+tuple_builder_add_nil(struct tuple_builder *builder);
+
+/**
+ * Add a chunk of data with `field_count` tuple fields to the builder.
+ * If the chunk is adjacent to the previous one, only single pointer is updated,
+ * otherwise a new list element is allocated on builder->region and added to
+ * builder->chunks.
+ */
+void
+tuple_builder_add(struct tuple_builder *builder, const char *data,
+		  size_t data_size, uint32_t field_count);
+
+/**
+ * Encode tuple fields added to the builder into the new MsgPack array.
+ * The buffer is allocated on builder->region, and the address is returned
+ * in data and data_end.
+ */
+void
+tuple_builder_finalize(struct tuple_builder *builder, const char **data,
+		       const char **data_end);
+
+#if defined(__cplusplus)
+} /* extern "C" */
+#endif /* defined(__cplusplus) */
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 2bf5f47693..911014c6be 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -567,6 +567,11 @@ create_unit_test(PREFIX key_def
                  LIBRARIES unit box core
 )
 
+create_unit_test(PREFIX tuple_builder
+                 SOURCES tuple_builder.c box_test_utils.c
+                 LIBRARIES unit box core
+)
+
 create_unit_test(PREFIX getenv_safe
                  SOURCES getenv_safe.c core_test_utils.c
                  LIBRARIES unit core
diff --git a/test/unit/tuple_builder.c b/test/unit/tuple_builder.c
new file mode 100644
index 0000000000..a3f1b25a9b
--- /dev/null
+++ b/test/unit/tuple_builder.c
@@ -0,0 +1,172 @@
+#include "fiber.h"
+#include "memory.h"
+#include "msgpuck.h"
+#include "tuple.h"
+#include "tuple_builder.h"
+
+#define UNIT_TAP_COMPATIBLE 1
+#include "unit.h"
+
+static void
+test_tuple_builder_empty(void)
+{
+	plan(2);
+	header();
+
+	const char *data, *data_end;
+	struct region *region = &fiber()->gc;
+	size_t region_svp = region_used(region);
+
+	struct tuple_builder builder;
+	tuple_builder_new(&builder, region);
+	tuple_builder_finalize(&builder, &data, &data_end);
+
+	is(mp_typeof(*data), MP_ARRAY, "type is MP_ARRAY");
+	is(mp_decode_array(&data), 0, "array is empty");
+	region_truncate(region, region_svp);
+
+	footer();
+	check_plan();
+}
+
+static void
+test_tuple_builder_nulls(void)
+{
+	plan(4);
+	header();
+
+	const char *data, *data_end;
+	struct region *region = &fiber()->gc;
+	size_t region_svp = region_used(region);
+
+	struct tuple_builder builder;
+	tuple_builder_new(&builder, region);
+	tuple_builder_add_nil(&builder);
+	tuple_builder_add_nil(&builder);
+	tuple_builder_add_nil(&builder);
+	tuple_builder_finalize(&builder, &data, &data_end);
+
+	is(mp_decode_array(&data), 3, "array contains 3 elements");
+	is(mp_typeof(*data), MP_NIL, "[0] MP_NIL");
+	mp_decode_nil(&data);
+	is(mp_typeof(*data), MP_NIL, "[1] MP_NIL");
+	mp_decode_nil(&data);
+	is(mp_typeof(*data), MP_NIL, "[2] MP_NIL");
+	region_truncate(region, region_svp);
+
+	footer();
+	check_plan();
+}
+
+static struct tuple *
+create_tuple1(void)
+{
+	char data[16];
+	char *end = data;
+	end = mp_encode_array(end, 5);
+	end = mp_encode_uint(end, 0);
+	end = mp_encode_uint(end, 111);
+	end = mp_encode_uint(end, 222);
+	end = mp_encode_uint(end, 333);
+	end = mp_encode_uint(end, 444);
+
+	struct tuple *tuple = tuple_new(tuple_format_runtime, data, end);
+	tuple_ref(tuple);
+	return tuple;
+}
+
+static struct tuple *
+create_tuple2(void)
+{
+	char data[16];
+	char *end = data;
+	end = mp_encode_array(end, 3);
+	end = mp_encode_str0(end, "xxx");
+	end = mp_encode_str0(end, "yyy");
+	end = mp_encode_str0(end, "zzz");
+
+	struct tuple *tuple = tuple_new(tuple_format_runtime, data, end);
+	tuple_ref(tuple);
+	return tuple;
+}
+
+static void
+test_tuple_builder_merge(void)
+{
+	plan(9);
+	header();
+
+	uint32_t len;
+	const char *str, *data, *data_end;
+	struct region *region = &fiber()->gc;
+	size_t region_svp = region_used(region);
+
+	struct tuple *tuple1 = create_tuple1();
+	struct tuple *tuple2 = create_tuple2();
+	const char *t1f2 = tuple_field(tuple1, 2);
+	const char *t1f3 = tuple_field(tuple1, 3);
+	const char *t1f4 = tuple_field(tuple1, 4);
+	const char *t2f0 = tuple_field(tuple2, 0);
+	const char *t2f1 = tuple_field(tuple2, 1);
+	const char *t2f2 = tuple_field(tuple2, 2);
+
+	struct tuple_builder builder;
+	tuple_builder_new(&builder, region);
+	tuple_builder_add(&builder, t1f2, t1f4 - t1f2, 2);
+	tuple_builder_add(&builder, t2f0, t2f2 - t2f0, 2);
+	tuple_builder_add_nil(&builder);
+	tuple_builder_add(&builder, t2f1, t2f2 - t2f1, 1);
+	tuple_builder_add(&builder, t1f2, t1f3 - t1f2, 1);
+	tuple_builder_add_nil(&builder);
+	tuple_builder_finalize(&builder, &data, &data_end);
+
+	tuple_unref(tuple1);
+	tuple_unref(tuple2);
+
+	is(mp_decode_array(&data), 8, "array contains 8 elements");
+	is(mp_decode_uint(&data), 222, "[0] MP_UINT is 222");
+	is(mp_decode_uint(&data), 333, "[1] MP_UINT is 333");
+	str = mp_decode_str(&data, &len);
+	is(strncmp(str, "xxx", 3), 0, "[2] MP_STR is xxx");
+	str = mp_decode_str(&data, &len);
+	is(strncmp(str, "yyy", 3), 0, "[3] MP_STR is yyy");
+	is(mp_typeof(*data), MP_NIL, "[4] MP_NIL");
+	mp_decode_nil(&data);
+	str = mp_decode_str(&data, &len);
+	is(strncmp(str, "yyy", 3), 0, "[5] MP_STR is yyy");
+	is(mp_decode_uint(&data), 222, "[6] MP_UINT is 222");
+	is(mp_typeof(*data), MP_NIL, "[7] MP_NIL");
+	region_truncate(region, region_svp);
+
+	footer();
+	check_plan();
+}
+
+static int
+test_tuple_builder(void)
+{
+	plan(3);
+	header();
+
+	test_tuple_builder_empty();
+	test_tuple_builder_nulls();
+	test_tuple_builder_merge();
+
+	footer();
+	return check_plan();
+}
+
+int
+main(void)
+{
+	memory_init();
+	fiber_init(fiber_c_invoke);
+	tuple_init(NULL);
+
+	int rc = test_tuple_builder();
+
+	tuple_free();
+	fiber_free();
+	memory_free();
+	return rc;
+}
-- 
GitLab