From 04e25c0958232352434b37e4bfb4557d9537d089 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@tarantool.org>
Date: Thu, 1 Sep 2022 17:32:30 +0300
Subject: [PATCH] memtx: optimize tuple garbage collection

Currently, tuples are never garbage collected if the number of open read
views stays above zero, even if they can't possibly be accessed from any
read view (e.g. were freed before the oldest read view was created).
This commit fixes this issue by introducing per read view tuple garbage
collection lists. The algorithm is described in the comments to the
code.

Closes #7185

NO_DOC=internal
NO_CHANGELOG=internal
---
 src/box/memtx_allocator.cc       | 106 ++++++++++++++++++++
 src/box/memtx_allocator.h        | 163 +++++++++++++++++++++++++------
 test/unit/memtx_allocator.cc     |  66 ++++++++++++-
 test/unit/memtx_allocator.result |  17 +++-
 4 files changed, 322 insertions(+), 30 deletions(-)

diff --git a/src/box/memtx_allocator.cc b/src/box/memtx_allocator.cc
index 2528066c5a..b2004cb932 100644
--- a/src/box/memtx_allocator.cc
+++ b/src/box/memtx_allocator.cc
@@ -31,6 +31,112 @@
 #include "memtx_allocator.h"
 #include "trivia/tuple.h"
 
+struct memtx_tuple_rv *
+memtx_tuple_rv_new(uint32_t version, struct rlist *list)
+{
+	assert(version > 0);
+	int count = 1;
+	struct memtx_tuple_rv *rv;
+	rlist_foreach_entry(rv, list, link)
+		count++;
+	struct memtx_tuple_rv *new_rv = (struct memtx_tuple_rv *)xmalloc(
+			sizeof(*new_rv) + count * sizeof(*new_rv->lists));
+	new_rv->count = count;
+	/* Create one list per each open read view. */
+	struct memtx_tuple_rv_list *l = &new_rv->lists[0];
+	uint32_t prev_version = 0;
+	rlist_foreach_entry(rv, list, link) {
+		l->version = memtx_tuple_rv_version(rv);
+		/* List must be sorted by read view version. */
+		assert(l->version > prev_version);
+		stailq_create(&l->tuples);
+		prev_version = l->version;
+		l++;
+	}
+	/* And one more list for self. */
+	assert(l == &new_rv->lists[count - 1]);
+	l->version = version;
+	assert(l->version > prev_version);
+	(void)prev_version;
+	stailq_create(&l->tuples);
+	rlist_add_tail_entry(list, new_rv, link);
+	return new_rv;
+}
+
+void
+memtx_tuple_rv_delete(struct memtx_tuple_rv *rv, struct rlist *list,
+		      struct stailq *tuples_to_free)
+{
+	struct memtx_tuple_rv *prev_rv = rlist_prev_entry_safe(rv, list, link);
+	uint32_t prev_version = prev_rv == nullptr ? 0 :
+				memtx_tuple_rv_version(prev_rv);
+	/*
+	 * Move tuples from lists with version <= prev_version to the list of
+	 * the previous read view and delete all other tuples.
+	 */
+	int i = 0;
+	int j = 0;
+	while (i < rv->count) {
+		struct memtx_tuple_rv_list *src = &rv->lists[i];
+		if (src->version <= prev_version) {
+			/*
+			 * The tuples were allocated before the previous read
+			 * view was opened. Move them to the previous read
+			 * view's list.
+			 */
+			assert(prev_rv != nullptr);
+			assert(j < prev_rv->count);
+			struct memtx_tuple_rv_list *dst = &prev_rv->lists[j];
+			/*
+			 * The previous read view may have more lists, because
+			 * some read views could have been closed by the time
+			 * this read view was open.  Skip them.
+			 */
+			while (dst->version != src->version) {
+				j++;
+				assert(j < prev_rv->count);
+				dst = &prev_rv->lists[j];
+			}
+			stailq_concat(&dst->tuples, &src->tuples);
+			j++;
+		} else {
+			/*
+			 * The tuples were allocated after the previous read
+			 * view was opened and freed before the next read view
+			 * was opened. Free them immediately.
+			 */
+			stailq_concat(tuples_to_free, &src->tuples);
+		}
+		i++;
+	}
+	rlist_del_entry(rv, link);
+	free(rv);
+}
+
+void
+memtx_tuple_rv_add(struct memtx_tuple_rv *rv, struct memtx_tuple *tuple)
+{
+	/*
+	 * Binary search the list with min version such that
+	 * list->version > tuple->version.
+	 */
+	int begin = 0;
+	int end = rv->count;
+	struct memtx_tuple_rv_list *found = nullptr;
+	while (begin != end) {
+		int middle = begin + (end - begin) / 2;
+		struct memtx_tuple_rv_list *l = &rv->lists[middle];
+		if (l->version <= tuple->version) {
+			begin = middle + 1;
+		} else {
+			found = l;
+			end = middle;
+		}
+	}
+	assert(found != nullptr);
+	stailq_add_entry(&found->tuples, tuple, in_gc);
+}
+
 void
 memtx_allocators_init(struct allocator_settings *settings)
 {
diff --git a/src/box/memtx_allocator.h b/src/box/memtx_allocator.h
index b72b74b21f..d239f62fd0 100644
--- a/src/box/memtx_allocator.h
+++ b/src/box/memtx_allocator.h
@@ -31,6 +31,7 @@
  */
 #include "allocator.h"
 #include "salad/stailq.h"
+#include "small/rlist.h"
 #include "tuple.h"
 
 /**
@@ -60,6 +61,98 @@ struct memtx_tuple {
 	};
 };
 
+/**
+ * List of tuples owned by a read view.
+ *
+ * See the comment to memtx_tuple_rv for details.
+ */
+struct memtx_tuple_rv_list {
+	/** Read view version. */
+	uint32_t version;
+	/** List of tuples, linked by memtx_tuple::in_gc. */
+	struct stailq tuples;
+};
+
+/**
+ * Tuple list array associated with a read view.
+ *
+ * When a read view is opened:
+ * + We assign a unique incrementally growing version to it.
+ * + We create and associate a list array with it. The array consists of one
+ *   tuple list per each read view created so far, including the new read view.
+ *
+ * When a tuple is allocated, we store the most recent read view version in it.
+ * This will allow us to check if it's visible by a read view when it's freed.
+ *
+ * When a tuple is freed:
+ * 1. We look up the most recent open read view.
+ * 2. If there's no open read views or the most recent open read view's version
+ *    is <= the tuple's version, we free the tuple immediately, because it was
+ *    allocated after the most recent open read view was opened.
+ * 3. Otherwise, we add the tuple to the list that has the minimal version
+ *    among all lists in the array such that list->version > tuple->version.
+ *    In other words, we add it to the list corresponding to the oldest read
+ *    view that can access the tuple.
+ *
+ * When a read view is closed:
+ * 1. We look up the most recent read view older than the closed one.
+ * 2. If there's no such read view, we free all tuples from the closed read
+ *    view's lists.
+ * 3. Otherwise,
+ *    + We free all tuples from lists with version > the found read view's
+ *      version, because those tuples were allocated after any older read
+ *      view was opened and freed before any newer read view was opened.
+ *    + We move tuples from all other lists to the corresponding list of
+ *      the found read view.
+ */
+struct memtx_tuple_rv {
+	/** Link in the list of all open read views. */
+	struct rlist link;
+	/** Number of entries in the array. */
+	int count;
+	/**
+	 * Array of tuple lists, one per each read view that were open at the
+	 * time when this read view was created, including this read view.
+	 * Ordered by read view version, ascending (the oldest read view comes
+	 * first).
+	 */
+	struct memtx_tuple_rv_list lists[0];
+};
+
+/** Returns the read view version. */
+static inline uint32_t
+memtx_tuple_rv_version(struct memtx_tuple_rv *rv)
+{
+	/* Last list corresponds to self. */
+	assert(rv->count > 0);
+	return rv->lists[rv->count - 1].version;
+}
+
+/**
+ * Allocates a list array for a read view and initializes it using the list of
+ * all open read views. Adds the new read view to the list.
+ */
+struct memtx_tuple_rv *
+memtx_tuple_rv_new(uint32_t version, struct rlist *list);
+
+/**
+ * Deletes a list array. Tuples that are still visible from other read views
+ * are moved to the older read view's lists. Tuples that are not visible from
+ * any read view are appended to the tuples_to_free list.
+ */
+void
+memtx_tuple_rv_delete(struct memtx_tuple_rv *rv, struct rlist *list,
+		      struct stailq *tuples_to_free);
+
+/**
+ * Adds a freed tuple to a read view's list and returns true.
+ *
+ * The tuple must be visible from some read view, that is the tuple version
+ * must be < than the most recent open read view.
+ */
+void
+memtx_tuple_rv_add(struct memtx_tuple_rv *rv, struct memtx_tuple *tuple);
+
 /** Memtx read view options. */
 struct memtx_read_view_opts {};
 
@@ -72,11 +165,15 @@ class MemtxAllocator {
 	 * Opening a read view pins tuples that were allocated before
 	 * the read view was created. See open_read_view().
 	 */
-	struct ReadView {};
+	struct ReadView {
+		/** Lists of tuples owned by this read view. */
+		struct memtx_tuple_rv *rv;
+	};
 
 	static void create()
 	{
 		stailq_create(&gc);
+		rlist_create(&read_views);
 	}
 
 	static void destroy()
@@ -97,8 +194,9 @@ class MemtxAllocator {
 	{
 		(void)opts;
 		read_view_version++;
-		delayed_free_mode++;
-		return nullptr;
+		ReadView *rv = (ReadView *)xmalloc(sizeof(*rv));
+		rv->rv = memtx_tuple_rv_new(read_view_version, &read_views);
+		return rv;
 	}
 
 	/**
@@ -106,10 +204,9 @@ class MemtxAllocator {
 	 */
 	static void close_read_view(ReadView *rv)
 	{
-		assert(rv == nullptr);
-		(void)rv;
-		assert(delayed_free_mode > 0);
-		--delayed_free_mode;
+		memtx_tuple_rv_delete(rv->rv, &read_views, &gc);
+		TRASH(rv);
+		::free(rv);
 	}
 
 	/**
@@ -130,19 +227,19 @@ class MemtxAllocator {
 	 * Free a tuple allocated with alloc_tuple().
 	 *
 	 * The tuple is freed immediately if there's no read view that may use
-	 * it. Otherwise, it's put in the garbage collection list to be free as
-	 * soon as the last read view using it is destroyed.
+	 * it. Otherwise, it's put in a read view's list to be free as soon as
+	 * the last read view using it is destroyed.
 	 */
 	static void free_tuple(struct tuple *tuple)
 	{
 		struct memtx_tuple *memtx_tuple = container_of(
 			tuple, struct memtx_tuple, base);
-		if (delayed_free_mode == 0 ||
-		    memtx_tuple->version == read_view_version ||
-		    tuple_has_flag(tuple, TUPLE_IS_TEMPORARY)) {
+		struct memtx_tuple_rv *rv = tuple_rv_last(tuple);
+		if (rv == nullptr ||
+		    memtx_tuple->version >= memtx_tuple_rv_version(rv)) {
 			immediate_free_tuple(memtx_tuple);
 		} else {
-			delayed_free_tuple(memtx_tuple);
+			memtx_tuple_rv_add(rv, memtx_tuple);
 		}
 	}
 
@@ -167,15 +264,8 @@ class MemtxAllocator {
 		free(memtx_tuple, size);
 	}
 
-	static void delayed_free_tuple(struct memtx_tuple *memtx_tuple)
-	{
-		stailq_add_entry(&gc, memtx_tuple, in_gc);
-	}
-
 	static void collect_garbage()
 	{
-		if (delayed_free_mode > 0)
-			return;
 		for (int i = 0; !stailq_empty(&gc) && i < GC_BATCH_SIZE; i++) {
 			struct memtx_tuple *memtx_tuple = stailq_shift_entry(
 					&gc, struct memtx_tuple, in_gc);
@@ -184,31 +274,48 @@ class MemtxAllocator {
 	}
 
 	/**
-	 * Tuple garbage collection list. Contains tuples that were not freed
-	 * immediately because they are currently in use by a read view.
+	 * Returns the most recent open read view that needs this tuple or null
+	 * if the tuple may be freed immediately.
 	 */
-	static struct stailq gc;
+	static struct memtx_tuple_rv *
+	tuple_rv_last(struct tuple *tuple)
+	{
+		/* Temporary tuples are freed immediately. */
+		if (tuple_has_flag(tuple, TUPLE_IS_TEMPORARY))
+			return nullptr;
+		if (rlist_empty(&read_views))
+			return nullptr;
+		return rlist_last_entry(&read_views,
+					struct memtx_tuple_rv, link);
+	}
+
 	/**
-	 * Unless zero, freeing of tuples allocated before the last call to
-	 * open_read_view() is delayed until close_read_view() is called.
+	 * List of freed tuples that were not freed immediately, because
+	 * they were in use by a read view, linked in by memtx_tuple::in_gc.
+	 * We collect tuples from this list on allocation.
 	 */
-	static uint32_t delayed_free_mode;
+	static struct stailq gc;
 	/**
 	 * Most recent read view's version.
 	 *
 	 * Incremented with each open read view. Not supposed to wrap around.
 	 */
 	static uint32_t read_view_version;
+	/**
+	 * List of memtx_tuple_rv objects, ordered by read view version,
+	 * ascending (the oldest read view comes first).
+	 */
+	static struct rlist read_views;
 };
 
 template<class Allocator>
 struct stailq MemtxAllocator<Allocator>::gc;
 
 template<class Allocator>
-uint32_t MemtxAllocator<Allocator>::delayed_free_mode;
+uint32_t MemtxAllocator<Allocator>::read_view_version;
 
 template<class Allocator>
-uint32_t MemtxAllocator<Allocator>::read_view_version;
+struct rlist MemtxAllocator<Allocator>::read_views;
 
 void
 memtx_allocators_init(struct allocator_settings *settings);
diff --git a/test/unit/memtx_allocator.cc b/test/unit/memtx_allocator.cc
index d557e8b3ef..b7c6f66779 100644
--- a/test/unit/memtx_allocator.cc
+++ b/test/unit/memtx_allocator.cc
@@ -212,10 +212,73 @@ test_free_not_delayed_if_temporary()
 	check_plan();
 }
 
+/**
+ * Checks that tuples are freed as soon as all read views that can access it
+ * are closed, even if other (newer or older) read views still exist.
+ */
+static void
+test_tuple_gc()
+{
+	plan(11);
+	header();
+
+	is(alloc_tuple_count(), 0, "count before alloc");
+	struct tuple *tuple11 = alloc_tuple();
+	struct tuple *tuple12 = alloc_tuple();
+	struct tuple *tuple13 = alloc_tuple();
+	struct tuple *tuple14 = alloc_tuple();
+	memtx_allocators_read_view rv1 = memtx_allocators_open_read_view({});
+	is(alloc_tuple_count(), 4, "count after rv1 opened");
+	free_tuple(tuple11);
+	struct tuple *tuple22 = alloc_tuple();
+	struct tuple *tuple23 = alloc_tuple();
+	struct tuple *tuple24 = alloc_tuple();
+	memtx_allocators_read_view rv2 = memtx_allocators_open_read_view({});
+	is(alloc_tuple_count(), 7, "count after rv2 opened");
+	free_tuple(tuple12);
+	free_tuple(tuple22);
+	struct tuple *tuple33 = alloc_tuple();
+	struct tuple *tuple34 = alloc_tuple();
+	memtx_allocators_read_view rv3 = memtx_allocators_open_read_view({});
+	is(alloc_tuple_count(), 9, "count after rv3 opened");
+	free_tuple(tuple13);
+	free_tuple(tuple23);
+	free_tuple(tuple33);
+	struct tuple *tuple44 = alloc_tuple();
+
+	is(alloc_tuple_count(), 10, "count before rv2 closed");
+	memtx_allocators_close_read_view(rv2);
+	/* tuple22 is freed */
+	is(alloc_tuple_count(), 9, "count after rv2 closed");
+
+	memtx_allocators_read_view rv4 = memtx_allocators_open_read_view({});
+	is(alloc_tuple_count(), 9, "count after rv4 opened");
+	free_tuple(tuple14);
+	free_tuple(tuple24);
+	free_tuple(tuple34);
+	free_tuple(tuple44);
+
+	is(alloc_tuple_count(), 9, "count before rv4 closed");
+	memtx_allocators_close_read_view(rv4);
+	/* tuple44 is freed */
+	is(alloc_tuple_count(), 8, "count after rv4 closed");
+
+	memtx_allocators_close_read_view(rv1);
+	/* tuple11 and tuple12 are freed */
+	is(alloc_tuple_count(), 6, "count after rv1 closed");
+
+	/* tuple13, tuple14, tuple23, tuple24, tuple33, tuple34 are freed */
+	memtx_allocators_close_read_view(rv3);
+	is(alloc_tuple_count(), 0, "count after rv3 closed");
+
+	footer();
+	check_plan();
+}
+
 static int
 test_main()
 {
-	plan(5);
+	plan(6);
 	header();
 
 	test_alloc_stats();
@@ -223,6 +286,7 @@ test_main()
 	test_free_delayed_until_all_read_views_closed();
 	test_free_not_delayed_if_alloc_after_read_view();
 	test_free_not_delayed_if_temporary();
+	test_tuple_gc();
 
 	footer();
 	return check_plan();
diff --git a/test/unit/memtx_allocator.result b/test/unit/memtx_allocator.result
index 7e815d8567..f8fe4bff65 100644
--- a/test/unit/memtx_allocator.result
+++ b/test/unit/memtx_allocator.result
@@ -1,4 +1,4 @@
-1..5
+1..6
 	*** test_main ***
     1..5
 	*** test_alloc_stats ***
@@ -40,4 +40,19 @@ ok 4 - subtests
     ok 3 - count after free
 	*** test_free_not_delayed_if_temporary: done ***
 ok 5 - subtests
+    1..11
+	*** test_tuple_gc ***
+    ok 1 - count before alloc
+    ok 2 - count after rv1 opened
+    ok 3 - count after rv2 opened
+    ok 4 - count after rv3 opened
+    ok 5 - count before rv2 closed
+    ok 6 - count after rv2 closed
+    ok 7 - count after rv4 opened
+    ok 8 - count before rv4 closed
+    ok 9 - count after rv4 closed
+    ok 10 - count after rv1 closed
+    ok 11 - count after rv3 closed
+	*** test_tuple_gc: done ***
+ok 6 - subtests
 	*** test_main: done ***
-- 
GitLab