From f927a87b6747d96f877694603c45e0428433dbf8 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov.dev@gmail.com>
Date: Sun, 29 Oct 2017 14:58:46 +0300
Subject: [PATCH] vinyl: rework dump trigger

Currently, dump is triggered (by bumping the memory generation) by the
scheduler fiber while quota consumers just wake it up. As a result, the
scheduler depends on the quota - it has to access the quota to check if
it needs to trigger dump. In order to move the scheduler to a separate
source file, we need to get rid of this dependency.

Let's rework this code as follows:

 - Remove vy_scheduler_trigger_dump() from vy_scheduler_peek_dump(). The
   scheduler fiber now just dumps all indexes eligible for dump and
   completes dump by bumping dump_generation. It doesn't trigger dump by
   bumping generation anymore. As a result, it doesn't need to access
   the quota.

 - Make quota consumers call vy_scheduler_trigger_dump() instead of just
   waking up the scheduler. This function will become a public one once
   the scheduler is moved out of vinyl.c. The function logic is changed
   a bit. First, besides bumping generation, it now also wakes up the
   scheduler fiber. Second, it does nothing if dump is already in
   progress or can't be scheduled because of concurrent checkpoint.
   In the latter case it sets a special flag though that will force the
   scheduler trigger dump upon checkpoint completion.

 - vy_scheduler_begin_checkpoint() can't use vy_scheduler_trigger_dump()
   anymore due to additional checks added to the function, so it bumps
   the generation directly. This looks fine.

 - Such a design has a subtlety regarding how quota consumers notify the
   scheduler and how they are notified back about available quota.
   In extreme cases, quota released by a dump may be not enough to
   satisfy all consumers, in which case we need to reschedule dump.
   Since the scheduler doesn't check the quota anymore and doesn't
   reschedule dump, it has to be done by the left consumers. So
   consumers has to call the quota_exceeded_cb (which triggers a dump
   now) callback every time they are woken up and see there's not enough
   quota. The vy_quota_use() is reworked accordingly.

   Also, since the quota usage may exceed the limit (because of
   vy_quota_force_use()), the quota usage may remain higher than the
   limit after a dump completion, in which case vy_quota_release()
   doesn't wake up consumers and again there's no one to trigger another
   dump. So we must wake up all consumers every time vy_quota_release()
   is called.
---
 src/box/vinyl.c    | 126 ++++++++++++++++++++++++---------------------
 src/box/vy_quota.h |  23 +++------
 2 files changed, 73 insertions(+), 76 deletions(-)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 51a67252be..0d53354b27 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -438,6 +438,13 @@ struct vy_scheduler {
 	bool is_throttled;
 	/** Set if checkpoint is in progress. */
 	bool checkpoint_in_progress;
+	/**
+	 * In order to guarantee checkpoint consistency, we must not
+	 * dump in-memory trees created after checkpoint was started
+	 * so we set this flag instead, which will make the scheduler
+	 * schedule a dump as soon as checkpoint is complete.
+	 */
+	bool dump_pending;
 	/**
 	 * Current generation of in-memory data.
 	 *
@@ -1401,8 +1408,14 @@ vy_scheduler_peek_dump(struct vy_scheduler *scheduler, struct vy_task **ptask)
 {
 retry:
 	*ptask = NULL;
-	bool dump_in_progress = (scheduler->dump_generation <
-				 scheduler->generation);
+	assert(scheduler->dump_generation <= scheduler->generation);
+	if (scheduler->dump_generation == scheduler->generation) {
+		/*
+		 * All memory trees of past generations have
+		 * been dumped, nothing to do.
+		 */
+		return 0;
+	}
 	/*
 	 * Look up the oldest index eligible for dump.
 	 */
@@ -1410,18 +1423,13 @@ vy_scheduler_peek_dump(struct vy_scheduler *scheduler, struct vy_task **ptask)
 	if (pn == NULL) {
 		/*
 		 * There is no vinyl index and so no task to schedule.
-		 * If a dump round is in progress, complete it.
+		 * Complete the current dump round.
 		 */
-		if (dump_in_progress)
-			vy_scheduler_complete_dump(scheduler);
+		vy_scheduler_complete_dump(scheduler);
 		return 0;
 	}
 	struct vy_index *index = container_of(pn, struct vy_index, in_dump);
-	if (index->is_dumping || index->pin_count > 0) {
-		/* All eligible indexes are already being dumped. */
-		return 0;
-	}
-	if (dump_in_progress &&
+	if (!index->is_dumping && index->pin_count == 0 &&
 	    vy_index_generation(index) == scheduler->dump_generation) {
 		/*
 		 * Dump is in progress and there is an index that
@@ -1439,44 +1447,13 @@ vy_scheduler_peek_dump(struct vy_scheduler *scheduler, struct vy_task **ptask)
 		 */
 		goto retry;
 	}
-	if (dump_in_progress) {
-		/*
-		 * Dump is in progress, but there is no index eligible
-		 * for dump in the heap. Wait until the current round
-		 * is complete.
-		 */
-		assert(scheduler->dump_task_count > 0);
-		return 0;
-	}
 	/*
-	 * Nothing being dumped right now. Consider triggering
-	 * dump if memory quota is exceeded.
+	 * Dump is in progress, but all eligible indexes are
+	 * already being dumped. Wait until the current round
+	 * is complete.
 	 */
-	if (scheduler->checkpoint_in_progress) {
-		/*
-		 * Do not trigger another dump until checkpoint
-		 * is complete so as to make sure no statements
-		 * inserted after WAL rotation are written to
-		 * the snapshot.
-		 */
-		return 0;
-	}
-	if (!vy_quota_is_exceeded(&scheduler->env->quota)) {
-		/*
-		 * Memory consumption is below the watermark,
-		 * nothing to do.
-		 */
-		return 0;
-	}
-	if (lsregion_used(&scheduler->env->stmt_env.allocator) == 0) {
-		/*
-		 * Quota must be exceeded by a pending transaction,
-		 * there's nothing we can do about that.
-		 */
-		return 0;
-	}
-	vy_scheduler_trigger_dump(scheduler);
-	goto retry;
+	assert(scheduler->dump_task_count > 0);
+	return 0;
 }
 
 /**
@@ -1812,16 +1789,24 @@ static void
 vy_scheduler_trigger_dump(struct vy_scheduler *scheduler)
 {
 	assert(scheduler->dump_generation <= scheduler->generation);
-	if (scheduler->generation == scheduler->dump_generation) {
+	if (scheduler->dump_generation < scheduler->generation) {
+		/* Dump is already in progress, nothing to do. */
+		return;
+	}
+	if (scheduler->checkpoint_in_progress) {
 		/*
-		 * We are about to start a new dump round.
-		 * Remember the current time so that we can update
-		 * dump bandwidth when the dump round is complete
-		 * (see vy_scheduler_complete_dump()).
+		 * Do not trigger another dump until checkpoint
+		 * is complete so as to make sure no statements
+		 * inserted after WAL rotation are written to
+		 * the snapshot.
 		 */
-		scheduler->dump_start = ev_monotonic_now(loop());
+		scheduler->dump_pending = true;
+		return;
 	}
+	scheduler->dump_start = ev_monotonic_now(loop());
 	scheduler->generation++;
+	scheduler->dump_pending = false;
+	fiber_cond_signal(&scheduler->scheduler_cond);
 }
 
 /**
@@ -1913,7 +1898,17 @@ vy_scheduler_begin_checkpoint(struct vy_scheduler *scheduler)
 		return -1;
 	}
 
-	vy_scheduler_trigger_dump(scheduler);
+	assert(scheduler->dump_generation <= scheduler->generation);
+	if (scheduler->generation == scheduler->dump_generation) {
+		/*
+		 * We are about to start a new dump round.
+		 * Remember the current time so that we can update
+		 * dump bandwidth when the dump round is complete
+		 * (see vy_scheduler_complete_dump()).
+		 */
+		scheduler->dump_start = ev_monotonic_now(loop());
+	}
+	scheduler->generation++;
 	scheduler->checkpoint_in_progress = true;
 	fiber_cond_signal(&scheduler->scheduler_cond);
 	return 0;
@@ -1958,13 +1953,15 @@ vy_scheduler_end_checkpoint(struct vy_scheduler *scheduler)
 	if (!scheduler->checkpoint_in_progress)
 		return;
 
-	/*
-	 * Checkpoint blocks dumping of in-memory trees created after
-	 * checkpoint started, so wake up the scheduler after we are
-	 * done so that it can catch up.
-	 */
 	scheduler->checkpoint_in_progress = false;
-	fiber_cond_signal(&scheduler->scheduler_cond);
+	if (scheduler->dump_pending) {
+		/*
+		 * Dump was triggered while checkpoint was
+		 * in progress and hence it was postponed.
+		 * Schedule it now.
+		 */
+		vy_scheduler_trigger_dump(scheduler);
+	}
 }
 
 /* Scheduler }}} */
@@ -3840,7 +3837,16 @@ static void
 vy_env_quota_exceeded_cb(struct vy_quota *quota)
 {
 	struct vy_env *env = container_of(quota, struct vy_env, quota);
-	fiber_cond_signal(&env->scheduler->scheduler_cond);
+	if (lsregion_used(&env->stmt_env.allocator) == 0) {
+		/*
+		 * The memory limit has been exceeded, but there's
+		 * nothing to dump. This may happen if all available
+		 * quota has been consumed by pending transactions.
+		 * There's nothing we can do about that.
+		 */
+		return;
+	}
+	vy_scheduler_trigger_dump(env->scheduler);
 }
 
 static struct vy_squash_queue *
diff --git a/src/box/vy_quota.h b/src/box/vy_quota.h
index 5dc84fbdcb..835d984ac1 100644
--- a/src/box/vy_quota.h
+++ b/src/box/vy_quota.h
@@ -93,15 +93,6 @@ vy_quota_destroy(struct vy_quota *q)
 	fiber_cond_destroy(&q->cond);
 }
 
-/**
- * Return true if memory reclaim should be triggered.
- */
-static inline bool
-vy_quota_is_exceeded(struct vy_quota *q)
-{
-	return q->used > q->watermark;
-}
-
 /**
  * Set memory limit. If current memory usage exceeds
  * the new limit, invoke the callback.
@@ -146,8 +137,7 @@ vy_quota_release(struct vy_quota *q, size_t size)
 {
 	assert(q->used >= size);
 	q->used -= size;
-	if (q->used < q->limit)
-		fiber_cond_broadcast(&q->cond);
+	fiber_cond_broadcast(&q->cond);
 }
 
 /**
@@ -158,18 +148,19 @@ vy_quota_release(struct vy_quota *q, size_t size)
 static inline int
 vy_quota_use(struct vy_quota *q, size_t size, double timeout)
 {
-	vy_quota_force_use(q, size);
-	while (q->used >= q->limit && timeout > 0) {
+	while (q->used + size > q->limit && timeout > 0) {
+		q->quota_exceeded_cb(q);
 		double wait_start = ev_monotonic_now(loop());
 		if (fiber_cond_wait_timeout(&q->cond, timeout) != 0)
 			break; /* timed out */
 		double wait_end = ev_monotonic_now(loop());
 		timeout -= (wait_end - wait_start);
 	}
-	if (q->used > q->limit) {
-		vy_quota_release(q, size);
+	if (q->used + size > q->limit)
 		return -1;
-	}
+	q->used += size;
+	if (q->used >= q->watermark)
+		q->quota_exceeded_cb(q);
 	return 0;
 }
 
-- 
GitLab