From f927a87b6747d96f877694603c45e0428433dbf8 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov <vdavydov.dev@gmail.com> Date: Sun, 29 Oct 2017 14:58:46 +0300 Subject: [PATCH] vinyl: rework dump trigger Currently, dump is triggered (by bumping the memory generation) by the scheduler fiber while quota consumers just wake it up. As a result, the scheduler depends on the quota - it has to access the quota to check if it needs to trigger dump. In order to move the scheduler to a separate source file, we need to get rid of this dependency. Let's rework this code as follows: - Remove vy_scheduler_trigger_dump() from vy_scheduler_peek_dump(). The scheduler fiber now just dumps all indexes eligible for dump and completes dump by bumping dump_generation. It doesn't trigger dump by bumping generation anymore. As a result, it doesn't need to access the quota. - Make quota consumers call vy_scheduler_trigger_dump() instead of just waking up the scheduler. This function will become a public one once the scheduler is moved out of vinyl.c. The function logic is changed a bit. First, besides bumping generation, it now also wakes up the scheduler fiber. Second, it does nothing if dump is already in progress or can't be scheduled because of concurrent checkpoint. In the latter case it sets a special flag though that will force the scheduler trigger dump upon checkpoint completion. - vy_scheduler_begin_checkpoint() can't use vy_scheduler_trigger_dump() anymore due to additional checks added to the function, so it bumps the generation directly. This looks fine. - Such a design has a subtlety regarding how quota consumers notify the scheduler and how they are notified back about available quota. In extreme cases, quota released by a dump may be not enough to satisfy all consumers, in which case we need to reschedule dump. Since the scheduler doesn't check the quota anymore and doesn't reschedule dump, it has to be done by the left consumers. So consumers has to call the quota_exceeded_cb (which triggers a dump now) callback every time they are woken up and see there's not enough quota. The vy_quota_use() is reworked accordingly. Also, since the quota usage may exceed the limit (because of vy_quota_force_use()), the quota usage may remain higher than the limit after a dump completion, in which case vy_quota_release() doesn't wake up consumers and again there's no one to trigger another dump. So we must wake up all consumers every time vy_quota_release() is called. --- src/box/vinyl.c | 126 ++++++++++++++++++++++++--------------------- src/box/vy_quota.h | 23 +++------ 2 files changed, 73 insertions(+), 76 deletions(-) diff --git a/src/box/vinyl.c b/src/box/vinyl.c index 51a67252be..0d53354b27 100644 --- a/src/box/vinyl.c +++ b/src/box/vinyl.c @@ -438,6 +438,13 @@ struct vy_scheduler { bool is_throttled; /** Set if checkpoint is in progress. */ bool checkpoint_in_progress; + /** + * In order to guarantee checkpoint consistency, we must not + * dump in-memory trees created after checkpoint was started + * so we set this flag instead, which will make the scheduler + * schedule a dump as soon as checkpoint is complete. + */ + bool dump_pending; /** * Current generation of in-memory data. * @@ -1401,8 +1408,14 @@ vy_scheduler_peek_dump(struct vy_scheduler *scheduler, struct vy_task **ptask) { retry: *ptask = NULL; - bool dump_in_progress = (scheduler->dump_generation < - scheduler->generation); + assert(scheduler->dump_generation <= scheduler->generation); + if (scheduler->dump_generation == scheduler->generation) { + /* + * All memory trees of past generations have + * been dumped, nothing to do. + */ + return 0; + } /* * Look up the oldest index eligible for dump. */ @@ -1410,18 +1423,13 @@ vy_scheduler_peek_dump(struct vy_scheduler *scheduler, struct vy_task **ptask) if (pn == NULL) { /* * There is no vinyl index and so no task to schedule. - * If a dump round is in progress, complete it. + * Complete the current dump round. */ - if (dump_in_progress) - vy_scheduler_complete_dump(scheduler); + vy_scheduler_complete_dump(scheduler); return 0; } struct vy_index *index = container_of(pn, struct vy_index, in_dump); - if (index->is_dumping || index->pin_count > 0) { - /* All eligible indexes are already being dumped. */ - return 0; - } - if (dump_in_progress && + if (!index->is_dumping && index->pin_count == 0 && vy_index_generation(index) == scheduler->dump_generation) { /* * Dump is in progress and there is an index that @@ -1439,44 +1447,13 @@ vy_scheduler_peek_dump(struct vy_scheduler *scheduler, struct vy_task **ptask) */ goto retry; } - if (dump_in_progress) { - /* - * Dump is in progress, but there is no index eligible - * for dump in the heap. Wait until the current round - * is complete. - */ - assert(scheduler->dump_task_count > 0); - return 0; - } /* - * Nothing being dumped right now. Consider triggering - * dump if memory quota is exceeded. + * Dump is in progress, but all eligible indexes are + * already being dumped. Wait until the current round + * is complete. */ - if (scheduler->checkpoint_in_progress) { - /* - * Do not trigger another dump until checkpoint - * is complete so as to make sure no statements - * inserted after WAL rotation are written to - * the snapshot. - */ - return 0; - } - if (!vy_quota_is_exceeded(&scheduler->env->quota)) { - /* - * Memory consumption is below the watermark, - * nothing to do. - */ - return 0; - } - if (lsregion_used(&scheduler->env->stmt_env.allocator) == 0) { - /* - * Quota must be exceeded by a pending transaction, - * there's nothing we can do about that. - */ - return 0; - } - vy_scheduler_trigger_dump(scheduler); - goto retry; + assert(scheduler->dump_task_count > 0); + return 0; } /** @@ -1812,16 +1789,24 @@ static void vy_scheduler_trigger_dump(struct vy_scheduler *scheduler) { assert(scheduler->dump_generation <= scheduler->generation); - if (scheduler->generation == scheduler->dump_generation) { + if (scheduler->dump_generation < scheduler->generation) { + /* Dump is already in progress, nothing to do. */ + return; + } + if (scheduler->checkpoint_in_progress) { /* - * We are about to start a new dump round. - * Remember the current time so that we can update - * dump bandwidth when the dump round is complete - * (see vy_scheduler_complete_dump()). + * Do not trigger another dump until checkpoint + * is complete so as to make sure no statements + * inserted after WAL rotation are written to + * the snapshot. */ - scheduler->dump_start = ev_monotonic_now(loop()); + scheduler->dump_pending = true; + return; } + scheduler->dump_start = ev_monotonic_now(loop()); scheduler->generation++; + scheduler->dump_pending = false; + fiber_cond_signal(&scheduler->scheduler_cond); } /** @@ -1913,7 +1898,17 @@ vy_scheduler_begin_checkpoint(struct vy_scheduler *scheduler) return -1; } - vy_scheduler_trigger_dump(scheduler); + assert(scheduler->dump_generation <= scheduler->generation); + if (scheduler->generation == scheduler->dump_generation) { + /* + * We are about to start a new dump round. + * Remember the current time so that we can update + * dump bandwidth when the dump round is complete + * (see vy_scheduler_complete_dump()). + */ + scheduler->dump_start = ev_monotonic_now(loop()); + } + scheduler->generation++; scheduler->checkpoint_in_progress = true; fiber_cond_signal(&scheduler->scheduler_cond); return 0; @@ -1958,13 +1953,15 @@ vy_scheduler_end_checkpoint(struct vy_scheduler *scheduler) if (!scheduler->checkpoint_in_progress) return; - /* - * Checkpoint blocks dumping of in-memory trees created after - * checkpoint started, so wake up the scheduler after we are - * done so that it can catch up. - */ scheduler->checkpoint_in_progress = false; - fiber_cond_signal(&scheduler->scheduler_cond); + if (scheduler->dump_pending) { + /* + * Dump was triggered while checkpoint was + * in progress and hence it was postponed. + * Schedule it now. + */ + vy_scheduler_trigger_dump(scheduler); + } } /* Scheduler }}} */ @@ -3840,7 +3837,16 @@ static void vy_env_quota_exceeded_cb(struct vy_quota *quota) { struct vy_env *env = container_of(quota, struct vy_env, quota); - fiber_cond_signal(&env->scheduler->scheduler_cond); + if (lsregion_used(&env->stmt_env.allocator) == 0) { + /* + * The memory limit has been exceeded, but there's + * nothing to dump. This may happen if all available + * quota has been consumed by pending transactions. + * There's nothing we can do about that. + */ + return; + } + vy_scheduler_trigger_dump(env->scheduler); } static struct vy_squash_queue * diff --git a/src/box/vy_quota.h b/src/box/vy_quota.h index 5dc84fbdcb..835d984ac1 100644 --- a/src/box/vy_quota.h +++ b/src/box/vy_quota.h @@ -93,15 +93,6 @@ vy_quota_destroy(struct vy_quota *q) fiber_cond_destroy(&q->cond); } -/** - * Return true if memory reclaim should be triggered. - */ -static inline bool -vy_quota_is_exceeded(struct vy_quota *q) -{ - return q->used > q->watermark; -} - /** * Set memory limit. If current memory usage exceeds * the new limit, invoke the callback. @@ -146,8 +137,7 @@ vy_quota_release(struct vy_quota *q, size_t size) { assert(q->used >= size); q->used -= size; - if (q->used < q->limit) - fiber_cond_broadcast(&q->cond); + fiber_cond_broadcast(&q->cond); } /** @@ -158,18 +148,19 @@ vy_quota_release(struct vy_quota *q, size_t size) static inline int vy_quota_use(struct vy_quota *q, size_t size, double timeout) { - vy_quota_force_use(q, size); - while (q->used >= q->limit && timeout > 0) { + while (q->used + size > q->limit && timeout > 0) { + q->quota_exceeded_cb(q); double wait_start = ev_monotonic_now(loop()); if (fiber_cond_wait_timeout(&q->cond, timeout) != 0) break; /* timed out */ double wait_end = ev_monotonic_now(loop()); timeout -= (wait_end - wait_start); } - if (q->used > q->limit) { - vy_quota_release(q, size); + if (q->used + size > q->limit) return -1; - } + q->used += size; + if (q->used >= q->watermark) + q->quota_exceeded_cb(q); return 0; } -- GitLab