diff --git a/src/box/CMakeLists.txt b/src/box/CMakeLists.txt index cab6a227639e918f6ffc8717e0af3ebaff3c8e7e..67750898d2e88f697ac01e3103e5ea4c0848693e 100644 --- a/src/box/CMakeLists.txt +++ b/src/box/CMakeLists.txt @@ -86,6 +86,7 @@ add_library(box STATIC vy_history.c vy_read_set.c vy_scheduler.c + vy_regulator.c vy_quota.c request.c space.c diff --git a/src/box/vinyl.c b/src/box/vinyl.c index 67b9c58922c66efb623aa3e0a6a5a9e6e164e936..b528f3d8c13121cfa762212c8f472285bc0826af 100644 --- a/src/box/vinyl.c +++ b/src/box/vinyl.c @@ -43,6 +43,7 @@ #include "vy_point_lookup.h" #include "vy_quota.h" #include "vy_scheduler.h" +#include "vy_regulator.h" #include "vy_stat.h" #include <stdbool.h> @@ -115,6 +116,8 @@ struct vy_env { struct vy_mem_env mem_env; /** Scheduler */ struct vy_scheduler scheduler; + /** Load regulator. */ + struct vy_regulator regulator; /** Local recovery context. */ struct vy_recovery *recovery; /** Local recovery vclock. */ @@ -249,17 +252,15 @@ static struct trigger on_replace_vinyl_deferred_delete; /** {{{ Introspection */ static void -vy_info_append_quota(struct vy_env *env, struct info_handler *h) +vy_info_append_regulator(struct vy_env *env, struct info_handler *h) { - struct vy_quota *q = &env->quota; + struct vy_regulator *r = &env->regulator; - info_table_begin(h, "quota"); - info_append_int(h, "used", q->used); - info_append_int(h, "limit", q->limit); - info_append_int(h, "watermark", q->watermark); - info_append_int(h, "use_rate", q->use_rate); - info_append_int(h, "dump_bandwidth", q->dump_bw); - info_table_end(h); /* quota */ + info_table_begin(h, "regulator"); + info_append_int(h, "write_rate", r->write_rate); + info_append_int(h, "dump_bandwidth", r->dump_bandwidth); + info_append_int(h, "dump_watermark", r->dump_watermark); + info_table_end(h); /* regulator */ } static void @@ -328,10 +329,10 @@ vinyl_engine_stat(struct vinyl_engine *vinyl, struct info_handler *h) struct vy_env *env = vinyl->env; info_begin(h); - vy_info_append_quota(env, h); vy_info_append_tx(env, h); vy_info_append_memory(env, h); vy_info_append_disk(env, h); + vy_info_append_regulator(env, h); info_end(h); } @@ -2366,6 +2367,7 @@ vinyl_engine_prepare(struct engine *engine, struct txn *txn) assert(mem_used_after >= mem_used_before); vy_quota_adjust(&env->quota, tx->write_size, mem_used_after - mem_used_before); + vy_regulator_check_dump_watermark(&env->regulator); return rc; } @@ -2390,6 +2392,7 @@ vinyl_engine_commit(struct engine *engine, struct txn *txn) assert(mem_used_after >= mem_used_before); /* We can't abort the transaction at this point, use force. */ vy_quota_force_use(&env->quota, mem_used_after - mem_used_before); + vy_regulator_check_dump_watermark(&env->regulator); txn->engine_tx = NULL; if (!txn->is_autocommit) @@ -2440,6 +2443,13 @@ static void vy_env_quota_exceeded_cb(struct vy_quota *quota) { struct vy_env *env = container_of(quota, struct vy_env, quota); + vy_regulator_quota_exceeded(&env->regulator); +} + +static void +vy_env_trigger_dump_cb(struct vy_regulator *regulator) +{ + struct vy_env *env = container_of(regulator, struct vy_env, regulator); if (lsregion_used(&env->mem_env.allocator) == 0) { /* @@ -2468,7 +2478,7 @@ vy_env_dump_complete_cb(struct vy_scheduler *scheduler, assert(mem_used_after <= mem_used_before); size_t mem_dumped = mem_used_before - mem_used_after; vy_quota_release(quota, mem_dumped); - vy_quota_update_dump_bandwidth(quota, mem_dumped, dump_duration); + vy_regulator_dump_complete(&env->regulator, mem_dumped, dump_duration); say_info("dumped %zu bytes in %.1f sec", mem_dumped, dump_duration); } @@ -2520,8 +2530,9 @@ vy_env_new(const char *path, size_t memory, vy_squash_schedule, e) != 0) goto error_lsm_env; - if (vy_quota_create(&e->quota, vy_env_quota_exceeded_cb) != 0) - goto error_quota; + vy_quota_create(&e->quota, vy_env_quota_exceeded_cb); + vy_regulator_create(&e->regulator, &e->quota, + vy_env_trigger_dump_cb); struct slab_cache *slab_cache = cord_slab_cache(); mempool_create(&e->iterator_pool, slab_cache, @@ -2530,8 +2541,7 @@ vy_env_new(const char *path, size_t memory, vy_run_env_create(&e->run_env); vy_log_init(e->path); return e; -error_quota: - vy_lsm_env_destroy(&e->lsm_env); + error_lsm_env: vy_mem_env_destroy(&e->mem_env); vy_scheduler_destroy(&e->scheduler); @@ -2548,6 +2558,7 @@ vy_env_new(const char *path, size_t memory, static void vy_env_delete(struct vy_env *e) { + vy_regulator_destroy(&e->regulator); vy_scheduler_destroy(&e->scheduler); vy_squash_queue_delete(e->squash_queue); tx_manager_delete(e->xm); @@ -2574,6 +2585,7 @@ vy_env_complete_recovery(struct vy_env *env) { vy_scheduler_start(&env->scheduler); vy_quota_set_limit(&env->quota, env->memory); + vy_regulator_start(&env->regulator); } struct vinyl_engine * @@ -2651,7 +2663,8 @@ vinyl_engine_set_snap_io_rate_limit(struct vinyl_engine *vinyl, double limit) { int64_t limit_in_bytes = limit * 1024 * 1024; vinyl->env->run_env.snap_io_rate_limit = limit_in_bytes; - vy_quota_reset_dump_bandwidth(&vinyl->env->quota, limit_in_bytes); + vy_regulator_reset_dump_bandwidth(&vinyl->env->regulator, + limit_in_bytes); } /** }}} Environment */ @@ -3194,6 +3207,7 @@ vinyl_space_apply_initial_join_row(struct space *space, struct request *request) assert(mem_used_after >= mem_used_before); size_t used = mem_used_after - mem_used_before; vy_quota_adjust(&env->quota, reserved, used); + vy_regulator_check_dump_watermark(&env->regulator); return rc; } @@ -3515,6 +3529,7 @@ vy_squash_process(struct vy_squash *squash) vy_mem_commit_stmt(mem, region_stmt); vy_quota_force_use(&env->quota, mem_used_after - mem_used_before); + vy_regulator_check_dump_watermark(&env->regulator); } return rc; } @@ -3989,6 +4004,7 @@ vy_build_insert_tuple(struct vy_env *env, struct vy_lsm *lsm, size_t mem_used_after = lsregion_used(&env->mem_env.allocator); assert(mem_used_after >= mem_used_before); vy_quota_force_use(&env->quota, mem_used_after - mem_used_before); + vy_regulator_check_dump_watermark(&env->regulator); vy_quota_wait(&env->quota); return rc; } @@ -4419,6 +4435,7 @@ vy_deferred_delete_on_replace(struct trigger *trigger, void *event) size_t mem_used_after = lsregion_used(&env->mem_env.allocator); assert(mem_used_after >= mem_used_before); vy_quota_force_use(&env->quota, mem_used_after - mem_used_before); + vy_regulator_check_dump_watermark(&env->regulator); tuple_unref(delete); if (rc != 0) diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c index 51f0ba71c6af16c5c003a50258ef8af8a33ab03f..e6d223486f9a69a2ef037b2c602a42c3b6bfafd6 100644 --- a/src/box/vy_quota.c +++ b/src/box/vy_quota.c @@ -32,42 +32,13 @@ #include <assert.h> #include <stddef.h> -#include <stdint.h> -#include <math.h> #include <tarantool_ev.h> -#include "diag.h" #include "fiber.h" #include "fiber_cond.h" #include "say.h" -#include "histogram.h" #include "trivia/util.h" -enum { - /** - * Time interval between successive updates of - * quota watermark and use rate, in seconds. - */ - VY_QUOTA_UPDATE_INTERVAL = 1, - /** - * Period of time over which the quota use rate - * is averaged, in seconds. - */ - VY_QUOTA_RATE_AVG_PERIOD = 5, -}; - -/* - * Until we dump anything, assume bandwidth to be 10 MB/s, - * which should be fine for initial guess. - */ -static const size_t VY_DEFAULT_DUMP_BANDWIDTH = 10 * 1024 * 1024; - -/** - * Histogram percentile used for estimating dump bandwidth. - * For details see the comment to vy_quota::dump_bw_hist. - */ -enum { VY_DUMP_BANDWIDTH_PCT = 10 }; - /** * Returns true if the quota limit is exceeded and so consumers * have to wait. @@ -78,84 +49,19 @@ vy_quota_is_exceeded(struct vy_quota *q) return q->used > q->limit; } -static void -vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events) -{ - (void)loop; - (void)events; - - struct vy_quota *q = timer->data; - - /* - * Update the quota use rate with the new measurement. - */ - const double weight = 1 - exp(-VY_QUOTA_UPDATE_INTERVAL / - (double)VY_QUOTA_RATE_AVG_PERIOD); - q->use_rate = (1 - weight) * q->use_rate + - weight * q->use_curr / VY_QUOTA_UPDATE_INTERVAL; - q->use_curr = 0; - - /* - * Due to log structured nature of the lsregion allocator, - * which is used for allocating statements, we cannot free - * memory in chunks, only all at once. Therefore we should - * configure the watermark so that by the time we hit the - * limit, all memory have been dumped, i.e. - * - * limit - watermark watermark - * ----------------- = -------------- - * use_rate dump_bandwidth - */ - q->watermark = ((double)q->limit * q->dump_bw / - (q->dump_bw + q->use_rate + 1)); - if (q->used >= q->watermark) - q->quota_exceeded_cb(q); -} - -int +void vy_quota_create(struct vy_quota *q, vy_quota_exceeded_f quota_exceeded_cb) { - enum { KB = 1024, MB = KB * KB }; - static int64_t dump_bandwidth_buckets[] = { - 100 * KB, 200 * KB, 300 * KB, 400 * KB, 500 * KB, 600 * KB, - 700 * KB, 800 * KB, 900 * KB, 1 * MB, 2 * MB, 3 * MB, - 4 * MB, 5 * MB, 6 * MB, 7 * MB, 8 * MB, 9 * MB, - 10 * MB, 15 * MB, 20 * MB, 25 * MB, 30 * MB, 35 * MB, - 40 * MB, 45 * MB, 50 * MB, 55 * MB, 60 * MB, 65 * MB, - 70 * MB, 75 * MB, 80 * MB, 85 * MB, 90 * MB, 95 * MB, - 100 * MB, 200 * MB, 300 * MB, 400 * MB, 500 * MB, 600 * MB, - 700 * MB, 800 * MB, 900 * MB, - }; - - q->dump_bw_hist = histogram_new(dump_bandwidth_buckets, - lengthof(dump_bandwidth_buckets)); - if (q->dump_bw_hist == NULL) { - diag_set(OutOfMemory, 0, "histogram_new", - "dump bandwidth histogram"); - return -1; - } - q->limit = SIZE_MAX; - q->watermark = SIZE_MAX; q->used = 0; - q->use_curr = 0; - q->use_rate = 0; q->too_long_threshold = TIMEOUT_INFINITY; - q->dump_bw = VY_DEFAULT_DUMP_BANDWIDTH; q->quota_exceeded_cb = quota_exceeded_cb; fiber_cond_create(&q->cond); - ev_timer_init(&q->timer, vy_quota_timer_cb, 0, - VY_QUOTA_UPDATE_INTERVAL); - q->timer.data = q; - ev_timer_start(loop(), &q->timer); - return 0; } void vy_quota_destroy(struct vy_quota *q) { - ev_timer_stop(loop(), &q->timer); - histogram_delete(q->dump_bw_hist); fiber_cond_broadcast(&q->cond); fiber_cond_destroy(&q->cond); } @@ -163,41 +69,17 @@ vy_quota_destroy(struct vy_quota *q) void vy_quota_set_limit(struct vy_quota *q, size_t limit) { - q->limit = q->watermark = limit; + q->limit = limit; if (q->used >= limit) q->quota_exceeded_cb(q); fiber_cond_signal(&q->cond); } -void -vy_quota_update_dump_bandwidth(struct vy_quota *q, size_t size, - double duration) -{ - if (duration > 0) { - histogram_collect(q->dump_bw_hist, size / duration); - /* - * To avoid unpredictably long stalls, we need to - * know the worst (smallest) dump bandwidth so use - * a lower-bound percentile estimate. - */ - q->dump_bw = histogram_percentile_lower(q->dump_bw_hist, - VY_DUMP_BANDWIDTH_PCT); - } -} - -void -vy_quota_reset_dump_bandwidth(struct vy_quota *q, size_t max) -{ - histogram_reset(q->dump_bw_hist); - q->dump_bw = MIN(VY_DEFAULT_DUMP_BANDWIDTH, max); -} - void vy_quota_force_use(struct vy_quota *q, size_t size) { q->used += size; - q->use_curr += size; - if (q->used >= q->watermark) + if (q->used >= q->limit) q->quota_exceeded_cb(q); } @@ -213,7 +95,6 @@ int vy_quota_use(struct vy_quota *q, size_t size, double timeout) { q->used += size; - q->use_curr += size; if (vy_quota_is_exceeded(q)) { /* Wait for quota. */ double start_time = ev_monotonic_now(loop()); @@ -222,11 +103,9 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout) do { q->quota_exceeded_cb(q); q->used -= size; - q->use_curr -= size; if (fiber_cond_wait_deadline(&q->cond, deadline) != 0) return -1; /* timed out */ q->used += size; - q->use_curr += size; } while (vy_quota_is_exceeded(q)); double wait_time = ev_monotonic_now(loop()) - start_time; @@ -240,8 +119,6 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout) */ fiber_cond_signal(&q->cond); } - if (q->used >= q->watermark) - q->quota_exceeded_cb(q); return 0; } @@ -252,10 +129,6 @@ vy_quota_adjust(struct vy_quota *q, size_t reserved, size_t used) size_t excess = reserved - used; assert(q->used >= excess); q->used -= excess; - if (q->use_curr >= excess) - q->use_curr -= excess; - else /* was reset by timeout */ - q->use_curr = 0; fiber_cond_signal(&q->cond); } if (reserved < used) diff --git a/src/box/vy_quota.h b/src/box/vy_quota.h index 9ce53fc41277315dc051af9b6d052176de826ca9..59fe075f59e7a27be3fb56a62683e72a767fec27 100644 --- a/src/box/vy_quota.h +++ b/src/box/vy_quota.h @@ -40,7 +40,6 @@ extern "C" { #endif /* defined(__cplusplus) */ struct vy_quota; -struct histogram; typedef void (*vy_quota_exceeded_f)(struct vy_quota *quota); @@ -55,12 +54,6 @@ struct vy_quota { * throttled until memory is reclaimed. */ size_t limit; - /** - * Memory watermark. Exceeding it does not result in - * throttling new transactions, but it does trigger - * background memory reclaim. - */ - size_t watermark; /** Current memory consumption. */ size_t used; /** @@ -74,40 +67,13 @@ struct vy_quota { */ struct fiber_cond cond; /** - * Called when quota is consumed if used >= watermark. + * Called if the limit is hit when quota is consumed. * It is supposed to trigger memory reclaim. */ vy_quota_exceeded_f quota_exceeded_cb; - /** Timer for updating quota watermark. */ - ev_timer timer; - /** - * Amount of quota used since the last - * invocation of the quota timer callback. - */ - size_t use_curr; - /** - * Quota use rate, in bytes per second. - * Calculated as exponentially weighted - * moving average of use_curr. - */ - size_t use_rate; - /** Current dump bandwidth estimate. */ - size_t dump_bw; - /** - * Dump bandwidth is needed for calculating the quota watermark. - * The higher the bandwidth, the later we can start dumping w/o - * suffering from transaction throttling. So we want to be very - * conservative about estimating the bandwidth. - * - * To make sure we don't overestimate it, we maintain a - * histogram of all observed measurements and assume the - * bandwidth to be equal to the 10th percentile, i.e. the - * best result among 10% worst measurements. - */ - struct histogram *dump_bw_hist; }; -int +void vy_quota_create(struct vy_quota *q, vy_quota_exceeded_f quota_exceeded_cb); void @@ -120,27 +86,6 @@ vy_quota_destroy(struct vy_quota *q); void vy_quota_set_limit(struct vy_quota *q, size_t limit); -/** Return dump bandwidth. */ -size_t -vy_quota_dump_bandwidth(struct vy_quota *q); - -/** - * Update dump bandwidth. - * - * @size: size of dumped memory. - * @duration: how long memory dump took. - */ -void -vy_quota_update_dump_bandwidth(struct vy_quota *q, size_t size, - double duration); - -/** - * Reset dump bandwidth histogram and update initial estimate. - * Called when box.cfg.snap_io_rate_limit is updated. - */ -void -vy_quota_reset_dump_bandwidth(struct vy_quota *q, size_t max); - /** * Consume @size bytes of memory. In contrast to vy_quota_use() * this function does not throttle the caller. diff --git a/src/box/vy_regulator.c b/src/box/vy_regulator.c new file mode 100644 index 0000000000000000000000000000000000000000..73d17edd4bdfdb3ab394c75d911fed13bd3c4694 --- /dev/null +++ b/src/box/vy_regulator.c @@ -0,0 +1,211 @@ +/* + * Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include "vy_regulator.h" + +#include <math.h> +#include <stddef.h> +#include <stdint.h> +#include <tarantool_ev.h> + +#include "fiber.h" +#include "histogram.h" +#include "say.h" +#include "trivia/util.h" + +#include "vy_quota.h" + +/** + * Regulator timer period, in seconds. + */ +static const double VY_REGULATOR_TIMER_PERIOD = 1; + +/** + * Time window over which the write rate is averaged, + * in seconds. + */ +static const double VY_WRITE_RATE_AVG_WIN = 5; + +/** + * Histogram percentile used for estimating dump bandwidth. + * For details see the comment to vy_regulator::dump_bandwidth_hist. + */ +static const int VY_DUMP_BANDWIDTH_PCT = 10; + +/* + * Until we dump anything, assume bandwidth to be 10 MB/s, + * which should be fine for initial guess. + */ +static const size_t VY_DUMP_BANDWIDTH_DEFAULT = 10 * 1024 * 1024; + +static void +vy_regulator_update_write_rate(struct vy_regulator *regulator) +{ + size_t used_curr = regulator->quota->used; + size_t used_last = regulator->quota_used_last; + + /* + * Memory can be dumped between two subsequent timer + * callback invocations, in which case memory usage + * will decrease. Ignore such observations - it's not + * a big deal, because dump is a rare event. + */ + if (used_curr < used_last) { + regulator->quota_used_last = used_curr; + return; + } + + size_t rate_avg = regulator->write_rate; + size_t rate_curr = (used_curr - used_last) / VY_REGULATOR_TIMER_PERIOD; + + double weight = 1 - exp(-VY_REGULATOR_TIMER_PERIOD / + VY_WRITE_RATE_AVG_WIN); + rate_avg = (1 - weight) * rate_avg + weight * rate_curr; + + regulator->write_rate = rate_avg; + regulator->quota_used_last = used_curr; +} + +static void +vy_regulator_update_dump_watermark(struct vy_regulator *regulator) +{ + struct vy_quota *quota = regulator->quota; + + /* + * Due to log structured nature of the lsregion allocator, + * which is used for allocating statements, we cannot free + * memory in chunks, only all at once. Therefore we should + * configure the watermark so that by the time we hit the + * limit, all memory have been dumped, i.e. + * + * limit - watermark watermark + * ----------------- = -------------- + * write_rate dump_bandwidth + */ + regulator->dump_watermark = + (double)quota->limit * regulator->dump_bandwidth / + (regulator->dump_bandwidth + regulator->write_rate + 1); +} + +static void +vy_regulator_timer_cb(ev_loop *loop, ev_timer *timer, int events) +{ + (void)loop; + (void)events; + + struct vy_regulator *regulator = timer->data; + + vy_regulator_update_write_rate(regulator); + vy_regulator_update_dump_watermark(regulator); + vy_regulator_check_dump_watermark(regulator); +} + +void +vy_regulator_create(struct vy_regulator *regulator, struct vy_quota *quota, + vy_trigger_dump_f trigger_dump_cb) +{ + enum { KB = 1024, MB = KB * KB }; + static int64_t dump_bandwidth_buckets[] = { + 100 * KB, 200 * KB, 300 * KB, 400 * KB, 500 * KB, 600 * KB, + 700 * KB, 800 * KB, 900 * KB, 1 * MB, 2 * MB, 3 * MB, + 4 * MB, 5 * MB, 6 * MB, 7 * MB, 8 * MB, 9 * MB, + 10 * MB, 15 * MB, 20 * MB, 25 * MB, 30 * MB, 35 * MB, + 40 * MB, 45 * MB, 50 * MB, 55 * MB, 60 * MB, 65 * MB, + 70 * MB, 75 * MB, 80 * MB, 85 * MB, 90 * MB, 95 * MB, + 100 * MB, 200 * MB, 300 * MB, 400 * MB, 500 * MB, 600 * MB, + 700 * MB, 800 * MB, 900 * MB, + }; + regulator->dump_bandwidth_hist = histogram_new(dump_bandwidth_buckets, + lengthof(dump_bandwidth_buckets)); + if (regulator->dump_bandwidth_hist == NULL) + panic("failed to allocate dump bandwidth histogram"); + + regulator->quota = quota; + regulator->trigger_dump_cb = trigger_dump_cb; + ev_timer_init(®ulator->timer, vy_regulator_timer_cb, 0, + VY_REGULATOR_TIMER_PERIOD); + regulator->timer.data = regulator; + regulator->write_rate = 0; + regulator->quota_used_last = 0; + regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT; + regulator->dump_watermark = SIZE_MAX; +} + +void +vy_regulator_start(struct vy_regulator *regulator) +{ + regulator->quota_used_last = regulator->quota->used; + ev_timer_start(loop(), ®ulator->timer); +} + +void +vy_regulator_destroy(struct vy_regulator *regulator) +{ + ev_timer_stop(loop(), ®ulator->timer); + histogram_delete(regulator->dump_bandwidth_hist); +} + +void +vy_regulator_quota_exceeded(struct vy_regulator *regulator) +{ + regulator->trigger_dump_cb(regulator); +} + +void +vy_regulator_check_dump_watermark(struct vy_regulator *regulator) +{ + if (regulator->quota->used >= regulator->dump_watermark) + regulator->trigger_dump_cb(regulator); +} + +void +vy_regulator_dump_complete(struct vy_regulator *regulator, + size_t mem_dumped, double dump_duration) +{ + if (dump_duration > 0) { + histogram_collect(regulator->dump_bandwidth_hist, + mem_dumped / dump_duration); + /* + * To avoid unpredictably long stalls caused by + * mispredicting dump time duration, we need to + * know the worst (smallest) dump bandwidth so + * use a lower-bound percentile estimate. + */ + regulator->dump_bandwidth = histogram_percentile_lower( + regulator->dump_bandwidth_hist, VY_DUMP_BANDWIDTH_PCT); + } +} + +void +vy_regulator_reset_dump_bandwidth(struct vy_regulator *regulator, size_t max) +{ + histogram_reset(regulator->dump_bandwidth_hist); + regulator->dump_bandwidth = MIN(VY_DUMP_BANDWIDTH_DEFAULT, max); +} diff --git a/src/box/vy_regulator.h b/src/box/vy_regulator.h new file mode 100644 index 0000000000000000000000000000000000000000..3b2bb52381ce9de255baf9891f497b9471ead1e3 --- /dev/null +++ b/src/box/vy_regulator.h @@ -0,0 +1,146 @@ +#ifndef INCLUDES_TARANTOOL_BOX_VY_REGULATOR_H +#define INCLUDES_TARANTOOL_BOX_VY_REGULATOR_H +/* + * Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <stddef.h> +#include <tarantool_ev.h> + +#if defined(__cplusplus) +extern "C" { +#endif /* defined(__cplusplus) */ + +struct histogram; +struct vy_quota; +struct vy_regulator; + +typedef void +(*vy_trigger_dump_f)(struct vy_regulator *regulator); + +/** + * The regulator is supposed to keep track of vinyl memory usage + * and dump/compaction progress and adjust transaction write rate + * accordingly. + */ +struct vy_regulator { + /** + * Pointer to a quota object that is used to control + * memory usage. + */ + struct vy_quota *quota; + /** + * Called when the regulator detects that memory usage + * exceeds the computed watermark. Supposed to trigger + * memory dump. + */ + vy_trigger_dump_f trigger_dump_cb; + /** + * Periodic timer that updates the memory watermark + * basing on accumulated statistics. + */ + ev_timer timer; + /** + * Average rate at which transactions are writing to + * the database, in bytes per second. + */ + size_t write_rate; + /** + * Amount of memory that was used when the timer was + * executed last time. Needed to update @write_rate. + */ + size_t quota_used_last; + /** + * Current dump bandwidth estimate, in bytes per second. + * See @dump_bandwidth_hist for more details. + */ + size_t dump_bandwidth; + /** + * Dump bandwidth is needed for calculating the watermark. + * The higher the bandwidth, the later we can start dumping + * w/o suffering from transaction throttling. So we want to + * be very conservative about estimating the bandwidth. + * + * To make sure we don't overestimate it, we maintain a + * histogram of all observed measurements and assume the + * bandwidth to be equal to the 10th percentile, i.e. the + * best result among 10% worst measurements. + */ + struct histogram *dump_bandwidth_hist; + /** + * Memory watermark. Exceeding it does not result in + * throttling new transactions, but it does trigger + * background memory reclaim. + */ + size_t dump_watermark; +}; + +void +vy_regulator_create(struct vy_regulator *regulator, struct vy_quota *quota, + vy_trigger_dump_f trigger_dump_cb); + +void +vy_regulator_start(struct vy_regulator *regulator); + +void +vy_regulator_destroy(struct vy_regulator *regulator); + +/** + * Check if memory usage is above the watermark and trigger + * memory dump if so. + */ +void +vy_regulator_check_dump_watermark(struct vy_regulator *regulator); + +/** + * Called when the memory limit is hit by a quota consumer. + */ +void +vy_regulator_quota_exceeded(struct vy_regulator *regulator); + +/** + * Notify the regulator about memory dump completion. + */ +void +vy_regulator_dump_complete(struct vy_regulator *regulator, + size_t mem_dumped, double dump_duration); + +/** + * Reset dump bandwidth histogram and update initial estimate. + * Called when box.cfg.snap_io_rate_limit is updated. + */ +void +vy_regulator_reset_dump_bandwidth(struct vy_regulator *regulator, size_t max); + +#if defined(__cplusplus) +} /* extern "C" */ +#endif /* defined(__cplusplus) */ + +#endif /* INCLUDES_TARANTOOL_BOX_VY_REGULATOR_H */ diff --git a/test/vinyl/errinj.result b/test/vinyl/errinj.result index 8badb47af300ec8f37ec8a8d7cd5140704b6f701..b4dc5b69a0309d4cfb90e1fe06ab5ea37a1f44ea 100644 --- a/test/vinyl/errinj.result +++ b/test/vinyl/errinj.result @@ -1359,7 +1359,7 @@ pad = string.rep('x', 100 * 1024) _ = fiber.create(function() for i = 1, 11 do box.space.test:replace{i, pad} end end) --- ... -repeat fiber.sleep(0.001) q = box.info.vinyl().quota until q.limit - q.used < pad:len() +repeat fiber.sleep(0.001) until box.cfg.vinyl_memory - box.stat.vinyl().memory.level0 < pad:len() --- ... test_run:cmd("restart server low_quota with args='1048576'") @@ -1376,7 +1376,7 @@ pad = string.rep('x', 100 * 1024) _ = fiber.create(function() for i = 1, 11 do box.space.test:replace{i, pad} end end) --- ... -repeat fiber.sleep(0.001) q = box.info.vinyl().quota until q.limit - q.used < pad:len() +repeat fiber.sleep(0.001) until box.cfg.vinyl_memory - box.stat.vinyl().memory.level0 < pad:len() --- ... test_run:cmd('switch default') diff --git a/test/vinyl/errinj.test.lua b/test/vinyl/errinj.test.lua index 5a47ecc42c9b6a403c39c99a8b76fac996685045..e82b6aee8ca3bf2aa93fb07e7e71ac0b0ba9ec66 100644 --- a/test/vinyl/errinj.test.lua +++ b/test/vinyl/errinj.test.lua @@ -529,13 +529,13 @@ box.error.injection.set('ERRINJ_VY_RUN_WRITE_STMT_TIMEOUT', 0.01) fiber = require('fiber') pad = string.rep('x', 100 * 1024) _ = fiber.create(function() for i = 1, 11 do box.space.test:replace{i, pad} end end) -repeat fiber.sleep(0.001) q = box.info.vinyl().quota until q.limit - q.used < pad:len() +repeat fiber.sleep(0.001) until box.cfg.vinyl_memory - box.stat.vinyl().memory.level0 < pad:len() test_run:cmd("restart server low_quota with args='1048576'") box.error.injection.set('ERRINJ_VY_LOG_FLUSH_DELAY', true) fiber = require('fiber') pad = string.rep('x', 100 * 1024) _ = fiber.create(function() for i = 1, 11 do box.space.test:replace{i, pad} end end) -repeat fiber.sleep(0.001) q = box.info.vinyl().quota until q.limit - q.used < pad:len() +repeat fiber.sleep(0.001) until box.cfg.vinyl_memory - box.stat.vinyl().memory.level0 < pad:len() test_run:cmd('switch default') test_run:cmd("stop server low_quota") test_run:cmd("cleanup server low_quota") diff --git a/test/vinyl/info.result b/test/vinyl/info.result index 21945b9df5f71a862c5db73de69379adc54d3eb2..a47c9a1b37369faed0d940cfa87779e7c41d594d 100644 --- a/test/vinyl/info.result +++ b/test/vinyl/info.result @@ -95,13 +95,11 @@ end; ... -- Return global statistics. -- --- Note, quota watermark checking is beyond the scope of this --- test so we just filter out related statistics. +-- Note, checking correctness of the load regulator logic is beyond +-- the scope of this test so we just filter out related statistics. function gstat() local st = box.stat.vinyl() - st.quota.use_rate = nil - st.quota.dump_bandwidth = nil - st.quota.watermark = nil + st.regulator = nil return st end; --- @@ -231,9 +229,6 @@ gstat() out: 0 data: 0 index: 0 - quota: - limit: 134217728 - used: 0 memory: tuple_cache: 0 tx: 0 @@ -1075,9 +1070,6 @@ gstat() out: 0 data: 104300 index: 1190 - quota: - limit: 134217728 - used: 262583 memory: tuple_cache: 14313 tx: 0 diff --git a/test/vinyl/info.test.lua b/test/vinyl/info.test.lua index 5912320cf995c21f7c13f025c98509755701aa6c..e5794a231c8a762cb2c82436b1de151c5bb1f44b 100644 --- a/test/vinyl/info.test.lua +++ b/test/vinyl/info.test.lua @@ -77,13 +77,11 @@ end; -- Return global statistics. -- --- Note, quota watermark checking is beyond the scope of this --- test so we just filter out related statistics. +-- Note, checking correctness of the load regulator logic is beyond +-- the scope of this test so we just filter out related statistics. function gstat() local st = box.stat.vinyl() - st.quota.use_rate = nil - st.quota.dump_bandwidth = nil - st.quota.watermark = nil + st.regulator = nil return st end; diff --git a/test/vinyl/quota.result b/test/vinyl/quota.result index 480421852174062e617ee8e5a7656543453fda4f..1a0842f96c8ab45e8fe26eda16ea03627cca5791 100644 --- a/test/vinyl/quota.result +++ b/test/vinyl/quota.result @@ -12,7 +12,7 @@ test_run:cmd('restart server default') -- -- gh-1863 add BPS tree extents to memory quota -- -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 --- - 0 ... @@ -29,7 +29,7 @@ space:insert({1, 1}) --- - [1, 1] ... -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 --- - 98343 ... @@ -37,7 +37,7 @@ space:insert({1, 1}) --- - error: Duplicate key exists in unique index 'pk' in space 'test' ... -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 --- - 98343 ... @@ -45,7 +45,7 @@ space:update({1}, {{'!', 1, 100}}) -- try to modify the primary key --- - error: Attempt to modify a tuple field which is part of index 'pk' in space 'test' ... -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 --- - 98343 ... @@ -61,7 +61,7 @@ space:insert({4, 4}) --- - [4, 4] ... -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 --- - 98460 ... @@ -69,7 +69,7 @@ box.snapshot() --- - ok ... -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 --- - 0 ... @@ -80,14 +80,14 @@ space:select{} - [3, 3] - [4, 4] ... -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 --- - 0 ... _ = space:replace{1, 1, string.rep('a', 1024 * 1024 * 5)} --- ... -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 --- - 5341267 ... @@ -121,14 +121,14 @@ count = 20 pad = string.rep('x', 100 * 1024) --- ... -box.stat.vinyl().quota.limit +box.cfg.vinyl_memory --- - 1048576 ... for i = 1, count do s:replace{i, pad} end -- triggers dump --- ... -box.stat.vinyl().quota.used < count * pad:len() +box.stat.vinyl().memory.level0 < count * pad:len() --- - true ... @@ -139,14 +139,14 @@ box.snapshot() box.cfg{vinyl_memory = 8 * 1024 * 1024} --- ... -box.stat.vinyl().quota.limit +box.cfg.vinyl_memory --- - 8388608 ... for i = 1, count do s:replace{i, pad} end -- does not trigger dump --- ... -box.stat.vinyl().quota.used > count * pad:len() +box.stat.vinyl().memory.level0 > count * pad:len() --- - true ... @@ -155,7 +155,7 @@ box.cfg{vinyl_memory = 4 * 1024 * 1024} -- error: decreasing vinyl_memory is not - error: 'Incorrect value for option ''vinyl_memory'': cannot decrease memory size at runtime' ... -box.stat.vinyl().quota.limit +box.cfg.vinyl_memory --- - 8388608 ... diff --git a/test/vinyl/quota.test.lua b/test/vinyl/quota.test.lua index e67e54305a62abc8419b167c54e1c48f50c1eaa0..a2793a01510dfa6492b76303efd7d8b3fb860768 100644 --- a/test/vinyl/quota.test.lua +++ b/test/vinyl/quota.test.lua @@ -12,7 +12,7 @@ test_run:cmd('restart server default') -- gh-1863 add BPS tree extents to memory quota -- -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 space = box.schema.space.create('test', { engine = 'vinyl' }) pk = space:create_index('pk') @@ -20,33 +20,33 @@ sec = space:create_index('sec', { parts = {2, 'unsigned'} }) space:insert({1, 1}) -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 space:insert({1, 1}) -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 space:update({1}, {{'!', 1, 100}}) -- try to modify the primary key -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 space:insert({2, 2}) space:insert({3, 3}) space:insert({4, 4}) -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 box.snapshot() -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 space:select{} -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 _ = space:replace{1, 1, string.rep('a', 1024 * 1024 * 5)} -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 space:drop() @@ -63,21 +63,21 @@ _ = s:create_index('pk') count = 20 pad = string.rep('x', 100 * 1024) -box.stat.vinyl().quota.limit +box.cfg.vinyl_memory for i = 1, count do s:replace{i, pad} end -- triggers dump -box.stat.vinyl().quota.used < count * pad:len() +box.stat.vinyl().memory.level0 < count * pad:len() box.snapshot() box.cfg{vinyl_memory = 8 * 1024 * 1024} -box.stat.vinyl().quota.limit +box.cfg.vinyl_memory for i = 1, count do s:replace{i, pad} end -- does not trigger dump -box.stat.vinyl().quota.used > count * pad:len() +box.stat.vinyl().memory.level0 > count * pad:len() box.cfg{vinyl_memory = 4 * 1024 * 1024} -- error: decreasing vinyl_memory is not allowed -box.stat.vinyl().quota.limit +box.cfg.vinyl_memory test_run:cmd('switch default') test_run:cmd("stop server test") diff --git a/test/vinyl/quota_timeout.result b/test/vinyl/quota_timeout.result index fd8b01966bc4318d74c4029ec3d2f87b92e1dd01..990d0a4c52c3981e1878b8001c302a16ca316e9f 100644 --- a/test/vinyl/quota_timeout.result +++ b/test/vinyl/quota_timeout.result @@ -47,7 +47,7 @@ s:count() --- - 1 ... -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 --- - 748241 ... @@ -61,7 +61,7 @@ s:count() --- - 1 ... -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 --- - 748241 ... @@ -148,7 +148,7 @@ box.snapshot() -- The following operation should fail instantly irrespective -- of the value of 'vinyl_timeout' (gh-3291). -- -box.stat.vinyl().quota.used == 0 +box.stat.vinyl().memory.level0 == 0 --- - true ... diff --git a/test/vinyl/quota_timeout.test.lua b/test/vinyl/quota_timeout.test.lua index 41a864bb128f9cb42059a8af278a54da3c01a9f1..6266d38b817cc0b8c20f5b7c718fad6e61ea9d3e 100644 --- a/test/vinyl/quota_timeout.test.lua +++ b/test/vinyl/quota_timeout.test.lua @@ -21,13 +21,13 @@ _ = s:create_index('pk') pad = string.rep('x', 2 * box.cfg.vinyl_memory / 3) _ = s:auto_increment{pad} s:count() -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 -- Since the following operation requires more memory than configured -- and dump is disabled, it should fail with ER_VY_QUOTA_TIMEOUT. _ = s:auto_increment{pad} s:count() -box.stat.vinyl().quota.used +box.stat.vinyl().memory.level0 -- -- Check that increasing box.cfg.vinyl_memory wakes up fibers @@ -72,7 +72,7 @@ box.snapshot() -- The following operation should fail instantly irrespective -- of the value of 'vinyl_timeout' (gh-3291). -- -box.stat.vinyl().quota.used == 0 +box.stat.vinyl().memory.level0 == 0 box.cfg{vinyl_timeout = 9000} pad = string.rep('x', box.cfg.vinyl_memory) _ = s:auto_increment{pad} diff --git a/test/vinyl/recovery_quota.result b/test/vinyl/recovery_quota.result index 323b796747b69ec5ebde2b8f31870de88b7b3b09..151f280bb9d810fdcc0f3012c72faf38a4a711ab 100644 --- a/test/vinyl/recovery_quota.result +++ b/test/vinyl/recovery_quota.result @@ -60,7 +60,7 @@ test_run:cmd('restart server test with args="2097152"') stat = box.stat.vinyl() --- ... -stat.quota.used <= stat.quota.limit or {stat.quota.used, stat.quota.limit} +stat.memory.level0 <= box.cfg.vinyl_memory or {stat.memory.level0, box.cfg.vinyl_memory} --- - true ... @@ -109,7 +109,7 @@ for i = 1, box.cfg.vinyl_memory / pad_size do box.space.test:replace{i, pad} end --- - error: Timed out waiting for Vinyl memory quota ... -box.stat.vinyl().quota.used > 1024 * 1024 +box.stat.vinyl().memory.level0 > 1024 * 1024 --- - true ... @@ -125,7 +125,7 @@ while box.space.test.index.pk:stat().disk.dump.count == 0 do fiber.sleep(0.001) stat = box.stat.vinyl() --- ... -stat.quota.used <= stat.quota.limit or {stat.quota.used, stat.quota.limit} +stat.memory.level0 <= box.cfg.vinyl_memory or {stat.memory.level0, box.cfg.vinyl_memory} --- - true ... diff --git a/test/vinyl/recovery_quota.test.lua b/test/vinyl/recovery_quota.test.lua index 44e35ba6c48fc8b6ad3a5319aac89ffa41aee37b..1e3476674bdf288308e8e4cd823fff2abb700d90 100644 --- a/test/vinyl/recovery_quota.test.lua +++ b/test/vinyl/recovery_quota.test.lua @@ -27,7 +27,7 @@ _ = var:insert{'dump', stat.disk.dump.out.rows} test_run:cmd('restart server test with args="2097152"') -- Check that we do not exceed quota. stat = box.stat.vinyl() -stat.quota.used <= stat.quota.limit or {stat.quota.used, stat.quota.limit} +stat.memory.level0 <= box.cfg.vinyl_memory or {stat.memory.level0, box.cfg.vinyl_memory} -- Check that we did not replay statements dumped before restart. stat = box.space.test.index.pk:stat() var = box.space.var @@ -43,14 +43,14 @@ box.cfg{vinyl_timeout=0.001} pad_size = 1000 pad = string.rep('x', pad_size) for i = 1, box.cfg.vinyl_memory / pad_size do box.space.test:replace{i, pad} end -box.stat.vinyl().quota.used > 1024 * 1024 +box.stat.vinyl().memory.level0 > 1024 * 1024 -- Check that tarantool can recover with a smaller memory limit. test_run:cmd('restart server test with args="1048576"') fiber = require 'fiber' -- All memory above the limit must be dumped after recovery. while box.space.test.index.pk:stat().disk.dump.count == 0 do fiber.sleep(0.001) end stat = box.stat.vinyl() -stat.quota.used <= stat.quota.limit or {stat.quota.used, stat.quota.limit} +stat.memory.level0 <= box.cfg.vinyl_memory or {stat.memory.level0, box.cfg.vinyl_memory} _ = test_run:cmd('switch default') test_run:cmd('stop server test')