From e6ebd5eb748efaae43ca0d880a11cc76dc155872 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov <vdavydov.dev@gmail.com> Date: Tue, 11 Dec 2018 18:15:42 +0300 Subject: [PATCH] Rate limit certain warning messages There are a few warning messages that can easily flood the log, making it more difficult to figure out what causes the problem. Those are - too long WAL write - waited for ... bytes of vinyl memory quota for too long - get/select(...) => ... took too long - readahead limit is reached - net_msg_max limit is reached Actually, it's pointless to print each and every of them, because all messages of the same kind are similar and don't convey any additional information. So this patch limits the rate at which those messages may be printed. To achieve that, it introduces say_ratelimited() helper, which works exactly like say() except it does nothing if too many messages of the same kind have already been printed in the last few seconds. The implementation is trivial - say_ratelimited() defines a static ratelimit state variable at its call site (it's a macro) and checks it before logging anything. If the ratelimit state says that an event may be emitted, it will log the message, otherwise it will skip it and eventually print the total number of skipped messages instead. The rate limit is set to 10 messages per 5 seconds for each kind of a warning message enumerated above. Here's how it looks in the log: 2018-12-11 18:07:21.830 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:26.851 [30404] iproto iproto.cc:524 W> 9635 messages suppressed Closes #2218 --- src/box/iproto.cc | 10 ++++++---- src/box/txn.c | 5 +++-- src/box/vy_point_lookup.c | 7 ++++--- src/box/vy_quota.c | 5 +++-- src/box/vy_read_iterator.c | 9 +++++---- src/say.h | 38 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 59 insertions(+), 15 deletions(-) diff --git a/src/box/iproto.cc b/src/box/iproto.cc index cb1cceb5cc..223469f22c 100644 --- a/src/box/iproto.cc +++ b/src/box/iproto.cc @@ -539,8 +539,9 @@ iproto_connection_is_idle(struct iproto_connection *con) static inline void iproto_connection_stop_readahead_limit(struct iproto_connection *con) { - say_warn("stopping input on connection %s, readahead limit is reached", - sio_socketname(con->input.fd)); + say_warn_ratelimited("stopping input on connection %s, " + "readahead limit is reached", + sio_socketname(con->input.fd)); assert(rlist_empty(&con->in_stop_list)); ev_io_stop(con->loop, &con->input); } @@ -550,8 +551,9 @@ iproto_connection_stop_msg_max_limit(struct iproto_connection *con) { assert(rlist_empty(&con->in_stop_list)); - say_warn("stopping input on connection %s, net_msg_max limit is reached", - sio_socketname(con->input.fd)); + say_warn_ratelimited("stopping input on connection %s, " + "net_msg_max limit is reached", + sio_socketname(con->input.fd)); ev_io_stop(con->loop, &con->input); /* * Important to add to tail and fetch from head to ensure diff --git a/src/box/txn.c b/src/box/txn.c index e47853a8bf..7f4e85b478 100644 --- a/src/box/txn.c +++ b/src/box/txn.c @@ -296,8 +296,9 @@ txn_write_to_wal(struct txn *txn) diag_set(ClientError, ER_WAL_IO); diag_log(); } else if (stop - start > too_long_threshold) { - say_warn("too long WAL write: %d rows at LSN %lld: %.3f sec", - txn->n_rows, res - txn->n_rows + 1, stop - start); + say_warn_ratelimited("too long WAL write: %d rows at " + "LSN %lld: %.3f sec", txn->n_rows, + res - txn->n_rows + 1, stop - start); } /* * Use vclock_sum() from WAL writer as transaction signature. diff --git a/src/box/vy_point_lookup.c b/src/box/vy_point_lookup.c index 7b704b8490..ddbc2d46fa 100644 --- a/src/box/vy_point_lookup.c +++ b/src/box/vy_point_lookup.c @@ -287,9 +287,10 @@ vy_point_lookup(struct vy_lsm *lsm, struct vy_tx *tx, latency_collect(&lsm->stat.latency, latency); if (latency > lsm->env->too_long_threshold) { - say_warn("%s: get(%s) => %s took too long: %.3f sec", - vy_lsm_name(lsm), tuple_str(key), - vy_stmt_str(*ret), latency); + say_warn_ratelimited("%s: get(%s) => %s " + "took too long: %.3f sec", + vy_lsm_name(lsm), tuple_str(key), + vy_stmt_str(*ret), latency); } return 0; } diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c index 161ab4c7e6..07cd58563d 100644 --- a/src/box/vy_quota.c +++ b/src/box/vy_quota.c @@ -241,8 +241,9 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout) double wait_time = ev_monotonic_now(loop()) - wait_start; if (wait_time > q->too_long_threshold) { - say_warn("waited for %zu bytes of vinyl memory quota " - "for too long: %.3f sec", size, wait_time); + say_warn_ratelimited("waited for %zu bytes of vinyl memory " + "quota for too long: %.3f sec", size, + wait_time); } vy_quota_do_use(q, size); diff --git a/src/box/vy_read_iterator.c b/src/box/vy_read_iterator.c index 954fc0df6a..06860cbc46 100644 --- a/src/box/vy_read_iterator.c +++ b/src/box/vy_read_iterator.c @@ -887,10 +887,11 @@ vy_read_iterator_next(struct vy_read_iterator *itr, struct tuple **result) latency_collect(&lsm->stat.latency, latency); if (latency > lsm->env->too_long_threshold) { - say_warn("%s: select(%s, %s) => %s took too long: %.3f sec", - vy_lsm_name(lsm), tuple_str(itr->key), - iterator_type_strs[itr->iterator_type], - vy_stmt_str(stmt), latency); + say_warn_ratelimited("%s: select(%s, %s) => %s " + "took too long: %.3f sec", + vy_lsm_name(lsm), tuple_str(itr->key), + iterator_type_strs[itr->iterator_type], + vy_stmt_str(stmt), latency); } *result = stmt; diff --git a/src/say.h b/src/say.h index 3cff02e96e..26db95133c 100644 --- a/src/say.h +++ b/src/say.h @@ -39,6 +39,7 @@ #include <tarantool_ev.h> #include "small/rlist.h" #include "fiber_cond.h" +#include "ratelimit.h" #if defined(__cplusplus) extern "C" { @@ -327,6 +328,43 @@ CFORMAT(printf, 5, 0) extern sayfunc_t _say; #define panic(...) panic_status(EXIT_FAILURE, __VA_ARGS__) #define panic_syserror(...) ({ say(S_FATAL, strerror(errno), __VA_ARGS__); exit(EXIT_FAILURE); }) +enum { + /* 10 messages per 5 seconds. */ + SAY_RATELIMIT_INTERVAL = 5, + SAY_RATELIMIT_BURST = 10, +}; + +/** + * Wrapper around ratelimit_check() that prints a warning if some + * messages are suppressed. It uses ev_monotonic_now() as a time + * source. + */ +#define say_ratelimit_check(rl) ({ \ + int suppressed = 0; \ + bool ret = ratelimit_check((rl), ev_monotonic_now(loop()), \ + &suppressed); \ + if (suppressed > 0) \ + say_warn("%d messages suppressed", suppressed); \ + ret; \ +}) + +/** + * Same as say(), but rate limited. If this function is called more + * often than SAY_RATELIMIT_BURST times per SAY_RATELIMIT_INTERVAL + * seconds, extra messages are suppressed and a warning is printed + * to the log. + */ +#define say_ratelimited(level, error, format, ...) ({ \ + static struct ratelimit rl = \ + RATELIMIT_INITIALIZER(SAY_RATELIMIT_INTERVAL, \ + SAY_RATELIMIT_BURST); \ + if (say_ratelimit_check(&rl)) \ + say(level, error, format, ##__VA_ARGS__); \ +}) + +#define say_warn_ratelimited(format, ...) \ + say_ratelimited(S_WARN, NULL, format, ##__VA_ARGS__) + /** * Format and print a message to Tarantool log file. * -- GitLab