From f9950440a68b4adbc0e3caee7efa58f1d878eb87 Mon Sep 17 00:00:00 2001
From: Konstantin Osipov <kostja.osipov@gmail.com>
Date: Fri, 16 Aug 2019 16:13:42 +0300
Subject: [PATCH] gc: randomie the next checkpoint time also after a manual
 box.snapshot().

Before this patch, snapshot interval was set randomly within
checkpoint_interval period. However, after box.snapshot(), the next
snapshot was scheduled exactly checkpoint_interval from the current time.
Many orchestration scripts snapshot entire cluster right after deployment,
to take a backup. This kills randomness, since all instances begin to
count the next checkpoint time from the current time.

Randomize the next checkpoint time after a manual snapshot as well.

Fixes gh-4432

(cherry picked from commit 6277f48ad8a7213d73e280d5046c5b5872f74e25)
---
 src/box/gc.c | 15 +++++++++++----
 src/box/gc.h |  4 +++-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/box/gc.c b/src/box/gc.c
index a2c963e0a3..e0df924730 100644
--- a/src/box/gc.c
+++ b/src/box/gc.c
@@ -404,11 +404,18 @@ gc_checkpoint(void)
 	}
 
 	/*
-	 * Reset the schedule and wake up the checkpoint daemon
-	 * so that it can readjust.
+	 * Since a user invoked a snapshot manually, this may be
+	 * because he may be not happy with the current randomized
+	 * schedule. Randomize the schedule again and wake up the
+	 * checkpoint daemon so that it * can readjust.
+	 * It is also a good idea to randomize the interval, since
+	 * otherwise many instances running on the same host will
+	 * no longer run their checkpoints randomly after
+	 * a sweeping box.snapshot() (gh-4432).
 	 */
-	checkpoint_schedule_reset(&gc.checkpoint_schedule,
-				  ev_monotonic_now(loop()));
+	checkpoint_schedule_cfg(&gc.checkpoint_schedule,
+				ev_monotonic_now(loop()),
+				gc.checkpoint_schedule.interval);
 	fiber_wakeup(gc.checkpoint_fiber);
 
 	if (gc_do_checkpoint() != 0)
diff --git a/src/box/gc.h b/src/box/gc.h
index 5790ebcc6a..827a5db8eb 100644
--- a/src/box/gc.h
+++ b/src/box/gc.h
@@ -240,7 +240,9 @@ void
 gc_add_checkpoint(const struct vclock *vclock);
 
 /**
- * Make a checkpoint.
+ * Make a *manual* checkpoint.
+ * This is entry point for box.snapshot() and SIGUSR1 signal
+ * handler.
  *
  * This function runs engine/WAL callbacks to create a checkpoint
  * on disk, then tracks the new checkpoint in the garbage collector
-- 
GitLab