From 0dc0c6358d91b302f96cbd6a8112e7bc416b06fe Mon Sep 17 00:00:00 2001
From: Konstantin Osipov <kostja@tarantool.org>
Date: Fri, 16 Jan 2015 18:58:05 +0300
Subject: [PATCH] Convert recover_xlog() to use exceptions.

---
 src/box/recovery.cc | 146 +++++++++++++++++++++++---------------------
 src/box/recovery.h  |   1 -
 src/box/xlog.cc     |  15 +++--
 3 files changed, 88 insertions(+), 74 deletions(-)

diff --git a/src/box/recovery.cc b/src/box/recovery.cc
index 2b38313271..f7ce5d6954 100644
--- a/src/box/recovery.cc
+++ b/src/box/recovery.cc
@@ -261,6 +261,45 @@ recovery_process(struct recovery_state *r, struct xrow_header *row)
 	return r->row_handler(r, r->row_handler_param, row);
 }
 
+#define LOG_EOF 0
+
+/**
+ * @retval 0 OK, read full xlog.
+ * @retval 1 OK, read some but not all rows, or no EOF marker
+ */
+static int
+recover_xlog(struct recovery_state *r, struct xlog *l)
+{
+	struct xlog_cursor i;
+
+	xlog_cursor_open(&i, l);
+
+	auto guard = make_scoped_guard([&]{
+		xlog_cursor_close(&i);
+	});
+
+	struct xrow_header row;
+	while (xlog_cursor_next(&i, &row) == 0) {
+		try {
+			recovery_process(r, &row);
+		} catch (ClientError *e) {
+			if (l->dir->panic_if_error)
+				throw;
+			say_error("can't apply row: ");
+			e->log();
+		}
+	}
+	/*
+	 * xlog_cursor_next() returns 1 when
+	 * it can not read more rows. This doesn't mean
+	 * the file is fully read: it's fully read only
+	 * when EOF marker has been read. This is
+	 * why eof_read is used here to indicate the
+	 * end of log.
+	 */
+	return !i.eof_read;
+}
+
 void
 recovery_bootstrap(struct recovery_state *r)
 {
@@ -274,16 +313,20 @@ recovery_bootstrap(struct recovery_state *r)
 			   sizeof(bootstrap_bin), "r");
 	struct xlog *snap = xlog_open_stream(&r->snap_dir, 0, NONE, f,
 					     filename);
-	int rc = recover_xlog(r, snap);
-	xlog_close(snap);
-
-	if (rc != 0)
-		panic("failed to recover from %s", filename);
+	auto guard = make_scoped_guard([=]{
+		xlog_close(snap);
+	});
+	/** The snapshot must have a EOF marker. */
+	if (recover_xlog(r, snap) != LOG_EOF)
+		panic("can't process bootstrap snapshot");
 }
 
 /**
  * Read a snapshot and call row_handler for every snapshot row.
  * Panic in case of error.
+ *
+ * @pre there is an existing snapshot. Otherwise
+ * recovery_bootstrap() should be used instead.
  */
 void
 recover_snap(struct recovery_state *r)
@@ -291,72 +334,33 @@ recover_snap(struct recovery_state *r)
 	/* There's no current_wal during initial recover. */
 	assert(r->current_wal == NULL);
 	say_info("recovery start");
-
-	struct xlog *snap;
-	int64_t sign;
-
-	try {
-		xdir_scan(&r->snap_dir);
-	} catch (Exception *e) {
-		e->log();
-		panic("can't scan snapshot directory");
-	}
+	/**
+	 * Don't rescan the directory, it's done when
+	 * recovery is initialized.
+	 * Don't check if the directory index is empty, there must
+	 * be at least one existing snapshot, otherwise we would
+	 * have created it from a bootstrap copy.
+	 */
 	struct vclock *res = vclockset_last(&r->snap_dir.index);
-	if (res == NULL)
-		panic("can't find snapshot");
-	sign = vclock_signature(res);
-	snap = xlog_open(&r->snap_dir, sign, NONE);
+	int64_t signature = vclock_signature(res);
 
-	/* Save server uuid */
+	struct xlog *snap = xlog_open(&r->snap_dir, signature, NONE);
+	auto guard = make_scoped_guard([=]{
+		xlog_close(snap);
+	});
+	/* Save server UUID */
 	r->server_uuid = snap->server_uuid;
 
 	/* Add a surrogate server id for snapshot rows */
 	vclock_add_server(&r->vclock, 0);
 
 	say_info("recovering from `%s'", snap->filename);
-	if (recover_xlog(r, snap) != 0)
+	if (recover_xlog(r, snap) != LOG_EOF)
 		panic("can't process snapshot");
-
 	/* Replace server vclock using the data from snapshot */
 	vclock_copy(&r->vclock, &snap->vclock);
 }
 
-#define LOG_EOF 0
-
-/**
- * @retval -1 error
- * @retval 0 EOF
- * @retval 1 ok, maybe read something
- */
-int
-recover_xlog(struct recovery_state *r, struct xlog *l)
-{
-	int res = -1;
-	struct xlog_cursor i;
-
-	xlog_cursor_open(&i, l);
-
-	struct xrow_header row;
-	while (xlog_cursor_next(&i, &row) == 0) {
-		try {
-			recovery_process(r, &row);
-		} catch (SystemError *e) {
-			say_error("can't apply row: ");
-			e->log();
-			goto end;
-		} catch (Exception *e) {
-			say_error("can't apply row: ");
-			e->log();
-			if (l->dir->panic_if_error)
-				goto end;
-		}
-	}
-	res = i.eof_read ? LOG_EOF : 1;
-end:
-	xlog_cursor_close(&i);
-	/* Sic: we don't close the log here. */
-	return res;
-}
 
 /** Find out if there are new .xlog files since the current
  * LSN, and read them all up.
@@ -447,10 +451,12 @@ recover_remaining_wals(struct recovery_state *r)
 
 recover_current_wal:
 		rows_before = r->current_wal->rows;
-		result = recover_xlog(r, r->current_wal);
-		if (result < 0) {
+		try {
+			result = recover_xlog(r, r->current_wal);
+		} catch (Exception *e) {
 			say_error("failure reading from %s",
 				  r->current_wal->filename);
+			e->log();
 			break;
 		}
 
@@ -623,17 +629,19 @@ recovery_rescan_file(ev_loop * loop, ev_stat *w, int /* revents */)
 	struct recovery_state *r = (struct recovery_state *) w->data;
 	struct wal_watcher *watcher = r->watcher;
 	fiber_set_user(fiber(), &admin_credentials);
-	int result = recover_xlog(r, r->current_wal);
-	fiber_set_user(fiber(), NULL);
-	if (result < 0)
+	try {
+		if (recover_xlog(r, r->current_wal) == LOG_EOF) {
+			say_info("done `%s'", r->current_wal->filename);
+			recovery_close_log(r);
+			recovery_stop_file(watcher);
+			/* Don't wait for wal_dir_rescan_delay. */
+			recovery_rescan_dir(loop, &watcher->dir_timer, 0);
+		}
+	} catch (Exception *e) {
+		e->log();
 		panic("recover failed");
-	if (result == LOG_EOF) {
-		say_info("done `%s'", r->current_wal->filename);
-		recovery_close_log(r);
-		recovery_stop_file(watcher);
-		/* Don't wait for wal_dir_rescan_delay. */
-		recovery_rescan_dir(loop, &watcher->dir_timer, 0);
 	}
+	fiber_set_user(fiber(), NULL);
 }
 
 void
diff --git a/src/box/recovery.h b/src/box/recovery.h
index f8b4b12ffc..3ab0e93456 100644
--- a/src/box/recovery.h
+++ b/src/box/recovery.h
@@ -153,7 +153,6 @@ void recover_snap(struct recovery_state *r);
 void recovery_follow_local(struct recovery_state *r, ev_tstamp wal_dir_rescan_delay);
 void recovery_finalize(struct recovery_state *r);
 
-int recover_xlog(struct recovery_state *r, struct xlog *l);
 int wal_write(struct recovery_state *r, struct xrow_header *packet);
 
 void recovery_setup_panic(struct recovery_state *r, bool on_snap_error, bool on_wal_error);
diff --git a/src/box/xlog.cc b/src/box/xlog.cc
index 452429475a..91cdf546e5 100644
--- a/src/box/xlog.cc
+++ b/src/box/xlog.cc
@@ -494,6 +494,8 @@ xlog_cursor_close(struct xlog_cursor *i)
  *
  * @param i	iterator object, encapsulating log specifics.
  *
+ * @retval 0    OK
+ * @retval 1    EOF
  */
 int
 xlog_cursor_next(struct xlog_cursor *i, struct xrow_header *row)
@@ -543,10 +545,15 @@ xlog_cursor_next(struct xlog_cursor *i, struct xrow_header *row)
 	} catch (ClientError *e) {
 		if (l->dir->panic_if_error)
 			throw;
-		else {
-			say_warn("failed to read row");
-			goto restart;
-		}
+		/*
+		 * say_warn() is used and exception is not
+		 * logged since an error may happen in local
+		 * hot standby or replication mode, when it's
+		 * caused by an attempt to read a partially
+		 * written WAL.
+		 */
+		say_warn("failed to read row");
+		goto restart;
 	}
 
 	i->good_offset = ftello(l->f);
-- 
GitLab