From da4ca29f18b5d4b09520898a44e49bee13797ad7 Mon Sep 17 00:00:00 2001 From: Georgy Moshkin <gmoshkin@picodata.io> Date: Mon, 21 Nov 2022 15:09:42 +0300 Subject: [PATCH] fix(governor): fix error message when failing to switch replicaset master --- src/traft/node.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/traft/node.rs b/src/traft/node.rs index b4b7de542b..6523e0d4ed 100644 --- a/src/traft/node.rs +++ b/src/traft/node.rs @@ -972,6 +972,7 @@ fn raft_conf_change_loop(status: Rc<Cell<Status>>, storage: Storage) { ); let replicaset_id = &peer.replicaset_id; + // choose a new replicaset master if needed let res = (|| -> traft::Result<_> { let replicaset = storage.replicasets.get(replicaset_id)?; if replicaset @@ -987,13 +988,22 @@ fn raft_conf_change_loop(status: Rc<Cell<Status>>, storage: Storage) { let op = OpDML::update(ClusterSpace::Replicasets, &[replicaset_id], ops)?; tlog!(Info, "proposing replicaset master change"; "op" => ?op); // TODO: don't hard code the timeout - // TODO: these `?` will be processed in the wrong place node.propose_and_wait(op, Duration::from_secs(3))??; } else { tlog!(Info, "skip proposing replicaset master change"); } } + Ok(()) + })(); + if let Err(e) = res { + tlog!(Warning, "failed proposing replicaset master change: {e}"); + // TODO: don't hard code timeout + event::wait_timeout(Event::TopologyChanged, Duration::from_secs(1)).unwrap(); + continue 'governor; + } + // reconfigure vshard storages and routers + let res = (|| -> traft::Result<_> { let commit = storage.raft.commit()?.unwrap(); let reqs = maybe_responding(&peers) .filter(|peer| { @@ -1029,6 +1039,7 @@ fn raft_conf_change_loop(status: Rc<Cell<Status>>, storage: Storage) { continue 'governor; } + // update peer's CurrentGrade let req = UpdatePeerRequest::new(peer.instance_id.clone(), cluster_id.clone()) .with_current_grade(CurrentGrade::offline(peer.target_grade.incarnation)); tlog!(Info, -- GitLab