summaryrefslogtreecommitdiff
path: root/ctdb
diff options
context:
space:
mode:
authorAmitay Isaacs <amitay@gmail.com>2014-09-25 17:17:04 +1000
committerMartin Schwenke <martins@samba.org>2014-09-25 14:48:11 +0200
commit57310f80c9b8146a0978d912f73b0a64fde7697e (patch)
tree429534bda7eb2acf641321b8143a753cabd34098 /ctdb
parentf1e281cd47d9ebd79e09294606b8fa411ec0fbb4 (diff)
downloadsamba-57310f80c9b8146a0978d912f73b0a64fde7697e.tar.gz
ctdb-recoverd: If obtaining recovery lock fails, try again
When ctdb daemon starts up, it considers itself the recovery master and tries to do first recovery. However, it's possible that there is already a recovery master and the current node has not yet heard from it. So do not ban ourselves immediately if ctdb_recovery_lock() fails when doing first recovery. Signed-off-by: Amitay Isaacs <amitay@gmail.com> Reviewed-by: Martin Schwenke <martin@meltin.net>
Diffstat (limited to 'ctdb')
-rw-r--r--ctdb/server/ctdb_recoverd.c18
1 files changed, 18 insertions, 0 deletions
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index 14e6ea85ad5..945b01c4e92 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -1815,6 +1815,16 @@ static int do_recovery(struct ctdb_recoverd *rec,
DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
start_time = timeval_current();
if (!ctdb_recovery_lock(ctdb, true)) {
+ if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
+ /* If ctdb is trying first recovery, it's
+ * possible that current node does not know yet
+ * who the recmaster is.
+ */
+ DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
+ " - retrying recovery\n"));
+ return -1;
+ }
+
DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
"and ban ourself for %u seconds\n",
ctdb->tunable.recovery_ban_period));
@@ -3593,6 +3603,14 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
return;
}
+ /* get runstate */
+ ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
+ CTDB_CURRENT_NODE, &ctdb->runstate);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
+ return;
+ }
+
/* get the current recovery lock file from the server */
if (update_recovery_lock_file(ctdb) != 0) {
DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));