diff options
author | Amitay Isaacs <amitay@gmail.com> | 2014-09-25 17:17:04 +1000 |
---|---|---|
committer | Martin Schwenke <martins@samba.org> | 2014-09-25 14:48:11 +0200 |
commit | 57310f80c9b8146a0978d912f73b0a64fde7697e (patch) | |
tree | 429534bda7eb2acf641321b8143a753cabd34098 /ctdb | |
parent | f1e281cd47d9ebd79e09294606b8fa411ec0fbb4 (diff) | |
download | samba-57310f80c9b8146a0978d912f73b0a64fde7697e.tar.gz |
ctdb-recoverd: If obtaining recovery lock fails, try again
When ctdb daemon starts up, it considers itself the recovery master
and tries to do first recovery. However, it's possible that there is
already a recovery master and the current node has not yet heard from it.
So do not ban ourselves immediately if ctdb_recovery_lock() fails when
doing first recovery.
Signed-off-by: Amitay Isaacs <amitay@gmail.com>
Reviewed-by: Martin Schwenke <martin@meltin.net>
Diffstat (limited to 'ctdb')
-rw-r--r-- | ctdb/server/ctdb_recoverd.c | 18 |
1 files changed, 18 insertions, 0 deletions
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 14e6ea85ad5..945b01c4e92 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -1815,6 +1815,16 @@ static int do_recovery(struct ctdb_recoverd *rec, DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n")); start_time = timeval_current(); if (!ctdb_recovery_lock(ctdb, true)) { + if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) { + /* If ctdb is trying first recovery, it's + * possible that current node does not know yet + * who the recmaster is. + */ + DEBUG(DEBUG_ERR, ("Unable to get recovery lock" + " - retrying recovery\n")); + return -1; + } + DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery " "and ban ourself for %u seconds\n", ctdb->tunable.recovery_ban_period)); @@ -3593,6 +3603,14 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, return; } + /* get runstate */ + ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(), + CTDB_CURRENT_NODE, &ctdb->runstate); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n")); + return; + } + /* get the current recovery lock file from the server */ if (update_recovery_lock_file(ctdb) != 0) { DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n")); |