summaryrefslogtreecommitdiff
path: root/ctdb/server/ctdb_recover.c
diff options
context:
space:
mode:
authorAndrew Tridgell <tridge@samba.org>2008-11-21 08:05:59 +1100
committerAndrew Tridgell <tridge@samba.org>2008-11-21 10:24:13 +1100
commit59b6a9a9e66bc66cb6dda6a03226626ee12506db (patch)
treebd538e28da3dd55465473545c0d53190d22f934e /ctdb/server/ctdb_recover.c
parenteeae32c8d222058acfe6e967e766f82560edea3e (diff)
downloadsamba-59b6a9a9e66bc66cb6dda6a03226626ee12506db.tar.gz
fixed problem with looping ctdb recoveries
After a node failure, GPFS can get into a state where non-blocking fcntl() locks can take a long time. This means to the ctdb set_recmode test timing out, which leads to a recovery failure, and a new recovery. The recovery loop can last a long time. The fix is to consider a fcntl timeout as a success of this test. The test is to see that we can't lock the shared reclock file, so a timeout is fine for a success. (This used to be ctdb commit 6579a6a2a7161214adedf0f67dce62f4a4ad1afe)
Diffstat (limited to 'ctdb/server/ctdb_recover.c')
-rw-r--r--ctdb/server/ctdb_recover.c11
1 files changed, 9 insertions, 2 deletions
diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c
index c8b0ba066ab..39b73acff56 100644
--- a/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@@ -477,7 +477,14 @@ static void ctdb_set_recmode_timeout(struct event_context *ev, struct timed_even
struct ctdb_set_recmode_state *state = talloc_get_type(private_data,
struct ctdb_set_recmode_state);
- ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "timeout in ctdb_set_recmode");
+ /* we consider this a success, not a failure, as we failed to
+ set the recovery lock which is what we wanted. This can be
+ caused by the cluster filesystem being very slow to
+ arbitrate locks immediately after a node failure.
+ */
+ DEBUG(DEBUG_NOTICE,(__location__ " set_recmode timeout - allowing recmode set\n"));
+ state->ctdb->recovery_mode = state->recmode;
+ ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
talloc_free(state);
}
@@ -643,7 +650,7 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
talloc_set_destructor(state, set_recmode_destructor);
state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(3, 0),
- ctdb_set_recmode_timeout, state);
+ ctdb_set_recmode_timeout, state);
state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
EVENT_FD_READ|EVENT_FD_AUTOCLOSE,