summaryrefslogtreecommitdiff
path: root/ctdb/server/ctdb_recoverd.c
diff options
context:
space:
mode:
Diffstat (limited to 'ctdb/server/ctdb_recoverd.c')
-rw-r--r--ctdb/server/ctdb_recoverd.c119
1 files changed, 119 insertions, 0 deletions
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index 2b5debc6c78..2633c755752 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -3091,6 +3091,112 @@ static void recd_sig_term_handler(struct tevent_context *ev,
exit(0);
}
+/*
+ * Periodically log elements of the cluster state
+ *
+ * This can be used to confirm a split brain has occurred
+ */
+static void maybe_log_cluster_state(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval current_time,
+ void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type_abort(
+ private_data, struct ctdb_recoverd);
+ struct ctdb_context *ctdb = rec->ctdb;
+ struct tevent_timer *tt;
+
+ static struct timeval start_incomplete = {
+ .tv_sec = 0,
+ };
+
+ bool is_complete;
+ bool was_complete;
+ unsigned int i;
+ double seconds;
+ unsigned int minutes;
+ unsigned int num_connected;
+
+ if (rec->recmaster != ctdb_get_pnn(ctdb)) {
+ goto done;
+ }
+
+ if (rec->nodemap == NULL) {
+ goto done;
+ }
+
+ is_complete = true;
+ num_connected = 0;
+ for (i = 0; i < rec->nodemap->num; i++) {
+ struct ctdb_node_and_flags *n = &rec->nodemap->nodes[i];
+
+ if (n->pnn == ctdb_get_pnn(ctdb)) {
+ continue;
+ }
+ if ((n->flags & NODE_FLAGS_DELETED) != 0) {
+ continue;
+ }
+ if ((n->flags & NODE_FLAGS_DISCONNECTED) != 0) {
+ is_complete = false;
+ continue;
+ }
+
+ num_connected++;
+ }
+
+ was_complete = timeval_is_zero(&start_incomplete);
+
+ if (is_complete) {
+ if (! was_complete) {
+ D_WARNING("Cluster complete with master=%u\n",
+ rec->recmaster);
+ start_incomplete = timeval_zero();
+ }
+ goto done;
+ }
+
+ /* Cluster is newly incomplete... */
+ if (was_complete) {
+ start_incomplete = current_time;
+ minutes = 0;
+ goto log;
+ }
+
+ /*
+ * Cluster has been incomplete since previous check, so figure
+ * out how long (in minutes) and decide whether to log anything
+ */
+ seconds = timeval_elapsed2(&start_incomplete, &current_time);
+ minutes = (unsigned int)seconds / 60;
+ if (minutes >= 60) {
+ /* Over an hour, log every hour */
+ if (minutes % 60 != 0) {
+ goto done;
+ }
+ } else if (minutes >= 10) {
+ /* Over 10 minutes, log every 10 minutes */
+ if (minutes % 10 != 0) {
+ goto done;
+ }
+ }
+
+log:
+ D_WARNING("Cluster incomplete with master=%u, elapsed=%u minutes, "
+ "connected=%u\n",
+ rec->recmaster,
+ minutes,
+ num_connected);
+
+done:
+ tt = tevent_add_timer(ctdb->ev,
+ rec,
+ timeval_current_ofs(60, 0),
+ maybe_log_cluster_state,
+ rec);
+ if (tt == NULL) {
+ DBG_WARNING("Failed to set up cluster state timer\n");
+ }
+}
/*
the main monitoring loop
@@ -3125,6 +3231,19 @@ static void monitor_cluster(struct ctdb_context *ctdb)
exit(1);
}
+ if (ctdb->recovery_lock == NULL) {
+ struct tevent_timer *tt;
+
+ tt = tevent_add_timer(ctdb->ev,
+ rec,
+ timeval_current_ofs(60, 0),
+ maybe_log_cluster_state,
+ rec);
+ if (tt == NULL) {
+ DBG_WARNING("Failed to set up cluster state timer\n");
+ }
+ }
+
/* register a message port for sending memory dumps */
ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);