summaryrefslogtreecommitdiff
path: root/ctdb/server
diff options
context:
space:
mode:
authorMartin Schwenke <martin@meltin.net>2019-07-16 08:58:33 +1000
committerMartin Schwenke <martins@samba.org>2019-07-26 03:34:16 +0000
commit6fe963c3f72680ecbed1b6f9c70a7af003115666 (patch)
tree91b8d114eba62b3257315d16c015aa68b1e97fb3 /ctdb/server
parentf2559ef8ce01a46b33a9295e67da920a327f3929 (diff)
downloadsamba-6fe963c3f72680ecbed1b6f9c70a7af003115666.tar.gz
ctdb-recoverd: Periodically log recovery master of incomplete cluster
Only do this if the recovery lock is unset. Log every minute for the first 10 minutes, then every 10 minutes, then every hour. This is useful for determining whether a split brain occurred. It is particularly useful if logging failed or was throttled at startup, so there is no evidence of the split brain when it began. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
Diffstat (limited to 'ctdb/server')
-rw-r--r--ctdb/server/ctdb_recoverd.c119
1 files changed, 119 insertions, 0 deletions
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index 2b5debc6c78..2633c755752 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -3091,6 +3091,112 @@ static void recd_sig_term_handler(struct tevent_context *ev,
exit(0);
}
+/*
+ * Periodically log elements of the cluster state
+ *
+ * This can be used to confirm a split brain has occurred
+ */
+static void maybe_log_cluster_state(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval current_time,
+ void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type_abort(
+ private_data, struct ctdb_recoverd);
+ struct ctdb_context *ctdb = rec->ctdb;
+ struct tevent_timer *tt;
+
+ static struct timeval start_incomplete = {
+ .tv_sec = 0,
+ };
+
+ bool is_complete;
+ bool was_complete;
+ unsigned int i;
+ double seconds;
+ unsigned int minutes;
+ unsigned int num_connected;
+
+ if (rec->recmaster != ctdb_get_pnn(ctdb)) {
+ goto done;
+ }
+
+ if (rec->nodemap == NULL) {
+ goto done;
+ }
+
+ is_complete = true;
+ num_connected = 0;
+ for (i = 0; i < rec->nodemap->num; i++) {
+ struct ctdb_node_and_flags *n = &rec->nodemap->nodes[i];
+
+ if (n->pnn == ctdb_get_pnn(ctdb)) {
+ continue;
+ }
+ if ((n->flags & NODE_FLAGS_DELETED) != 0) {
+ continue;
+ }
+ if ((n->flags & NODE_FLAGS_DISCONNECTED) != 0) {
+ is_complete = false;
+ continue;
+ }
+
+ num_connected++;
+ }
+
+ was_complete = timeval_is_zero(&start_incomplete);
+
+ if (is_complete) {
+ if (! was_complete) {
+ D_WARNING("Cluster complete with master=%u\n",
+ rec->recmaster);
+ start_incomplete = timeval_zero();
+ }
+ goto done;
+ }
+
+ /* Cluster is newly incomplete... */
+ if (was_complete) {
+ start_incomplete = current_time;
+ minutes = 0;
+ goto log;
+ }
+
+ /*
+ * Cluster has been incomplete since previous check, so figure
+ * out how long (in minutes) and decide whether to log anything
+ */
+ seconds = timeval_elapsed2(&start_incomplete, &current_time);
+ minutes = (unsigned int)seconds / 60;
+ if (minutes >= 60) {
+ /* Over an hour, log every hour */
+ if (minutes % 60 != 0) {
+ goto done;
+ }
+ } else if (minutes >= 10) {
+ /* Over 10 minutes, log every 10 minutes */
+ if (minutes % 10 != 0) {
+ goto done;
+ }
+ }
+
+log:
+ D_WARNING("Cluster incomplete with master=%u, elapsed=%u minutes, "
+ "connected=%u\n",
+ rec->recmaster,
+ minutes,
+ num_connected);
+
+done:
+ tt = tevent_add_timer(ctdb->ev,
+ rec,
+ timeval_current_ofs(60, 0),
+ maybe_log_cluster_state,
+ rec);
+ if (tt == NULL) {
+ DBG_WARNING("Failed to set up cluster state timer\n");
+ }
+}
/*
the main monitoring loop
@@ -3125,6 +3231,19 @@ static void monitor_cluster(struct ctdb_context *ctdb)
exit(1);
}
+ if (ctdb->recovery_lock == NULL) {
+ struct tevent_timer *tt;
+
+ tt = tevent_add_timer(ctdb->ev,
+ rec,
+ timeval_current_ofs(60, 0),
+ maybe_log_cluster_state,
+ rec);
+ if (tt == NULL) {
+ DBG_WARNING("Failed to set up cluster state timer\n");
+ }
+ }
+
/* register a message port for sending memory dumps */
ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);