diff options
author | David Disseldorp <ddiss@suse.de> | 2011-08-03 10:38:27 +0200 |
---|---|---|
committer | David Disseldorp <ddiss@suse.de> | 2011-09-06 14:01:18 +0200 |
commit | 2f925f1e64acc46d994e7ee74dffeb8438128f20 (patch) | |
tree | c6a4ef8f58ad040a6706b671a5d0aac89e74c5e8 /ctdb/utils | |
parent | 9699e5750db81fe9cdd25ef24cbe1fbe8a200090 (diff) | |
download | samba-2f925f1e64acc46d994e7ee74dffeb8438128f20.tar.gz |
pmda: Attempt reconnects while ctdbd is unavailable
Attempt to reconnect to ctdbd on fetch while it is unreachable.
We must provide our own queue callback wrapper, as ctdb_client_read_cb()
exits on transport failure.
(This used to be ctdb commit 28df6fbf1273b8d095a2bc38dca6a6c35c5c31bd)
Diffstat (limited to 'ctdb/utils')
-rw-r--r-- | ctdb/utils/pmda/pmda_ctdb.c | 191 |
1 files changed, 141 insertions, 50 deletions
diff --git a/ctdb/utils/pmda/pmda_ctdb.c b/ctdb/utils/pmda/pmda_ctdb.c index 40efb28af31..daaa8573d97 100644 --- a/ctdb/utils/pmda/pmda_ctdb.c +++ b/ctdb/utils/pmda/pmda_ctdb.c @@ -163,6 +163,115 @@ static pmdaMetric metrictab[] = { static struct ctdb_context *ctdb; static struct event_context *ev; +static void +pmda_ctdb_q_read_cb(uint8_t *data, size_t cnt, void *args) +{ + if (cnt == 0) { + fprintf(stderr, "ctdbd unreachable\n"); + /* cleanup on request timeout */ + return; + } + + ctdb_client_read_cb(data, cnt, args); +} + + +static int +pmda_ctdb_daemon_connect(void) +{ + const char *socket_name; + int ret; + struct sockaddr_un addr; + + ev = event_context_init(NULL); + if (ev == NULL) { + fprintf(stderr, "Failed to init event ctx\n"); + return -1; + } + + ctdb = ctdb_init(ev); + if (ctdb == NULL) { + fprintf(stderr, "Failed to init ctdb\n"); + goto err_ev; + } + + socket_name = getenv("CTDB_SOCKET"); + if (socket_name == NULL) { + socket_name = "/var/lib/ctdb/ctdb.socket"; + } + + ret = ctdb_set_socketname(ctdb, socket_name); + if (ret == -1) { + fprintf(stderr, "ctdb_set_socketname failed - %s\n", + ctdb_errstr(ctdb)); + goto err_ctdb; + } + + /* + * ctdb_socket_connect() sets up a default queue callback handler calls + * exit() if ctdbd is unavailable on recv, override with our own to + * handle this + */ + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)); + + ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0); + if (ctdb->daemon.sd == -1) { + fprintf(stderr, "Failed to open client socket\n"); + goto err_ctdb; + } + + set_nonblocking(ctdb->daemon.sd); + set_close_on_exec(ctdb->daemon.sd); + + if (connect(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + fprintf(stderr, "Failed to connect to ctdb daemon\n"); + goto err_sd; + } + + ctdb->daemon.queue = ctdb_queue_setup(ctdb, ctdb, ctdb->daemon.sd, + CTDB_DS_ALIGNMENT, + pmda_ctdb_q_read_cb, ctdb); + if (ctdb->daemon.queue == NULL) { + fprintf(stderr, "Failed to setup queue\n"); + goto err_sd; + } + + ctdb->pnn = ctdb_ctrl_getpnn(ctdb, timeval_current_ofs(3, 0), + CTDB_CURRENT_NODE); + if (ctdb->pnn == (uint32_t)-1) { + fprintf(stderr, "Failed to get ctdb pnn\n"); + goto err_sd; + } + + return 0; +err_sd: + close(ctdb->daemon.sd); +err_ctdb: + talloc_free(ctdb); +err_ev: + talloc_free(ev); + ctdb = NULL; + return -1; +} + +static void +pmda_ctdb_daemon_disconnect(void) +{ + if (ctdb->methods) { + ctdb->methods->shutdown(ctdb); + } + + if (ctdb->daemon.sd != -1) + close(ctdb->daemon.sd); + + talloc_free(ctdb); + talloc_free(ev); + ctdb = NULL; +} + static int fill_node(unsigned int item, struct ctdb_statistics *stats, pmAtomValue *atom) { @@ -246,17 +355,44 @@ pmda_ctdb_fetch_cb(pmdaMetric *mdesc, unsigned int inst, pmAtomValue *atom) { struct ctdb_statistics stats; int ret; + TDB_DATA data; + int32_t res; + struct timeval ctdb_timeout; + __pmID_int *id = (__pmID_int *)&(mdesc->m_desc.pmid); if (inst != PM_IN_NULL) return PM_ERR_INST; - ret = ctdb_ctrl_statistics(ctdb, ctdb->pnn, &stats); - if (ret) { + if (ctdb == NULL) { + fprintf(stderr, "ctdbd disconnected, stats not available\n"); ret = PM_ERR_VALUE; goto err_out; } + ctdb_timeout = timeval_current_ofs(1, 0); + ret = ctdb_control(ctdb, ctdb->pnn, 0, + CTDB_CONTROL_STATISTICS, 0, tdb_null, + ctdb, &data, &res, &ctdb_timeout, NULL); + + if (ret != 0 || res != 0) { + fprintf(stderr, "ctdb control for statistics failed, reconnecting\n"); + if (ctdb != NULL) + pmda_ctdb_daemon_disconnect(); + ret = PM_ERR_VALUE; + goto err_out; + } + + if (data.dsize != sizeof(struct ctdb_statistics)) { + fprintf(stderr, "incorrect statistics size %zu - not %zu\n", + data.dsize, sizeof(struct ctdb_statistics)); + ret = PM_ERR_VALUE; + goto err_out; + } + + stats = *(struct ctdb_statistics *)data.dptr; + talloc_free(data.dptr); + switch (id->cluster) { case 0: atom->ul = stats.num_clients; @@ -356,56 +492,11 @@ err_out: static int pmda_ctdb_fetch(int numpmid, pmID pmidlist[], pmResult **resp, pmdaExt *pmda) { - return pmdaFetch(numpmid, pmidlist, resp, pmda); -} - -static int -pmda_ctdb_daemon_connect(void) -{ - const char *socket_name; - int ret; - - ev = event_context_init(NULL); - if (ev == NULL) { - fprintf(stderr, "Failed to init event ctx\n"); - return -1; - } - - ctdb = ctdb_init(ev); if (ctdb == NULL) { - fprintf(stderr, "Failed to init ctdb\n"); - return -1; + fprintf(stderr, "attempting reconnect to ctdbd\n"); + pmda_ctdb_daemon_connect(); } - - socket_name = getenv("CTDB_SOCKET"); - if (socket_name == NULL) { - socket_name = "/var/lib/ctdb/ctdb.socket"; - } - - ret = ctdb_set_socketname(ctdb, socket_name); - if (ret == -1) { - fprintf(stderr, "ctdb_set_socketname failed - %s\n", - ctdb_errstr(ctdb)); - talloc_free(ctdb); - return -1; - } - - ret = ctdb_socket_connect(ctdb); - if (ret != 0) { - fprintf(stderr, "Failed to connect to daemon\n"); - talloc_free(ctdb); - return -1; - } - - ctdb->pnn = ctdb_ctrl_getpnn(ctdb, timeval_current_ofs(3, 0), - CTDB_CURRENT_NODE); - if (ctdb->pnn == (uint32_t)-1) { - fprintf(stderr, "Failed to get ctdb pnn\n"); - talloc_free(ctdb); - return -1; - } - - return 0; + return pmdaFetch(numpmid, pmidlist, resp, pmda); } /* |