summaryrefslogtreecommitdiff
path: root/ctdb/utils/ceph
diff options
context:
space:
mode:
authorDavid Disseldorp <ddiss@samba.org>2016-12-01 13:33:22 +0100
committerAmitay Isaacs <amitay@samba.org>2016-12-09 04:10:20 +0100
commitd8b61863ece6c5c231ac3e5b783c725864cfdad0 (patch)
treefc68e8c35c89966c988cd397544f499154b2de60 /ctdb/utils/ceph
parentcbc81dd78e4fe3c54e5930db0d1b89d1cdca367d (diff)
downloadsamba-d8b61863ece6c5c231ac3e5b783c725864cfdad0.tar.gz
ctdb: cluster mutex helper using Ceph RADOS
ctdb_mutex_ceph_rados_helper implements the cluster mutex helper API atop Ceph using the librados rados_lock_exclusive()/rados_unlock() functionality. Once configured, split brain avoidance during CTDB recovery will be handled using locks against an object located in a Ceph RADOS pool. Signed-off-by: David Disseldorp <ddiss@samba.org> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
Diffstat (limited to 'ctdb/utils/ceph')
-rw-r--r--ctdb/utils/ceph/ctdb_mutex_ceph_rados_helper.c328
1 files changed, 328 insertions, 0 deletions
diff --git a/ctdb/utils/ceph/ctdb_mutex_ceph_rados_helper.c b/ctdb/utils/ceph/ctdb_mutex_ceph_rados_helper.c
new file mode 100644
index 00000000000..326a0b02519
--- /dev/null
+++ b/ctdb/utils/ceph/ctdb_mutex_ceph_rados_helper.c
@@ -0,0 +1,328 @@
+/*
+ CTDB mutex helper using Ceph librados locks
+
+ Copyright (C) David Disseldorp 2016
+
+ Based on ctdb_mutex_fcntl_helper.c, which is:
+ Copyright (C) Martin Schwenke 2015
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+
+#include "tevent.h"
+#include "talloc.h"
+#include "rados/librados.h"
+
+#define CTDB_MUTEX_CEPH_LOCK_NAME "ctdb_reclock_mutex"
+#define CTDB_MUTEX_CEPH_LOCK_COOKIE CTDB_MUTEX_CEPH_LOCK_NAME
+#define CTDB_MUTEX_CEPH_LOCK_DESC "CTDB recovery lock"
+
+#define CTDB_MUTEX_STATUS_HOLDING "0"
+#define CTDB_MUTEX_STATUS_CONTENDED "1"
+#define CTDB_MUTEX_STATUS_TIMEOUT "2"
+#define CTDB_MUTEX_STATUS_ERROR "3"
+
+static char *progname = NULL;
+
+static int ctdb_mutex_rados_ctx_create(const char *ceph_cluster_name,
+ const char *ceph_auth_name,
+ const char *pool_name,
+ rados_t *_ceph_cluster,
+ rados_ioctx_t *_ioctx)
+{
+ rados_t ceph_cluster = NULL;
+ rados_ioctx_t ioctx = NULL;
+ int ret;
+
+ ret = rados_create2(&ceph_cluster, ceph_cluster_name, ceph_auth_name, 0);
+ if (ret < 0) {
+ fprintf(stderr, "%s: failed to initialise Ceph cluster %s as %s"
+ " - (%s)\n", progname, ceph_cluster_name, ceph_auth_name,
+ strerror(-ret));
+ return ret;
+ }
+
+ /* path=NULL tells librados to use default locations */
+ ret = rados_conf_read_file(ceph_cluster, NULL);
+ if (ret < 0) {
+ fprintf(stderr, "%s: failed to parse Ceph cluster config"
+ " - (%s)\n", progname, strerror(-ret));
+ rados_shutdown(ceph_cluster);
+ return ret;
+ }
+
+ ret = rados_connect(ceph_cluster);
+ if (ret < 0) {
+ fprintf(stderr, "%s: failed to connect to Ceph cluster %s as %s"
+ " - (%s)\n", progname, ceph_cluster_name, ceph_auth_name,
+ strerror(-ret));
+ rados_shutdown(ceph_cluster);
+ return ret;
+ }
+
+
+ ret = rados_ioctx_create(ceph_cluster, pool_name, &ioctx);
+ if (ret < 0) {
+ fprintf(stderr, "%s: failed to create Ceph ioctx for pool %s"
+ " - (%s)\n", progname, pool_name, strerror(-ret));
+ rados_shutdown(ceph_cluster);
+ return ret;
+ }
+
+ *_ceph_cluster = ceph_cluster;
+ *_ioctx = ioctx;
+
+ return 0;
+}
+
+static void ctdb_mutex_rados_ctx_destroy(rados_t ceph_cluster,
+ rados_ioctx_t ioctx)
+{
+ rados_ioctx_destroy(ioctx);
+ rados_shutdown(ceph_cluster);
+}
+
+static int ctdb_mutex_rados_lock(rados_ioctx_t *ioctx,
+ const char *oid)
+{
+ int ret;
+
+ ret = rados_lock_exclusive(ioctx, oid,
+ CTDB_MUTEX_CEPH_LOCK_NAME,
+ CTDB_MUTEX_CEPH_LOCK_COOKIE,
+ CTDB_MUTEX_CEPH_LOCK_DESC,
+ NULL, /* infinite duration */
+ 0);
+ if ((ret == -EEXIST) || (ret == -EBUSY)) {
+ /* lock contention */
+ return ret;
+ } else if (ret < 0) {
+ /* unexpected failure */
+ fprintf(stderr,
+ "%s: Failed to get lock on RADOS object '%s' - (%s)\n",
+ progname, oid, strerror(-ret));
+ return ret;
+ }
+
+ /* lock obtained */
+ return 0;
+}
+
+static int ctdb_mutex_rados_unlock(rados_ioctx_t *ioctx,
+ const char *oid)
+{
+ int ret;
+
+ ret = rados_unlock(ioctx, oid,
+ CTDB_MUTEX_CEPH_LOCK_NAME,
+ CTDB_MUTEX_CEPH_LOCK_COOKIE);
+ if (ret < 0) {
+ fprintf(stderr,
+ "%s: Failed to drop lock on RADOS object '%s' - (%s)\n",
+ progname, oid, strerror(-ret));
+ return ret;
+ }
+
+ return 0;
+}
+
+struct ctdb_mutex_rados_state {
+ bool holding_mutex;
+ const char *ceph_cluster_name;
+ const char *ceph_auth_name;
+ const char *pool_name;
+ const char *object;
+ int ppid;
+ struct tevent_context *ev;
+ struct tevent_signal *sig_ev;
+ struct tevent_timer *timer_ev;
+ rados_t ceph_cluster;
+ rados_ioctx_t ioctx;
+};
+
+static void ctdb_mutex_rados_sigterm_cb(struct tevent_context *ev,
+ struct tevent_signal *se,
+ int signum,
+ int count,
+ void *siginfo,
+ void *private_data)
+{
+ struct ctdb_mutex_rados_state *cmr_state = private_data;
+ int ret;
+
+ if (!cmr_state->holding_mutex) {
+ fprintf(stderr, "Sigterm callback invoked without mutex!\n");
+ ret = -EINVAL;
+ goto err_ctx_cleanup;
+ }
+
+ ret = ctdb_mutex_rados_unlock(cmr_state->ioctx, cmr_state->object);
+err_ctx_cleanup:
+ ctdb_mutex_rados_ctx_destroy(cmr_state->ceph_cluster,
+ cmr_state->ioctx);
+ talloc_free(cmr_state);
+ exit(ret ? 1 : 0);
+}
+
+static void ctdb_mutex_rados_timer_cb(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval current_time,
+ void *private_data)
+{
+ struct ctdb_mutex_rados_state *cmr_state = private_data;
+ int ret;
+
+ if (!cmr_state->holding_mutex) {
+ fprintf(stderr, "Timer callback invoked without mutex!\n");
+ ret = -EINVAL;
+ goto err_ctx_cleanup;
+ }
+
+ if ((kill(cmr_state->ppid, 0) == 0) || (errno != ESRCH)) {
+ /* parent still around, keep waiting */
+ cmr_state->timer_ev = tevent_add_timer(cmr_state->ev, cmr_state,
+ tevent_timeval_current_ofs(5, 0),
+ ctdb_mutex_rados_timer_cb,
+ cmr_state);
+ if (cmr_state->timer_ev == NULL) {
+ fprintf(stderr, "Failed to create timer event\n");
+ /* rely on signal cb */
+ }
+ return;
+ }
+
+ /* parent ended, drop lock and exit */
+ ret = ctdb_mutex_rados_unlock(cmr_state->ioctx, cmr_state->object);
+err_ctx_cleanup:
+ ctdb_mutex_rados_ctx_destroy(cmr_state->ceph_cluster,
+ cmr_state->ioctx);
+ talloc_free(cmr_state);
+ exit(ret ? 1 : 0);
+}
+
+int main(int argc, char *argv[])
+{
+ int ret;
+ struct ctdb_mutex_rados_state *cmr_state;
+
+ progname = argv[0];
+
+ if (argc != 5) {
+ fprintf(stderr, "Usage: %s <Ceph Cluster> <Ceph user> "
+ "<RADOS pool> <RADOS object>\n",
+ progname);
+ ret = -EINVAL;
+ goto err_out;
+ }
+
+ ret = setvbuf(stdout, NULL, _IONBF, 0);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to configure unbuffered stdout I/O\n");
+ }
+
+ cmr_state = talloc_zero(NULL, struct ctdb_mutex_rados_state);
+ if (cmr_state == NULL) {
+ fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+ ret = -ENOMEM;
+ goto err_out;
+ }
+
+ cmr_state->ceph_cluster_name = argv[1];
+ cmr_state->ceph_auth_name = argv[2];
+ cmr_state->pool_name = argv[3];
+ cmr_state->object = argv[4];
+
+ cmr_state->ppid = getppid();
+ if (cmr_state->ppid == 1) {
+ /*
+ * The original parent is gone and the process has
+ * been reparented to init. This can happen if the
+ * helper is started just as the parent is killed
+ * during shutdown. The error message doesn't need to
+ * be stellar, since there won't be anything around to
+ * capture and log it...
+ */
+ fprintf(stderr, "%s: PPID == 1\n", progname);
+ ret = -EPIPE;
+ goto err_state_free;
+ }
+
+ cmr_state->ev = tevent_context_init(cmr_state);
+ if (cmr_state->ev == NULL) {
+ fprintf(stderr, "tevent_context_init failed\n");
+ fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+ ret = -ENOMEM;
+ goto err_state_free;
+ }
+
+ /* wait for sigterm */
+ cmr_state->sig_ev = tevent_add_signal(cmr_state->ev, cmr_state, SIGTERM, 0,
+ ctdb_mutex_rados_sigterm_cb,
+ cmr_state);
+ if (cmr_state->sig_ev == NULL) {
+ fprintf(stderr, "Failed to create signal event\n");
+ fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+ ret = -ENOMEM;
+ goto err_state_free;
+ }
+
+ /* periodically check parent */
+ cmr_state->timer_ev = tevent_add_timer(cmr_state->ev, cmr_state,
+ tevent_timeval_current_ofs(5, 0),
+ ctdb_mutex_rados_timer_cb,
+ cmr_state);
+ if (cmr_state->timer_ev == NULL) {
+ fprintf(stderr, "Failed to create timer event\n");
+ fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+ ret = -ENOMEM;
+ goto err_state_free;
+ }
+
+ ret = ctdb_mutex_rados_ctx_create(cmr_state->ceph_cluster_name,
+ cmr_state->ceph_auth_name,
+ cmr_state->pool_name,
+ &cmr_state->ceph_cluster,
+ &cmr_state->ioctx);
+ if (ret < 0) {
+ fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+ goto err_state_free;
+ }
+
+ ret = ctdb_mutex_rados_lock(cmr_state->ioctx, cmr_state->object);
+ if ((ret == -EEXIST) || (ret == -EBUSY)) {
+ fprintf(stdout, CTDB_MUTEX_STATUS_CONTENDED);
+ goto err_ctx_cleanup;
+ } else if (ret < 0) {
+ fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+ goto err_ctx_cleanup;
+ }
+
+ cmr_state->holding_mutex = true;
+ fprintf(stdout, CTDB_MUTEX_STATUS_HOLDING);
+
+ /* wait for the signal / timer events to do their work */
+ ret = tevent_loop_wait(cmr_state->ev);
+ if (ret < 0) {
+ goto err_ctx_cleanup;
+ }
+err_ctx_cleanup:
+ ctdb_mutex_rados_ctx_destroy(cmr_state->ceph_cluster,
+ cmr_state->ioctx);
+err_state_free:
+ talloc_free(cmr_state);
+err_out:
+ return ret ? 1 : 0;
+}