summaryrefslogtreecommitdiff
path: root/source3/lib
diff options
context:
space:
mode:
authorMichael Adam <obnox@samba.org>2009-12-11 14:07:28 +0100
committerKarolin Seeger <kseeger@samba.org>2010-04-01 09:39:14 +0200
commit5e6666ff391b10ed9454fc8cba3c6795f3e2e036 (patch)
treea4eb1871c56e2d3649cf12b0dc4f378718c6933c /source3/lib
parenta3726f340eb211dfbac3929ad36496de2ac836dc (diff)
downloadsamba-5e6666ff391b10ed9454fc8cba3c6795f3e2e036.tar.gz
s3:dbwrap_ctdb: maintain a database sequence number that bumps in transactions
For persistent databases, 64bit integer is kept in a special record __db_sequence_number__. This record is incremented with each completed transaction. The retry mechanism for failing TRANS3_COMMIT controls inside the db_ctdb_transaction_commit() function now relies one a modified behaviour of ctdbd's treatment of persistent databases in recoveries. Recently, a special treatment for persistent databases had been introduced in ctdb (1.0.108) to work around the problems with the orinal design of persistent transactions. Now with the rewrite we need to revert to the old behaviour that ctdb always takes the newest copies of all records. This change also paves the way for a next step, which will make recovery use the db seqnum to tell which node has the newest copy of a persistent db and use that node's copy. This will greatly reduce the amount of data transferred with each recovery. Michael (cherry picked from commit 3fe7ce141d6afe3825b06c5feb90558911e4df1e) (cherry picked from commit cfdf2d76878ee1ba32b2cb89f9ed740386990530)
Diffstat (limited to 'source3/lib')
-rw-r--r--source3/lib/dbwrap_ctdb.c121
1 files changed, 116 insertions, 5 deletions
diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index 0986083268f..fb99e1d9cf5 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -664,6 +664,65 @@ static NTSTATUS db_ctdb_delete_transaction(struct db_record *rec)
return status;
}
+/**
+ * Fetch the db sequence number of a persistent db directly from the db.
+ */
+static NTSTATUS db_ctdb_fetch_db_seqnum_from_db(struct db_ctdb_ctx *db,
+ uint64_t *seqnum)
+{
+ NTSTATUS status;
+ const char *keyname = CTDB_DB_SEQNUM_KEY;
+ TDB_DATA key;
+ TDB_DATA data;
+ struct ctdb_ltdb_header header;
+ TALLOC_CTX *mem_ctx = talloc_stackframe();
+
+ if (seqnum == NULL) {
+ return NT_STATUS_INVALID_PARAMETER;
+ }
+
+ key.dptr = (uint8_t *)discard_const(keyname);
+ key.dsize = strlen(keyname) + 1;
+
+ status = db_ctdb_ltdb_fetch(db, key, &header, mem_ctx, &data);
+ if (!NT_STATUS_IS_OK(status)) {
+ goto done;
+ }
+
+ if (data.dsize != sizeof(uint64_t)) {
+ *seqnum = 0;
+ goto done;
+ }
+
+ *seqnum = *(uint64_t *)data.dptr;
+
+done:
+ TALLOC_FREE(mem_ctx);
+ return status;
+}
+
+/**
+ * Store the database sequence number inside a transaction.
+ */
+static NTSTATUS db_ctdb_store_db_seqnum(struct db_ctdb_transaction_handle *h,
+ uint64_t seqnum)
+{
+ NTSTATUS status;
+ const char *keyname = CTDB_DB_SEQNUM_KEY;
+ TDB_DATA key;
+ TDB_DATA data;
+
+ key.dptr = (uint8_t *)discard_const(keyname);
+ key.dsize = strlen(keyname);
+
+ data.dptr = (uint8_t *)&seqnum;
+ data.dsize = sizeof(uint64_t);
+
+ status = db_ctdb_transaction_store(h, key, data);
+
+ return status;
+}
+
/*
commit a transaction
*/
@@ -674,6 +733,8 @@ static int db_ctdb_transaction_commit(struct db_context *db)
NTSTATUS rets;
int status;
struct db_ctdb_transaction_handle *h = ctx->transaction;
+ uint64_t old_seqnum, new_seqnum;
+ int ret;
if (h == NULL) {
DEBUG(0,(__location__ " transaction commit with no open transaction on db 0x%08x\n", ctx->db_id));
@@ -693,6 +754,30 @@ static int db_ctdb_transaction_commit(struct db_context *db)
DEBUG(5,(__location__ " Commit transaction on db 0x%08x\n", ctx->db_id));
+ /*
+ * As the last db action before committing, bump the database sequence
+ * number. Note that this undoes all changes to the seqnum records
+ * performed under the transaction. This record is not meant to be
+ * modified by user interaction. It is for internal use only...
+ */
+ rets = db_ctdb_fetch_db_seqnum_from_db(ctx, &old_seqnum);
+ if (!NT_STATUS_IS_OK(rets)) {
+ DEBUG(1, (__location__ " failed to fetch the db sequence number "
+ "in transaction commit on db 0x%08x\n", ctx->db_id));
+ ret = -1;
+ goto done;
+ }
+
+ new_seqnum = old_seqnum + 1;
+
+ rets = db_ctdb_store_db_seqnum(h, new_seqnum);
+ if (!NT_STATUS_IS_OK(rets)) {
+ DEBUG(1, (__location__ "failed to store the db sequence number "
+ " in transaction commit on db 0x%08x\n", ctx->db_id));
+ ret = -1;
+ goto done;
+ }
+
again:
if (h->m_write == NULL) {
/* no changes were made, potentially after a retry */
@@ -707,14 +792,40 @@ again:
NULL, NULL, &status);
if (!NT_STATUS_IS_OK(rets) || status != 0) {
/*
- * TODO:
- * check the database sequence number and
- * compare it to the seqnum after applying the
- * marshall buffer. If it is the same: return success.
+ * The TRANS3_COMMIT control should only possibly fail when a
+ * recovery has been running concurrently. In any case, the db
+ * will be the same on all nodes, either the new copy or the
+ * old copy. This can be detected by comparing the old and new
+ * local sequence numbers.
+ */
+ rets = db_ctdb_fetch_db_seqnum_from_db(ctx, &new_seqnum);
+ if (!NT_STATUS_IS_OK(rets)) {
+ DEBUG(1, (__location__ " failed to refetch db sequence "
+ "number after failed TRANS3_COMMIT\n"));
+ ret = -1;
+ goto done;
+ }
+
+ if (new_seqnum == old_seqnum) {
+ /* Recovery prevented all our changes: retry. */
+ goto again;
+ } else if (new_seqnum != (old_seqnum + 1)) {
+ DEBUG(0, (__location__ " ERROR: new_seqnum[%lu] != "
+ "old_seqnum[%lu] + (0 or 1) after failed "
+ "TRANS3_COMMIT - this should not happen!\n",
+ (unsigned long)new_seqnum,
+ (unsigned long)old_seqnum));
+ ret = -1;
+ goto done;
+ }
+ /*
+ * Recovery propagated our changes to all nodes, completing
+ * our commit for us - succeed.
*/
- goto again;
}
+ ret = 0;
+
done:
h->ctx->transaction = NULL;
talloc_free(h);