From 00a263982cdf666c1c9c409e4f07df8f5d38ab98 Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Wed, 14 Feb 2018 15:18:17 +1100 Subject: ctdb-vacuum: Fix the incorrect counting of remote errors If a node fails to delete a record in TRY_DELETE_RECORDS control during vacuuming, then it's possible that other nodes also may fail to delete a record. So instead of deleting the record from RB tree on first failure, keep track of the remote failures. Update delete_list.remote_error and delete_list.left statistics only once per record during the delete_record_traverse. BUG: https://bugzilla.samba.org/show_bug.cgi?id=13641 Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit ef052397173522ac2dd0d0bd9660a18a13a3e4fc) --- ctdb/server/ctdb_vacuum.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_vacuum.c b/ctdb/server/ctdb_vacuum.c index 5aa0ca7dcc0..8faf803efb9 100644 --- a/ctdb/server/ctdb_vacuum.c +++ b/ctdb/server/ctdb_vacuum.c @@ -107,6 +107,7 @@ struct delete_record_data { struct ctdb_context *ctdb; struct ctdb_db_context *ctdb_db; struct ctdb_ltdb_header hdr; + uint32_t remote_fail_count; TDB_DATA key; uint8_t keydata[1]; }; @@ -149,6 +150,7 @@ static int insert_delete_record_data_into_tree(struct ctdb_context *ctdb, memcpy(dd->keydata, key.dptr, key.dsize); dd->hdr = *hdr; + dd->remote_fail_count = 0; hash = ctdb_hash(&key); @@ -451,6 +453,13 @@ static int delete_record_traverse(void *param, void *data) uint32_t lmaster; uint32_t hash = ctdb_hash(&(dd->key)); + if (dd->remote_fail_count > 0) { + vdata->count.delete_list.remote_error++; + vdata->count.delete_list.left--; + talloc_free(dd); + return 0; + } + res = tdb_chainlock(ctdb_db->ltdb->tdb, dd->key); if (res != 0) { DEBUG(DEBUG_ERR, @@ -828,22 +837,17 @@ static void ctdb_process_delete_list(struct ctdb_db_context *ctdb_db, ctdb_hash(&reckey)); if (dd != NULL) { /* - * The other node could not delete the - * record and it is the first node that - * failed. So we should remove it from - * the tree and update statistics. + * The remote node could not delete the + * record. Since other remote nodes can + * also fail, we just mark the record. */ - talloc_free(dd); - vdata->count.delete_list.remote_error++; - vdata->count.delete_list.left--; + dd->remote_fail_count++; } else { DEBUG(DEBUG_ERR, (__location__ " Failed to " "find record with hash 0x%08x coming " "back from TRY_DELETE_RECORDS " "control in delete list.\n", ctdb_hash(&reckey))); - vdata->count.delete_list.local_error++; - vdata->count.delete_list.left--; } rec = (struct ctdb_rec_data_old *)(rec->length + (uint8_t *)rec); -- cgit v1.2.1