raft: Transfer leadership before creating snapshots.

With a big database writing snapshot could take a lot of time, for example, on one of the systems compaction of 300MB database takes about 10 seconds to complete. For the clustered database, 40% of this time takes conversion of the database to the file transaction json format, the rest of time is formatting a string and writing to disk. Of course, this highly depends on the disc and CPU speeds. 300MB is the very possible database size for the OVN Southbound DB, and it might be even bigger than that. During compaction the database is not available and the ovsdb-server doesn't do any other tasks. If leader spends 10-15 seconds writing a snapshot, the cluster is not functional for that time period. Leader also, likely, has some monitors to serve, so the one poll interval may be 15-20 seconds long in the end. Systems with so big databases typically has very high election timers configured (16 seconds), so followers will start election only after this significant amount of time. Once leader is back to the operational state, it will re-connect and try to join the cluster back. In some cases, this might also trigger the 'connected' state flapping on the old leader triggering a re-connection of clients. This issue has been observed with large-scale OVN deployments. One of the methods to improve the situation is to transfer leadership before compacting. This allows to keep the cluster functional, while one of the servers writes a snapshot. Additionally logging the time spent for compaction if it was longer than 1 second. This adds a bit of visibility to 'unreasonably long poll interval's. Reported-at: https://bugzilla.redhat.com/1960391 Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Acked-by: Dumitru Ceara <dceara@redhat.com>
author: Ilya Maximets <i.maximets@ovn.org> 2021-05-06 14:47:31 +0200
committer: Ilya Maximets <i.maximets@ovn.org> 2021-05-14 16:01:07 +0200
commit: 058702e3dcc61700bd587621d750d80985660a54 (patch)
tree: 9582ca3e082362ca5d44f3eab9ce26afbc3fba3d
parent: 3bc41e2b6a170dcd6b5c728a4ed3410a6ce0865d (diff)
download: openvswitch-058702e3dcc61700bd587621d750d80985660a54.tar.gz
4 files changed, 41 insertions, 7 deletions
diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c
index 9042658fa..e019631e9 100644
--- a/ovsdb/ovsdb.c
+++ b/ovsdb/ovsdb.c
@@ -31,6 +31,7 @@
 #include "simap.h"
 #include "storage.h"
 #include "table.h"
+#include "timeval.h"
 #include "transaction.h"
 #include "trigger.h"
 
@@ -525,6 +526,7 @@ ovsdb_snapshot(struct ovsdb *db, bool trim_memory OVS_UNUSED)
         return NULL;
     }
 
+    uint64_t elapsed, start_time = time_msec();
     struct json *schema = ovsdb_schema_to_json(db->schema);
     struct json *data = ovsdb_to_txn_json(db, "compacting database online");
     struct ovsdb_error *error = ovsdb_storage_store_snapshot(db->storage,
@@ -537,6 +539,12 @@ ovsdb_snapshot(struct ovsdb *db, bool trim_memory OVS_UNUSED)
         malloc_trim(0);
     }
 #endif
+
+    elapsed = time_msec() - start_time;
+    if (elapsed > 1000) {
+        VLOG_INFO("%s: Database compaction took %"PRIu64"ms",
+                  db->name, elapsed);
+    }
     return error;
 }
 
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index d08a7bcb6..ec4c24476 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -4139,9 +4139,24 @@ raft_may_snapshot(const struct raft *raft)
             && !raft->leaving
             && !raft->left
             && !raft->failed
+            && raft->role != RAFT_LEADER
             && raft->last_applied >= raft->log_start);
 }
 
+/* Prepares for soon snapshotting. */
+void
+raft_notify_snapshot_recommended(struct raft *raft)
+{
+    if (raft->role == RAFT_LEADER) {
+        /* Leader is about to write database snapshot to the disk and this
+         * might take significant amount of time.  Stepping back from the
+         * leadership to keep the cluster functional during this process.  */
+        VLOG_INFO("Transferring leadership to write a snapshot.");
+        raft_transfer_leadership(raft, "preparing to write snapshot");
+        raft_become_follower(raft);
+    }
+}
+
 /* Replaces the log for 'raft', up to the last log entry read, by
  * 'new_snapshot_data'.  Returns NULL if successful, otherwise an error that
  * the caller must eventually free.
diff --git a/ovsdb/raft.h b/ovsdb/raft.h
index 99d5307e5..59902fe82 100644
--- a/ovsdb/raft.h
+++ b/ovsdb/raft.h
@@ -174,6 +174,7 @@ void raft_command_wait(const struct raft_command *);
 bool raft_grew_lots(const struct raft *);
 uint64_t raft_get_log_length(const struct raft *);
 bool raft_may_snapshot(const struct raft *);
+void raft_notify_snapshot_recommended(struct raft *);
 struct ovsdb_error *raft_store_snapshot(struct raft *,
                                         const struct json *new_snapshot)
     OVS_WARN_UNUSED_RESULT;
diff --git a/ovsdb/storage.c b/ovsdb/storage.c
index 7b4ad16f6..bd1fe0a33 100644
--- a/ovsdb/storage.c
+++ b/ovsdb/storage.c
@@ -509,14 +509,11 @@ ovsdb_storage_should_snapshot(const struct ovsdb_storage *storage)
             return false;
         }
 
-        /* If we can't snapshot right now, don't. */
-        if (storage->raft && !raft_may_snapshot(storage->raft)) {
-            return false;
-        }
-
         uint64_t log_len = (storage->raft
                             ? raft_get_log_length(storage->raft)
                             : storage->n_read + storage->n_written);
+        bool snapshot_recommended = false;
+
         if (now < storage->next_snapshot_max) {
             /* Maximum snapshot time not yet reached.  Take a snapshot if there
              * have been at least 100 log entries and the log file size has
@@ -524,12 +521,25 @@ ovsdb_storage_should_snapshot(const struct ovsdb_storage *storage)
             bool grew_lots = (storage->raft
                               ? raft_grew_lots(storage->raft)
                               : ovsdb_log_grew_lots(storage->log));
-            return log_len >= 100 && grew_lots;
+            snapshot_recommended = (log_len >= 100 && grew_lots);
         } else {
             /* We have reached the maximum snapshot time.  Take a snapshot if
              * there have been any log entries at all. */
-            return log_len > 0;
+            snapshot_recommended = (log_len > 0);
         }
+
+        if (!snapshot_recommended) {
+            return false;
+        }
+
+        /* If we can't snapshot right now, don't. */
+        if (storage->raft && !raft_may_snapshot(storage->raft)) {
+            /* Notifying the storage that it needs to make a snapshot soon. */
+            raft_notify_snapshot_recommended(storage->raft);
+            return false;
+        }
+
+        return true;
     }
 
     return false;
author	Ilya Maximets <i.maximets@ovn.org>	2021-05-06 14:47:31 +0200
committer	Ilya Maximets <i.maximets@ovn.org>	2021-05-14 16:01:07 +0200
commit	058702e3dcc61700bd587621d750d80985660a54 (patch)
tree	9582ca3e082362ca5d44f3eab9ce26afbc3fba3d
parent	3bc41e2b6a170dcd6b5c728a4ed3410a6ce0865d (diff)
download	openvswitch-058702e3dcc61700bd587621d750d80985660a54.tar.gz