summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJudah Schvimer <judah.schvimer@10gen.com>2019-10-11 17:39:49 +0000
committerevergreen <evergreen@mongodb.com>2019-10-11 17:39:49 +0000
commit9b3801e457c4952e36f2a13d45387d647c301e03 (patch)
treea617e2650e6a812355c859ebe5998e0294b38b42
parent546e411b72cf6f75d24b304ce9219d1f3d3a4e4f (diff)
downloadmongo-9b3801e457c4952e36f2a13d45387d647c301e03.tar.gz
SERVER-34722 Add new server status metrics about oplog application
-rw-r--r--jstests/replsets/server_status_metrics.js75
-rw-r--r--src/mongo/db/repl/bgsync.cpp32
-rw-r--r--src/mongo/db/repl/reporter.cpp9
3 files changed, 110 insertions, 6 deletions
diff --git a/jstests/replsets/server_status_metrics.js b/jstests/replsets/server_status_metrics.js
index e7784ba0c76..50e58380299 100644
--- a/jstests/replsets/server_status_metrics.js
+++ b/jstests/replsets/server_status_metrics.js
@@ -1,9 +1,16 @@
+
+(function() {
+"use strict";
+
+load("jstests/libs/check_log.js");
+load("jstests/libs/write_concern_util.js");
+
/**
* Test replication metrics
*/
function testSecondaryMetrics(secondary, opCount, baseOpsApplied, baseOpsReceived) {
var ss = secondary.getDB("test").serverStatus();
- printjson(ss.metrics);
+ jsTestLog(`Secondary ${secondary.host} metrics: ${tojson(ss.metrics)}`);
assert(ss.metrics.repl.network.readersCreated > 0, "no (oplog) readers created");
assert(ss.metrics.repl.network.getmores.num > 0, "no getmores");
@@ -18,6 +25,11 @@ function testSecondaryMetrics(secondary, opCount, baseOpsApplied, baseOpsReceive
ss.metrics.repl.network.ops, opCount + baseOpsApplied, "wrong number of ops retrieved");
assert(ss.metrics.repl.network.bytes > 0, "zero or missing network bytes");
+ assert.gt(
+ ss.metrics.repl.network.replSetUpdatePosition.num, 0, "no update position commands sent");
+ assert.gt(ss.metrics.repl.syncSource.numSelections, 0, "num selections not incremented");
+ assert.gt(ss.metrics.repl.syncSource.numTimesChoseDifferent, 0, "no new sync source chosen");
+
assert(ss.metrics.repl.buffer.count >= 0, "buffer count missing");
assert(ss.metrics.repl.buffer.sizeBytes >= 0, "size (bytes)] missing");
assert(ss.metrics.repl.buffer.maxSizeBytes >= 0, "maxSize (bytes) missing");
@@ -30,9 +42,16 @@ function testSecondaryMetrics(secondary, opCount, baseOpsApplied, baseOpsReceive
assert.eq(ss.metrics.repl.apply.ops, opCount + baseOpsApplied, "wrong number of applied ops");
}
-var rt = new ReplSetTest({name: "server_status_metrics", nodes: 2, oplogSize: 100});
+var rt = new ReplSetTest({
+ name: "server_status_metrics",
+ nodes: 2,
+ oplogSize: 100,
+ // Write periodic noops to aid sync source selection. ReplSetTest.initiate() requires at least a
+ // 2 second noop writer interval to converge on a lastApplied optime.
+ nodeOptions: {setParameter: {writePeriodicNoops: true, periodicNoopIntervalSecs: 2}}
+});
rt.startSet();
-rt.initiate();
+rt.initiateWithHighElectionTimeout();
rt.awaitSecondaryNodes();
@@ -52,7 +71,7 @@ var secondaryBaseOplogOpsReceived = ss.metrics.repl.apply.batchSize;
// add test docs
var bulk = testDB.a.initializeUnorderedBulkOp();
-for (x = 0; x < 1000; x++) {
+for (let x = 0; x < 1000; x++) {
bulk.insert({});
}
assert.commandWorked(bulk.execute({w: 2}));
@@ -68,7 +87,8 @@ testSecondaryMetrics(secondary, 2000, secondaryBaseOplogOpsApplied, secondaryBas
var startMillis = testDB.serverStatus().metrics.getLastError.wtime.totalMillis;
var startNum = testDB.serverStatus().metrics.getLastError.wtime.num;
-printjson(primary.getDB("test").serverStatus().metrics);
+jsTestLog(
+ `Primary ${primary.host} metrics #1: ${tojson(primary.getDB("test").serverStatus().metrics)}`);
assert.commandWorked(testDB.a.insert({x: 1}, {writeConcern: {w: 1, wtimeout: 5000}}));
assert.eq(testDB.serverStatus().metrics.getLastError.wtime.totalMillis, startMillis);
@@ -86,6 +106,49 @@ assert.eq(testDB.serverStatus().metrics.getLastError.wtime.num, startNum + 1);
assert.writeError(testDB.a.insert({x: 1}, {writeConcern: {w: 3, wtimeout: 50}}));
assert.eq(testDB.serverStatus().metrics.getLastError.wtime.num, startNum + 2);
-printjson(primary.getDB("test").serverStatus().metrics);
+jsTestLog(
+ `Primary ${primary.host} metrics #2: ${tojson(primary.getDB("test").serverStatus().metrics)}`);
+
+let ssOld = secondary.getDB("test").serverStatus().metrics.repl.syncSource;
+jsTestLog(`Secondary ${secondary.host} metrics before restarting replication: ${tojson(ssOld)}`);
+
+// Repeatedly restart replication and wait for the sync source to be rechosen. If the sync source
+// gets set to empty between stopping and restarting replication, then the secondary won't
+// increment numTimesChoseSame, so we do this in a loop.
+let ssNew;
+assert.soon(
+ function() {
+ stopServerReplication(secondary);
+ restartServerReplication(secondary);
+
+ // Do a dummy write to choose a new sync source and replicate the write to block on that.
+ assert.commandWorked(
+ primary.getDB("test").bar.insert({"dummy_write": 3}, {writeConcern: {w: 2}}));
+ ssNew = secondary.getDB("test").serverStatus().metrics.repl.syncSource;
+ jsTestLog(
+ `Secondary ${secondary.host} metrics after restarting replication: ${tojson(ssNew)}`);
+ return ssNew.numTimesChoseSame > ssOld.numTimesChoseSame;
+ },
+ "timed out waiting to re-choose same sync source",
+ null,
+ 3 * 1000 /* 3sec interval to wait for noop */);
+
+assert.gt(ssNew.numSelections, ssOld.numSelections, "num selections not incremented");
+assert.gt(ssNew.numTimesChoseSame, ssOld.numTimesChoseSame, "same sync source not chosen");
+
+// Stop the primary so the secondary cannot choose a sync source.
+ssOld = ssNew;
+rt.stop(primary);
+stopServerReplication(secondary);
+restartServerReplication(secondary);
+assert.soon(function() {
+ ssNew = secondary.getDB("test").serverStatus().metrics.repl.syncSource;
+ jsTestLog(`Secondary ${secondary.host} metrics after stopping primary: ${tojson(ssNew)}`);
+ return ssNew.numTimesCouldNotFind > ssOld.numTimesCouldNotFind;
+});
+
+assert.gt(ssNew.numSelections, ssOld.numSelections, "num selections not incremented");
+assert.gt(ssNew.numTimesCouldNotFind, ssOld.numTimesCouldNotFind, "found new sync source");
rt.stopSet();
+})();
diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp
index 8b6a0d4221c..497a0fefca9 100644
--- a/src/mongo/db/repl/bgsync.cpp
+++ b/src/mongo/db/repl/bgsync.cpp
@@ -74,6 +74,31 @@ const int kSleepToAllowBatchingMillis = 2;
const int kSmallBatchLimitBytes = 40000;
const Milliseconds kRollbackOplogSocketTimeout(10 * 60 * 1000);
+// The number of times a node attempted to choose a node to sync from among the available sync
+// source options. This occurs if we re-evaluate our sync source, receive an error from the source,
+// or step down.
+Counter64 numSyncSourceSelections;
+ServerStatusMetricField<Counter64> displayNumSyncSourceSelections("repl.syncSource.numSelections",
+ &numSyncSourceSelections);
+
+// The number of times a node kept it's original sync source after re-evaluating if its current sync
+// source was optimal.
+Counter64 numTimesChoseSameSyncSource;
+ServerStatusMetricField<Counter64> displayNumTimesChoseSameSyncSource(
+ "repl.syncSource.numTimesChoseSame", &numTimesChoseSameSyncSource);
+
+// The number of times a node chose a new sync source after re-evaluating if its current sync source
+// was optimal.
+Counter64 numTimesChoseDifferentSyncSource;
+ServerStatusMetricField<Counter64> displayNumTimesChoseDifferentSyncSource(
+ "repl.syncSource.numTimesChoseDifferent", &numTimesChoseDifferentSyncSource);
+
+// The number of times a node could not find a sync source when choosing a node to sync from among
+// the available options.
+Counter64 numTimesCouldNotFindSyncSource;
+ServerStatusMetricField<Counter64> displayNumTimesCouldNotFindSyncSource(
+ "repl.syncSource.numTimesCouldNotFind", &numTimesCouldNotFindSyncSource);
+
/**
* Extends DataReplicatorExternalStateImpl to be member state aware.
*/
@@ -298,6 +323,8 @@ void BackgroundSync::_produce() {
_syncSourceResolver.reset();
}
+ numSyncSourceSelections.increment(1);
+
if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) {
// All (accessible) sync sources are too far ahead of us.
if (_replCoord->getMemberState().primary()) {
@@ -351,11 +378,13 @@ void BackgroundSync::_produce() {
log() << "Chose same sync source candidate as last time, " << source
<< ". Sleeping for 1 second to avoid immediately choosing a new sync source for "
"the same reason as last time.";
+ numTimesChoseSameSyncSource.increment(1);
sleepsecs(1);
} else {
log() << "Changed sync source from "
<< (oldSource.empty() ? std::string("empty") : oldSource.toString()) << " to "
<< source;
+ numTimesChoseDifferentSyncSource.increment(1);
}
} else {
if (!syncSourceResp.isOK()) {
@@ -363,6 +392,9 @@ void BackgroundSync::_produce() {
<< syncSourceResp.syncSourceStatus.getStatus();
}
// No sync source found.
+ LOG(1) << "Could not find a sync source. Sleeping for 1 second before trying again.";
+ numTimesCouldNotFindSyncSource.increment(1);
+
sleepsecs(1);
return;
}
diff --git a/src/mongo/db/repl/reporter.cpp b/src/mongo/db/repl/reporter.cpp
index d659ee83965..f66eb930893 100644
--- a/src/mongo/db/repl/reporter.cpp
+++ b/src/mongo/db/repl/reporter.cpp
@@ -33,7 +33,9 @@
#include "mongo/db/repl/reporter.h"
+#include "mongo/base/counter.h"
#include "mongo/bson/util/bson_extract.h"
+#include "mongo/db/commands/server_status_metric.h"
#include "mongo/db/repl/update_position_args.h"
#include "mongo/rpc/get_status_from_command_result.h"
#include "mongo/util/assert_util.h"
@@ -47,6 +49,11 @@ namespace {
const char kConfigVersionFieldName[] = "configVersion";
+// The number of replSetUpdatePosition commands a node sent to its sync source.
+Counter64 numUpdatePosition;
+ServerStatusMetricField<Counter64> displayNumUpdatePosition(
+ "repl.network.replSetUpdatePosition.num", &numUpdatePosition);
+
/**
* Returns configuration version in update command object.
* Returns -1 on failure.
@@ -233,6 +240,8 @@ void Reporter::_sendCommand_inlock(BSONObj commandRequest, Milliseconds netTimeo
return;
}
+ numUpdatePosition.increment(1);
+
_remoteCommandCallbackHandle = scheduleResult.getValue();
}