summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Schwenke <martin@meltin.net>2015-08-03 17:22:08 +1000
committerAmitay Isaacs <amitay@samba.org>2015-08-29 17:06:25 +0200
commitb6a0e4b85699241ba90f25f4c605cbb7a6fc2146 (patch)
treebc2309f36fc8ca36dc69e44483908a6c9ee09f52
parent02fa6c3d106e8fbf0e685afafa5e6a9bc0c3d22d (diff)
downloadsamba-b6a0e4b85699241ba90f25f4c605cbb7a6fc2146.tar.gz
ctdb-scripts: New consistent system memory and swap monitoring
New variables CTDB_MONITOR_MEMORY_USAGE and CTDB_MONITOR_SWAP_USAGE. Both take a pair of <warn_threshold>:<unhealthy_threshold> where each theshold is specified as a percentage. This adds a callout to check_thresholds() that is run when the unhealthy threshold is reached. Add some combination tests. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
-rw-r--r--ctdb/config/events.d/05.system57
-rw-r--r--ctdb/doc/ctdbd.conf.5.xml38
-rwxr-xr-xctdb/tests/eventscripts/05.system.monitor.011.sh5
-rwxr-xr-xctdb/tests/eventscripts/05.system.monitor.012.sh5
-rwxr-xr-xctdb/tests/eventscripts/05.system.monitor.013.sh13
-rwxr-xr-xctdb/tests/eventscripts/05.system.monitor.014.sh7
-rwxr-xr-xctdb/tests/eventscripts/05.system.monitor.015.sh9
-rwxr-xr-xctdb/tests/eventscripts/05.system.monitor.016.sh16
-rwxr-xr-xctdb/tests/eventscripts/05.system.monitor.017.sh42
-rw-r--r--ctdb/tests/eventscripts/scripts/local.sh5
10 files changed, 119 insertions, 78 deletions
diff --git a/ctdb/config/events.d/05.system b/ctdb/config/events.d/05.system
index 770c0dc6055..48946cc36f5 100644
--- a/ctdb/config/events.d/05.system
+++ b/ctdb/config/events.d/05.system
@@ -22,6 +22,7 @@ check_thresholds ()
_thing="$1"
_thresholds="$2"
_usage="$3"
+ _unhealthy_callout="$4"
case "$_thresholds" in
*:*)
@@ -35,7 +36,9 @@ check_thresholds ()
if validate_percentage "$_unhealthy_threshold" "$_thing" ; then
if [ "$_usage" -ge "$_unhealthy_threshold" ] ; then
- die "ERROR: ${_thing} utilization ${_usage}% >= threshold ${_unhealthy_threshold}%"
+ echo "ERROR: ${_thing} utilization ${_usage}% >= threshold ${_unhealthy_threshold}%"
+ eval "$_unhealthy_callout"
+ exit 1
fi
fi
@@ -73,11 +76,21 @@ monitor_filesystem_usage ()
done
}
+dump_memory_info ()
+{
+ echo "CRITICAL: Shutting down CTDB!!!"
+ get_proc "meminfo"
+ ps auxfww
+ set_proc "sysrq-trigger" "m"
+ ctdb disable
+ sleep 3
+ ctdb shutdown
+}
+
monitor_memory_usage ()
{
- if [ -z "$CTDB_MONITOR_FREE_MEMORY_WARN" -a \
- -z "$CTDB_MONITOR_FREE_MEMORY" -a \
- "$CTDB_CHECK_SWAP_IS_NOT_USED" != "yes" ] ; then
+ if [ -z "$CTDB_MONITOR_MEMORY_USAGE" -a \
+ -z "$CTDB_MONITOR_SWAP_USAGE" ] ; then
return
fi
@@ -98,35 +111,15 @@ END {
_mem_usage="$1"
_swap_usage="$2"
- # Shutdown CTDB when memory is below the configured limit
- if [ -n "$CTDB_MONITOR_FREE_MEMORY" ] ; then
- if [ $_mem_usage -ge $CTDB_MONITOR_FREE_MEMORY ] ; then
- echo "CRITICAL: OOM - ${_mem_usage}% usage >= ${CTDB_MONITOR_FREE_MEMORY}% (CTDB threshold)"
- echo "CRITICAL: Shutting down CTDB!!!"
- echo "$_meminfo"
- ps auxfww
- set_proc "sysrq-trigger" "m"
- ctdb disable
- sleep 3
- ctdb shutdown
- fi
- fi
+ check_thresholds "System memory" \
+ "$CTDB_MONITOR_MEMORY_USAGE" \
+ "$_mem_usage" \
+ dump_memory_info
- # Warn when low on memory
- if [ -n "$CTDB_MONITOR_FREE_MEMORY_WARN" ] ; then
- if [ $_mem_usage -ge $CTDB_MONITOR_FREE_MEMORY_WARN ] ; then
- echo "WARNING: memory usage is excessive - ${_mem_usage}% >= ${CTDB_MONITOR_FREE_MEMORY_WARN}% (CTDB threshold)"
- fi
- fi
-
- # We should never enter swap, so SwapTotal == SwapFree.
- if [ "$CTDB_CHECK_SWAP_IS_NOT_USED" = "yes" ] ; then
- if [ $_swap_usage -gt 0 ] ; then
- echo We are swapping:
- echo "$_meminfo"
- ps auxfww
- fi
- fi
+ check_thresholds "System swap" \
+ "$CTDB_MONITOR_SWAP_USAGE" \
+ "$_swap_usage" \
+ dump_memory_info
}
diff --git a/ctdb/doc/ctdbd.conf.5.xml b/ctdb/doc/ctdbd.conf.5.xml
index 63c84aa5ac3..0e38d6acf41 100644
--- a/ctdb/doc/ctdbd.conf.5.xml
+++ b/ctdb/doc/ctdbd.conf.5.xml
@@ -1321,26 +1321,16 @@ CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=9000
</varlistentry>
<varlistentry>
- <term>CTDB_CHECK_SWAP_IS_NOT_USED=yes|no</term>
+ <term>CTDB_MONITOR_MEMORY_USAGE=<parameter>MEM-LIMITS</parameter></term>
<listitem>
<para>
- Should a warning be logged if swap space is in use.
- </para>
- <para>
- Default is no.
- </para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
- <term>CTDB_MONITOR_FREE_MEMORY=<parameter>NUM</parameter></term>
- <listitem>
- <para>
- NUM is threshold of acceptable memory usage, expressed
- as a percentage. If this is set and memory usage
- reaches this limit then some debug information will be
- logged, the node will be disabled and then CTDB will be
- shut down.
+ MEM-LIMITS takes the form
+ <parameter>WARN_LIMIT</parameter><optional>:<parameter>UNHEALTHY_LIMIT</parameter></optional>
+ indicating that warnings should be logged if memory
+ usage reaches WARN_LIMIT%. If usage reaches
+ UNHEALTHY_LIMIT then the node should be flagged
+ unhealthy. Either WARN_LIMIT or UNHEALTHY_LIMIT may be
+ left blank, meaning that check will be omitted.
</para>
<para>
No default.
@@ -1349,12 +1339,16 @@ CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=9000
</varlistentry>
<varlistentry>
- <term>CTDB_MONITOR_FREE_MEMORY_WARN=<parameter>NUM</parameter></term>
+ <term>CTDB_MONITOR_SWAP_USAGE=<parameter>SWAP-LIMITS</parameter></term>
<listitem>
<para>
- NUM is threshold of acceptable memory usage, expressed
- as a percentage. If this is set and memory usage
- reaches this limit then a warning will be logged.
+ SWAP-LIMITS takes the form
+ <parameter>WARN_LIMIT</parameter><optional>:<parameter>UNHEALTHY_LIMIT</parameter></optional>
+ indicating that warnings should be logged if
+ swap usage reaches WARN_LIMIT%. If usage reaches
+ UNHEALTHY_LIMIT then the node should be flagged
+ unhealthy. Either WARN_LIMIT or UNHEALTHY_LIMIT may be
+ left blank, meaning that check will be omitted.
</para>
<para>
No default.
diff --git a/ctdb/tests/eventscripts/05.system.monitor.011.sh b/ctdb/tests/eventscripts/05.system.monitor.011.sh
index 79f55f06a91..79ceb90b768 100755
--- a/ctdb/tests/eventscripts/05.system.monitor.011.sh
+++ b/ctdb/tests/eventscripts/05.system.monitor.011.sh
@@ -6,9 +6,8 @@ define_test "Memory check, bad situation, no checks enabled"
setup_memcheck 100 100
-CTDB_MONITOR_FREE_MEMORY=""
-CTDB_MONITOR_FREE_MEMORY_WARN=""
-CTDB_CHECK_SWAP_IS_NOT_USED="no"
+CTDB_MONITOR_MEMORY_USAGE=""
+CTDB_MONITOR_SWAP_USAGE=""
ok_null
diff --git a/ctdb/tests/eventscripts/05.system.monitor.012.sh b/ctdb/tests/eventscripts/05.system.monitor.012.sh
index 6c06480824a..bb2c7b57811 100755
--- a/ctdb/tests/eventscripts/05.system.monitor.012.sh
+++ b/ctdb/tests/eventscripts/05.system.monitor.012.sh
@@ -6,9 +6,8 @@ define_test "Memory check, good situation, all enabled"
setup_memcheck
-CTDB_MONITOR_FREE_MEMORY="90"
-CTDB_MONITOR_FREE_MEMORY_WARN="80"
-CTDB_CHECK_SWAP_IS_NOT_USED="yes"
+CTDB_MONITOR_MEMORY_USAGE="80:90"
+CTDB_MONITOR_SWAP_USAGE="1:50"
ok_null
diff --git a/ctdb/tests/eventscripts/05.system.monitor.013.sh b/ctdb/tests/eventscripts/05.system.monitor.013.sh
index dc3d40d0fa9..25fa780d89e 100755
--- a/ctdb/tests/eventscripts/05.system.monitor.013.sh
+++ b/ctdb/tests/eventscripts/05.system.monitor.013.sh
@@ -4,16 +4,17 @@
define_test "Memory check, bad situation, only swap check"
-setup_memcheck 100 10
+setup_memcheck 100 90
-CTDB_MONITOR_FREE_MEMORY=""
-CTDB_MONITOR_FREE_MEMORY_WARN=""
-CTDB_CHECK_SWAP_IS_NOT_USED="yes"
+CTDB_MONITOR_MEMORY_USAGE=""
+CTDB_MONITOR_SWAP_USAGE=":50"
-ok <<EOF
-We are swapping:
+required_result 1 <<EOF
+ERROR: System swap utilization 90% >= threshold 50%
+CRITICAL: Shutting down CTDB!!!
$FAKE_PROC_MEMINFO
$(ps foobar)
+CTDB says BYE!
EOF
simple_test
diff --git a/ctdb/tests/eventscripts/05.system.monitor.014.sh b/ctdb/tests/eventscripts/05.system.monitor.014.sh
index 64c07416445..46955f34020 100755
--- a/ctdb/tests/eventscripts/05.system.monitor.014.sh
+++ b/ctdb/tests/eventscripts/05.system.monitor.014.sh
@@ -6,12 +6,11 @@ define_test "Memory check, bad situation, only memory warning"
setup_memcheck 90 10
-CTDB_MONITOR_FREE_MEMORY=""
-CTDB_MONITOR_FREE_MEMORY_WARN="85"
-CTDB_CHECK_SWAP_IS_NOT_USED="no"
+CTDB_MONITOR_MEMORY_USAGE="85:"
+CTDB_MONITOR_SWAP_USAGE=""
ok <<EOF
-WARNING: memory usage is excessive - 90% >= 85% (CTDB threshold)
+WARNING: System memory utilization 90% >= threshold 85%
EOF
simple_test
diff --git a/ctdb/tests/eventscripts/05.system.monitor.015.sh b/ctdb/tests/eventscripts/05.system.monitor.015.sh
index e950bbd276e..3beac4cc91d 100755
--- a/ctdb/tests/eventscripts/05.system.monitor.015.sh
+++ b/ctdb/tests/eventscripts/05.system.monitor.015.sh
@@ -6,12 +6,11 @@ define_test "Memory check, bad situation, only memory critical"
setup_memcheck 90 0
-CTDB_MONITOR_FREE_MEMORY="85"
-CTDB_MONITOR_FREE_MEMORY_WARN=""
-CTDB_CHECK_SWAP_IS_NOT_USED="no"
+CTDB_MONITOR_MEMORY_USAGE=":85"
+CTDB_MONITOR_SWAP_USAGE=""
-ok <<EOF
-CRITICAL: OOM - 90% usage >= 85% (CTDB threshold)
+required_result 1 <<EOF
+ERROR: System memory utilization 90% >= threshold 85%
CRITICAL: Shutting down CTDB!!!
$FAKE_PROC_MEMINFO
$(ps foobar)
diff --git a/ctdb/tests/eventscripts/05.system.monitor.016.sh b/ctdb/tests/eventscripts/05.system.monitor.016.sh
new file mode 100755
index 00000000000..44dddc688a9
--- /dev/null
+++ b/ctdb/tests/eventscripts/05.system.monitor.016.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Memory check, bad situation, both memory checks, causes warning"
+
+setup_memcheck 87 0
+
+CTDB_MONITOR_MEMORY_USAGE="80:90"
+CTDB_MONITOR_SWAP_USAGE=""
+
+ok <<EOF
+WARNING: System memory utilization 87% >= threshold 80%
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/05.system.monitor.017.sh b/ctdb/tests/eventscripts/05.system.monitor.017.sh
new file mode 100755
index 00000000000..f1b6a26ec2e
--- /dev/null
+++ b/ctdb/tests/eventscripts/05.system.monitor.017.sh
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Memory check, bad situation, both memory checks, causes unhealthy"
+
+setup_memcheck 87 0
+
+CTDB_MONITOR_MEMORY_USAGE="70:80"
+CTDB_MONITOR_SWAP_USAGE=""
+
+required_result 1 <<EOF
+ERROR: System memory utilization 87% >= threshold 80%
+CRITICAL: Shutting down CTDB!!!
+MemTotal: 3940712 kB
+MemFree: 225268 kB
+Buffers: 146120 kB
+Cached: 140904 kB
+SwapCached: 56016 kB
+Active: 2422104 kB
+Inactive: 1019928 kB
+Active(anon): 1917580 kB
+Inactive(anon): 523080 kB
+Active(file): 504524 kB
+Inactive(file): 496848 kB
+Unevictable: 4844 kB
+Mlocked: 4844 kB
+SwapTotal: 5857276 kB
+SwapFree: 5857276 kB
+...
+USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
+root 2 0.0 0.0 0 0 ? S Aug28 0:00 [kthreadd]
+root 3 0.0 0.0 0 0 ? S Aug28 0:43 \_ [ksoftirqd/0]
+...
+root 1 0.0 0.0 2976 624 ? Ss Aug28 0:07 init [2]
+root 495 0.0 0.0 3888 1640 ? Ss Aug28 0:00 udevd --daemon
+...
+[MORE FAKE ps OUTPUT]
+CTDB says BYE!
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/scripts/local.sh b/ctdb/tests/eventscripts/scripts/local.sh
index ce1c2510bd0..57e022536cd 100644
--- a/ctdb/tests/eventscripts/scripts/local.sh
+++ b/ctdb/tests/eventscripts/scripts/local.sh
@@ -369,9 +369,8 @@ SwapTotal: ${_swap_total} kB
SwapFree: ${_swap_free} kB
..."
- export CTDB_MONITOR_FREE_MEMORY
- export CTDB_MONITOR_FREE_MEMORY_WARN
- export CTDB_CHECK_SWAP_IS_NOT_USED
+ export CTDB_MONITOR_MEMORY_USAGE
+ export CTDB_MONITOR_SWAP_USAGE
}
setup_fscheck ()