diff options
author | Martin Schwenke <martin@meltin.net> | 2015-08-03 17:22:08 +1000 |
---|---|---|
committer | Amitay Isaacs <amitay@samba.org> | 2015-08-29 17:06:25 +0200 |
commit | b6a0e4b85699241ba90f25f4c605cbb7a6fc2146 (patch) | |
tree | bc2309f36fc8ca36dc69e44483908a6c9ee09f52 | |
parent | 02fa6c3d106e8fbf0e685afafa5e6a9bc0c3d22d (diff) | |
download | samba-b6a0e4b85699241ba90f25f4c605cbb7a6fc2146.tar.gz |
ctdb-scripts: New consistent system memory and swap monitoring
New variables CTDB_MONITOR_MEMORY_USAGE and CTDB_MONITOR_SWAP_USAGE.
Both take a pair of <warn_threshold>:<unhealthy_threshold> where each
theshold is specified as a percentage.
This adds a callout to check_thresholds() that is run when the
unhealthy threshold is reached.
Add some combination tests.
Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
-rw-r--r-- | ctdb/config/events.d/05.system | 57 | ||||
-rw-r--r-- | ctdb/doc/ctdbd.conf.5.xml | 38 | ||||
-rwxr-xr-x | ctdb/tests/eventscripts/05.system.monitor.011.sh | 5 | ||||
-rwxr-xr-x | ctdb/tests/eventscripts/05.system.monitor.012.sh | 5 | ||||
-rwxr-xr-x | ctdb/tests/eventscripts/05.system.monitor.013.sh | 13 | ||||
-rwxr-xr-x | ctdb/tests/eventscripts/05.system.monitor.014.sh | 7 | ||||
-rwxr-xr-x | ctdb/tests/eventscripts/05.system.monitor.015.sh | 9 | ||||
-rwxr-xr-x | ctdb/tests/eventscripts/05.system.monitor.016.sh | 16 | ||||
-rwxr-xr-x | ctdb/tests/eventscripts/05.system.monitor.017.sh | 42 | ||||
-rw-r--r-- | ctdb/tests/eventscripts/scripts/local.sh | 5 |
10 files changed, 119 insertions, 78 deletions
diff --git a/ctdb/config/events.d/05.system b/ctdb/config/events.d/05.system index 770c0dc6055..48946cc36f5 100644 --- a/ctdb/config/events.d/05.system +++ b/ctdb/config/events.d/05.system @@ -22,6 +22,7 @@ check_thresholds () _thing="$1" _thresholds="$2" _usage="$3" + _unhealthy_callout="$4" case "$_thresholds" in *:*) @@ -35,7 +36,9 @@ check_thresholds () if validate_percentage "$_unhealthy_threshold" "$_thing" ; then if [ "$_usage" -ge "$_unhealthy_threshold" ] ; then - die "ERROR: ${_thing} utilization ${_usage}% >= threshold ${_unhealthy_threshold}%" + echo "ERROR: ${_thing} utilization ${_usage}% >= threshold ${_unhealthy_threshold}%" + eval "$_unhealthy_callout" + exit 1 fi fi @@ -73,11 +76,21 @@ monitor_filesystem_usage () done } +dump_memory_info () +{ + echo "CRITICAL: Shutting down CTDB!!!" + get_proc "meminfo" + ps auxfww + set_proc "sysrq-trigger" "m" + ctdb disable + sleep 3 + ctdb shutdown +} + monitor_memory_usage () { - if [ -z "$CTDB_MONITOR_FREE_MEMORY_WARN" -a \ - -z "$CTDB_MONITOR_FREE_MEMORY" -a \ - "$CTDB_CHECK_SWAP_IS_NOT_USED" != "yes" ] ; then + if [ -z "$CTDB_MONITOR_MEMORY_USAGE" -a \ + -z "$CTDB_MONITOR_SWAP_USAGE" ] ; then return fi @@ -98,35 +111,15 @@ END { _mem_usage="$1" _swap_usage="$2" - # Shutdown CTDB when memory is below the configured limit - if [ -n "$CTDB_MONITOR_FREE_MEMORY" ] ; then - if [ $_mem_usage -ge $CTDB_MONITOR_FREE_MEMORY ] ; then - echo "CRITICAL: OOM - ${_mem_usage}% usage >= ${CTDB_MONITOR_FREE_MEMORY}% (CTDB threshold)" - echo "CRITICAL: Shutting down CTDB!!!" - echo "$_meminfo" - ps auxfww - set_proc "sysrq-trigger" "m" - ctdb disable - sleep 3 - ctdb shutdown - fi - fi + check_thresholds "System memory" \ + "$CTDB_MONITOR_MEMORY_USAGE" \ + "$_mem_usage" \ + dump_memory_info - # Warn when low on memory - if [ -n "$CTDB_MONITOR_FREE_MEMORY_WARN" ] ; then - if [ $_mem_usage -ge $CTDB_MONITOR_FREE_MEMORY_WARN ] ; then - echo "WARNING: memory usage is excessive - ${_mem_usage}% >= ${CTDB_MONITOR_FREE_MEMORY_WARN}% (CTDB threshold)" - fi - fi - - # We should never enter swap, so SwapTotal == SwapFree. - if [ "$CTDB_CHECK_SWAP_IS_NOT_USED" = "yes" ] ; then - if [ $_swap_usage -gt 0 ] ; then - echo We are swapping: - echo "$_meminfo" - ps auxfww - fi - fi + check_thresholds "System swap" \ + "$CTDB_MONITOR_SWAP_USAGE" \ + "$_swap_usage" \ + dump_memory_info } diff --git a/ctdb/doc/ctdbd.conf.5.xml b/ctdb/doc/ctdbd.conf.5.xml index 63c84aa5ac3..0e38d6acf41 100644 --- a/ctdb/doc/ctdbd.conf.5.xml +++ b/ctdb/doc/ctdbd.conf.5.xml @@ -1321,26 +1321,16 @@ CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=9000 </varlistentry> <varlistentry> - <term>CTDB_CHECK_SWAP_IS_NOT_USED=yes|no</term> + <term>CTDB_MONITOR_MEMORY_USAGE=<parameter>MEM-LIMITS</parameter></term> <listitem> <para> - Should a warning be logged if swap space is in use. - </para> - <para> - Default is no. - </para> - </listitem> - </varlistentry> - - <varlistentry> - <term>CTDB_MONITOR_FREE_MEMORY=<parameter>NUM</parameter></term> - <listitem> - <para> - NUM is threshold of acceptable memory usage, expressed - as a percentage. If this is set and memory usage - reaches this limit then some debug information will be - logged, the node will be disabled and then CTDB will be - shut down. + MEM-LIMITS takes the form + <parameter>WARN_LIMIT</parameter><optional>:<parameter>UNHEALTHY_LIMIT</parameter></optional> + indicating that warnings should be logged if memory + usage reaches WARN_LIMIT%. If usage reaches + UNHEALTHY_LIMIT then the node should be flagged + unhealthy. Either WARN_LIMIT or UNHEALTHY_LIMIT may be + left blank, meaning that check will be omitted. </para> <para> No default. @@ -1349,12 +1339,16 @@ CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=9000 </varlistentry> <varlistentry> - <term>CTDB_MONITOR_FREE_MEMORY_WARN=<parameter>NUM</parameter></term> + <term>CTDB_MONITOR_SWAP_USAGE=<parameter>SWAP-LIMITS</parameter></term> <listitem> <para> - NUM is threshold of acceptable memory usage, expressed - as a percentage. If this is set and memory usage - reaches this limit then a warning will be logged. + SWAP-LIMITS takes the form + <parameter>WARN_LIMIT</parameter><optional>:<parameter>UNHEALTHY_LIMIT</parameter></optional> + indicating that warnings should be logged if + swap usage reaches WARN_LIMIT%. If usage reaches + UNHEALTHY_LIMIT then the node should be flagged + unhealthy. Either WARN_LIMIT or UNHEALTHY_LIMIT may be + left blank, meaning that check will be omitted. </para> <para> No default. diff --git a/ctdb/tests/eventscripts/05.system.monitor.011.sh b/ctdb/tests/eventscripts/05.system.monitor.011.sh index 79f55f06a91..79ceb90b768 100755 --- a/ctdb/tests/eventscripts/05.system.monitor.011.sh +++ b/ctdb/tests/eventscripts/05.system.monitor.011.sh @@ -6,9 +6,8 @@ define_test "Memory check, bad situation, no checks enabled" setup_memcheck 100 100 -CTDB_MONITOR_FREE_MEMORY="" -CTDB_MONITOR_FREE_MEMORY_WARN="" -CTDB_CHECK_SWAP_IS_NOT_USED="no" +CTDB_MONITOR_MEMORY_USAGE="" +CTDB_MONITOR_SWAP_USAGE="" ok_null diff --git a/ctdb/tests/eventscripts/05.system.monitor.012.sh b/ctdb/tests/eventscripts/05.system.monitor.012.sh index 6c06480824a..bb2c7b57811 100755 --- a/ctdb/tests/eventscripts/05.system.monitor.012.sh +++ b/ctdb/tests/eventscripts/05.system.monitor.012.sh @@ -6,9 +6,8 @@ define_test "Memory check, good situation, all enabled" setup_memcheck -CTDB_MONITOR_FREE_MEMORY="90" -CTDB_MONITOR_FREE_MEMORY_WARN="80" -CTDB_CHECK_SWAP_IS_NOT_USED="yes" +CTDB_MONITOR_MEMORY_USAGE="80:90" +CTDB_MONITOR_SWAP_USAGE="1:50" ok_null diff --git a/ctdb/tests/eventscripts/05.system.monitor.013.sh b/ctdb/tests/eventscripts/05.system.monitor.013.sh index dc3d40d0fa9..25fa780d89e 100755 --- a/ctdb/tests/eventscripts/05.system.monitor.013.sh +++ b/ctdb/tests/eventscripts/05.system.monitor.013.sh @@ -4,16 +4,17 @@ define_test "Memory check, bad situation, only swap check" -setup_memcheck 100 10 +setup_memcheck 100 90 -CTDB_MONITOR_FREE_MEMORY="" -CTDB_MONITOR_FREE_MEMORY_WARN="" -CTDB_CHECK_SWAP_IS_NOT_USED="yes" +CTDB_MONITOR_MEMORY_USAGE="" +CTDB_MONITOR_SWAP_USAGE=":50" -ok <<EOF -We are swapping: +required_result 1 <<EOF +ERROR: System swap utilization 90% >= threshold 50% +CRITICAL: Shutting down CTDB!!! $FAKE_PROC_MEMINFO $(ps foobar) +CTDB says BYE! EOF simple_test diff --git a/ctdb/tests/eventscripts/05.system.monitor.014.sh b/ctdb/tests/eventscripts/05.system.monitor.014.sh index 64c07416445..46955f34020 100755 --- a/ctdb/tests/eventscripts/05.system.monitor.014.sh +++ b/ctdb/tests/eventscripts/05.system.monitor.014.sh @@ -6,12 +6,11 @@ define_test "Memory check, bad situation, only memory warning" setup_memcheck 90 10 -CTDB_MONITOR_FREE_MEMORY="" -CTDB_MONITOR_FREE_MEMORY_WARN="85" -CTDB_CHECK_SWAP_IS_NOT_USED="no" +CTDB_MONITOR_MEMORY_USAGE="85:" +CTDB_MONITOR_SWAP_USAGE="" ok <<EOF -WARNING: memory usage is excessive - 90% >= 85% (CTDB threshold) +WARNING: System memory utilization 90% >= threshold 85% EOF simple_test diff --git a/ctdb/tests/eventscripts/05.system.monitor.015.sh b/ctdb/tests/eventscripts/05.system.monitor.015.sh index e950bbd276e..3beac4cc91d 100755 --- a/ctdb/tests/eventscripts/05.system.monitor.015.sh +++ b/ctdb/tests/eventscripts/05.system.monitor.015.sh @@ -6,12 +6,11 @@ define_test "Memory check, bad situation, only memory critical" setup_memcheck 90 0 -CTDB_MONITOR_FREE_MEMORY="85" -CTDB_MONITOR_FREE_MEMORY_WARN="" -CTDB_CHECK_SWAP_IS_NOT_USED="no" +CTDB_MONITOR_MEMORY_USAGE=":85" +CTDB_MONITOR_SWAP_USAGE="" -ok <<EOF -CRITICAL: OOM - 90% usage >= 85% (CTDB threshold) +required_result 1 <<EOF +ERROR: System memory utilization 90% >= threshold 85% CRITICAL: Shutting down CTDB!!! $FAKE_PROC_MEMINFO $(ps foobar) diff --git a/ctdb/tests/eventscripts/05.system.monitor.016.sh b/ctdb/tests/eventscripts/05.system.monitor.016.sh new file mode 100755 index 00000000000..44dddc688a9 --- /dev/null +++ b/ctdb/tests/eventscripts/05.system.monitor.016.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +. "${TEST_SCRIPTS_DIR}/unit.sh" + +define_test "Memory check, bad situation, both memory checks, causes warning" + +setup_memcheck 87 0 + +CTDB_MONITOR_MEMORY_USAGE="80:90" +CTDB_MONITOR_SWAP_USAGE="" + +ok <<EOF +WARNING: System memory utilization 87% >= threshold 80% +EOF + +simple_test diff --git a/ctdb/tests/eventscripts/05.system.monitor.017.sh b/ctdb/tests/eventscripts/05.system.monitor.017.sh new file mode 100755 index 00000000000..f1b6a26ec2e --- /dev/null +++ b/ctdb/tests/eventscripts/05.system.monitor.017.sh @@ -0,0 +1,42 @@ +#!/bin/sh + +. "${TEST_SCRIPTS_DIR}/unit.sh" + +define_test "Memory check, bad situation, both memory checks, causes unhealthy" + +setup_memcheck 87 0 + +CTDB_MONITOR_MEMORY_USAGE="70:80" +CTDB_MONITOR_SWAP_USAGE="" + +required_result 1 <<EOF +ERROR: System memory utilization 87% >= threshold 80% +CRITICAL: Shutting down CTDB!!! +MemTotal: 3940712 kB +MemFree: 225268 kB +Buffers: 146120 kB +Cached: 140904 kB +SwapCached: 56016 kB +Active: 2422104 kB +Inactive: 1019928 kB +Active(anon): 1917580 kB +Inactive(anon): 523080 kB +Active(file): 504524 kB +Inactive(file): 496848 kB +Unevictable: 4844 kB +Mlocked: 4844 kB +SwapTotal: 5857276 kB +SwapFree: 5857276 kB +... +USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND +root 2 0.0 0.0 0 0 ? S Aug28 0:00 [kthreadd] +root 3 0.0 0.0 0 0 ? S Aug28 0:43 \_ [ksoftirqd/0] +... +root 1 0.0 0.0 2976 624 ? Ss Aug28 0:07 init [2] +root 495 0.0 0.0 3888 1640 ? Ss Aug28 0:00 udevd --daemon +... +[MORE FAKE ps OUTPUT] +CTDB says BYE! +EOF + +simple_test diff --git a/ctdb/tests/eventscripts/scripts/local.sh b/ctdb/tests/eventscripts/scripts/local.sh index ce1c2510bd0..57e022536cd 100644 --- a/ctdb/tests/eventscripts/scripts/local.sh +++ b/ctdb/tests/eventscripts/scripts/local.sh @@ -369,9 +369,8 @@ SwapTotal: ${_swap_total} kB SwapFree: ${_swap_free} kB ..." - export CTDB_MONITOR_FREE_MEMORY - export CTDB_MONITOR_FREE_MEMORY_WARN - export CTDB_CHECK_SWAP_IS_NOT_USED + export CTDB_MONITOR_MEMORY_USAGE + export CTDB_MONITOR_SWAP_USAGE } setup_fscheck () |