diff options
author | Martin Schwenke <martin@meltin.net> | 2015-06-19 16:35:12 +1000 |
---|---|---|
committer | Amitay Isaacs <amitay@samba.org> | 2015-07-14 09:57:18 +0200 |
commit | 74428e5c1463d6c60880ba593c880bc36a8f1ff4 (patch) | |
tree | 0a47561e9e5bc42c73721323647957771bbd91e2 /ctdb | |
parent | 3161d611bb2931019a7d40c7795c12f0b70a903c (diff) | |
download | samba-74428e5c1463d6c60880ba593c880bc36a8f1ff4.tar.gz |
ctdb-scripts: Switch NFS checks to new style
Note that the 60.ganesha RPC checks need to be identical to those in
the nfs-checks.d/ directory. This is because the NFS unit test
infrastructure checks output against what should be produced by the
checks in nfs-checks.d/. This is a minor issue, since one of the aims
of this work is to remove the need for a separate 60.ganesha.
In most cases configuration variable CTDB_NFS_DUMP_STUCK_THREADS is
now ignored. This is now handled by passing the desired number of
threads to the command specified in the service_debug_cmd variable in
a .check file.
Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
Diffstat (limited to 'ctdb')
-rwxr-xr-x | ctdb/config/events.d/60.ganesha | 29 | ||||
-rwxr-xr-x | ctdb/config/events.d/60.nfs | 2 | ||||
-rwxr-xr-x | ctdb/config/functions | 11 | ||||
-rw-r--r-- | ctdb/config/nfs-checks.d/10.status.check | 7 | ||||
-rw-r--r-- | ctdb/config/nfs-checks.d/20.nfs.check | 7 | ||||
-rw-r--r-- | ctdb/config/nfs-checks.d/30.nlockmgr.check | 6 | ||||
-rw-r--r-- | ctdb/config/nfs-checks.d/40.mountd.check | 7 | ||||
-rw-r--r-- | ctdb/config/nfs-checks.d/50.rquotad.check | 7 | ||||
-rw-r--r-- | ctdb/config/nfs-checks.d/README | 28 | ||||
-rw-r--r-- | ctdb/packaging/RPM/ctdb.spec.in | 12 | ||||
l--------- | ctdb/tests/eventscripts/etc-ctdb/nfs-checks.d | 1 | ||||
-rw-r--r-- | ctdb/tests/eventscripts/scripts/local.sh | 152 | ||||
-rwxr-xr-x | ctdb/wscript | 4 |
13 files changed, 172 insertions, 101 deletions
diff --git a/ctdb/config/events.d/60.ganesha b/ctdb/config/events.d/60.ganesha index 43c70df1c6d..2524fd472fb 100755 --- a/ctdb/config/events.d/60.ganesha +++ b/ctdb/config/events.d/60.ganesha @@ -222,25 +222,28 @@ case "$1" in update_tickles 2049 nfs_update_lock_info - # check that statd responds to rpc requests - # if statd is not running we try to restart it - # we only do this IF we have a rpc.statd command. - # For platforms where rpc.statd does not exist, we skip - # the check completely - p="rpc.statd" - type $p >/dev/null 2>/dev/null && \ - nfs_check_rpc_service "statd" \ - -ge 6 "verbose restart:b unhealthy" \ - % 2 "verbose restart:b" + nfs_check_service "status" <<EOF +version="1" # could drop this and use any version? +restart_every=2 +unhealthy_after=6 +service_stop_cmd="killall -q -9 rpc.statd" +service_start_cmd="rpc.statd ${STATD_HA_CALLOUT:+-H} $STATD_HA_CALLOUT ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME ${STATD_PORT:+-p} $STATD_PORT ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT" +service_debug_cmd="program_stack_traces rpc.statd 5" +EOF if [ "$CTDB_SKIP_GANESHA_NFSD_CHECK" != "yes" ] ; then monitor_ganesha_nfsd fi # rquotad is sometimes not started correctly on RHEL5 - nfs_check_rpc_service "rquotad" \ - -ge 6 "verbose restart:b unhealthy" \ - % 2 "verbose restart:b" + nfs_check_service "rquotad" <<EOF +version="1" # could drop this and use any version? +restart_every=2 +unhealthy_after=6 +service_stop_cmd="killall -q -9 rpc.rquotad" +service_start_cmd="rpc.rquotad ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT" +service_debug_cmd="program_stack_traces rpc.rquotad 5" +EOF ;; *) diff --git a/ctdb/config/events.d/60.nfs b/ctdb/config/events.d/60.nfs index babff1e5330..997d676091b 100755 --- a/ctdb/config/events.d/60.nfs +++ b/ctdb/config/events.d/60.nfs @@ -92,7 +92,7 @@ case "$1" in update_tickles 2049 nfs_update_lock_info - nfs_check_rpc_services + nfs_check_services nfs_check_thread_count ;; diff --git a/ctdb/config/functions b/ctdb/config/functions index 0b0021c79b4..4290bfa841f 100755 --- a/ctdb/config/functions +++ b/ctdb/config/functions @@ -943,6 +943,11 @@ startstop_nfs() { nfs_dump_some_threads service nfsserver start ;; + restart-stop) + set_proc "fs/nfsd/threads" 0 + service nfsserver stop > /dev/null 2>&1 + pkill -9 nfsd + ;; esac ;; rhel) @@ -964,6 +969,12 @@ startstop_nfs() { service nfslock start service nfs start ;; + restart-stop) + set_proc "fs/nfsd/threads" 0 + service nfs stop > /dev/null 2>&1 + service nfslock stop > /dev/null 2>&1 + pkill -9 nfsd + ;; esac ;; *) diff --git a/ctdb/config/nfs-checks.d/10.status.check b/ctdb/config/nfs-checks.d/10.status.check new file mode 100644 index 00000000000..dfa5c59117e --- /dev/null +++ b/ctdb/config/nfs-checks.d/10.status.check @@ -0,0 +1,7 @@ +# status +version="1" +restart_every=2 +unhealthy_after=6 +service_stop_cmd="killall -q -9 rpc.statd" +service_start_cmd="rpc.statd ${STATD_HA_CALLOUT:+-H} $STATD_HA_CALLOUT ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME ${STATD_PORT:+-p} $STATD_PORT ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT" +service_debug_cmd="program_stack_traces rpc.statd 5" diff --git a/ctdb/config/nfs-checks.d/20.nfs.check b/ctdb/config/nfs-checks.d/20.nfs.check new file mode 100644 index 00000000000..7229f7d9fe0 --- /dev/null +++ b/ctdb/config/nfs-checks.d/20.nfs.check @@ -0,0 +1,7 @@ +# nfs +version="3" +restart_every=10 +unhealthy_after=2 +service_stop_cmd="startstop_nfs restart-stop" +service_start_cmd="startstop_nfs start" +service_debug_cmd="program_stack_traces nfsd 5" diff --git a/ctdb/config/nfs-checks.d/30.nlockmgr.check b/ctdb/config/nfs-checks.d/30.nlockmgr.check new file mode 100644 index 00000000000..c2e723e1051 --- /dev/null +++ b/ctdb/config/nfs-checks.d/30.nlockmgr.check @@ -0,0 +1,6 @@ +# nlockmgr +version="4" +restart_every=2 +unhealthy_after=6 +service_stop_cmd="startstop_nfslock stop" +service_start_cmd="startstop_nfslock start" diff --git a/ctdb/config/nfs-checks.d/40.mountd.check b/ctdb/config/nfs-checks.d/40.mountd.check new file mode 100644 index 00000000000..56b3fd29512 --- /dev/null +++ b/ctdb/config/nfs-checks.d/40.mountd.check @@ -0,0 +1,7 @@ +# mountd +version="1" +restart_every=2 +unhealthy_after=6 +service_stop_cmd="killall -q -9 rpc.mountd" +service_start_cmd="rpc.mountd $RPCMOUNTDOPTS ${MOUNTD_PORT:+-p} $MOUNTD_PORT" +service_debug_cmd="program_stack_traces rpc.mountd 5" diff --git a/ctdb/config/nfs-checks.d/50.rquotad.check b/ctdb/config/nfs-checks.d/50.rquotad.check new file mode 100644 index 00000000000..b7bd9d2c757 --- /dev/null +++ b/ctdb/config/nfs-checks.d/50.rquotad.check @@ -0,0 +1,7 @@ +# rquotad +version="1" +restart_every=2 +unhealthy_after=6 +service_stop_cmd="killall -q -9 rpc.rquotad" +service_start_cmd="rpc.rquotad ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT" +service_debug_cmd="program_stack_traces rpc.rquotad 5" diff --git a/ctdb/config/nfs-checks.d/README b/ctdb/config/nfs-checks.d/README new file mode 100644 index 00000000000..51ba54b7373 --- /dev/null +++ b/ctdb/config/nfs-checks.d/README @@ -0,0 +1,28 @@ +NFS check configuration files. + +Files are named NN.RPCSERVICE.check. Files without a .check suffix +are ignored. + +Supported variables are: + +* family - "tcp" or "udp" or space separated list + default: tcp +* version - optional, RPC service version number + default is to omit to check for any version +* unhealthy_after - number of check fails before unhealthy + default: 1 +* restart_every - number of check fails before restart + default: 0, meaning no restart +* service_stop_cmd - command to stop service + default: no default, must be provided if + restart_every > 0 +* service_start_cmd - command to start service + default: no default, must be provided if + restart_every > 0 +* service_debug_cmd - command to debug a service after trying to stop it; + for example, it can be useful to print stack + traces of threads that have not exited, since + they may be stuck doing I/O; + no default, see also function program_stack_traces() + +Quoting inside values is not preserved. diff --git a/ctdb/packaging/RPM/ctdb.spec.in b/ctdb/packaging/RPM/ctdb.spec.in index ce7d8a629e9..503670023b8 100644 --- a/ctdb/packaging/RPM/ctdb.spec.in +++ b/ctdb/packaging/RPM/ctdb.spec.in @@ -125,6 +125,8 @@ install -m755 config/ctdb.init $RPM_BUILD_ROOT%{initdir}/ctdb # This is a hack. All documents should be installed in /usr/share/doc. rm -f $RPM_BUILD_ROOT%{_sysconfdir}/ctdb/events.d/README cp config/events.d/README README.eventscripts +rm -f $RPM_BUILD_ROOT%{_sysconfdir}/ctdb/nfs-checks.d/README +cp config/nfs-checks.d/README README.nfs-checks.d cp config/notify.d.README README.notify.d # Remove "*.old" files @@ -183,11 +185,11 @@ rm -rf $RPM_BUILD_ROOT %{_sysconfdir}/ctdb/events.d/70.iscsi %{_sysconfdir}/ctdb/events.d/91.lvs %{_sysconfdir}/ctdb/events.d/99.timeout -%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/10.statd.check -%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/20.nfsd.check -%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/30.lockd.check -%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/40.mountd.check -%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/50.rquotad.check +%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/10.status.check +%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/20.nfs.check +%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/30.nlockmgr.check +%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/40.mountd.check +%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/50.rquotad.check %{_sysconfdir}/ctdb/statd-callout %{_sbindir}/ctdbd %{_sbindir}/ctdbd_wrapper diff --git a/ctdb/tests/eventscripts/etc-ctdb/nfs-checks.d b/ctdb/tests/eventscripts/etc-ctdb/nfs-checks.d new file mode 120000 index 00000000000..3dc2161efd0 --- /dev/null +++ b/ctdb/tests/eventscripts/etc-ctdb/nfs-checks.d @@ -0,0 +1 @@ +../../../config/nfs-checks.d
\ No newline at end of file diff --git a/ctdb/tests/eventscripts/scripts/local.sh b/ctdb/tests/eventscripts/scripts/local.sh index 69b0d477570..b13399e97c3 100644 --- a/ctdb/tests/eventscripts/scripts/local.sh +++ b/ctdb/tests/eventscripts/scripts/local.sh @@ -893,22 +893,23 @@ EOF done } -mark_background () -{ - sed -e 's@^@\&@' -} - -convert_progname () +guess_output () { case "$1" in - nfs) echo "nfsd" ;; - nlockmgr) echo "lockd" ;; - status) echo "statd" ;; - *) echo "$1" ;; + startstop_nfslock\ start) + echo "&Starting nfslock: OK" + ;; + startstop_nfs\ start) + cat <<EOF +&Starting nfslock: OK +&Starting nfs: OK +EOF + ;; + *) + : # Nothing esac } - # Set the required result for a particular RPC program having failed # for a certain number of iterations. This is probably still a work # in progress. Note that we could hook aggressively @@ -920,12 +921,13 @@ convert_progname () rpc_set_service_failure_response () { _rpc_service="$1" - # The number of failures defaults to the iteration number. This - # will be true when we fail from the 1st iteration... but we need - # the flexibility to set the number of failures. - _numfails="${2:-${iteration:-1}}" + _numfails="${2:-1}" # default 1 - _progname=$(convert_progname "$_rpc_service") + # Default + ok_null + if [ $_numfails -eq 0 ] ; then + return + fi nfs_load_config @@ -933,79 +935,69 @@ rpc_set_service_failure_response () _nl=" " - # Default - ok_null + _dir="${CTDB_NFS_CHECKS_DIR:-${CTDB_BASE}/nfs-checks.d}" - _file=$(ls "${CTDB_BASE}/nfs-rpc-checks.d/"[0-9][0-9]."${_progname}.check") + _file=$(ls "$_dir"/[0-9][0-9]."${_rpc_service}.check") [ -r "$_file" ] || die "RPC check file \"$_file\" does not exist or is not unique" - while read _op _li _actions ; do - # Skip comments - case "$_op" in - \#*) continue ;; - esac + _out=$(mktemp --tmpdir="$EVENTSCRIPTS_TESTS_VAR_DIR") + _rc_file=$(mktemp --tmpdir="$EVENTSCRIPTS_TESTS_VAR_DIR") - _hit=false - if [ "$_op" != "%" ] ; then - if [ $_numfails $_op $_li ] ; then - _hit=true - fi + ( + # Subshell to restrict scope variables... + + # Defaults + family="tcp" + version="" + unhealthy_after=1 + restart_every=0 + service_stop_cmd="" + service_start_cmd="" + service_debug_cmd="" + + # Don't bother syntax checking, eventscript does that... + . "$_file" + + # Just use the first version, default to 1. This is dumb but + # handles all the cases that we care about now... + if [ -n "$version" ] ; then + _ver="${version%% *}" else - if [ $_numfails -gt 0 -a $(($_numfails $_op $_li)) -eq 0 ] ; then - _hit=true - fi + _ver=1 fi - if $_hit ; then - _out="" - _rc=0 - for _action in $_actions ; do - case "$_action" in - verbose) - _ver=1 - case "$_rpc_service" in - nfs) _ver=3 ;; - nlockmgr) _ver=4 ;; - esac - _out="\ -ERROR: $_rpc_service failed RPC check: + _rpc_check_out="\ +$_rpc_service failed RPC check: rpcinfo: RPC: Program not registered program $_rpc_service version $_ver is not available" - ;; - restart*) - _p="rpc.${_progname}" - case "$_action" in - *:b) _bg=mark_background ;; - *) _bg=cat ;; - esac - case "$_progname" in - nfsd) - _t=$(program_stack_traces "nfsd" 5) - _t="${_t}${_t:+${_nl}}Starting nfslock: OK -Starting nfs: OK" - _t=$(echo "$_t" | $_bg) - _t="\ -Trying to restart NFS service -${_t}" - ;; - lockd) - _t=$(echo "Starting nfslock: OK" | $_bg) - _t="Trying to restart lock manager service${_t:+${_nl}}${_t}" - ;; - *) - _t="Trying to restart $_progname [${_p}]" - _stacks=$(program_stack_traces "$_p" 5) - _t="${_t}${_stacks:+${_nl}}${_stacks}" - esac - _out="${_out}${_out:+${_nl}}${_t}" - ;; - unhealthy) - _rc=1 - esac - done - required_result $_rc "$_out" - return + + if [ $unhealthy_after -gt 0 -a $_numfails -ge $unhealthy_after ] ; then + _unhealthy=true + echo 1 >"$_rc_file" + echo "ERROR: ${_rpc_check_out}" >>"$_out" + else + _unhealthy=false + echo 0 >"$_rc_file" fi - done <"$_file" + + if [ $restart_every -gt 0 -a $(($_numfails % $restart_every)) -eq 0 ] ; then + if ! $_unhealthy ; then + echo "WARNING: ${_rpc_check_out}" >>"$_out" + fi + + echo "Trying to restart service \"${_rpc_service}\"..." >>"$_out" + + if [ -n "$service_debug_cmd" ] ; then + $service_debug_cmd 2>&1 >>"$_out" + fi + + guess_output "$service_start_cmd" >>"$_out" + fi + ) + + read _rc <"$_rc_file" + required_result $_rc <"$_out" + + rm -f "$_out" "$_rc_file" } ###################################################################### diff --git a/ctdb/wscript b/ctdb/wscript index add10ec0e78..7b3304b10a8 100755 --- a/ctdb/wscript +++ b/ctdb/wscript @@ -464,7 +464,7 @@ def build(bld): etc_subdirs = [ 'events.d', - 'nfs-rpc-checks.d' + 'nfs-checks.d' ] if bld.env.standalone_ctdb: @@ -627,7 +627,7 @@ def build(bld): test_eventscript_links = [ 'events.d', 'functions', - 'nfs-rpc-checks.d', + 'nfs-checks.d', 'statd-callout' ] |