summaryrefslogtreecommitdiff
path: root/ctdb
diff options
context:
space:
mode:
authorMartin Schwenke <martin@meltin.net>2015-06-19 16:35:12 +1000
committerAmitay Isaacs <amitay@samba.org>2015-07-14 09:57:18 +0200
commit74428e5c1463d6c60880ba593c880bc36a8f1ff4 (patch)
tree0a47561e9e5bc42c73721323647957771bbd91e2 /ctdb
parent3161d611bb2931019a7d40c7795c12f0b70a903c (diff)
downloadsamba-74428e5c1463d6c60880ba593c880bc36a8f1ff4.tar.gz
ctdb-scripts: Switch NFS checks to new style
Note that the 60.ganesha RPC checks need to be identical to those in the nfs-checks.d/ directory. This is because the NFS unit test infrastructure checks output against what should be produced by the checks in nfs-checks.d/. This is a minor issue, since one of the aims of this work is to remove the need for a separate 60.ganesha. In most cases configuration variable CTDB_NFS_DUMP_STUCK_THREADS is now ignored. This is now handled by passing the desired number of threads to the command specified in the service_debug_cmd variable in a .check file. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
Diffstat (limited to 'ctdb')
-rwxr-xr-xctdb/config/events.d/60.ganesha29
-rwxr-xr-xctdb/config/events.d/60.nfs2
-rwxr-xr-xctdb/config/functions11
-rw-r--r--ctdb/config/nfs-checks.d/10.status.check7
-rw-r--r--ctdb/config/nfs-checks.d/20.nfs.check7
-rw-r--r--ctdb/config/nfs-checks.d/30.nlockmgr.check6
-rw-r--r--ctdb/config/nfs-checks.d/40.mountd.check7
-rw-r--r--ctdb/config/nfs-checks.d/50.rquotad.check7
-rw-r--r--ctdb/config/nfs-checks.d/README28
-rw-r--r--ctdb/packaging/RPM/ctdb.spec.in12
l---------ctdb/tests/eventscripts/etc-ctdb/nfs-checks.d1
-rw-r--r--ctdb/tests/eventscripts/scripts/local.sh152
-rwxr-xr-xctdb/wscript4
13 files changed, 172 insertions, 101 deletions
diff --git a/ctdb/config/events.d/60.ganesha b/ctdb/config/events.d/60.ganesha
index 43c70df1c6d..2524fd472fb 100755
--- a/ctdb/config/events.d/60.ganesha
+++ b/ctdb/config/events.d/60.ganesha
@@ -222,25 +222,28 @@ case "$1" in
update_tickles 2049
nfs_update_lock_info
- # check that statd responds to rpc requests
- # if statd is not running we try to restart it
- # we only do this IF we have a rpc.statd command.
- # For platforms where rpc.statd does not exist, we skip
- # the check completely
- p="rpc.statd"
- type $p >/dev/null 2>/dev/null && \
- nfs_check_rpc_service "statd" \
- -ge 6 "verbose restart:b unhealthy" \
- % 2 "verbose restart:b"
+ nfs_check_service "status" <<EOF
+version="1" # could drop this and use any version?
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.statd"
+service_start_cmd="rpc.statd ${STATD_HA_CALLOUT:+-H} $STATD_HA_CALLOUT ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME ${STATD_PORT:+-p} $STATD_PORT ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT"
+service_debug_cmd="program_stack_traces rpc.statd 5"
+EOF
if [ "$CTDB_SKIP_GANESHA_NFSD_CHECK" != "yes" ] ; then
monitor_ganesha_nfsd
fi
# rquotad is sometimes not started correctly on RHEL5
- nfs_check_rpc_service "rquotad" \
- -ge 6 "verbose restart:b unhealthy" \
- % 2 "verbose restart:b"
+ nfs_check_service "rquotad" <<EOF
+version="1" # could drop this and use any version?
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.rquotad"
+service_start_cmd="rpc.rquotad ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT"
+service_debug_cmd="program_stack_traces rpc.rquotad 5"
+EOF
;;
*)
diff --git a/ctdb/config/events.d/60.nfs b/ctdb/config/events.d/60.nfs
index babff1e5330..997d676091b 100755
--- a/ctdb/config/events.d/60.nfs
+++ b/ctdb/config/events.d/60.nfs
@@ -92,7 +92,7 @@ case "$1" in
update_tickles 2049
nfs_update_lock_info
- nfs_check_rpc_services
+ nfs_check_services
nfs_check_thread_count
;;
diff --git a/ctdb/config/functions b/ctdb/config/functions
index 0b0021c79b4..4290bfa841f 100755
--- a/ctdb/config/functions
+++ b/ctdb/config/functions
@@ -943,6 +943,11 @@ startstop_nfs() {
nfs_dump_some_threads
service nfsserver start
;;
+ restart-stop)
+ set_proc "fs/nfsd/threads" 0
+ service nfsserver stop > /dev/null 2>&1
+ pkill -9 nfsd
+ ;;
esac
;;
rhel)
@@ -964,6 +969,12 @@ startstop_nfs() {
service nfslock start
service nfs start
;;
+ restart-stop)
+ set_proc "fs/nfsd/threads" 0
+ service nfs stop > /dev/null 2>&1
+ service nfslock stop > /dev/null 2>&1
+ pkill -9 nfsd
+ ;;
esac
;;
*)
diff --git a/ctdb/config/nfs-checks.d/10.status.check b/ctdb/config/nfs-checks.d/10.status.check
new file mode 100644
index 00000000000..dfa5c59117e
--- /dev/null
+++ b/ctdb/config/nfs-checks.d/10.status.check
@@ -0,0 +1,7 @@
+# status
+version="1"
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.statd"
+service_start_cmd="rpc.statd ${STATD_HA_CALLOUT:+-H} $STATD_HA_CALLOUT ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME ${STATD_PORT:+-p} $STATD_PORT ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT"
+service_debug_cmd="program_stack_traces rpc.statd 5"
diff --git a/ctdb/config/nfs-checks.d/20.nfs.check b/ctdb/config/nfs-checks.d/20.nfs.check
new file mode 100644
index 00000000000..7229f7d9fe0
--- /dev/null
+++ b/ctdb/config/nfs-checks.d/20.nfs.check
@@ -0,0 +1,7 @@
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="startstop_nfs restart-stop"
+service_start_cmd="startstop_nfs start"
+service_debug_cmd="program_stack_traces nfsd 5"
diff --git a/ctdb/config/nfs-checks.d/30.nlockmgr.check b/ctdb/config/nfs-checks.d/30.nlockmgr.check
new file mode 100644
index 00000000000..c2e723e1051
--- /dev/null
+++ b/ctdb/config/nfs-checks.d/30.nlockmgr.check
@@ -0,0 +1,6 @@
+# nlockmgr
+version="4"
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="startstop_nfslock stop"
+service_start_cmd="startstop_nfslock start"
diff --git a/ctdb/config/nfs-checks.d/40.mountd.check b/ctdb/config/nfs-checks.d/40.mountd.check
new file mode 100644
index 00000000000..56b3fd29512
--- /dev/null
+++ b/ctdb/config/nfs-checks.d/40.mountd.check
@@ -0,0 +1,7 @@
+# mountd
+version="1"
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.mountd"
+service_start_cmd="rpc.mountd $RPCMOUNTDOPTS ${MOUNTD_PORT:+-p} $MOUNTD_PORT"
+service_debug_cmd="program_stack_traces rpc.mountd 5"
diff --git a/ctdb/config/nfs-checks.d/50.rquotad.check b/ctdb/config/nfs-checks.d/50.rquotad.check
new file mode 100644
index 00000000000..b7bd9d2c757
--- /dev/null
+++ b/ctdb/config/nfs-checks.d/50.rquotad.check
@@ -0,0 +1,7 @@
+# rquotad
+version="1"
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.rquotad"
+service_start_cmd="rpc.rquotad ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT"
+service_debug_cmd="program_stack_traces rpc.rquotad 5"
diff --git a/ctdb/config/nfs-checks.d/README b/ctdb/config/nfs-checks.d/README
new file mode 100644
index 00000000000..51ba54b7373
--- /dev/null
+++ b/ctdb/config/nfs-checks.d/README
@@ -0,0 +1,28 @@
+NFS check configuration files.
+
+Files are named NN.RPCSERVICE.check. Files without a .check suffix
+are ignored.
+
+Supported variables are:
+
+* family - "tcp" or "udp" or space separated list
+ default: tcp
+* version - optional, RPC service version number
+ default is to omit to check for any version
+* unhealthy_after - number of check fails before unhealthy
+ default: 1
+* restart_every - number of check fails before restart
+ default: 0, meaning no restart
+* service_stop_cmd - command to stop service
+ default: no default, must be provided if
+ restart_every > 0
+* service_start_cmd - command to start service
+ default: no default, must be provided if
+ restart_every > 0
+* service_debug_cmd - command to debug a service after trying to stop it;
+ for example, it can be useful to print stack
+ traces of threads that have not exited, since
+ they may be stuck doing I/O;
+ no default, see also function program_stack_traces()
+
+Quoting inside values is not preserved.
diff --git a/ctdb/packaging/RPM/ctdb.spec.in b/ctdb/packaging/RPM/ctdb.spec.in
index ce7d8a629e9..503670023b8 100644
--- a/ctdb/packaging/RPM/ctdb.spec.in
+++ b/ctdb/packaging/RPM/ctdb.spec.in
@@ -125,6 +125,8 @@ install -m755 config/ctdb.init $RPM_BUILD_ROOT%{initdir}/ctdb
# This is a hack. All documents should be installed in /usr/share/doc.
rm -f $RPM_BUILD_ROOT%{_sysconfdir}/ctdb/events.d/README
cp config/events.d/README README.eventscripts
+rm -f $RPM_BUILD_ROOT%{_sysconfdir}/ctdb/nfs-checks.d/README
+cp config/nfs-checks.d/README README.nfs-checks.d
cp config/notify.d.README README.notify.d
# Remove "*.old" files
@@ -183,11 +185,11 @@ rm -rf $RPM_BUILD_ROOT
%{_sysconfdir}/ctdb/events.d/70.iscsi
%{_sysconfdir}/ctdb/events.d/91.lvs
%{_sysconfdir}/ctdb/events.d/99.timeout
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/10.statd.check
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/20.nfsd.check
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/30.lockd.check
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/40.mountd.check
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/50.rquotad.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/10.status.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/20.nfs.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/30.nlockmgr.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/40.mountd.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/50.rquotad.check
%{_sysconfdir}/ctdb/statd-callout
%{_sbindir}/ctdbd
%{_sbindir}/ctdbd_wrapper
diff --git a/ctdb/tests/eventscripts/etc-ctdb/nfs-checks.d b/ctdb/tests/eventscripts/etc-ctdb/nfs-checks.d
new file mode 120000
index 00000000000..3dc2161efd0
--- /dev/null
+++ b/ctdb/tests/eventscripts/etc-ctdb/nfs-checks.d
@@ -0,0 +1 @@
+../../../config/nfs-checks.d \ No newline at end of file
diff --git a/ctdb/tests/eventscripts/scripts/local.sh b/ctdb/tests/eventscripts/scripts/local.sh
index 69b0d477570..b13399e97c3 100644
--- a/ctdb/tests/eventscripts/scripts/local.sh
+++ b/ctdb/tests/eventscripts/scripts/local.sh
@@ -893,22 +893,23 @@ EOF
done
}
-mark_background ()
-{
- sed -e 's@^@\&@'
-}
-
-convert_progname ()
+guess_output ()
{
case "$1" in
- nfs) echo "nfsd" ;;
- nlockmgr) echo "lockd" ;;
- status) echo "statd" ;;
- *) echo "$1" ;;
+ startstop_nfslock\ start)
+ echo "&Starting nfslock: OK"
+ ;;
+ startstop_nfs\ start)
+ cat <<EOF
+&Starting nfslock: OK
+&Starting nfs: OK
+EOF
+ ;;
+ *)
+ : # Nothing
esac
}
-
# Set the required result for a particular RPC program having failed
# for a certain number of iterations. This is probably still a work
# in progress. Note that we could hook aggressively
@@ -920,12 +921,13 @@ convert_progname ()
rpc_set_service_failure_response ()
{
_rpc_service="$1"
- # The number of failures defaults to the iteration number. This
- # will be true when we fail from the 1st iteration... but we need
- # the flexibility to set the number of failures.
- _numfails="${2:-${iteration:-1}}"
+ _numfails="${2:-1}" # default 1
- _progname=$(convert_progname "$_rpc_service")
+ # Default
+ ok_null
+ if [ $_numfails -eq 0 ] ; then
+ return
+ fi
nfs_load_config
@@ -933,79 +935,69 @@ rpc_set_service_failure_response ()
_nl="
"
- # Default
- ok_null
+ _dir="${CTDB_NFS_CHECKS_DIR:-${CTDB_BASE}/nfs-checks.d}"
- _file=$(ls "${CTDB_BASE}/nfs-rpc-checks.d/"[0-9][0-9]."${_progname}.check")
+ _file=$(ls "$_dir"/[0-9][0-9]."${_rpc_service}.check")
[ -r "$_file" ] || die "RPC check file \"$_file\" does not exist or is not unique"
- while read _op _li _actions ; do
- # Skip comments
- case "$_op" in
- \#*) continue ;;
- esac
+ _out=$(mktemp --tmpdir="$EVENTSCRIPTS_TESTS_VAR_DIR")
+ _rc_file=$(mktemp --tmpdir="$EVENTSCRIPTS_TESTS_VAR_DIR")
- _hit=false
- if [ "$_op" != "%" ] ; then
- if [ $_numfails $_op $_li ] ; then
- _hit=true
- fi
+ (
+ # Subshell to restrict scope variables...
+
+ # Defaults
+ family="tcp"
+ version=""
+ unhealthy_after=1
+ restart_every=0
+ service_stop_cmd=""
+ service_start_cmd=""
+ service_debug_cmd=""
+
+ # Don't bother syntax checking, eventscript does that...
+ . "$_file"
+
+ # Just use the first version, default to 1. This is dumb but
+ # handles all the cases that we care about now...
+ if [ -n "$version" ] ; then
+ _ver="${version%% *}"
else
- if [ $_numfails -gt 0 -a $(($_numfails $_op $_li)) -eq 0 ] ; then
- _hit=true
- fi
+ _ver=1
fi
- if $_hit ; then
- _out=""
- _rc=0
- for _action in $_actions ; do
- case "$_action" in
- verbose)
- _ver=1
- case "$_rpc_service" in
- nfs) _ver=3 ;;
- nlockmgr) _ver=4 ;;
- esac
- _out="\
-ERROR: $_rpc_service failed RPC check:
+ _rpc_check_out="\
+$_rpc_service failed RPC check:
rpcinfo: RPC: Program not registered
program $_rpc_service version $_ver is not available"
- ;;
- restart*)
- _p="rpc.${_progname}"
- case "$_action" in
- *:b) _bg=mark_background ;;
- *) _bg=cat ;;
- esac
- case "$_progname" in
- nfsd)
- _t=$(program_stack_traces "nfsd" 5)
- _t="${_t}${_t:+${_nl}}Starting nfslock: OK
-Starting nfs: OK"
- _t=$(echo "$_t" | $_bg)
- _t="\
-Trying to restart NFS service
-${_t}"
- ;;
- lockd)
- _t=$(echo "Starting nfslock: OK" | $_bg)
- _t="Trying to restart lock manager service${_t:+${_nl}}${_t}"
- ;;
- *)
- _t="Trying to restart $_progname [${_p}]"
- _stacks=$(program_stack_traces "$_p" 5)
- _t="${_t}${_stacks:+${_nl}}${_stacks}"
- esac
- _out="${_out}${_out:+${_nl}}${_t}"
- ;;
- unhealthy)
- _rc=1
- esac
- done
- required_result $_rc "$_out"
- return
+
+ if [ $unhealthy_after -gt 0 -a $_numfails -ge $unhealthy_after ] ; then
+ _unhealthy=true
+ echo 1 >"$_rc_file"
+ echo "ERROR: ${_rpc_check_out}" >>"$_out"
+ else
+ _unhealthy=false
+ echo 0 >"$_rc_file"
fi
- done <"$_file"
+
+ if [ $restart_every -gt 0 -a $(($_numfails % $restart_every)) -eq 0 ] ; then
+ if ! $_unhealthy ; then
+ echo "WARNING: ${_rpc_check_out}" >>"$_out"
+ fi
+
+ echo "Trying to restart service \"${_rpc_service}\"..." >>"$_out"
+
+ if [ -n "$service_debug_cmd" ] ; then
+ $service_debug_cmd 2>&1 >>"$_out"
+ fi
+
+ guess_output "$service_start_cmd" >>"$_out"
+ fi
+ )
+
+ read _rc <"$_rc_file"
+ required_result $_rc <"$_out"
+
+ rm -f "$_out" "$_rc_file"
}
######################################################################
diff --git a/ctdb/wscript b/ctdb/wscript
index add10ec0e78..7b3304b10a8 100755
--- a/ctdb/wscript
+++ b/ctdb/wscript
@@ -464,7 +464,7 @@ def build(bld):
etc_subdirs = [
'events.d',
- 'nfs-rpc-checks.d'
+ 'nfs-checks.d'
]
if bld.env.standalone_ctdb:
@@ -627,7 +627,7 @@ def build(bld):
test_eventscript_links = [
'events.d',
'functions',
- 'nfs-rpc-checks.d',
+ 'nfs-checks.d',
'statd-callout'
]