ctdb/config/debug-hung-script.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59

#!/bin/sh

# This script only works on Linux.  Please modify (and submit patches)
# for other operating systems.

[ -n "$CTDB_BASE" ] || \
    export CTDB_BASE=$(cd -P $(dirname "$0") ; echo "$PWD")

. "$CTDB_BASE/functions"

loadconfig ctdb

# Testing hook
if [ -n "$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" ] ; then
    tmp="${CTDB_DEBUG_HUNG_SCRIPT_LOGFILE}.part"
    exec >>"$tmp" 2>&1
fi

(
    # No use running several of these in parallel if, say, "releaseip"
    # event hangs for multiple IPs.  In that case the output would be
    # interleaved in the log and would just be confusing.
    flock --wait 2 9 || exit 1

    echo "===== Start of hung script debug for PID=\"$1\", event=\"$2\" ====="

    echo "pstree -p -a ${1}:"
    out=$(pstree -p -a $1)
    echo "$out"

    # Check for processes matching a regular expression and print
    # stack staces.  This could help confirm that certain processes
    # are stuck in certain places such as the cluster filesystem.  The
    # regexp must separate items with "|" and must not contain
    # parentheses.  The default pattern can be replaced for testing.
    default_pat='exportfs|rpcinfo'
    pat="${CTDB_DEBUG_HUNG_SCRIPT_STACKPAT:-${default_pat}}"
    echo "$out" |
    sed -r -n "s@.*-(.*(${pat}).*),([0-9]*).*@\3 \1@p" |
    while read pid name ; do
	trace=$(cat "/proc/${pid}/stack" 2>/dev/null)
	if [ $? -eq 0 ] ; then
	    echo "---- Stack trace of interesting process ${pid}[${name}] ----"
	    echo "$trace"
	fi
    done

    if [ "$2" != "init" ] ; then
	echo "---- ctdb scriptstatus ${2}: ----"
	ctdb scriptstatus "$2"
    fi

    echo "===== End of hung script debug for PID=\"$1\", event=\"$2\" ====="

    if [ -n "$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" ] ; then
	mv "$tmp" "$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE"
    fi

) 9>"${CTDB_VARDIR}/debug-hung-script.lock"