ctdb/tests/simple/77_ctdb_db_recovery.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

#!/bin/bash

test_info()
{
    cat <<EOF
Recovery can under certain circumstances lead to old record copies
resurrecting: Recovery selects the newest record copy purely by RSN. At
the end of the recovery, the recovery master is the dmaster for all
records in all (non-persistent) databases. And the other nodes locally
hold the complete copy of the databases. The bug is that the recovery
process does not increment the RSN on the recovery master at the end of
the recovery. Now clients acting directly on the Recovery master will
directly change a record's content on the recmaster without migration
and hence without RSN bump.  So a subsequent recovery can not tell that
the recmaster's copy is newer than the copies on the other nodes, since
their RSN is the same. Hence, if the recmaster is not node 0 (or more
precisely not the active node with the lowest node number), the recovery
will choose copies from nodes with lower number and stick to these.

Steps:

1. Create a test database
2. Add a record with value value1 on recovery master
3. Force a recovery
4. Update the record with value value2 on recovery master
5. Force a recovery
6. Fetch the record

Expected results:

* The record should have value value2 and not value1

EOF
}

. "${TEST_SCRIPTS_DIR}/integration.bash"

ctdb_test_init

set -e

cluster_is_healthy

#
# Main test
#
TESTDB="rec_test.tdb"

status=0

# Make sure node 0 is not the recovery master
echo "find out which node is recmaster"
try_command_on_node any $CTDB recmaster
recmaster="$out"
if [ "$recmaster" = "0" ]; then
    echo "node 0 is recmaster, disable recmasterrole on node 0"
    #
    # Note:
    # It should be sufficient to run "ctdb setrecmasterrole off"
    # on node 0 and wait for election and recovery to finish.
    # But there were problems related to this in this automatic
    # test, so for now use "ctdb stop" and "ctdb continue".
    #
    echo "stop node 0"
    try_command_on_node 0 $CTDB stop
    wait_until_node_has_status 0 stopped
    echo "continue node 0"
    try_command_on_node 0 $CTDB continue
    wait_until_node_has_status 0 notstopped

    try_command_on_node any $CTDB recmaster
    recmaster="$out"
    if [ "$recmaster" = "0" ]; then
	echo "failed to move recmaster to different node"
	exit 1
    fi
fi

echo "Recmaster:$recmaster"

# Create a temporary non-persistent database to test with
echo "create test database $TESTDB"
try_command_on_node $recmaster $CTDB attach $TESTDB

# Wipe Test database
echo "wipe test database"
try_command_on_node $recmaster $CTDB wipedb $TESTDB

# Add a record   key=test1 data=value1
echo "store key(test1) data(value1)"
try_command_on_node $recmaster $CTDB writekey $TESTDB test1 value1

# Fetch a record   key=test1
echo "read key(test1)"
try_command_on_node $recmaster $CTDB readkey $TESTDB test1
cat "$outfile"

# Do a recovery
echo "force recovery"
try_command_on_node $recmaster $CTDB recover

wait_until_node_has_status $recmaster recovered

# Add a record   key=test1 data=value2
echo "store key(test1) data(value2)"
try_command_on_node $recmaster $CTDB writekey $TESTDB test1 value2

# Fetch a record   key=test1
echo "read key(test1)"
try_command_on_node $recmaster $CTDB readkey $TESTDB test1
cat "$outfile"

# Do a recovery
echo "force recovery"
try_command_on_node $recmaster $CTDB recover

wait_until_node_has_status $recmaster recovered

# Verify record   key=test1
echo "read key(test1)"
try_command_on_node $recmaster $CTDB readkey $TESTDB test1
cat "$outfile"
if [ "$out" = "Data: size:6 ptr:[value2]" ]; then
	echo "GOOD: Recovery did not corrupt database"
else
	echo "BAD: Recovery corrupted database"
	status=1
fi

exit $status