summaryrefslogtreecommitdiff
path: root/qpid/cpp/src/posix/QpiddBroker.cpp
diff options
context:
space:
mode:
authorAlan Conway <aconway@apache.org>2014-07-31 13:55:11 +0000
committerAlan Conway <aconway@apache.org>2014-07-31 13:55:11 +0000
commitc9276b03da088b3f4d3f4b527f2e02703e2729eb (patch)
treeb3f0553221917ffeb27f9562d9df7a5d9f8000d2 /qpid/cpp/src/posix/QpiddBroker.cpp
parent5b6f651d3f2c5b33fa510e120dc0e98f6a95409a (diff)
downloadqpid-python-c9276b03da088b3f4d3f4b527f2e02703e2729eb.tar.gz
QPID-5942: qpid HA cluster may end-up in joining state after HA primary is killed
There are two issues here, both related to the fact that rgmanager sees qpidd and qpidd-primary as two separate services. 1. The service start/stop scripts can be called concurrently. This can lead to running a qpidd process who's pid is not in the pidfile. rgmanager cannot detect or kill this qpidd and cannot start another qpidd because of the lock on the qpidd data directory. 2. rgmanager sees a primary failure as two failures: qpidd and qpidd-primary, and will then try to stop and start both services. The order of these actions is not defined and can lead to rgmanager killing a service it has just started. This patch makes two major changes to the init scripts: 1. Uses flock to lock the sensitive stop/start part of the scripts to ensure they are not executed concurrently. 2. On "stop" the scripts check if a running qpidd is primary or not. "qpidd stop" is a no-op if the running broker is primary, "qpidd-primary stop" is a no op if it is not. This ensures that a broker will be stopped by the same stream of service actions that started it. Minor changes in this patch: - better logging of broker start-up and shut-down sequence. - qpid-ha heartbeat use half of timeout option. - add missing timeouts in qpid-ha. Notes: This changes the behavior of 'clusvcadm -d <qpidd-service>' on the primary node. Previously this would have stopped the qpidd service on that node, killed the qpidd process and relocated the primary service. Now this will stop the qpidd service (as far as rgmanager is concerned) but will not kill qpidd or relocate the primary service. When the primary is relocated the qpidd service wil not be able to re-start on that node until it is re-enabled with 'clusvcadm -e'. git-svn-id: https://svn.apache.org/repos/asf/qpid/trunk@1614895 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'qpid/cpp/src/posix/QpiddBroker.cpp')
-rw-r--r--qpid/cpp/src/posix/QpiddBroker.cpp26
1 files changed, 17 insertions, 9 deletions
diff --git a/qpid/cpp/src/posix/QpiddBroker.cpp b/qpid/cpp/src/posix/QpiddBroker.cpp
index 831b2e0641..9228c2d18d 100644
--- a/qpid/cpp/src/posix/QpiddBroker.cpp
+++ b/qpid/cpp/src/posix/QpiddBroker.cpp
@@ -59,12 +59,14 @@ const std::string TCP = "tcp";
struct DaemonOptions : public qpid::Options {
bool daemon;
bool quit;
+ bool kill;
bool check;
+ std::vector<int> closeFd;
int wait;
std::string piddir;
std::string transport;
- DaemonOptions() : qpid::Options("Daemon options"), daemon(false), quit(false), check(false), wait(600), transport(TCP)
+ DaemonOptions() : qpid::Options("Daemon options"), daemon(false), quit(false), kill(false), check(false), wait(600), transport(TCP)
{
char *home = ::getenv("HOME");
@@ -78,9 +80,11 @@ struct DaemonOptions : public qpid::Options {
("daemon,d", pure_switch(daemon), "Run as a daemon. Logs to syslog by default in this mode.")
("transport", optValue(transport, "TRANSPORT"), "The transport for which to return the port")
("pid-dir", optValue(piddir, "DIR"), "Directory where port-specific PID file is stored")
+ ("close-fd", optValue(closeFd, "FD"), "File descriptors that the daemon should close")
("wait,w", optValue(wait, "SECONDS"), "Sets the maximum wait time to initialize or shutdown the daemon. If the daemon fails to initialize/shutdown, prints an error and returns 1")
("check,c", pure_switch(check), "Prints the daemon's process ID to stdout and returns 0 if the daemon is running, otherwise returns 1")
- ("quit,q", pure_switch(quit), "Tells the daemon to shut down");
+ ("quit,q", pure_switch(quit), "Tells the daemon to shut down with an INT signal")
+ ("kill,k", pure_switch(kill), "Kill the daemon with a KILL signal.");
}
};
@@ -132,12 +136,15 @@ struct QpiddDaemon : public Daemon {
/** Code for parent process */
void parent() {
uint16_t port = wait(options->daemon.wait);
- if (options->parent->broker.port == 0
- ) cout << port << endl;
+ if (options->parent->broker.port == 0)
+ cout << port << endl;
}
/** Code for forked child process */
void child() {
+ // Close extra FDs requested in options.
+ for (size_t i = 0; i < options->daemon.closeFd.size(); ++i)
+ ::close(options->daemon.closeFd[i]);
boost::intrusive_ptr<Broker> brokerPtr(new Broker(options->parent->broker));
ScopedSetBroker ssb(brokerPtr);
brokerPtr->accept();
@@ -157,21 +164,22 @@ int QpiddBroker::execute (QpiddOptions *options) {
if (myOptions == 0)
throw Exception("Internal error obtaining platform options");
- if (myOptions->daemon.check || myOptions->daemon.quit) {
+ if (myOptions->daemon.check || myOptions->daemon.quit || myOptions->daemon.kill) {
pid_t pid;
try {
pid = Daemon::getPid(myOptions->daemon.piddir, options->broker.port);
- } catch (const ErrnoException& e) {
+ } catch (const Exception& e) {
// This is not a critical error, usually means broker is not running
- QPID_LOG(notice, "Cannot stop broker: " << e.what());
+ QPID_LOG(notice, "Broker is not running: " << e.what());
return 1;
}
if (pid < 0)
return 1;
if (myOptions->daemon.check)
cout << pid << endl;
- if (myOptions->daemon.quit) {
- if (kill(pid, SIGINT) < 0)
+ if (myOptions->daemon.quit || myOptions->daemon.kill) {
+ int signal = myOptions->daemon.kill ? SIGKILL : SIGINT;
+ if (kill(pid, signal) < 0)
throw Exception("Failed to stop daemon: " + qpid::sys::strError(errno));
// Wait for the process to die before returning
int retry=myOptions->daemon.wait*1000; // Try up to "--wait N" seconds, do retry every millisecond