summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHervé Beraud <hberaud@redhat.com>2019-05-03 00:55:56 +0200
committerHervé Beraud <hberaud@redhat.com>2019-05-21 15:24:23 +0200
commit7b8fd6370c8b04c3836ee4f9d06eaa90c7be5197 (patch)
tree4149631dc95a0f698ebe8c9794152e517c7e28d9
parent0f91b0a2115f3c0a817abb624d65f8f0dcad40b2 (diff)
downloadoslo-messaging-7b8fd6370c8b04c3836ee4f9d06eaa90c7be5197.tar.gz
Fix switch connection destination when a rabbitmq cluster node disappear
In a clustered rabbitmq when a node disappears, we get a ConnectionRefusedError because the socket get disconnected. The socket access yields a OSError because the heartbeat tries to reach an unreachable host (No route to host). Catch these exceptions to ensure that we call ensure_connection for switching the connection destination. POC is available at github.com:4383/rabbitmq-oslo_messging-error-poc Example: $ git clone git@github.com:4383/rabbitmq-oslo_messging-error-poc $ cd rabbitmq-oslo_messging-error-poc $ python -m virtualenv . $ source bin/activate $ pip install -r requirements.txt $ sudo podman run -d --hostname my-rabbit --name rabbit rabbitmq:3 $ python poc.py $(sudo podman inspect rabbit | niet '.[0].NetworkSettings.IPAddress') And in parallele in an another shell|tmux $ podman stop rabbit $ # observe the output of the poc.py script we now call ensure_connection Now you can observe some output relative to the connection who is modified and not catched before these changes. Related to: https://bugzilla.redhat.com/show_bug.cgi?id=1665399 Closes-Bug: #1828841 Change-Id: I9dc1644cac0e39eb11bf05f57bde77dcf6d42ed3 (cherry picked from commit 9d8b1430e5c081b081c0e3c0b5f12f744dc7809d)
-rw-r--r--oslo_messaging/_drivers/impl_rabbit.py18
1 files changed, 18 insertions, 0 deletions
diff --git a/oslo_messaging/_drivers/impl_rabbit.py b/oslo_messaging/_drivers/impl_rabbit.py
index 5e07593..68f3543 100644
--- a/oslo_messaging/_drivers/impl_rabbit.py
+++ b/oslo_messaging/_drivers/impl_rabbit.py
@@ -897,6 +897,14 @@ class Connection(object):
def _heartbeat_thread_job(self):
"""Thread that maintains inactive connections
"""
+ # NOTE(hberaud): Python2 doesn't have ConnectionRefusedError
+ # defined so to switch connections destination on failure
+ # with python2 and python3 we need to wrapp adapt connection refused
+ try:
+ ConnectRefuseError = ConnectionRefusedError
+ except NameError:
+ ConnectRefuseError = socket.error
+
while not self._heartbeat_exit_event.is_set():
with self._connection_lock.for_heartbeat():
@@ -913,7 +921,17 @@ class Connection(object):
self.connection.drain_events(timeout=0.001)
except socket.timeout:
pass
+ # NOTE(hberaud): In a clustered rabbitmq when
+ # a node disappears, we get a ConnectionRefusedError
+ # because the socket get disconnected.
+ # The socket access yields a OSError because the heartbeat
+ # tries to reach an unreachable host (No route to host).
+ # Catch these exceptions to ensure that we call
+ # ensure_connection for switching the
+ # connection destination.
except (socket.timeout,
+ ConnectRefuseError,
+ OSError,
kombu.exceptions.OperationalError) as exc:
LOG.info(_LI("A recoverable connection/channel error "
"occurred, trying to reconnect: %s"), exc)