diff options
author | Igor Skokov <igor.skokov@cinarra.com> | 2019-08-06 12:52:07 +0200 |
---|---|---|
committer | Andor Molnar <andor@apache.org> | 2019-08-06 12:52:07 +0200 |
commit | 6692d7a5b4bc3f0dbd36677c06e782ef5240153a (patch) | |
tree | eb19b5d0247ae40a792ecae65982a49d8a618504 | |
parent | 4de5f012a56e26d72bcddf782995849d7a9e7cfd (diff) | |
download | zookeeper-6692d7a5b4bc3f0dbd36677c06e782ef5240153a.tar.gz |
ZOOKEEPER-3320: Leader election port stop listen when hostname unresolvable for some time
Improvements and fixes of #863
Author: Igor Skokov <igor.skokov@cinarra.com>
Author: Igor Skokov <lagrang09@gmail.com>
Reviewers: nkalmar@apache.org, andor@apache.org
Closes #1033 from Lagrang/ZOOKEEPER-3320 and squashes the following commits:
50d64659e [Igor Skokov] ZOOKEEPER-3320: doc fix, rename config property 'zookeeper.electionPortBindRetry' to 'electionPortBindRetry'
fb9cdc57c [Igor Skokov] Merge remote-tracking branch 'lagrang/ZOOKEEPER-3320' into ZOOKEEPER-3320
f95ee187b [Igor Skokov] ZOOKEEPER-3320: CnxManagerTest.testCnxManagerListenerThreadConfigurableRetry fix
1af098d33 [Igor Skokov] ZOOKEEPER-3320: support custom socket bind error handler in QuorumCnxManager.Listener
7b222efbe [Igor Skokov] ZOOKEEPER-3320: handle 0 value for zookeeper.electionPortBindRetry as infinite, fix CnxManagerTest. testCnxManagerListenerThreadConfigurableRetry to prevent JVM exit during testing
5051b4cdf [Igor Skokov] ZOOKEEPER-3320: fix of test compilation
eeb5c4155 [Igor Skokov] ZOOKEEPER-3320: use existing scheme to stop server when QuorumCnxManager.Listener fails to bind to election port
587fd95a0 [Igor Skokov] ZOOKEEPER-3320: QuorumCnxManager.Listener extends ZookeeperCriticalThread, add test to CnxManagerTest to check configurable retries of leader election port bind
0888a2953 [Igor Skokov] ZOOKEEPER-3320: add documentation for zookeeper.electionPortBindRetry property
a9a934254 [Igor Skokov] ZOOKEEPER-3320: add validation and logging of zookeeper.electionPortBindRetry value
da33c1d3a [Igor Skokov] ZOOKEEPER-3320: configurable retry count for election port bind in QuorumCnxManager.Listener
e25b44551 [Igor Skokov] ZOOKEEPER-3320: support custom socket bind error handler in QuorumCnxManager.Listener
b4abdc7f2 [Igor Skokov] ZOOKEEPER-3320: handle 0 value for zookeeper.electionPortBindRetry as infinite, fix CnxManagerTest. testCnxManagerListenerThreadConfigurableRetry to prevent JVM exit during testing
c1afdf933 [Igor Skokov] Merge branch 'master' into ZOOKEEPER-3320
e9db1e445 [Igor Skokov] ZOOKEEPER-3320: fix of test compilation
a541ee902 [Igor Skokov] Merge branch 'master' into ZOOKEEPER-3320
bb0c77f7a [Igor Skokov] ZOOKEEPER-3320: use existing scheme to stop server when QuorumCnxManager.Listener fails to bind to election port
914295895 [Igor Skokov] ZOOKEEPER-3320: QuorumCnxManager.Listener extends ZookeeperCriticalThread, add test to CnxManagerTest to check configurable retries of leader election port bind
883d35eb0 [Igor Skokov] ZOOKEEPER-3320: add documentation for zookeeper.electionPortBindRetry property
b448f3603 [Igor Skokov] ZOOKEEPER-3320: add validation and logging of zookeeper.electionPortBindRetry value
706e1f058 [Igor Skokov] ZOOKEEPER-3320: configurable retry count for election port bind in QuorumCnxManager.Listener
3 files changed, 87 insertions, 14 deletions
diff --git a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md index 6154eb798..dbf9a0841 100644 --- a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md +++ b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md @@ -1088,6 +1088,19 @@ As an example, this will enable all four letter word commands: properly, check your operating system's options regarding TCP keepalive for more information. Defaults to **false**. + +* *electionPortBindRetry* : + (Java system property only: **zookeeper.electionPortBindRetry**) + Property set max retry count when Zookeeper server fails to bind + leader election port. Such errors can be temporary and recoverable, + such as DNS issue described in [ZOOKEEPER-3320](https://issues.apache.org/jira/projects/ZOOKEEPER/issues/ZOOKEEPER-3320), + or non-retryable, such as port already in use. + In case of transient errors, this property can improve availability + of Zookeeper server and help it to self recover. + Default value 3. In container environment, especially in Kubernetes, + this value should be increased or set to 0(infinite retry) to overcome issues + related to DNS name resolving. + * *observer.reconnectDelayMs* : (Java system property: **zookeeper.observer.reconnectDelayMs**) diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java index 3b6133a77..5039d83cd 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java @@ -18,6 +18,8 @@ package org.apache.zookeeper.server.quorum; +import static org.apache.zookeeper.common.NetUtils.formatInetAddr; + import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataInputStream; @@ -36,6 +38,7 @@ import java.util.Collections; import java.util.Enumeration; import java.util.HashSet; import java.util.Map; +import java.util.NoSuchElementException; import java.util.Set; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ConcurrentHashMap; @@ -43,24 +46,20 @@ import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; -import java.util.NoSuchElementException; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; - +import javax.net.ssl.SSLSocket; import org.apache.zookeeper.common.X509Exception; import org.apache.zookeeper.server.ExitCode; -import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException; -import org.apache.zookeeper.server.util.ConfigUtils; import org.apache.zookeeper.server.ZooKeeperThread; +import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException; import org.apache.zookeeper.server.quorum.auth.QuorumAuthLearner; import org.apache.zookeeper.server.quorum.auth.QuorumAuthServer; import org.apache.zookeeper.server.quorum.flexible.QuorumVerifier; +import org.apache.zookeeper.server.util.ConfigUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.net.ssl.SSLSocket; -import static org.apache.zookeeper.common.NetUtils.formatInetAddr; - /** * This class implements a connection manager for leader election using TCP. It * maintains one connection for every pair of servers. The tricky part is to @@ -848,12 +847,39 @@ public class QuorumCnxManager { */ public class Listener extends ZooKeeperThread { + private static final String ELECTION_PORT_BIND_RETRY = "zookeeper.electionPortBindRetry"; + private static final int DEFAULT_PORT_BIND_MAX_RETRY = 3; + + private final int portBindMaxRetry; + private Runnable socketBindErrorHandler = () -> System.exit(ExitCode.UNABLE_TO_BIND_QUORUM_PORT.getValue()); volatile ServerSocket ss = null; public Listener() { // During startup of thread, thread name will be overridden to // specific election address super("ListenerThread"); + + // maximum retry count while trying to bind to election port + // see ZOOKEEPER-3320 for more details + final Integer maxRetry = Integer.getInteger(ELECTION_PORT_BIND_RETRY, + DEFAULT_PORT_BIND_MAX_RETRY); + if (maxRetry >= 0) { + LOG.info("Election port bind maximum retries is {}", + maxRetry == 0 ? "infinite" : maxRetry); + portBindMaxRetry = maxRetry; + } else { + LOG.info("'{}' contains invalid value: {}(must be >= 0). " + + "Use default value of {} instead.", + ELECTION_PORT_BIND_RETRY, maxRetry, DEFAULT_PORT_BIND_MAX_RETRY); + portBindMaxRetry = DEFAULT_PORT_BIND_MAX_RETRY; + } + } + + /** + * Change socket bind error handler. Used for testing. + */ + void setSocketBindErrorHandler(Runnable errorHandler) { + this.socketBindErrorHandler = errorHandler; } /** @@ -865,7 +891,7 @@ public class QuorumCnxManager { InetSocketAddress addr; Socket client = null; Exception exitException = null; - while((!shutdown) && (numRetries < 3)){ + while ((!shutdown) && (portBindMaxRetry == 0 || numRetries < portBindMaxRetry)) { try { if (self.shouldUsePortUnification()) { LOG.info("Creating TLS-enabled quorum server socket"); @@ -935,15 +961,18 @@ public class QuorumCnxManager { } LOG.info("Leaving listener"); if (!shutdown) { - LOG.error("As I'm leaving the listener thread, " - + "I won't be able to participate in leader " - + "election any longer: " - + formatInetAddr(self.getElectionAddress())); - if (exitException instanceof BindException) { + LOG.error("As I'm leaving the listener thread after " + + numRetries + " errors. " + + "I won't be able to participate in leader " + + "election any longer: " + + formatInetAddr(self.getElectionAddress()) + + ". Use " + ELECTION_PORT_BIND_RETRY + " property to " + + "increase retry count."); + if (exitException instanceof SocketException) { // After leaving listener thread, the host cannot join the // quorum anymore, this is a severe error that we cannot // recover from, so we need to exit - System.exit(ExitCode.UNABLE_TO_BIND_QUORUM_PORT.getValue()); + socketBindErrorHandler.run(); } } else if (ss != null) { // Clean up for shutdown. diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java index 878e41b4c..276f35f47 100644 --- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java +++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java @@ -36,6 +36,7 @@ import java.util.Random; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import java.net.Socket; +import java.util.concurrent.atomic.AtomicBoolean; import javax.net.ssl.SSLSession; import javax.net.ssl.SSLSocket; import javax.net.ssl.HandshakeCompletedListener; @@ -291,6 +292,36 @@ public class CnxManagerTest extends ZKTestCase { } /** + * Test for bug described in {@link https://issues.apache.org/jira/browse/ZOOKEEPER-3320}. + * Test create peer with address which contains unresolvable DNS name, + * leader election listener thread should stop after N errors. + * + * @throws Exception + */ + @Test + public void testCnxManagerListenerThreadConfigurableRetry() throws Exception { + final Map<Long,QuorumServer> unresolvablePeers = new HashMap<>(); + final long myid = 1L; + unresolvablePeers.put(myid, new QuorumServer(myid, "unresolvable-domain.org:2182:2183;2181")); + final QuorumPeer peer = new QuorumPeer(unresolvablePeers, + ClientBase.createTmpDir(), + ClientBase.createTmpDir(), + 2181, 3, myid, 1000, 2, 2, 2); + final QuorumCnxManager cnxManager = peer.createCnxnManager(); + final QuorumCnxManager.Listener listener = cnxManager.listener; + final AtomicBoolean errorHappend = new AtomicBoolean(); + listener.setSocketBindErrorHandler(() -> errorHappend.set(true)); + listener.start(); + // listener thread should stop and throws error which notify QuorumPeer about error. + // QuorumPeer should start shutdown process + listener.join(15000); // set wait time, if listener contains bug and thread not stops. + Assert.assertFalse(listener.isAlive()); + Assert.assertTrue(errorHappend.get()); + Assert.assertFalse(QuorumPeer.class.getSimpleName() + " not stopped after " + + "listener thread death", listener.isAlive()); + } + + /** * Tests a bug in QuorumCnxManager that causes a NPE when a 3.4.6 * observer connects to a 3.5.0 server. * see https://issues.apache.org/jira/browse/ZOOKEEPER-1789 |