summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIgor Skokov <igor.skokov@cinarra.com>2019-08-06 12:52:07 +0200
committerAndor Molnar <andor@apache.org>2019-08-06 12:52:07 +0200
commit6692d7a5b4bc3f0dbd36677c06e782ef5240153a (patch)
treeeb19b5d0247ae40a792ecae65982a49d8a618504
parent4de5f012a56e26d72bcddf782995849d7a9e7cfd (diff)
downloadzookeeper-6692d7a5b4bc3f0dbd36677c06e782ef5240153a.tar.gz
ZOOKEEPER-3320: Leader election port stop listen when hostname unresolvable for some time
Improvements and fixes of #863 Author: Igor Skokov <igor.skokov@cinarra.com> Author: Igor Skokov <lagrang09@gmail.com> Reviewers: nkalmar@apache.org, andor@apache.org Closes #1033 from Lagrang/ZOOKEEPER-3320 and squashes the following commits: 50d64659e [Igor Skokov] ZOOKEEPER-3320: doc fix, rename config property 'zookeeper.electionPortBindRetry' to 'electionPortBindRetry' fb9cdc57c [Igor Skokov] Merge remote-tracking branch 'lagrang/ZOOKEEPER-3320' into ZOOKEEPER-3320 f95ee187b [Igor Skokov] ZOOKEEPER-3320: CnxManagerTest.testCnxManagerListenerThreadConfigurableRetry fix 1af098d33 [Igor Skokov] ZOOKEEPER-3320: support custom socket bind error handler in QuorumCnxManager.Listener 7b222efbe [Igor Skokov] ZOOKEEPER-3320: handle 0 value for zookeeper.electionPortBindRetry as infinite, fix CnxManagerTest. testCnxManagerListenerThreadConfigurableRetry to prevent JVM exit during testing 5051b4cdf [Igor Skokov] ZOOKEEPER-3320: fix of test compilation eeb5c4155 [Igor Skokov] ZOOKEEPER-3320: use existing scheme to stop server when QuorumCnxManager.Listener fails to bind to election port 587fd95a0 [Igor Skokov] ZOOKEEPER-3320: QuorumCnxManager.Listener extends ZookeeperCriticalThread, add test to CnxManagerTest to check configurable retries of leader election port bind 0888a2953 [Igor Skokov] ZOOKEEPER-3320: add documentation for zookeeper.electionPortBindRetry property a9a934254 [Igor Skokov] ZOOKEEPER-3320: add validation and logging of zookeeper.electionPortBindRetry value da33c1d3a [Igor Skokov] ZOOKEEPER-3320: configurable retry count for election port bind in QuorumCnxManager.Listener e25b44551 [Igor Skokov] ZOOKEEPER-3320: support custom socket bind error handler in QuorumCnxManager.Listener b4abdc7f2 [Igor Skokov] ZOOKEEPER-3320: handle 0 value for zookeeper.electionPortBindRetry as infinite, fix CnxManagerTest. testCnxManagerListenerThreadConfigurableRetry to prevent JVM exit during testing c1afdf933 [Igor Skokov] Merge branch 'master' into ZOOKEEPER-3320 e9db1e445 [Igor Skokov] ZOOKEEPER-3320: fix of test compilation a541ee902 [Igor Skokov] Merge branch 'master' into ZOOKEEPER-3320 bb0c77f7a [Igor Skokov] ZOOKEEPER-3320: use existing scheme to stop server when QuorumCnxManager.Listener fails to bind to election port 914295895 [Igor Skokov] ZOOKEEPER-3320: QuorumCnxManager.Listener extends ZookeeperCriticalThread, add test to CnxManagerTest to check configurable retries of leader election port bind 883d35eb0 [Igor Skokov] ZOOKEEPER-3320: add documentation for zookeeper.electionPortBindRetry property b448f3603 [Igor Skokov] ZOOKEEPER-3320: add validation and logging of zookeeper.electionPortBindRetry value 706e1f058 [Igor Skokov] ZOOKEEPER-3320: configurable retry count for election port bind in QuorumCnxManager.Listener
-rw-r--r--zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md13
-rw-r--r--zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java57
-rw-r--r--zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java31
3 files changed, 87 insertions, 14 deletions
diff --git a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
index 6154eb798..dbf9a0841 100644
--- a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
+++ b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
@@ -1088,6 +1088,19 @@ As an example, this will enable all four letter word commands:
properly, check your operating system's options regarding TCP
keepalive for more information. Defaults to
**false**.
+
+* *electionPortBindRetry* :
+ (Java system property only: **zookeeper.electionPortBindRetry**)
+ Property set max retry count when Zookeeper server fails to bind
+ leader election port. Such errors can be temporary and recoverable,
+ such as DNS issue described in [ZOOKEEPER-3320](https://issues.apache.org/jira/projects/ZOOKEEPER/issues/ZOOKEEPER-3320),
+ or non-retryable, such as port already in use.
+ In case of transient errors, this property can improve availability
+ of Zookeeper server and help it to self recover.
+ Default value 3. In container environment, especially in Kubernetes,
+ this value should be increased or set to 0(infinite retry) to overcome issues
+ related to DNS name resolving.
+
* *observer.reconnectDelayMs* :
(Java system property: **zookeeper.observer.reconnectDelayMs**)
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
index 3b6133a77..5039d83cd 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
@@ -18,6 +18,8 @@
package org.apache.zookeeper.server.quorum;
+import static org.apache.zookeeper.common.NetUtils.formatInetAddr;
+
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
@@ -36,6 +38,7 @@ import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Map;
+import java.util.NoSuchElementException;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
@@ -43,24 +46,20 @@ import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
-import java.util.NoSuchElementException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
-
+import javax.net.ssl.SSLSocket;
import org.apache.zookeeper.common.X509Exception;
import org.apache.zookeeper.server.ExitCode;
-import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
-import org.apache.zookeeper.server.util.ConfigUtils;
import org.apache.zookeeper.server.ZooKeeperThread;
+import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
import org.apache.zookeeper.server.quorum.auth.QuorumAuthLearner;
import org.apache.zookeeper.server.quorum.auth.QuorumAuthServer;
import org.apache.zookeeper.server.quorum.flexible.QuorumVerifier;
+import org.apache.zookeeper.server.util.ConfigUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import javax.net.ssl.SSLSocket;
-import static org.apache.zookeeper.common.NetUtils.formatInetAddr;
-
/**
* This class implements a connection manager for leader election using TCP. It
* maintains one connection for every pair of servers. The tricky part is to
@@ -848,12 +847,39 @@ public class QuorumCnxManager {
*/
public class Listener extends ZooKeeperThread {
+ private static final String ELECTION_PORT_BIND_RETRY = "zookeeper.electionPortBindRetry";
+ private static final int DEFAULT_PORT_BIND_MAX_RETRY = 3;
+
+ private final int portBindMaxRetry;
+ private Runnable socketBindErrorHandler = () -> System.exit(ExitCode.UNABLE_TO_BIND_QUORUM_PORT.getValue());
volatile ServerSocket ss = null;
public Listener() {
// During startup of thread, thread name will be overridden to
// specific election address
super("ListenerThread");
+
+ // maximum retry count while trying to bind to election port
+ // see ZOOKEEPER-3320 for more details
+ final Integer maxRetry = Integer.getInteger(ELECTION_PORT_BIND_RETRY,
+ DEFAULT_PORT_BIND_MAX_RETRY);
+ if (maxRetry >= 0) {
+ LOG.info("Election port bind maximum retries is {}",
+ maxRetry == 0 ? "infinite" : maxRetry);
+ portBindMaxRetry = maxRetry;
+ } else {
+ LOG.info("'{}' contains invalid value: {}(must be >= 0). "
+ + "Use default value of {} instead.",
+ ELECTION_PORT_BIND_RETRY, maxRetry, DEFAULT_PORT_BIND_MAX_RETRY);
+ portBindMaxRetry = DEFAULT_PORT_BIND_MAX_RETRY;
+ }
+ }
+
+ /**
+ * Change socket bind error handler. Used for testing.
+ */
+ void setSocketBindErrorHandler(Runnable errorHandler) {
+ this.socketBindErrorHandler = errorHandler;
}
/**
@@ -865,7 +891,7 @@ public class QuorumCnxManager {
InetSocketAddress addr;
Socket client = null;
Exception exitException = null;
- while((!shutdown) && (numRetries < 3)){
+ while ((!shutdown) && (portBindMaxRetry == 0 || numRetries < portBindMaxRetry)) {
try {
if (self.shouldUsePortUnification()) {
LOG.info("Creating TLS-enabled quorum server socket");
@@ -935,15 +961,18 @@ public class QuorumCnxManager {
}
LOG.info("Leaving listener");
if (!shutdown) {
- LOG.error("As I'm leaving the listener thread, "
- + "I won't be able to participate in leader "
- + "election any longer: "
- + formatInetAddr(self.getElectionAddress()));
- if (exitException instanceof BindException) {
+ LOG.error("As I'm leaving the listener thread after "
+ + numRetries + " errors. "
+ + "I won't be able to participate in leader "
+ + "election any longer: "
+ + formatInetAddr(self.getElectionAddress())
+ + ". Use " + ELECTION_PORT_BIND_RETRY + " property to "
+ + "increase retry count.");
+ if (exitException instanceof SocketException) {
// After leaving listener thread, the host cannot join the
// quorum anymore, this is a severe error that we cannot
// recover from, so we need to exit
- System.exit(ExitCode.UNABLE_TO_BIND_QUORUM_PORT.getValue());
+ socketBindErrorHandler.run();
}
} else if (ss != null) {
// Clean up for shutdown.
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
index 878e41b4c..276f35f47 100644
--- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
@@ -36,6 +36,7 @@ import java.util.Random;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.net.Socket;
+import java.util.concurrent.atomic.AtomicBoolean;
import javax.net.ssl.SSLSession;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.HandshakeCompletedListener;
@@ -291,6 +292,36 @@ public class CnxManagerTest extends ZKTestCase {
}
/**
+ * Test for bug described in {@link https://issues.apache.org/jira/browse/ZOOKEEPER-3320}.
+ * Test create peer with address which contains unresolvable DNS name,
+ * leader election listener thread should stop after N errors.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testCnxManagerListenerThreadConfigurableRetry() throws Exception {
+ final Map<Long,QuorumServer> unresolvablePeers = new HashMap<>();
+ final long myid = 1L;
+ unresolvablePeers.put(myid, new QuorumServer(myid, "unresolvable-domain.org:2182:2183;2181"));
+ final QuorumPeer peer = new QuorumPeer(unresolvablePeers,
+ ClientBase.createTmpDir(),
+ ClientBase.createTmpDir(),
+ 2181, 3, myid, 1000, 2, 2, 2);
+ final QuorumCnxManager cnxManager = peer.createCnxnManager();
+ final QuorumCnxManager.Listener listener = cnxManager.listener;
+ final AtomicBoolean errorHappend = new AtomicBoolean();
+ listener.setSocketBindErrorHandler(() -> errorHappend.set(true));
+ listener.start();
+ // listener thread should stop and throws error which notify QuorumPeer about error.
+ // QuorumPeer should start shutdown process
+ listener.join(15000); // set wait time, if listener contains bug and thread not stops.
+ Assert.assertFalse(listener.isAlive());
+ Assert.assertTrue(errorHappend.get());
+ Assert.assertFalse(QuorumPeer.class.getSimpleName() + " not stopped after "
+ + "listener thread death", listener.isAlive());
+ }
+
+ /**
* Tests a bug in QuorumCnxManager that causes a NPE when a 3.4.6
* observer connects to a 3.5.0 server.
* see https://issues.apache.org/jira/browse/ZOOKEEPER-1789