lib/tsocket: avoid endless cpu-spinning in tstream_bsd_fde_handler()

There were some reports that strace output an LDAP server socket is in CLOSE_WAIT state, returning EAGAIN for writev over and over (after a call to epoll() each time). In the tstream_bsd code the problem happens when we have a pending writev_send, while there's no readv_send pending. In that case we still ask for TEVENT_FD_READ in order to notice connection errors early, so we try to call writev even if the socket doesn't report TEVENT_FD_WRITE. And there are situations where we do that over and over again. It happens like this with a Linux kernel: tcp_fin() has this: struct tcp_sock *tp = tcp_sk(sk); inet_csk_schedule_ack(sk); sk->sk_shutdown |= RCV_SHUTDOWN; sock_set_flag(sk, SOCK_DONE); switch (sk->sk_state) { case TCP_SYN_RECV: case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); inet_csk_enter_pingpong_mode(sk); break; It means RCV_SHUTDOWN gets set as well as TCP_CLOSE_WAIT, but sk->sk_err is not changed to indicate an error. tcp_sendmsg_locked has this: ... err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; while (msg_data_left(msg)) { int copy = 0; skb = tcp_write_queue_tail(sk); if (skb) copy = size_goal - skb->len; if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_space; ... wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); err = sk_stream_wait_memory(sk, &timeo); if (err != 0) goto do_error; It means if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) doesn't hit as we only have RCV_SHUTDOWN and sk_stream_wait_memory returns -EAGAIN. tcp_poll has this: if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; So we'll get EPOLLIN | EPOLLRDNORM | EPOLLRDHUP triggering TEVENT_FD_READ and writev/sendmsg keeps getting EAGAIN. So we need to always clear TEVENT_FD_READ if we don't have readable handler in order to avoid burning cpu. But we turn it on again after a timeout of 1 second in order to monitor the error state of the connection. And now that our tsocket_bsd_error() helper checks for POLLRDHUP, we can check if the socket is in an error state before calling the writable handler when TEVENT_FD_READ was reported. Only on error we'll call the writable handler, which will pick the error without calling writev(). BUG: https://bugzilla.samba.org/show_bug.cgi?id=15202 Signed-off-by: Stefan Metzmacher <metze@samba.org> Reviewed-by: Ralph Boehme <slow@samba.org>
author: Stefan Metzmacher <metze@samba.org> 2022-10-12 17:26:16 +0200
committer: Stefan Metzmacher <metze@samba.org> 2022-10-19 16:14:36 +0000
commit: e232ba946f00aac39d67197d9939bc923814479c (patch)
tree: ecb15837e1f55caa251bdbe5d6abc4a0e5f5fc3b /lib/tsocket
parent: 4c7e2b9b60de5d02bb3f69effe7eddbf466a6155 (diff)
download: samba-e232ba946f00aac39d67197d9939bc923814479c.tar.gz
1 files changed, 116 insertions, 5 deletions
diff --git a/lib/tsocket/tsocket_bsd.c b/lib/tsocket/tsocket_bsd.c
index 72499561f2d..daba33eb9d7 100644
--- a/lib/tsocket/tsocket_bsd.c
+++ b/lib/tsocket/tsocket_bsd.c
@@ -1754,6 +1754,9 @@ struct tstream_bsd {
 	void (*readable_handler)(void *private_data);
 	void *writeable_private;
 	void (*writeable_handler)(void *private_data);
+
+	struct tevent_context *error_ctx;
+	struct tevent_timer *error_timer;
 };
 
 bool tstream_bsd_optimize_readv(struct tstream_context *stream,
@@ -1775,6 +1778,28 @@ bool tstream_bsd_optimize_readv(struct tstream_context *stream,
 	return old;
 }
 
+static void tstream_bsd_error_timer(struct tevent_context *ev,
+				    struct tevent_timer *te,
+				    struct timeval current_time,
+				    void *private_data)
+{
+	struct tstream_bsd *bsds =
+		talloc_get_type(private_data,
+		struct tstream_bsd);
+
+	TALLOC_FREE(bsds->error_timer);
+
+	/*
+	 * Turn on TEVENT_FD_READABLE() again
+	 * if we have a writeable_handler that
+	 * wants to monitor the connection
+	 * for errors.
+	 */
+	if (bsds->writeable_handler != NULL) {
+		TEVENT_FD_READABLE(bsds->fde);
+	}
+}
+
 static void tstream_bsd_fde_handler(struct tevent_context *ev,
 				    struct tevent_fd *fde,
 				    uint16_t flags,
@@ -1789,11 +1814,74 @@ static void tstream_bsd_fde_handler(struct tevent_context *ev,
 	}
 	if (flags & TEVENT_FD_READ) {
 		if (!bsds->readable_handler) {
-			if (bsds->writeable_handler) {
+			struct timeval recheck_time;
+
+			/*
+			 * In order to avoid cpu-spinning
+			 * we no longer want to get TEVENT_FD_READ
+			 */
+			TEVENT_FD_NOT_READABLE(bsds->fde);
+
+			if (!bsds->writeable_handler) {
+				return;
+			}
+
+			/*
+			 * If we have a writeable handler we
+			 * want that to report connection errors
+			 * early.
+			 *
+			 * So we check if the socket is in an
+			 * error state.
+			 */
+			if (bsds->error == 0) {
+				int ret = tsocket_bsd_error(bsds->fd);
+
+				if (ret == -1) {
+					bsds->error = errno;
+				}
+			}
+
+			if (bsds->error != 0) {
+				/*
+				 * Let the writeable handler report the error
+				 */
+				bsds->writeable_handler(bsds->writeable_private);
+				return;
+			}
+
+			/*
+			 * Here we called TEVENT_FD_NOT_READABLE() without
+			 * calling into the writeable handler.
+			 *
+			 * So we may have to wait for the kernels tcp stack
+			 * to report TEVENT_FD_WRITE in order to let
+			 * make progress and turn on TEVENT_FD_READABLE()
+			 * again.
+			 *
+			 * As a fallback we use a timer that turns on
+			 * TEVENT_FD_READABLE() again after a timeout of
+			 * 1 second.
+			 */
+
+			if (bsds->error_timer != NULL) {
+				return;
+			}
+
+			recheck_time = timeval_current_ofs(1, 0);
+			bsds->error_timer = tevent_add_timer(bsds->error_ctx,
+							     bsds,
+							     recheck_time,
+							     tstream_bsd_error_timer,
+							     bsds);
+			if (bsds->error_timer == NULL) {
+				bsds->error = ENOMEM;
+				/*
+				 * Let the writeable handler report the error
+				 */
 				bsds->writeable_handler(bsds->writeable_private);
 				return;
 			}
-			TEVENT_FD_NOT_READABLE(bsds->fde);
 			return;
 		}
 		bsds->readable_handler(bsds->readable_private);
@@ -1848,6 +1936,8 @@ static int tstream_bsd_set_readable_handler(struct tstream_bsd *bsds,
 		TEVENT_FD_READABLE(bsds->fde);
 	}
 
+	TALLOC_FREE(bsds->error_timer);
+
 	bsds->readable_handler = handler;
 	bsds->readable_private = private_data;
 
@@ -1870,7 +1960,8 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds,
 		bsds->writeable_handler = NULL;
 		bsds->writeable_private = NULL;
 		TEVENT_FD_NOT_WRITEABLE(bsds->fde);
-
+		TALLOC_FREE(bsds->error_timer);
+		bsds->error_ctx = NULL;
 		return 0;
 	}
 
@@ -1882,6 +1973,8 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds,
 		}
 		bsds->event_ptr = NULL;
 		TALLOC_FREE(bsds->fde);
+		TALLOC_FREE(bsds->error_timer);
+		bsds->error_ctx = NULL;
 	}
 
 	if (tevent_fd_get_flags(bsds->fde) == 0) {
@@ -1907,6 +2000,7 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds,
 
 	bsds->writeable_handler = handler;
 	bsds->writeable_private = private_data;
+	bsds->error_ctx = ev;
 
 	return 0;
 }
@@ -2212,7 +2306,14 @@ static void tstream_bsd_writev_handler(void *private_data)
 	}
 	err = tsocket_bsd_error_from_errno(ret, errno, &retry);
 	if (retry) {
-		/* retry later */
+		/*
+		 * retry later...
+		 *
+		 * make sure we also wait readable again
+		 * in order to notice errors early
+		 */
+		TEVENT_FD_READABLE(bsds->fde);
+		TALLOC_FREE(bsds->error_timer);
 		return;
 	}
 	if (err != 0) {
@@ -2238,7 +2339,13 @@ static void tstream_bsd_writev_handler(void *private_data)
 	}
 
 	if (state->count > 0) {
-		/* we have more to read */
+		/*
+		 * we have more to write
+		 *
+		 * make sure we also wait readable again
+		 * in order to notice errors early
+		 */
+		TEVENT_FD_READABLE(bsds->fde);
 		return;
 	}
 
@@ -2286,6 +2393,8 @@ static struct tevent_req *tstream_bsd_disconnect_send(TALLOC_CTX *mem_ctx,
 		goto post;
 	}
 
+	TALLOC_FREE(bsds->error_timer);
+	bsds->error_ctx = NULL;
 	TALLOC_FREE(bsds->fde);
 	ret = close(bsds->fd);
 	bsds->fd = -1;
@@ -2328,6 +2437,8 @@ static const struct tstream_context_ops tstream_bsd_ops = {
 
 static int tstream_bsd_destructor(struct tstream_bsd *bsds)
 {
+	TALLOC_FREE(bsds->error_timer);
+	bsds->error_ctx = NULL;
 	TALLOC_FREE(bsds->fde);
 	if (bsds->fd != -1) {
 		close(bsds->fd);
author	Stefan Metzmacher <metze@samba.org>	2022-10-12 17:26:16 +0200
committer	Stefan Metzmacher <metze@samba.org>	2022-10-19 16:14:36 +0000
commit	e232ba946f00aac39d67197d9939bc923814479c (patch)
tree	ecb15837e1f55caa251bdbe5d6abc4a0e5f5fc3b /lib/tsocket
parent	4c7e2b9b60de5d02bb3f69effe7eddbf466a6155 (diff)
download	samba-e232ba946f00aac39d67197d9939bc923814479c.tar.gz