summaryrefslogtreecommitdiff
path: root/lib/tsocket
diff options
context:
space:
mode:
authorStefan Metzmacher <metze@samba.org>2022-10-12 17:26:16 +0200
committerStefan Metzmacher <metze@samba.org>2022-10-19 16:14:36 +0000
commite232ba946f00aac39d67197d9939bc923814479c (patch)
treeecb15837e1f55caa251bdbe5d6abc4a0e5f5fc3b /lib/tsocket
parent4c7e2b9b60de5d02bb3f69effe7eddbf466a6155 (diff)
downloadsamba-e232ba946f00aac39d67197d9939bc923814479c.tar.gz
lib/tsocket: avoid endless cpu-spinning in tstream_bsd_fde_handler()
There were some reports that strace output an LDAP server socket is in CLOSE_WAIT state, returning EAGAIN for writev over and over (after a call to epoll() each time). In the tstream_bsd code the problem happens when we have a pending writev_send, while there's no readv_send pending. In that case we still ask for TEVENT_FD_READ in order to notice connection errors early, so we try to call writev even if the socket doesn't report TEVENT_FD_WRITE. And there are situations where we do that over and over again. It happens like this with a Linux kernel: tcp_fin() has this: struct tcp_sock *tp = tcp_sk(sk); inet_csk_schedule_ack(sk); sk->sk_shutdown |= RCV_SHUTDOWN; sock_set_flag(sk, SOCK_DONE); switch (sk->sk_state) { case TCP_SYN_RECV: case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); inet_csk_enter_pingpong_mode(sk); break; It means RCV_SHUTDOWN gets set as well as TCP_CLOSE_WAIT, but sk->sk_err is not changed to indicate an error. tcp_sendmsg_locked has this: ... err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; while (msg_data_left(msg)) { int copy = 0; skb = tcp_write_queue_tail(sk); if (skb) copy = size_goal - skb->len; if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_space; ... wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); err = sk_stream_wait_memory(sk, &timeo); if (err != 0) goto do_error; It means if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) doesn't hit as we only have RCV_SHUTDOWN and sk_stream_wait_memory returns -EAGAIN. tcp_poll has this: if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; So we'll get EPOLLIN | EPOLLRDNORM | EPOLLRDHUP triggering TEVENT_FD_READ and writev/sendmsg keeps getting EAGAIN. So we need to always clear TEVENT_FD_READ if we don't have readable handler in order to avoid burning cpu. But we turn it on again after a timeout of 1 second in order to monitor the error state of the connection. And now that our tsocket_bsd_error() helper checks for POLLRDHUP, we can check if the socket is in an error state before calling the writable handler when TEVENT_FD_READ was reported. Only on error we'll call the writable handler, which will pick the error without calling writev(). BUG: https://bugzilla.samba.org/show_bug.cgi?id=15202 Signed-off-by: Stefan Metzmacher <metze@samba.org> Reviewed-by: Ralph Boehme <slow@samba.org>
Diffstat (limited to 'lib/tsocket')
-rw-r--r--lib/tsocket/tsocket_bsd.c121
1 files changed, 116 insertions, 5 deletions
diff --git a/lib/tsocket/tsocket_bsd.c b/lib/tsocket/tsocket_bsd.c
index 72499561f2d..daba33eb9d7 100644
--- a/lib/tsocket/tsocket_bsd.c
+++ b/lib/tsocket/tsocket_bsd.c
@@ -1754,6 +1754,9 @@ struct tstream_bsd {
void (*readable_handler)(void *private_data);
void *writeable_private;
void (*writeable_handler)(void *private_data);
+
+ struct tevent_context *error_ctx;
+ struct tevent_timer *error_timer;
};
bool tstream_bsd_optimize_readv(struct tstream_context *stream,
@@ -1775,6 +1778,28 @@ bool tstream_bsd_optimize_readv(struct tstream_context *stream,
return old;
}
+static void tstream_bsd_error_timer(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval current_time,
+ void *private_data)
+{
+ struct tstream_bsd *bsds =
+ talloc_get_type(private_data,
+ struct tstream_bsd);
+
+ TALLOC_FREE(bsds->error_timer);
+
+ /*
+ * Turn on TEVENT_FD_READABLE() again
+ * if we have a writeable_handler that
+ * wants to monitor the connection
+ * for errors.
+ */
+ if (bsds->writeable_handler != NULL) {
+ TEVENT_FD_READABLE(bsds->fde);
+ }
+}
+
static void tstream_bsd_fde_handler(struct tevent_context *ev,
struct tevent_fd *fde,
uint16_t flags,
@@ -1789,11 +1814,74 @@ static void tstream_bsd_fde_handler(struct tevent_context *ev,
}
if (flags & TEVENT_FD_READ) {
if (!bsds->readable_handler) {
- if (bsds->writeable_handler) {
+ struct timeval recheck_time;
+
+ /*
+ * In order to avoid cpu-spinning
+ * we no longer want to get TEVENT_FD_READ
+ */
+ TEVENT_FD_NOT_READABLE(bsds->fde);
+
+ if (!bsds->writeable_handler) {
+ return;
+ }
+
+ /*
+ * If we have a writeable handler we
+ * want that to report connection errors
+ * early.
+ *
+ * So we check if the socket is in an
+ * error state.
+ */
+ if (bsds->error == 0) {
+ int ret = tsocket_bsd_error(bsds->fd);
+
+ if (ret == -1) {
+ bsds->error = errno;
+ }
+ }
+
+ if (bsds->error != 0) {
+ /*
+ * Let the writeable handler report the error
+ */
+ bsds->writeable_handler(bsds->writeable_private);
+ return;
+ }
+
+ /*
+ * Here we called TEVENT_FD_NOT_READABLE() without
+ * calling into the writeable handler.
+ *
+ * So we may have to wait for the kernels tcp stack
+ * to report TEVENT_FD_WRITE in order to let
+ * make progress and turn on TEVENT_FD_READABLE()
+ * again.
+ *
+ * As a fallback we use a timer that turns on
+ * TEVENT_FD_READABLE() again after a timeout of
+ * 1 second.
+ */
+
+ if (bsds->error_timer != NULL) {
+ return;
+ }
+
+ recheck_time = timeval_current_ofs(1, 0);
+ bsds->error_timer = tevent_add_timer(bsds->error_ctx,
+ bsds,
+ recheck_time,
+ tstream_bsd_error_timer,
+ bsds);
+ if (bsds->error_timer == NULL) {
+ bsds->error = ENOMEM;
+ /*
+ * Let the writeable handler report the error
+ */
bsds->writeable_handler(bsds->writeable_private);
return;
}
- TEVENT_FD_NOT_READABLE(bsds->fde);
return;
}
bsds->readable_handler(bsds->readable_private);
@@ -1848,6 +1936,8 @@ static int tstream_bsd_set_readable_handler(struct tstream_bsd *bsds,
TEVENT_FD_READABLE(bsds->fde);
}
+ TALLOC_FREE(bsds->error_timer);
+
bsds->readable_handler = handler;
bsds->readable_private = private_data;
@@ -1870,7 +1960,8 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds,
bsds->writeable_handler = NULL;
bsds->writeable_private = NULL;
TEVENT_FD_NOT_WRITEABLE(bsds->fde);
-
+ TALLOC_FREE(bsds->error_timer);
+ bsds->error_ctx = NULL;
return 0;
}
@@ -1882,6 +1973,8 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds,
}
bsds->event_ptr = NULL;
TALLOC_FREE(bsds->fde);
+ TALLOC_FREE(bsds->error_timer);
+ bsds->error_ctx = NULL;
}
if (tevent_fd_get_flags(bsds->fde) == 0) {
@@ -1907,6 +2000,7 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds,
bsds->writeable_handler = handler;
bsds->writeable_private = private_data;
+ bsds->error_ctx = ev;
return 0;
}
@@ -2212,7 +2306,14 @@ static void tstream_bsd_writev_handler(void *private_data)
}
err = tsocket_bsd_error_from_errno(ret, errno, &retry);
if (retry) {
- /* retry later */
+ /*
+ * retry later...
+ *
+ * make sure we also wait readable again
+ * in order to notice errors early
+ */
+ TEVENT_FD_READABLE(bsds->fde);
+ TALLOC_FREE(bsds->error_timer);
return;
}
if (err != 0) {
@@ -2238,7 +2339,13 @@ static void tstream_bsd_writev_handler(void *private_data)
}
if (state->count > 0) {
- /* we have more to read */
+ /*
+ * we have more to write
+ *
+ * make sure we also wait readable again
+ * in order to notice errors early
+ */
+ TEVENT_FD_READABLE(bsds->fde);
return;
}
@@ -2286,6 +2393,8 @@ static struct tevent_req *tstream_bsd_disconnect_send(TALLOC_CTX *mem_ctx,
goto post;
}
+ TALLOC_FREE(bsds->error_timer);
+ bsds->error_ctx = NULL;
TALLOC_FREE(bsds->fde);
ret = close(bsds->fd);
bsds->fd = -1;
@@ -2328,6 +2437,8 @@ static const struct tstream_context_ops tstream_bsd_ops = {
static int tstream_bsd_destructor(struct tstream_bsd *bsds)
{
+ TALLOC_FREE(bsds->error_timer);
+ bsds->error_ctx = NULL;
TALLOC_FREE(bsds->fde);
if (bsds->fd != -1) {
close(bsds->fd);