diff options
author | Stefan Metzmacher <metze@samba.org> | 2022-10-12 17:26:16 +0200 |
---|---|---|
committer | Stefan Metzmacher <metze@samba.org> | 2022-10-19 16:14:36 +0000 |
commit | e232ba946f00aac39d67197d9939bc923814479c (patch) | |
tree | ecb15837e1f55caa251bdbe5d6abc4a0e5f5fc3b /lib/tsocket | |
parent | 4c7e2b9b60de5d02bb3f69effe7eddbf466a6155 (diff) | |
download | samba-e232ba946f00aac39d67197d9939bc923814479c.tar.gz |
lib/tsocket: avoid endless cpu-spinning in tstream_bsd_fde_handler()
There were some reports that strace output an LDAP server socket is in
CLOSE_WAIT state, returning EAGAIN for writev over and over (after a call to
epoll() each time).
In the tstream_bsd code the problem happens when we have a pending
writev_send, while there's no readv_send pending. In that case
we still ask for TEVENT_FD_READ in order to notice connection errors
early, so we try to call writev even if the socket doesn't report TEVENT_FD_WRITE.
And there are situations where we do that over and over again.
It happens like this with a Linux kernel:
tcp_fin() has this:
struct tcp_sock *tp = tcp_sk(sk);
inet_csk_schedule_ack(sk);
sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(sk, SOCK_DONE);
switch (sk->sk_state) {
case TCP_SYN_RECV:
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
inet_csk_enter_pingpong_mode(sk);
break;
It means RCV_SHUTDOWN gets set as well as TCP_CLOSE_WAIT, but
sk->sk_err is not changed to indicate an error.
tcp_sendmsg_locked has this:
...
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
while (msg_data_left(msg)) {
int copy = 0;
skb = tcp_write_queue_tail(sk);
if (skb)
copy = size_goal - skb->len;
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
bool first_skb;
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_space;
...
wait_for_space:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
err = sk_stream_wait_memory(sk, &timeo);
if (err != 0)
goto do_error;
It means if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) doesn't
hit as we only have RCV_SHUTDOWN and sk_stream_wait_memory returns
-EAGAIN.
tcp_poll has this:
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
So we'll get EPOLLIN | EPOLLRDNORM | EPOLLRDHUP triggering
TEVENT_FD_READ and writev/sendmsg keeps getting EAGAIN.
So we need to always clear TEVENT_FD_READ if we don't
have readable handler in order to avoid burning cpu.
But we turn it on again after a timeout of 1 second
in order to monitor the error state of the connection.
And now that our tsocket_bsd_error() helper checks for POLLRDHUP,
we can check if the socket is in an error state before calling the
writable handler when TEVENT_FD_READ was reported.
Only on error we'll call the writable handler, which will pick
the error without calling writev().
BUG: https://bugzilla.samba.org/show_bug.cgi?id=15202
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Reviewed-by: Ralph Boehme <slow@samba.org>
Diffstat (limited to 'lib/tsocket')
-rw-r--r-- | lib/tsocket/tsocket_bsd.c | 121 |
1 files changed, 116 insertions, 5 deletions
diff --git a/lib/tsocket/tsocket_bsd.c b/lib/tsocket/tsocket_bsd.c index 72499561f2d..daba33eb9d7 100644 --- a/lib/tsocket/tsocket_bsd.c +++ b/lib/tsocket/tsocket_bsd.c @@ -1754,6 +1754,9 @@ struct tstream_bsd { void (*readable_handler)(void *private_data); void *writeable_private; void (*writeable_handler)(void *private_data); + + struct tevent_context *error_ctx; + struct tevent_timer *error_timer; }; bool tstream_bsd_optimize_readv(struct tstream_context *stream, @@ -1775,6 +1778,28 @@ bool tstream_bsd_optimize_readv(struct tstream_context *stream, return old; } +static void tstream_bsd_error_timer(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval current_time, + void *private_data) +{ + struct tstream_bsd *bsds = + talloc_get_type(private_data, + struct tstream_bsd); + + TALLOC_FREE(bsds->error_timer); + + /* + * Turn on TEVENT_FD_READABLE() again + * if we have a writeable_handler that + * wants to monitor the connection + * for errors. + */ + if (bsds->writeable_handler != NULL) { + TEVENT_FD_READABLE(bsds->fde); + } +} + static void tstream_bsd_fde_handler(struct tevent_context *ev, struct tevent_fd *fde, uint16_t flags, @@ -1789,11 +1814,74 @@ static void tstream_bsd_fde_handler(struct tevent_context *ev, } if (flags & TEVENT_FD_READ) { if (!bsds->readable_handler) { - if (bsds->writeable_handler) { + struct timeval recheck_time; + + /* + * In order to avoid cpu-spinning + * we no longer want to get TEVENT_FD_READ + */ + TEVENT_FD_NOT_READABLE(bsds->fde); + + if (!bsds->writeable_handler) { + return; + } + + /* + * If we have a writeable handler we + * want that to report connection errors + * early. + * + * So we check if the socket is in an + * error state. + */ + if (bsds->error == 0) { + int ret = tsocket_bsd_error(bsds->fd); + + if (ret == -1) { + bsds->error = errno; + } + } + + if (bsds->error != 0) { + /* + * Let the writeable handler report the error + */ + bsds->writeable_handler(bsds->writeable_private); + return; + } + + /* + * Here we called TEVENT_FD_NOT_READABLE() without + * calling into the writeable handler. + * + * So we may have to wait for the kernels tcp stack + * to report TEVENT_FD_WRITE in order to let + * make progress and turn on TEVENT_FD_READABLE() + * again. + * + * As a fallback we use a timer that turns on + * TEVENT_FD_READABLE() again after a timeout of + * 1 second. + */ + + if (bsds->error_timer != NULL) { + return; + } + + recheck_time = timeval_current_ofs(1, 0); + bsds->error_timer = tevent_add_timer(bsds->error_ctx, + bsds, + recheck_time, + tstream_bsd_error_timer, + bsds); + if (bsds->error_timer == NULL) { + bsds->error = ENOMEM; + /* + * Let the writeable handler report the error + */ bsds->writeable_handler(bsds->writeable_private); return; } - TEVENT_FD_NOT_READABLE(bsds->fde); return; } bsds->readable_handler(bsds->readable_private); @@ -1848,6 +1936,8 @@ static int tstream_bsd_set_readable_handler(struct tstream_bsd *bsds, TEVENT_FD_READABLE(bsds->fde); } + TALLOC_FREE(bsds->error_timer); + bsds->readable_handler = handler; bsds->readable_private = private_data; @@ -1870,7 +1960,8 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds, bsds->writeable_handler = NULL; bsds->writeable_private = NULL; TEVENT_FD_NOT_WRITEABLE(bsds->fde); - + TALLOC_FREE(bsds->error_timer); + bsds->error_ctx = NULL; return 0; } @@ -1882,6 +1973,8 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds, } bsds->event_ptr = NULL; TALLOC_FREE(bsds->fde); + TALLOC_FREE(bsds->error_timer); + bsds->error_ctx = NULL; } if (tevent_fd_get_flags(bsds->fde) == 0) { @@ -1907,6 +2000,7 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds, bsds->writeable_handler = handler; bsds->writeable_private = private_data; + bsds->error_ctx = ev; return 0; } @@ -2212,7 +2306,14 @@ static void tstream_bsd_writev_handler(void *private_data) } err = tsocket_bsd_error_from_errno(ret, errno, &retry); if (retry) { - /* retry later */ + /* + * retry later... + * + * make sure we also wait readable again + * in order to notice errors early + */ + TEVENT_FD_READABLE(bsds->fde); + TALLOC_FREE(bsds->error_timer); return; } if (err != 0) { @@ -2238,7 +2339,13 @@ static void tstream_bsd_writev_handler(void *private_data) } if (state->count > 0) { - /* we have more to read */ + /* + * we have more to write + * + * make sure we also wait readable again + * in order to notice errors early + */ + TEVENT_FD_READABLE(bsds->fde); return; } @@ -2286,6 +2393,8 @@ static struct tevent_req *tstream_bsd_disconnect_send(TALLOC_CTX *mem_ctx, goto post; } + TALLOC_FREE(bsds->error_timer); + bsds->error_ctx = NULL; TALLOC_FREE(bsds->fde); ret = close(bsds->fd); bsds->fd = -1; @@ -2328,6 +2437,8 @@ static const struct tstream_context_ops tstream_bsd_ops = { static int tstream_bsd_destructor(struct tstream_bsd *bsds) { + TALLOC_FREE(bsds->error_timer); + bsds->error_ctx = NULL; TALLOC_FREE(bsds->fde); if (bsds->fd != -1) { close(bsds->fd); |