From mboxrd@z Thu Jan 1 00:00:00 1970 From: Stefano Brivio To: passt-dev@passt.top Subject: [PATCH 15/24] tcp: Rework timers to use timerfd instead of periodic bitmap scan Date: Fri, 25 Mar 2022 23:52:51 +0100 Message-ID: <20220325225300.2803584-16-sbrivio@redhat.com> In-Reply-To: <20220325225300.2803584-1-sbrivio@redhat.com> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="===============2945062684121323069==" --===============2945062684121323069== Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable With a lot of concurrent connections, the bitmap scan approach is not really sustainable. Switch to per-connection timerfd timers, set based on events and on two new flags, ACK_FROM_TAP_DUE and ACK_TO_TAP_DUE. Timers are added to the common epoll list, and implement the existing timeouts. While at it, drop the CONN_ prefix from flag names, otherwise they get quite long, and fix the logic to decide if a connection has a local, possibly unreachable endpoint: we shouldn't go through the rest of tcp_conn_from_tap() if we reset the connection due to a successful bind(2), and we'll get EACCES if the port number is low. Suggested by: Stefan Hajnoczi Signed-off-by: Stefano Brivio --- README.md | 4 +- passt.c | 12 +- tap.c | 2 +- tcp.c | 498 +++++++++++++++++++++++++++++------------------------- tcp.h | 8 +- 5 files changed, 283 insertions(+), 241 deletions(-) diff --git a/README.md b/README.md index cd4caa3..8e07fb1 100644 --- a/README.md +++ b/README.md @@ -287,11 +287,9 @@ speeding up local connections, and usually requiring NAT= . _pasta_: * =E2=9C=85 all capabilities dropped, other than `CAP_NET_BIND_SERVICE` (if = granted) * =E2=9C=85 with default options, user, mount, IPC, UTS, PID namespaces are = detached * =E2=9C=85 no external dependencies (other than a standard C library) -* =E2=9C=85 restrictive seccomp profiles (22 syscalls allowed for _passt_, 3= 4 for +* =E2=9C=85 restrictive seccomp profiles (24 syscalls allowed for _passt_, 3= 6 for _pasta_ on x86_64) * =E2=9C=85 static checkers in continuous integration (clang-tidy, cppcheck) -* =F0=9F=9B=A0=EF=B8=8F rework of TCP state machine (flags instead of states= ), TCP timers, and code - de-duplication * =F0=9F=9B=A0=EF=B8=8F clearly defined packet abstraction * =F0=9F=9B=A0=EF=B8=8F ~5 000 LoC target * =E2=8C=9A [fuzzing](https://bugs.passt.top/show_bug.cgi?id=3D9), _packetdr= ill_ tests diff --git a/passt.c b/passt.c index 6c04266..6550a22 100644 --- a/passt.c +++ b/passt.c @@ -119,12 +119,12 @@ static void post_handler(struct ctx *c, struct timespec= *now) #define CALL_PROTO_HANDLER(c, now, lc, uc) \ do { \ extern void \ - lc ## _defer_handler (struct ctx *, struct timespec *) \ + lc ## _defer_handler (struct ctx *c) \ __attribute__ ((weak)); \ \ if (!c->no_ ## lc) { \ if (lc ## _defer_handler) \ - lc ## _defer_handler(c, now); \ + lc ## _defer_handler(c); \ \ if (timespec_diff_ms((now), &c->lc.timer_run) \ >=3D uc ## _TIMER_INTERVAL) { \ @@ -134,8 +134,11 @@ static void post_handler(struct ctx *c, struct timespec = *now) } \ } while (0) =20 + /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ CALL_PROTO_HANDLER(c, now, tcp, TCP); + /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ CALL_PROTO_HANDLER(c, now, udp, UDP); + /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ CALL_PROTO_HANDLER(c, now, icmp, ICMP); =20 #undef CALL_PROTO_HANDLER @@ -380,8 +383,8 @@ int main(int argc, char **argv) =20 clock_gettime(CLOCK_MONOTONIC, &now); =20 - if ((!c.no_udp && udp_sock_init(&c, &now)) || - (!c.no_tcp && tcp_sock_init(&c, &now))) + if ((!c.no_udp && udp_sock_init(&c)) || + (!c.no_tcp && tcp_sock_init(&c))) exit(EXIT_FAILURE); =20 proto_update_l2_buf(c.mac_guest, c.mac, &c.addr4); @@ -425,6 +428,7 @@ int main(int argc, char **argv) timer_init(&c, &now); =20 loop: + /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ nfds =3D epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL); if (nfds =3D=3D -1 && errno !=3D EINTR) { perror("epoll_wait"); diff --git a/tap.c b/tap.c index a1ccfc1..59a87f9 100644 --- a/tap.c +++ b/tap.c @@ -939,7 +939,7 @@ void tap_sock_init(struct ctx *c) * @c: Execution context * @fd: File descriptor where event occurred * @events: epoll events - * @now: Current timestamp + * @now: Current timestamp, can be NULL on EPOLLERR */ void tap_handler(struct ctx *c, int fd, uint32_t events, struct timespec *no= w) { diff --git a/tcp.c b/tcp.c index f03c929..384e7a6 100644 --- a/tcp.c +++ b/tcp.c @@ -177,32 +177,32 @@ * Aging and timeout * ----------------- * - * Open connections are checked periodically against a number of timeouts. T= hose - * are: + * Timeouts are implemented by means of timerfd timers, set based on flags: * - * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake with= in - * this time, reset the connection - * - * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on - * either side, the connection is reset - * - * - ACK_INTERVAL, or zero-sized window advertised to tap/guest: forcibly ch= eck - * if an ACK segment can be sent + * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake (flag + * ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, reset the + * connection * * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sendi= ng - * data, re-send data from the socket and reset sequence to what was - * acknowledged. If this persists for longer than LAST_ACK_TIMEOUT, reset = the - * connection + * data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from = the + * socket and reset sequence to what was acknowledged. If this persists for + * more than TCP_MAX_RETRANS times in a row, reset the connection * - * - FIN_TIMEOUT, on TAP_FIN_SENT: if no ACK is received for the FIN segment - * within this time, the connection is reset + * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_= DUE + * with TAP_FIN_SENT event), and no ACK is received within this time, reset + * the connection * - * - FIN_TIMEOUT, on SOCK_FIN_SENT: if no activity is detected on the socket - * after sending a FIN segment (write shutdown), reset the connection + * - FIN_TIMEOUT: if a FIN segment was acknowledged by tap/guest and a FIN + * segment (write shutdown) was sent via socket (events SOCK_FIN_SENT and + * TAP_FIN_ACKED), but no socket activity is detected from the socket with= in + * this time, reset the connection * - * - LAST_ACK_TIMEOUT on SOCK_FIN_SENT *and* SOCK_FIN_RCVD: reset the connec= tion - * if no activity was detected on any of the two sides after sending a FIN - * segment + * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on + * either side, the connection is reset + * + * - ACK_INTERVAL elapsed after data segment received from tap without having + * sent an ACK segment, or zero-sized window advertised to tap/guest (flag + * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent * * * Summary of data flows (with ESTABLISHED event) @@ -237,11 +237,6 @@ * - on two duplicated ACKs, reset @seq_to_tap to @seq_ack_from_tap, and * resend with steps listed above * - set TCP_WINDOW_CLAMP from TCP header from tap - * - periodically: - * - if @seq_ack_from_tap < @seq_to_tap and the retransmission timer - * (TODO: implement requirements from RFC 6298, currently 3s fixed) fr= om - * @ts_ack_from_tap elapsed, reset @seq_to_tap to @seq_ack_from_tap, a= nd - * resend data with the steps listed above * * - from tap/guest to socket: * - on packet from tap/guest: @@ -287,6 +282,7 @@ #include #endif #include +#include #include #include #include @@ -328,17 +324,13 @@ # define KERNEL_REPORTS_SND_WND(c) (0 && (c)) #endif =20 -#define SYN_TIMEOUT 240000 /* ms */ -#define ACK_TIMEOUT 2000 -#define ACK_INTERVAL 50 -#define ACT_TIMEOUT 7200000 -#define FIN_TIMEOUT 240000 -#define LAST_ACK_TIMEOUT 240000 +#define ACK_INTERVAL 50 /* ms */ +#define SYN_TIMEOUT 10 /* s */ +#define ACK_TIMEOUT 2 +#define FIN_TIMEOUT 60 +#define ACT_TIMEOUT 7200 =20 #define TCP_SOCK_POOL_TSH 16 /* Refill in ns if > x used */ -#define REFILL_INTERVAL 1000 - -#define PORT_DETECT_INTERVAL 1000 =20 #define LOW_RTT_TABLE_SIZE 8 #define LOW_RTT_THRESHOLD 10 /* us */ @@ -407,7 +399,11 @@ struct tcp_conn; */ struct tcp_conn { struct tcp_conn *next; - int sock; + int32_t sock:SOCKET_REF_BITS; +#define TCP_RETRANS_BITS 3 + unsigned int retrans:TCP_RETRANS_BITS; +#define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1) + int timer; int hash_bucket; =20 union { @@ -440,11 +436,13 @@ struct tcp_conn { (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED) =20 uint8_t flags; -#define CONN_STALLED BIT(0) -#define CONN_LOCAL BIT(1) -#define CONN_WND_CLAMPED BIT(2) -#define CONN_IN_EPOLL BIT(3) -#define CONN_ACTIVE_CLOSE BIT(4) +#define STALLED BIT(0) +#define LOCAL BIT(1) +#define WND_CLAMPED BIT(2) +#define IN_EPOLL BIT(3) +#define ACTIVE_CLOSE BIT(4) +#define ACK_TO_TAP_DUE BIT(5) +#define ACK_FROM_TAP_DUE BIT(6) =20 uint16_t tap_mss; =20 @@ -463,12 +461,6 @@ struct tcp_conn { uint32_t wnd_to_tap; =20 int snd_buf; - - struct timespec ts_sock_act; - struct timespec ts_tap_act; - struct timespec ts_ack_from_tap; - struct timespec ts_ack_to_tap; - struct timespec tap_data_noack; }; =20 #define CONN_IS_CLOSED(conn) (conn->events =3D=3D CLOSED) @@ -498,6 +490,7 @@ static const char *tcp_state_str[] __attribute((__unused_= _)) =3D { =20 static const char *tcp_flag_str[] __attribute((__unused__)) =3D { "STALLED", "LOCAL", "WND_CLAMPED", "IN_EPOLL", "ACTIVE_CLOSE", + "ACK_TO_TAP_DUE", "ACK_FROM_TAP_DUE", }; =20 /* Port re-mappings as delta, indexed by original destination port */ @@ -686,7 +679,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uin= t8_t conn_flags) if (events & TAP_FIN_SENT) return EPOLLET; =20 - if (conn_flags & CONN_STALLED) + if (conn_flags & STALLED) return EPOLLIN | EPOLLRDHUP | EPOLLET; =20 return EPOLLIN | EPOLLRDHUP; @@ -715,7 +708,7 @@ static void conn_flag_do(struct ctx *c, struct tcp_conn *= conn, */ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn) { - int m =3D (conn->flags & CONN_IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m =3D (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref =3D { .r.proto =3D IPPROTO_TCP, .r.s =3D conn->sock, .r.p.tcp.tcp.index =3D conn - tc, .r.p.tcp.tcp.v6 =3D CONN_V6(conn) }; @@ -731,13 +724,69 @@ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn= *conn) if (epoll_ctl(c->epollfd, m, conn->sock, &ev)) return -errno; =20 - conn->flags |=3D CONN_IN_EPOLL; /* No need to log this */ + conn->flags |=3D IN_EPOLL; /* No need to log this */ =20 return 0; } =20 /** - * conn_flag_do() - Set/unset given flag, log, update epoll on CONN_STALLED + * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if ne= eded + * @c: Execution context + * @conn: Connection pointer + * + * #syscalls timerfd_create timerfd_settime + */ +static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn) +{ + struct itimerspec it =3D { { 0 }, { 0 } }; + + if (conn->timer =3D=3D -1) { + union epoll_ref ref =3D { .r.proto =3D IPPROTO_TCP, + .r.s =3D conn->sock, + .r.p.tcp.tcp.timer =3D 1, + .r.p.tcp.tcp.index =3D conn - tc }; + struct epoll_event ev =3D { .data.u64 =3D ref.u64, + .events =3D EPOLLIN | EPOLLET }; + + conn->timer =3D timerfd_create(CLOCK_MONOTONIC, 0); + if (conn->timer =3D=3D -1) { + debug("TCP: failed to get timer: %s", strerror(errno)); + return; + } + + if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { + debug("TCP: failed to add timer: %s", strerror(errno)); + close(conn->timer); + conn->timer =3D -1; + return; + } + } + + if (conn->events =3D=3D CLOSED) { + it.it_value.tv_nsec =3D 1; + } else if (conn->flags & ACK_TO_TAP_DUE) { + it.it_value.tv_nsec =3D (long)ACK_INTERVAL * 1000 * 1000; + } else if (conn->flags & ACK_FROM_TAP_DUE) { + if (!(conn->events & ESTABLISHED)) + it.it_value.tv_sec =3D SYN_TIMEOUT; + else if (conn->events & TAP_FIN_SENT) + it.it_value.tv_sec =3D FIN_TIMEOUT; + else + it.it_value.tv_sec =3D ACK_TIMEOUT; + } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { + it.it_value.tv_sec =3D FIN_TIMEOUT; + } else { + it.it_value.tv_sec =3D ACT_TIMEOUT; + } + + debug("TCP: index %i, timer expires in %u.%03us", conn - tc, + it.it_value.tv_sec, it.it_value.tv_nsec / 1000 / 1000); + + timerfd_settime(conn->timer, 0, &it, NULL); +} + +/** + * conn_flag_do() - Set/unset given flag, log, update epoll on STALLED flag * @c: Execution context * @conn: Connection pointer * @flag: Flag to set, or ~flag to unset @@ -761,8 +810,11 @@ static void conn_flag_do(struct ctx *c, struct tcp_conn = *conn, tcp_flag_str[fls(flag)]); } =20 - if (flag =3D=3D CONN_STALLED || flag =3D=3D ~CONN_STALLED) + if (flag =3D=3D STALLED || flag =3D=3D ~STALLED) tcp_epoll_ctl(c, conn); + + if (flag =3D=3D ACK_FROM_TAP_DUE || flag =3D=3D ACK_TO_TAP_DUE) + tcp_timer_ctl(c, conn); } =20 /** @@ -780,7 +832,7 @@ static void conn_event_do(struct ctx *c, struct tcp_conn = *conn, return; =20 prev =3D fls(conn->events); - if (conn->flags & CONN_ACTIVE_CLOSE) + if (conn->flags & ACTIVE_CLOSE) prev +=3D 5; =20 if ((conn->events & ESTABLISHED) && (conn->events !=3D ESTABLISHED)) @@ -791,18 +843,13 @@ static void conn_event_do(struct ctx *c, struct tcp_con= n *conn, else conn->events |=3D event; =20 - if ((event =3D=3D TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) - conn_flag(c, conn, CONN_ACTIVE_CLOSE); - else - tcp_epoll_ctl(c, conn); - new =3D fls(conn->events); =20 if ((conn->events & ESTABLISHED) && (conn->events !=3D ESTABLISHED)) { num++; new++; } - if (conn->flags & CONN_ACTIVE_CLOSE) + if (conn->flags & ACTIVE_CLOSE) new +=3D 5; =20 if (prev !=3D new) { @@ -814,6 +861,14 @@ static void conn_event_do(struct ctx *c, struct tcp_conn= *conn, debug("TCP: index %i, %s", (conn) - tc, num =3D=3D -1 ? "CLOSED" : tcp_event_str[num]); } + + if ((event =3D=3D TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) + conn_flag(c, conn, ACTIVE_CLOSE); + else + tcp_epoll_ctl(c, conn); + + if (event =3D=3D CLOSED || CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) + tcp_timer_ctl(c, conn); } =20 #define conn_event(c, conn, event) \ @@ -1388,13 +1443,12 @@ static void tcp_rst_do(struct ctx *c, struct tcp_conn= *conn); * * Return: 0 on success, negative error code on failure (tap reset possible) */ -static int tcp_l2_buf_write_one(struct ctx *c, struct iovec *iov, - struct timespec *ts) +static int tcp_l2_buf_write_one(struct ctx *c, struct iovec *iov) { if (write(c->fd_tap, (char *)iov->iov_base + 4, iov->iov_len - 4) < 0) { debug("tap write: %s", strerror(errno)); if (errno !=3D EAGAIN && errno !=3D EWOULDBLOCK) - tap_handler(c, c->fd_tap, EPOLLERR, ts); + tap_handler(c, c->fd_tap, EPOLLERR, NULL); return -errno; } =20 @@ -1431,11 +1485,9 @@ static void tcp_l2_buf_flush_part(struct ctx *c, struc= t msghdr *mh, size_t sent) * @mh: Message header pointing to buffers, msg_iovlen not set * @buf_used: Pointer to count of used buffers, set to 0 on return * @buf_bytes: Pointer to count of buffer bytes, set to 0 on return - * @ts: Current timestamp */ static void tcp_l2_buf_flush(struct ctx *c, struct msghdr *mh, - unsigned int *buf_used, size_t *buf_bytes, - struct timespec *ts) + unsigned int *buf_used, size_t *buf_bytes) { if (!(mh->msg_iovlen =3D *buf_used)) return; @@ -1450,7 +1502,7 @@ static void tcp_l2_buf_flush(struct ctx *c, struct msgh= dr *mh, for (i =3D 0; i < mh->msg_iovlen; i++) { struct iovec *iov =3D &mh->msg_iov[i]; =20 - if (tcp_l2_buf_write_one(c, iov, ts)) + if (tcp_l2_buf_write_one(c, iov)) i--; } } @@ -1461,9 +1513,8 @@ static void tcp_l2_buf_flush(struct ctx *c, struct msgh= dr *mh, /** * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (fl= ags) * @c: Execution context - * @ts: Current timestamp (not packet timestamp) */ -static void tcp_l2_flags_buf_flush(struct ctx *c, struct timespec *ts) +static void tcp_l2_flags_buf_flush(struct ctx *c) { struct msghdr mh =3D { 0 }; unsigned int *buf_used; @@ -1472,20 +1523,19 @@ static void tcp_l2_flags_buf_flush(struct ctx *c, str= uct timespec *ts) mh.msg_iov =3D tcp6_l2_flags_iov; buf_used =3D &tcp6_l2_flags_buf_used; buf_bytes =3D &tcp6_l2_flags_buf_bytes; - tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts); + tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes); =20 mh.msg_iov =3D tcp4_l2_flags_iov; buf_used =3D &tcp4_l2_flags_buf_used; buf_bytes =3D &tcp4_l2_flags_buf_bytes; - tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts); + tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes); } =20 /** * tcp_l2_data_buf_flush() - Send out buffers for segments with data * @c: Execution context - * @ts: Current timestamp (not packet timestamp) */ -static void tcp_l2_data_buf_flush(struct ctx *c, struct timespec *ts) +static void tcp_l2_data_buf_flush(struct ctx *c) { struct msghdr mh =3D { 0 }; unsigned int *buf_used; @@ -1494,23 +1544,22 @@ static void tcp_l2_data_buf_flush(struct ctx *c, stru= ct timespec *ts) mh.msg_iov =3D tcp6_l2_iov; buf_used =3D &tcp6_l2_buf_used; buf_bytes =3D &tcp6_l2_buf_bytes; - tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts); + tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes); =20 mh.msg_iov =3D tcp4_l2_iov; buf_used =3D &tcp4_l2_buf_used; buf_bytes =3D &tcp4_l2_buf_bytes; - tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts); + tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes); } =20 /** * tcp_defer_handler() - Handler for TCP deferred tasks * @c: Execution context - * @now: Current timestamp */ -void tcp_defer_handler(struct ctx *c, struct timespec *now) +void tcp_defer_handler(struct ctx *c) { - tcp_l2_flags_buf_flush(c, now); - tcp_l2_data_buf_flush(c, now); + tcp_l2_flags_buf_flush(c); + tcp_l2_data_buf_flush(c); } =20 /** @@ -1627,7 +1676,7 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct = tcp_conn *conn, conn->seq_ack_to_tap =3D prev_ack_to_tap; #else if ((unsigned long)conn->snd_buf < SNDBUF_SMALL || tcp_rtt_dst_low(conn) - || CONN_IS_CLOSING(conn) || conn->flags & CONN_LOCAL || force_seq) { + || CONN_IS_CLOSING(conn) || conn->flags & LOCAL || force_seq) { conn->seq_ack_to_tap =3D conn->seq_from_tap; } else if (conn->seq_ack_to_tap !=3D conn->seq_from_tap) { if (!tinfo) { @@ -1660,7 +1709,7 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct = tcp_conn *conn, } =20 #ifdef HAS_SND_WND - if ((conn->flags & CONN_LOCAL) || tcp_rtt_dst_low(conn)) { + if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) { conn->wnd_to_tap =3D tinfo->tcpi_snd_wnd; } else { tcp_get_sndbuf(conn); @@ -1670,6 +1719,8 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct = tcp_conn *conn, =20 conn->wnd_to_tap =3D MIN(conn->wnd_to_tap, MAX_WINDOW); =20 + if (!conn->wnd_to_tap) + conn_flag(c, conn, ACK_TO_TAP_DUE); out: return conn->wnd_to_tap !=3D prev_wnd_to_tap || conn->seq_ack_to_tap !=3D prev_ack_to_tap; @@ -1680,12 +1731,10 @@ out: * @c: Execution context * @conn: Connection pointer * @flags: TCP flags: if not set, send segment only if ACK is due - * @now: Current timestamp * * Return: negative error code on connection reset, 0 otherwise */ -static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags, - struct timespec *now) +static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags) { uint32_t prev_ack_to_tap =3D conn->seq_ack_to_tap; uint32_t prev_wnd_to_tap =3D conn->wnd_to_tap; @@ -1709,7 +1758,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn= *conn, int flags, return -ECONNRESET; } =20 - if (!(conn->flags & CONN_LOCAL)) + if (!(conn->flags & LOCAL)) tcp_rtt_dst_check(conn, &tinfo); =20 if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags) @@ -1748,8 +1797,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn= *conn, int flags, mss -=3D sizeof(struct ipv6hdr); =20 if (c->low_wmem && - !(conn->flags & CONN_LOCAL) && - !tcp_rtt_dst_low(conn)) + !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) mss =3D MIN(mss, PAGE_SIZE); else if (mss > PAGE_SIZE) mss =3D ROUND_DOWN(mss, PAGE_SIZE); @@ -1795,11 +1843,11 @@ static int tcp_send_flag(struct ctx *c, struct tcp_co= nn *conn, int flags, else tcp6_l2_flags_buf_bytes +=3D iov->iov_len; =20 - if (th->ack && now) - conn->ts_ack_to_tap =3D *now; + if (th->ack) + conn_flag(c, conn, ~ACK_TO_TAP_DUE); =20 - if (th->fin && now) - conn->tap_data_noack =3D *now; + if (th->fin) + conn_flag(c, conn, ACK_FROM_TAP_DUE); =20 /* RFC 793, 3.1: "[...] and the first data octet is ISN+1." */ if (th->fin || th->syn) @@ -1814,7 +1862,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn= *conn, int flags, } =20 if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) - tcp_l2_flags_buf_flush(c, now); + tcp_l2_flags_buf_flush(c); } else { if (flags & DUP_ACK) { memcpy(b6 + 1, b6, sizeof(*b6)); @@ -1824,7 +1872,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn= *conn, int flags, } =20 if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) - tcp_l2_flags_buf_flush(c, now); + tcp_l2_flags_buf_flush(c); } =20 return 0; @@ -1840,7 +1888,7 @@ static void tcp_rst_do(struct ctx *c, struct tcp_conn *= conn) if (CONN_IS_CLOSED(conn)) return; =20 - if (!tcp_send_flag(c, conn, RST, NULL)) + if (!tcp_send_flag(c, conn, RST)) tcp_conn_destroy(c, conn); } =20 @@ -1874,7 +1922,7 @@ static void tcp_clamp_window(struct ctx *c, struct tcp_= conn *conn, =20 window =3D MIN(MAX_WINDOW, window); =20 - if (conn->flags & CONN_WND_CLAMPED) { + if (conn->flags & WND_CLAMPED) { if (conn->wnd_from_tap =3D=3D window) return; =20 @@ -1893,7 +1941,7 @@ static void tcp_clamp_window(struct ctx *c, struct tcp_= conn *conn, window =3D 256; setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP, &window, sizeof(window)); - conn_flag(c, conn, CONN_WND_CLAMPED); + conn_flag(c, conn, WND_CLAMPED); } } =20 @@ -2070,6 +2118,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, vo= id *addr, =20 conn =3D CONN(c->tcp.conn_count++); conn->sock =3D s; + conn->timer =3D -1; conn_event(c, conn, TAP_SYN_RCVD); =20 conn->wnd_to_tap =3D WINDOW_DEFAULT; @@ -2098,9 +2147,6 @@ static void tcp_conn_from_tap(struct ctx *c, int af, vo= id *addr, conn->sock_port =3D ntohs(th->dest); conn->tap_port =3D ntohs(th->source); =20 - conn->ts_sock_act =3D conn->ts_tap_act =3D *now; - conn->ts_ack_to_tap =3D conn->ts_ack_from_tap =3D *now; - conn->seq_init_from_tap =3D ntohl(th->seq); conn->seq_from_tap =3D conn->seq_init_from_tap + 1; conn->seq_ack_to_tap =3D conn->seq_from_tap; @@ -2111,10 +2157,12 @@ static void tcp_conn_from_tap(struct ctx *c, int af, = void *addr, =20 tcp_hash_insert(c, conn, af, addr); =20 - if (!bind(s, sa, sl)) + if (!bind(s, sa, sl)) { tcp_rst(c, conn); /* Nobody is listening then */ - if (errno !=3D EADDRNOTAVAIL) - conn_flag(c, conn, CONN_LOCAL); + return; + } + if (errno !=3D EADDRNOTAVAIL && errno !=3D EACCES) + conn_flag(c, conn, LOCAL); =20 if (connect(s, sa, sl)) { if (errno !=3D EINPROGRESS) { @@ -2126,7 +2174,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, vo= id *addr, } else { tcp_get_sndbuf(conn); =20 - if (tcp_send_flag(c, conn, SYN | ACK, now)) + if (tcp_send_flag(c, conn, SYN | ACK)) return; =20 conn_event(c, conn, TAP_SYN_ACK_SENT); @@ -2169,7 +2217,7 @@ static int tcp_sock_consume(struct tcp_conn *conn, uint= 32_t ack_seq) * @now: Current timestamp */ static void tcp_data_to_tap(struct ctx *c, struct tcp_conn *conn, ssize_t pl= en, - int no_csum, uint32_t seq, struct timespec *now) + int no_csum, uint32_t seq) { struct iovec *iov; size_t len; @@ -2183,7 +2231,7 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_c= onn *conn, ssize_t plen, iov =3D tcp4_l2_iov + tcp4_l2_buf_used++; tcp4_l2_buf_bytes +=3D iov->iov_len =3D len + sizeof(b->vnet_len); if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) - tcp_l2_data_buf_flush(c, now); + tcp_l2_data_buf_flush(c); } else if (CONN_V6(conn)) { struct tcp6_l2_buf_t *b =3D &tcp6_l2_buf[tcp6_l2_buf_used]; =20 @@ -2192,7 +2240,7 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_c= onn *conn, ssize_t plen, iov =3D tcp6_l2_iov + tcp6_l2_buf_used++; tcp6_l2_buf_bytes +=3D iov->iov_len =3D len + sizeof(b->vnet_len); if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) - tcp_l2_data_buf_flush(c, now); + tcp_l2_data_buf_flush(c); } } =20 @@ -2200,14 +2248,12 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp= _conn *conn, ssize_t plen, * tcp_data_from_sock() - Handle new data from socket, queue to tap, in wind= ow * @c: Execution context * @conn: Connection pointer - * @now: Current timestamp * * Return: negative on connection reset, 0 otherwise * * #syscalls recvmsg */ -static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn, - struct timespec *now) +static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn) { int fill_bufs, send_bufs =3D 0, last_len, iov_rem =3D 0; int sendlen, len, plen, v4 =3D CONN_V4(conn); @@ -2225,8 +2271,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp= _conn *conn, } =20 if (!conn->wnd_from_tap || already_sent >=3D conn->wnd_from_tap) { - conn_flag(c, conn, CONN_STALLED); - conn->tap_data_noack =3D *now; + conn_flag(c, conn, STALLED); + conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; } =20 @@ -2248,7 +2294,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp= _conn *conn, =20 if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) - tcp_l2_data_buf_flush(c, now); + tcp_l2_data_buf_flush(c); =20 for (i =3D 0, iov =3D iov_sock + 1; i < fill_bufs; i++, iov++) { if (v4) @@ -2274,11 +2320,11 @@ recvmsg: =20 sendlen =3D len - already_sent; if (sendlen <=3D 0) { - conn_flag(c, conn, CONN_STALLED); + conn_flag(c, conn, STALLED); return 0; } =20 - conn_flag(c, conn, ~CONN_STALLED); + conn_flag(c, conn, ~STALLED); =20 send_bufs =3D DIV_ROUND_UP(sendlen, conn->tap_mss); last_len =3D sendlen - (send_bufs - 1) * conn->tap_mss; @@ -2294,11 +2340,11 @@ recvmsg: if (i =3D=3D send_bufs - 1) plen =3D last_len; =20 - tcp_data_to_tap(c, conn, plen, no_csum, conn->seq_to_tap, now); + tcp_data_to_tap(c, conn, plen, no_csum, conn->seq_to_tap); conn->seq_to_tap +=3D plen; } =20 - conn->tap_data_noack =3D conn->ts_ack_to_tap =3D *now; + conn_flag(c, conn, ACK_FROM_TAP_DUE); =20 return 0; =20 @@ -2312,7 +2358,7 @@ err: =20 zero_len: if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) =3D=3D SOCK_FIN_RCVD) { - if ((ret =3D tcp_send_flag(c, conn, FIN | ACK, now))) { + if ((ret =3D tcp_send_flag(c, conn, FIN | ACK))) { tcp_rst(c, conn); return ret; } @@ -2329,13 +2375,11 @@ zero_len: * @conn: Connection pointer * @msg: Array of messages from tap * @count: Count of messages - * @now: Current timestamp * * #syscalls sendmsg */ static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn, - struct tap_l4_msg *msg, int count, - struct timespec *now) + struct tap_l4_msg *msg, int count) { int i, iov_i, ack =3D 0, fin =3D 0, retr =3D 0, keep =3D -1; uint32_t max_ack_seq =3D conn->seq_ack_from_tap; @@ -2445,16 +2489,18 @@ static void tcp_data_from_tap(struct ctx *c, struct t= cp_conn *conn, tcp_clamp_window(c, conn, NULL, 0, max_ack_seq_wnd, 0); =20 if (ack) { - conn->ts_ack_from_tap =3D *now; - if (max_ack_seq =3D=3D conn->seq_to_tap) - conn->tap_data_noack =3D ((struct timespec) { 0, 0 }); + if (max_ack_seq =3D=3D conn->seq_to_tap) { + conn_flag(c, conn, ~ACK_FROM_TAP_DUE); + conn->retrans =3D 0; + } + tcp_sock_consume(conn, max_ack_seq); } =20 if (retr) { conn->seq_ack_from_tap =3D max_ack_seq; conn->seq_to_tap =3D max_ack_seq; - tcp_data_from_sock(c, conn, now); + tcp_data_from_sock(c, conn); } =20 if (!iov_i) @@ -2470,14 +2516,14 @@ eintr: * Then swiftly looked away and left. */ conn->seq_from_tap =3D seq_from_tap; - tcp_send_flag(c, conn, ACK, now); + tcp_send_flag(c, conn, ACK); } =20 if (errno =3D=3D EINTR) goto eintr; =20 if (errno =3D=3D EAGAIN || errno =3D=3D EWOULDBLOCK) { - tcp_send_flag(c, conn, ACK_IF_NEEDED, now); + tcp_send_flag(c, conn, ACK_IF_NEEDED); return; } tcp_rst(c, conn); @@ -2487,7 +2533,7 @@ eintr: if (n < (int)(seq_from_tap - conn->seq_from_tap)) { partial_send =3D 1; conn->seq_from_tap +=3D n; - tcp_send_flag(c, conn, ACK_IF_NEEDED, now); + tcp_send_flag(c, conn, ACK_IF_NEEDED); } else { conn->seq_from_tap +=3D n; } @@ -2496,7 +2542,7 @@ out: if (keep !=3D -1) { if (conn->seq_dup_ack !=3D conn->seq_from_tap) { conn->seq_dup_ack =3D conn->seq_from_tap; - tcp_send_flag(c, conn, DUP_ACK, now); + tcp_send_flag(c, conn, DUP_ACK); } return; } @@ -2510,7 +2556,7 @@ out: =20 conn_event(c, conn, TAP_FIN_RCVD); } else { - tcp_send_flag(c, conn, ACK_IF_NEEDED, now); + tcp_send_flag(c, conn, ACK_IF_NEEDED); } } =20 @@ -2520,11 +2566,9 @@ out: * @conn: Connection pointer * @th: TCP header of SYN, ACK segment from tap/guest * @len: Packet length of SYN, ACK segment at L4, host order - * @now: Current timestamp */ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn, - struct tcphdr *th, size_t len, - struct timespec *now) + struct tcphdr *th, size_t len) { tcp_clamp_window(c, conn, th, len, 0, 1); conn->tap_mss =3D tcp_conn_tap_mss(c, conn, th, len); @@ -2538,8 +2582,8 @@ static void tcp_conn_from_sock_finish(struct ctx *c, st= ruct tcp_conn *conn, /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. */ - tcp_data_from_sock(c, conn, now); - tcp_send_flag(c, conn, ACK_IF_NEEDED, now); + tcp_data_from_sock(c, conn); + tcp_send_flag(c, conn, ACK_IF_NEEDED); } =20 /** @@ -2559,6 +2603,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, struct tcphdr *th =3D (struct tcphdr *)(pkt_buf + msg[0].pkt_buf_offset); uint16_t len =3D msg[0].l4_len; struct tcp_conn *conn; + int ack_due =3D 0; =20 conn =3D tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest)); =20 @@ -2574,13 +2619,17 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, return count; } =20 - conn->ts_tap_act =3D *now; - conn_flag(c, conn, ~CONN_STALLED); + if (th->ack) { + conn_flag(c, conn, ~ACK_FROM_TAP_DUE); + conn->retrans =3D 0; + } + + conn_flag(c, conn, ~STALLED); =20 /* Establishing connection from socket */ if (conn->events & SOCK_ACCEPTED) { if (th->syn && th->ack && !th->fin) - tcp_conn_from_sock_finish(c, conn, th, len, now); + tcp_conn_from_sock_finish(c, conn, th, len); else tcp_rst(c, conn); =20 @@ -2600,7 +2649,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, conn->seq_from_tap++; =20 shutdown(conn->sock, SHUT_WR); - tcp_send_flag(c, conn, ACK, now); + tcp_send_flag(c, conn, ACK); conn_event(c, conn, SOCK_FIN_SENT); =20 return count; @@ -2621,11 +2670,6 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, =20 /* Established connections not accepting data from tap */ if (conn->events & TAP_FIN_RCVD) { - if (th->ack) { - conn->tap_data_noack =3D ((struct timespec) { 0, 0 }); - conn->ts_ack_from_tap =3D *now; - } - if (conn->events & SOCK_FIN_RCVD && conn->seq_ack_from_tap =3D=3D conn->seq_to_tap) tcp_conn_destroy(c, conn); @@ -2634,14 +2678,20 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, } =20 /* Established connections accepting data from tap */ - tcp_data_from_tap(c, conn, msg, count, now); + tcp_data_from_tap(c, conn, msg, count); + if (conn->seq_ack_to_tap !=3D conn->seq_from_tap) + ack_due =3D 1; =20 if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) { shutdown(conn->sock, SHUT_WR); conn_event(c, conn, SOCK_FIN_SENT); - tcp_send_flag(c, conn, ACK, now); + tcp_send_flag(c, conn, ACK); + ack_due =3D 0; } =20 + if (ack_due) + conn_flag(c, conn, ACK_TO_TAP_DUE); + return count; } =20 @@ -2649,10 +2699,8 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, * tcp_connect_finish() - Handle completion of connect() from EPOLLOUT event * @c: Execution context * @conn: Connection pointer - * @now: Current timestamp */ -static void tcp_connect_finish(struct ctx *c, struct tcp_conn *conn, - struct timespec *now) +static void tcp_connect_finish(struct ctx *c, struct tcp_conn *conn) { socklen_t sl; int so; @@ -2663,10 +2711,11 @@ static void tcp_connect_finish(struct ctx *c, struct = tcp_conn *conn, return; } =20 - if (tcp_send_flag(c, conn, SYN | ACK, now)) + if (tcp_send_flag(c, conn, SYN | ACK)) return; =20 conn_event(c, conn, TAP_SYN_ACK_SENT); + conn_flag(c, conn, ACK_FROM_TAP_DUE); } =20 /** @@ -2693,7 +2742,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epo= ll_ref ref, =20 conn =3D CONN(c->tcp.conn_count++); conn->sock =3D s; - + conn->timer =3D -1; conn_event(c, conn, SOCK_ACCEPTED); =20 if (ref.r.p.tcp.tcp.v6) { @@ -2759,16 +2808,70 @@ static void tcp_conn_from_sock(struct ctx *c, union e= poll_ref ref, =20 conn->wnd_from_tap =3D WINDOW_DEFAULT; =20 - conn->ts_sock_act =3D conn->ts_tap_act =3D *now; - conn->ts_ack_from_tap =3D conn->ts_ack_to_tap =3D *now; - - tcp_send_flag(c, conn, SYN, now); + tcp_send_flag(c, conn, SYN); + conn_flag(c, conn, ACK_FROM_TAP_DUE); =20 tcp_get_sndbuf(conn); } =20 /** - * tcp_sock_handler() - Handle new data from socket + * tcp_timer_handler() - timerfd events: close, send ACK, retransmit, or res= et + * @c: Execution context + * @ref: epoll reference of timer (not connection) + */ +static void tcp_timer_handler(struct ctx *c, union epoll_ref ref) +{ + struct tcp_conn *conn =3D CONN(ref.r.p.tcp.tcp.index); + struct epoll_event ev =3D { 0 }; + + if (CONN_IS_CLOSED(conn)) { + tcp_hash_remove(conn); + tcp_table_compact(c, conn); + if (conn->timer !=3D -1) { + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev); + close(conn->timer); + conn->timer =3D -1; + } + } else if (conn->flags & ACK_TO_TAP_DUE) { + tcp_send_flag(c, conn, ACK_IF_NEEDED); + conn_flag(c, conn, ~ACK_TO_TAP_DUE); + } else if (conn->flags & ACK_FROM_TAP_DUE) { + if (!(conn->events & ESTABLISHED)) { + debug("TCP: index %i, handshake timeout", conn - tc); + tcp_rst(c, conn); + } else if (conn->events & TAP_FIN_SENT) { + debug("TCP: index %i, FIN timeout", conn - tc); + tcp_rst(c, conn); + } else if (conn->retrans =3D=3D TCP_MAX_RETRANS) { + debug("TCP: index %i, maximum retransmissions exceeded", + conn - tc); + tcp_rst(c, conn); + } else { + debug("TCP: index %i, ACK timeout, retry", conn - tc); + conn->retrans++; + conn->seq_to_tap =3D conn->seq_ack_from_tap; + tcp_data_from_sock(c, conn); + } + } else { + struct itimerspec new =3D { { 0 }, { ACT_TIMEOUT, 0 } }; + struct itimerspec old =3D { { 0 }, { 0 } }; + + /* Activity timeout: if it was already set, reset the + * connection, otherwise, it was a left-over from ACK_TO_TAP_DUE + * or ACK_FROM_TAP_DUE, so just set the long timeout in that + * case. This avoids having to preemptively reset the timer on + * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE. + */ + timerfd_settime(conn->timer, 0, &new, &old); + if (old.it_value.tv_sec =3D=3D ACT_TIMEOUT) { + debug("TCP: index %i, activity timeout", conn - tc); + tcp_rst(c, conn); + } + } +} + +/** + * tcp_sock_handler() - Handle new data from socket, or timerfd event * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap @@ -2779,6 +2882,11 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref r= ef, uint32_t events, { struct tcp_conn *conn; =20 + if (ref.r.p.tcp.tcp.timer) { + tcp_timer_handler(c, ref); + return; + } + if (ref.r.p.tcp.tcp.splice) { tcp_sock_handler_splice(c, ref, events); return; @@ -2792,8 +2900,6 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref re= f, uint32_t events, if (!(conn =3D CONN(ref.r.p.tcp.tcp.index))) return; =20 - conn->ts_sock_act =3D *now; - if (events & EPOLLERR) { tcp_rst(c, conn); return; @@ -2812,7 +2918,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref re= f, uint32_t events, conn_event(c, conn, SOCK_FIN_RCVD); =20 if (events & EPOLLIN) - tcp_data_from_sock(c, conn, now); + tcp_data_from_sock(c, conn); =20 if (events & EPOLLOUT) tcp_update_seqack_wnd(c, conn, 0, NULL); @@ -2832,7 +2938,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref re= f, uint32_t events, =20 if (conn->events =3D=3D TAP_SYN_RCVD) { if (events & EPOLLOUT) - tcp_connect_finish(c, conn, now); + tcp_connect_finish(c, conn); /* Data? Check later */ } } @@ -2981,9 +3087,9 @@ static int tcp_sock_refill(void *arg) } =20 for (i =3D 0; a->c->v4 && i < TCP_SOCK_POOL_SIZE; i++, p4++) { - if (*p4 >=3D 0) { + if (*p4 >=3D 0) break; - } + *p4 =3D socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); if (*p4 > SOCKET_MAX) { close(*p4); @@ -2995,9 +3101,9 @@ static int tcp_sock_refill(void *arg) } =20 for (i =3D 0; a->c->v6 && i < TCP_SOCK_POOL_SIZE; i++, p6++) { - if (*p6 >=3D 0) { + if (*p6 >=3D 0) break; - } + *p6 =3D socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); if (*p6 > SOCKET_MAX) { @@ -3091,72 +3197,6 @@ int tcp_sock_init(struct ctx *c, struct timespec *now) return 0; } =20 -/** - * tcp_timer_one() - Handler for timed events on one socket - * @c: Execution context - * @conn: Connection pointer - * @ts: Timestamp from caller - */ -static void tcp_timer_one(struct ctx *c, struct tcp_conn *conn, - struct timespec *ts) -{ - int ack_from_tap =3D timespec_diff_ms(ts, &conn->ts_ack_from_tap); - int ack_to_tap =3D timespec_diff_ms(ts, &conn->ts_ack_to_tap); - int sock_act =3D timespec_diff_ms(ts, &conn->ts_sock_act); - int tap_act =3D timespec_diff_ms(ts, &conn->ts_tap_act); - int tap_data_noack; - - if (!memcmp(&conn->tap_data_noack, &((struct timespec){ 0, 0 }), - sizeof(struct timespec))) - tap_data_noack =3D 0; - else - tap_data_noack =3D timespec_diff_ms(ts, &conn->tap_data_noack); - - if (CONN_IS_CLOSED(conn)) { - tcp_hash_remove(conn); - tcp_table_compact(c, conn); - return; - } - - if (!(conn->events & ESTABLISHED)) { - if (ack_from_tap > SYN_TIMEOUT) - tcp_rst(c, conn); - return; - } - - if (tap_act > ACT_TIMEOUT && sock_act > ACT_TIMEOUT) - goto rst; - - if (!conn->wnd_to_tap || ack_to_tap > ACK_INTERVAL) - tcp_send_flag(c, conn, ACK_IF_NEEDED, ts); - - if (tap_data_noack > ACK_TIMEOUT) { - if (conn->seq_ack_from_tap < conn->seq_to_tap) { - if (tap_data_noack > LAST_ACK_TIMEOUT) - goto rst; - - conn->seq_to_tap =3D conn->seq_ack_from_tap; - tcp_data_from_sock(c, conn, ts); - } - return; - } - - if (conn->events & TAP_FIN_SENT && tap_data_noack > FIN_TIMEOUT) - goto rst; - - if (conn->events & SOCK_FIN_SENT && sock_act > FIN_TIMEOUT) - goto rst; - - if (conn->events & SOCK_FIN_SENT && conn->events & SOCK_FIN_RCVD) { - if (sock_act > LAST_ACK_TIMEOUT || tap_act > LAST_ACK_TIMEOUT) - goto rst; - } - - return; -rst: - tcp_rst(c, conn); -} - /** * struct tcp_port_detect_arg - Arguments for tcp_port_detect() * @c: Execution context @@ -3281,7 +3321,6 @@ static int tcp_port_rebind(void *arg) void tcp_timer(struct ctx *c, struct timespec *now) { struct tcp_sock_refill_arg refill_arg =3D { c, 0 }; - int i; =20 if (c->mode =3D=3D MODE_PASTA) { if (timespec_diff_ms(now, &c->tcp.port_detect_ts) > @@ -3318,7 +3357,4 @@ void tcp_timer(struct ctx *c, struct timespec *now) NS_CALL(tcp_sock_refill, &refill_arg); } } - - for (i =3D c->tcp.conn_count - 1; i >=3D 0; i--) - tcp_timer_one(c, CONN(i), now); } diff --git a/tcp.h b/tcp.h index b4e3fde..3154b4b 100644 --- a/tcp.h +++ b/tcp.h @@ -6,7 +6,9 @@ #ifndef TCP_H #define TCP_H =20 -#define TCP_TIMER_INTERVAL 20 /* ms */ +#define REFILL_INTERVAL 1000 /* ms */ +#define PORT_DETECT_INTERVAL 1000 +#define TCP_TIMER_INTERVAL MIN(REFILL_INTERVAL, PORT_DETECT_INTERVAL) =20 #define TCP_MAX_CONNS (128 * 1024) #define TCP_MAX_SOCKS (TCP_MAX_CONNS + USHRT_MAX * 2) @@ -21,7 +23,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, struct tap_l4_msg *msg, int count, struct timespec *now); int tcp_sock_init(struct ctx *c, struct timespec *now); void tcp_timer(struct ctx *c, struct timespec *now); -void tcp_defer_handler(struct ctx *c, struct timespec *now); +void tcp_defer_handler(struct ctx *c); =20 void tcp_sock_set_bufsize(struct ctx *c, int s); void tcp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s, @@ -34,6 +36,7 @@ void tcp_remap_to_init(in_port_t port, in_port_t delta); * @listen: Set if this file descriptor is a listening socket * @splice: Set if descriptor is associated to a spliced connection * @v6: Set for IPv6 sockets or connections + * @timer: Reference is a timerfd descriptor for connection * @index: Index of connection in table, or port for bound sockets * @u32: Opaque u32 value of reference */ @@ -42,6 +45,7 @@ union tcp_epoll_ref { uint32_t listen:1, splice:1, v6:1, + timer:1, index:20; } tcp; uint32_t u32; --=20 2.35.1 --===============2945062684121323069==--