From mboxrd@z Thu Jan 1 00:00:00 1970 From: Stefano Brivio To: passt-dev@passt.top Subject: [PATCH 20/24] tcp: Fit struct tcp_conn into a single 64-byte cacheline Date: Fri, 25 Mar 2022 23:52:56 +0100 Message-ID: <20220325225300.2803584-21-sbrivio@redhat.com> In-Reply-To: <20220325225300.2803584-1-sbrivio@redhat.com> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="===============8745259835765916349==" --===============8745259835765916349== Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable ...by: - storing the chained-hash next connection pointer as numeric reference rather than as pointer - storing the MSS as 14-bit value, and rounding it - using only the effective amount of bits needed to store the hash bucket number - explicitly limiting window scaling factors to 4-bit values (maximum factor is 14, from RFC 7323) - scaling SO_SNDBUF values, and using a 8-bit representation for the duplicate ACK sequence - keeping window values unscaled, as received and sent Signed-off-by: Stefano Brivio --- tcp.c | 303 ++++++++++++++++++++++++++++++++-------------------------- tcp.h | 5 +- 2 files changed, 170 insertions(+), 138 deletions(-) diff --git a/tcp.c b/tcp.c index 2a5bf6e..3f61e6a 100644 --- a/tcp.c +++ b/tcp.c @@ -66,7 +66,7 @@ * ------ * * To avoid the need for dynamic memory allocation, a maximum, reasonable am= ount - * of connections is defined by MAX_TAP_CONNS below (currently 128k). + * of connections is defined by TCP_MAX_CONNS (currently 128k). * * Data needs to linger on sockets as long as it's not acknowledged by the * guest, and is read using MSG_PEEK into preallocated static buffers sized @@ -216,8 +216,8 @@ * @seq_init_from_tap: initial sequence number from tap/guest * @seq_init_to_tap: initial sequence number from tap/guest * - * @wnd_from_tap: last window size received from tap, scaled - * @wnd_from_tap: last window size advertised from tap, scaled + * @wnd_from_tap: last window size received from tap, never scaled + * @wnd_from_tap: last window size advertised from tap, never scaled * * - from socket to tap/guest: * - on new data from socket: @@ -299,23 +299,26 @@ #include "conf.h" #include "tcp_splice.h" =20 -#define MAX_TAP_CONNS (128 * 1024) - #define TCP_FRAMES_MEM 256 #define TCP_FRAMES \ (c->mode =3D=3D MODE_PASST ? TCP_FRAMES_MEM : 1) =20 +#define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1) #define TCP_HASH_TABLE_LOAD 70 /* % */ -#define TCP_HASH_TABLE_SIZE (MAX_TAP_CONNS * 100 / \ +#define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \ TCP_HASH_TABLE_LOAD) =20 #define MAX_WS 10 #define MAX_WINDOW (1 << (16 + (MAX_WS))) + +/* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 -#define MSS4 (USHRT_MAX - sizeof(uint32_t) - sizeof(struct ethhdr) - \ - sizeof(struct iphdr) - sizeof(struct tcphdr)) -#define MSS6 (USHRT_MAX - sizeof(uint32_t) - sizeof(struct ethhdr) - \ - sizeof(struct ipv6hdr) - sizeof(struct tcphdr)) +#define MSS4 ROUND_DOWN(USHRT_MAX - \ + sizeof(uint32_t) - sizeof(struct ethhdr) - \ + sizeof(struct iphdr) - sizeof(struct tcphdr), 4) +#define MSS6 ROUND_DOWN(USHRT_MAX - \ + sizeof(uint32_t) - sizeof(struct ethhdr) - \ + sizeof(struct ipv6hdr) - sizeof(struct tcphdr), 4) =20 #define WINDOW_DEFAULT 14600 /* RFC 6928 */ #ifdef HAS_SND_WND @@ -363,64 +366,46 @@ #define OPT_SACK 5 #define OPT_TS 8 =20 -struct tcp_conn; - /** * struct tcp_conn - Descriptor for a TCP connection (not spliced) - * @next: Pointer to next item in hash chain, if any + * @next_index: Connection index of next item in hash chain, -1 for none + * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number + * @events: Connection events, implying connection states + * @timer: timerfd descriptor for timeout events + * @flags: Connection flags representing internal attributes * @hash_bucket: Bucket index in connection lookup hash table + * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT + * @ws_from_tap: Window scaling factor advertised from tap/guest + * @ws_to_tap: Window scaling factor advertised to tap/guest + * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS + * @seq_dup_ack_approx: Last duplicate ACK number sent to tap * @a.a6: IPv6 remote address, can be IPv4-mapped * @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20 * @a.a4.one: Ones prefix for IPv4-mapped * @a.a4.a: IPv4 address * @tap_port: Guest-facing tap port * @sock_port: Remote, socket-facing port - * @events: Connection events, implying connection states - * @flags: Connection flags representing internal attributes - * @tap_mss: Maximum segment size advertised by guest + * @wnd_from_tap: Last window size from tap, unscaled (as received) + * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap * @seq_ack_from_tap: Last ACK number received from tap * @seq_from_tap: Next sequence for packets from tap (not actually sent) * @seq_ack_to_tap: Last ACK number sent to tap - * @seq_dup_ack: Last duplicate ACK number sent to tap * @seq_init_from_tap: Initial sequence number from tap - * @seq_init_from_tap: Initial sequence number to tap - * @ws_tap: Window scaling factor from tap - * @ws: Window scaling factor - * @wnd_from_tap: Last window size received from tap, scaled - * @wnd_to_tap: Socket-side sending window, advertised to tap - * @snd_buf: Socket sending buffer reported by kernel, in bytes - * @ts_sock_act: Last activity timestamp from socket for timeout purposes - * @ts_tap_act: Last activity timestamp from tap for timeout purposes - * @ts_ack_from_tap: Last ACK segment timestamp from tap - * @ts_ack_to_tap: Last ACK segment timestamp to tap - * @tap_data_noack: Last unacked data to tap, set to { 0, 0 } on ACK */ struct tcp_conn { - struct tcp_conn *next; - int32_t sock:SOCKET_REF_BITS; -#define TCP_RETRANS_BITS 3 - unsigned int retrans:TCP_RETRANS_BITS; -#define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1) - int timer; - int hash_bucket; + int32_t next_index :TCP_CONN_INDEX_BITS + 1; + +#define TCP_MSS_BITS 14 + uint16_t tap_mss :TCP_MSS_BITS; +#define MSS_SET(conn, mss) (conn->tap_mss =3D (mss >> (16 - TCP_MSS_BITS))) +#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS)) =20 - union { - struct in6_addr a6; - struct { - uint8_t zero[10]; - uint8_t one[2]; - struct in_addr a; - } a4; - } a; -#define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->a.a6) -#define CONN_V6(conn) (!CONN_V4(conn)) =20 - in_port_t tap_port; - in_port_t sock_port; + int32_t sock :SOCKET_REF_BITS; =20 - uint8_t events; + uint8_t events; #define CLOSED 0 #define SOCK_ACCEPTED BIT(0) /* implies SYN sent to tap */ #define TAP_SYN_RCVD BIT(1) /* implies socket connecting */ @@ -435,7 +420,10 @@ struct tcp_conn { #define CONN_STATE_BITS /* Setting these clears other flags */ \ (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED) =20 - uint8_t flags; + + int32_t timer :SOCKET_REF_BITS; + + uint8_t flags; #define STALLED BIT(0) #define LOCAL BIT(1) #define WND_CLAMPED BIT(2) @@ -444,23 +432,48 @@ struct tcp_conn { #define ACK_TO_TAP_DUE BIT(5) #define ACK_FROM_TAP_DUE BIT(6) =20 - uint16_t tap_mss; =20 - uint32_t seq_to_tap; - uint32_t seq_ack_from_tap; - uint32_t seq_from_tap; - uint32_t seq_ack_to_tap; - uint32_t seq_dup_ack; - uint32_t seq_init_from_tap; - uint32_t seq_init_to_tap; + uint32_t hash_bucket :TCP_HASH_BUCKET_BITS; + +#define TCP_RETRANS_BITS 3 + unsigned int retrans :TCP_RETRANS_BITS; +#define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1) =20 - uint16_t ws_tap; - uint16_t ws; +#define TCP_WS_BITS 4 /* RFC 7323 */ + uint8_t ws_from_tap :TCP_WS_BITS; + uint8_t ws_to_tap :TCP_WS_BITS; =20 - uint32_t wnd_from_tap; - uint32_t wnd_to_tap; =20 - int snd_buf; +#define SNDBUF_BITS 24 + uint32_t sndbuf :SNDBUF_BITS; +#define SNDBUF_SET(conn, bytes) (conn->sndbuf =3D ((bytes) >> (32 - SNDBUF_B= ITS))) +#define SNDBUF_GET(conn) (conn->sndbuf << (32 - SNDBUF_BITS)) + + uint8_t seq_dup_ack_approx; + + + union { + struct in6_addr a6; + struct { + uint8_t zero[10]; + uint8_t one[2]; + struct in_addr a; + } a4; + } a; +#define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->a.a6) +#define CONN_V6(conn) (!CONN_V4(conn)) + + in_port_t tap_port; + in_port_t sock_port; + + uint16_t wnd_from_tap; + uint16_t wnd_to_tap; + + uint32_t seq_to_tap; + uint32_t seq_ack_from_tap; + uint32_t seq_from_tap; + uint32_t seq_ack_to_tap; + uint32_t seq_init_from_tap; }; =20 #define CONN_IS_CLOSED(conn) (conn->events =3D=3D CLOSED) @@ -471,6 +484,12 @@ struct tcp_conn { =20 #define CONN(index) (tc + (index)) =20 +/* We probably don't want to use gcc statement expressions (for portability)= , so + * use this only after well-defined sequence points (no pre-/post-increments= ). + */ +#define CONN_OR_NULL(index) \ + (((index) >=3D 0 && (index) < TCP_MAX_CONNS) ? (tc + (index)) : NULL) + static const char *tcp_event_str[] __attribute((__unused__)) =3D { "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", =20 @@ -652,7 +671,7 @@ static unsigned int tcp6_l2_flags_buf_used; static size_t tcp6_l2_flags_buf_bytes; =20 /* TCP connections */ -static struct tcp_conn tc[MAX_TAP_CONNS]; +static struct tcp_conn tc[TCP_MAX_CONNS]; =20 /* Table for lookup from remote address, local port, remote port */ static struct tcp_conn *tc_hash[TCP_HASH_TABLE_SIZE]; @@ -747,12 +766,14 @@ static void tcp_timer_ctl(struct ctx *c, struct tcp_con= n *conn) .r.p.tcp.tcp.index =3D conn - tc }; struct epoll_event ev =3D { .data.u64 =3D ref.u64, .events =3D EPOLLIN | EPOLLET }; + int fd; =20 - conn->timer =3D timerfd_create(CLOCK_MONOTONIC, 0); - if (conn->timer =3D=3D -1) { + fd =3D timerfd_create(CLOCK_MONOTONIC, 0); + if (fd =3D=3D -1 || fd > SOCKET_MAX) { debug("TCP: failed to get timer: %s", strerror(errno)); return; } + conn->timer =3D fd; =20 if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { debug("TCP: failed to add timer: %s", strerror(errno)); @@ -957,7 +978,7 @@ static void tcp_get_sndbuf(struct tcp_conn *conn) =20 sl =3D sizeof(sndbuf); if (getsockopt(s, SOL_SOCKET, SO_SNDBUF, &sndbuf, &sl)) { - conn->snd_buf =3D WINDOW_DEFAULT; + SNDBUF_SET(conn, WINDOW_DEFAULT); return; } =20 @@ -967,7 +988,7 @@ static void tcp_get_sndbuf(struct tcp_conn *conn) else if (v > SNDBUF_SMALL) v -=3D v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2; =20 - conn->snd_buf =3D MIN(INT_MAX, v); + SNDBUF_SET(conn, MIN(INT_MAX, v)); } =20 /** @@ -1299,12 +1320,12 @@ static void tcp_hash_insert(struct ctx *c, struct tcp= _conn *conn, int b; =20 b =3D tcp_hash(c, af, addr, conn->tap_port, conn->sock_port); - conn->next =3D tc_hash[b]; + conn->next_index =3D tc_hash[b] ? tc_hash[b] - tc : -1; tc_hash[b] =3D conn; conn->hash_bucket =3D b; =20 debug("TCP: hash table insert: index %i, sock %i, bucket: %i, next: %p", - conn - tc, conn->sock, b, conn->next); + conn - tc, conn->sock, b, CONN_OR_NULL(conn->next_index)); } =20 /** @@ -1316,18 +1337,20 @@ static void tcp_hash_remove(struct tcp_conn *conn) struct tcp_conn *entry, *prev =3D NULL; int b =3D conn->hash_bucket; =20 - for (entry =3D tc_hash[b]; entry; prev =3D entry, entry =3D entry->next) { + for (entry =3D tc_hash[b]; entry; + prev =3D entry, entry =3D CONN_OR_NULL(entry->next_index)) { if (entry =3D=3D conn) { if (prev) - prev->next =3D conn->next; + prev->next_index =3D conn->next_index; else - tc_hash[b] =3D conn->next; + tc_hash[b] =3D CONN_OR_NULL(conn->next_index); break; } } =20 debug("TCP: hash table remove: index %i, sock %i, bucket: %i, new: %p", - conn - tc, conn->sock, b, prev ? prev->next : tc_hash[b]); + conn - tc, conn->sock, b, + prev ? CONN_OR_NULL(prev->next_index) : tc_hash[b]); } =20 /** @@ -1340,10 +1363,11 @@ static void tcp_hash_update(struct tcp_conn *old, str= uct tcp_conn *new) struct tcp_conn *entry, *prev =3D NULL; int b =3D old->hash_bucket; =20 - for (entry =3D tc_hash[b]; entry; prev =3D entry, entry =3D entry->next) { + for (entry =3D tc_hash[b]; entry; + prev =3D entry, entry =3D CONN_OR_NULL(entry->next_index)) { if (entry =3D=3D old) { if (prev) - prev->next =3D new; + prev->next_index =3D new - tc; else tc_hash[b] =3D new; break; @@ -1371,7 +1395,7 @@ static struct tcp_conn *tcp_hash_lookup(struct ctx *c, = int af, void *addr, int b =3D tcp_hash(c, af, addr, tap_port, sock_port); struct tcp_conn *conn; =20 - for (conn =3D tc_hash[b]; conn; conn =3D conn->next) { + for (conn =3D tc_hash[b]; conn; conn =3D CONN_OR_NULL(conn->next_index)) { if (tcp_hash_match(conn, af, addr, tap_port, sock_port)) return conn; } @@ -1586,21 +1610,11 @@ static size_t tcp_l2_buf_fill_headers(struct ctx *c, = struct tcp_conn *conn, b->th.dest =3D htons(conn->tap_port); \ b->th.seq =3D htonl(seq); \ b->th.ack_seq =3D htonl(conn->seq_ack_to_tap); \ - \ - /* First value sent by receiver is not scaled */ \ - if (b->th.syn) { \ - b->th.window =3D htons(MIN(conn->wnd_to_tap, \ - USHRT_MAX)); \ - } else { \ - b->th.window =3D htons(MIN(conn->wnd_to_tap >> \ - conn->ws, \ - USHRT_MAX)); \ - } \ + b->th.window =3D htons(MIN(conn->wnd_to_tap, USHRT_MAX)); \ } while (0) =20 if (CONN_V6(conn)) { struct tcp6_l2_buf_t *b =3D (struct tcp6_l2_buf_t *)p; - uint32_t flow =3D conn->seq_init_to_tap; =20 ip_len =3D plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); =20 @@ -1617,9 +1631,9 @@ static size_t tcp_l2_buf_fill_headers(struct ctx *c, st= ruct tcp_conn *conn, =20 tcp_update_check_tcp6(b); =20 - b->ip6h.flow_lbl[0] =3D (flow >> 16) & 0xf; - b->ip6h.flow_lbl[1] =3D (flow >> 8) & 0xff; - b->ip6h.flow_lbl[2] =3D (flow >> 0) & 0xff; + b->ip6h.flow_lbl[0] =3D (conn->sock >> 16) & 0xf; + b->ip6h.flow_lbl[1] =3D (conn->sock >> 8) & 0xff; + b->ip6h.flow_lbl[2] =3D (conn->sock >> 0) & 0xff; =20 eth_len =3D ip_len + sizeof(struct ethhdr); if (c->mode =3D=3D MODE_PASST) @@ -1663,10 +1677,11 @@ static size_t tcp_l2_buf_fill_headers(struct ctx *c, = struct tcp_conn *conn, static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, int force_seq, struct tcp_info *tinfo) { + uint32_t prev_wnd_to_tap =3D conn->wnd_to_tap << conn->ws_to_tap; uint32_t prev_ack_to_tap =3D conn->seq_ack_to_tap; - uint32_t prev_wnd_to_tap =3D conn->wnd_to_tap; socklen_t sl =3D sizeof(*tinfo); struct tcp_info tinfo_new; + uint32_t new_wnd_to_tap =3D prev_wnd_to_tap; int s =3D conn->sock; =20 #ifndef HAS_BYTES_ACKED @@ -1676,7 +1691,7 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct = tcp_conn *conn, if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) conn->seq_ack_to_tap =3D prev_ack_to_tap; #else - if ((unsigned long)conn->snd_buf < SNDBUF_SMALL || tcp_rtt_dst_low(conn) + if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) || conn->flags & LOCAL || force_seq) { conn->seq_ack_to_tap =3D conn->seq_from_tap; } else if (conn->seq_ack_to_tap !=3D conn->seq_from_tap) { @@ -1696,12 +1711,13 @@ static int tcp_update_seqack_wnd(struct ctx *c, struc= t tcp_conn *conn, =20 if (!KERNEL_REPORTS_SND_WND(c)) { tcp_get_sndbuf(conn); - conn->wnd_to_tap =3D MIN(conn->snd_buf, MAX_WINDOW); + new_wnd_to_tap =3D MIN(SNDBUF_GET(conn), MAX_WINDOW); + conn->wnd_to_tap =3D new_wnd_to_tap >> conn->ws_to_tap; goto out; } =20 if (!tinfo) { - if (conn->wnd_to_tap > WINDOW_DEFAULT) + if (prev_wnd_to_tap > WINDOW_DEFAULT) goto out; =20 tinfo =3D &tinfo_new; @@ -1711,19 +1727,20 @@ static int tcp_update_seqack_wnd(struct ctx *c, struc= t tcp_conn *conn, =20 #ifdef HAS_SND_WND if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) { - conn->wnd_to_tap =3D tinfo->tcpi_snd_wnd; + new_wnd_to_tap =3D tinfo->tcpi_snd_wnd; } else { tcp_get_sndbuf(conn); - conn->wnd_to_tap =3D MIN((int)tinfo->tcpi_snd_wnd, conn->snd_buf); + new_wnd_to_tap =3D MIN((int)tinfo->tcpi_snd_wnd, + SNDBUF_GET(conn)); } #endif =20 - conn->wnd_to_tap =3D MIN(conn->wnd_to_tap, MAX_WINDOW); + conn->wnd_to_tap =3D MIN(new_wnd_to_tap, MAX_WINDOW) >> conn->ws_to_tap; =20 if (!conn->wnd_to_tap) conn_flag(c, conn, ACK_TO_TAP_DUE); out: - return conn->wnd_to_tap !=3D prev_wnd_to_tap || + return new_wnd_to_tap !=3D prev_wnd_to_tap || conn->seq_ack_to_tap !=3D prev_ack_to_tap; } =20 @@ -1813,16 +1830,14 @@ static int tcp_send_flag(struct ctx *c, struct tcp_co= nn *conn, int flags) c->tcp.kernel_snd_wnd =3D 1; #endif =20 - conn->ws =3D MIN(MAX_WS, tinfo.tcpi_snd_wscale); + conn->ws_to_tap =3D MIN(MAX_WS, tinfo.tcpi_snd_wscale); =20 *data++ =3D OPT_NOP; *data++ =3D OPT_WS; *data++ =3D OPT_WS_LEN; - *data++ =3D conn->ws; + *data++ =3D conn->ws_to_tap; =20 th->ack =3D !!(flags & ACK); - - conn->wnd_to_tap =3D WINDOW_DEFAULT; } else { th->ack =3D !!(flags & (ACK | DUP_ACK)) || conn->seq_ack_to_tap !=3D prev_ack_to_tap || @@ -1839,6 +1854,10 @@ static int tcp_send_flag(struct ctx *c, struct tcp_con= n *conn, int flags) NULL, conn->seq_to_tap); iov->iov_len =3D eth_len + sizeof(uint32_t); =20 + /* First value is not scaled: scale now */ + if (flags & SYN) + conn->wnd_to_tap >>=3D conn->ws_to_tap; + if (CONN_V4(conn)) tcp4_l2_flags_buf_bytes +=3D iov->iov_len; else @@ -1908,7 +1927,7 @@ static void tcp_clamp_window(struct ctx *c, struct tcp_= conn *conn, if (init && th) { int ws =3D tcp_opt_get(th, len, OPT_WS, NULL, NULL); =20 - conn->ws_tap =3D ws; + conn->ws_from_tap =3D ws & 0xf; =20 /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp * yet, to avoid getting a zero scale just because we set a @@ -1916,30 +1935,34 @@ static void tcp_clamp_window(struct ctx *c, struct tc= p_conn *conn, */ conn->wnd_from_tap =3D ntohs(th->window); } else { + uint32_t prev_scaled =3D conn->wnd_from_tap << conn->ws_from_tap; + if (th) - window =3D ntohs(th->window) << conn->ws_tap; + window =3D ntohs(th->window) << conn->ws_from_tap; else - window <<=3D conn->ws_tap; + window <<=3D conn->ws_from_tap; =20 window =3D MIN(MAX_WINDOW, window); =20 if (conn->flags & WND_CLAMPED) { - if (conn->wnd_from_tap =3D=3D window) + if (prev_scaled =3D=3D window) return; =20 /* Discard +/- 1% updates to spare some syscalls. */ - if ((window > conn->wnd_from_tap && - window * 99 / 100 < conn->wnd_from_tap) || - (window < conn->wnd_from_tap && - window * 101 / 100 > conn->wnd_from_tap)) { - conn->wnd_from_tap =3D window; + if ((window > prev_scaled && + window * 99 / 100 < prev_scaled) || + (window < prev_scaled && + window * 101 / 100 > prev_scaled)) { + conn->wnd_from_tap =3D window >> + conn->ws_from_tap; return; } } =20 - conn->wnd_from_tap =3D window; if (window < 256) window =3D 256; + + conn->wnd_from_tap =3D window >> conn->ws_from_tap; setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP, &window, sizeof(window)); conn_flag(c, conn, WND_CLAMPED); @@ -2090,7 +2113,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, vo= id *addr, const struct sockaddr *sa; struct tcp_conn *conn; socklen_t sl; - int s; + int s, mss; =20 if (c->tcp.conn_count >=3D TCP_MAX_CONNS) return; @@ -2120,14 +2143,14 @@ static void tcp_conn_from_tap(struct ctx *c, int af, = void *addr, conn =3D CONN(c->tcp.conn_count++); conn->sock =3D s; conn->timer =3D -1; + conn->ws_to_tap =3D conn->ws_from_tap =3D 0; conn_event(c, conn, TAP_SYN_RCVD); =20 conn->wnd_to_tap =3D WINDOW_DEFAULT; =20 - conn->tap_mss =3D tcp_conn_tap_mss(c, conn, th, len); - - sl =3D sizeof(conn->tap_mss); - setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->tap_mss, sl); + mss =3D tcp_conn_tap_mss(c, conn, th, len); + setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)); + MSS_SET(conn, mss); =20 tcp_clamp_window(c, conn, th, len, 0, 1); =20 @@ -2153,7 +2176,6 @@ static void tcp_conn_from_tap(struct ctx *c, int af, vo= id *addr, conn->seq_ack_to_tap =3D conn->seq_from_tap; =20 conn->seq_to_tap =3D tcp_seq_init(c, af, addr, th->dest, th->source, now); - conn->seq_init_to_tap =3D conn->seq_to_tap; conn->seq_ack_from_tap =3D conn->seq_to_tap + 1; =20 tcp_hash_insert(c, conn, af, addr); @@ -2256,10 +2278,12 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp= _conn *conn, ssize_t plen, */ static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn) { + uint32_t wnd_scaled =3D conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs =3D 0, last_len, iov_rem =3D 0; int sendlen, len, plen, v4 =3D CONN_V4(conn); int s =3D conn->sock, i, ret =3D 0; struct msghdr mh_sock =3D { 0 }; + uint16_t mss =3D MSS_GET(conn); uint32_t already_sent; struct iovec *iov; =20 @@ -2271,20 +2295,19 @@ static int tcp_data_from_sock(struct ctx *c, struct t= cp_conn *conn) already_sent =3D 0; } =20 - if (!conn->wnd_from_tap || already_sent >=3D conn->wnd_from_tap) { + if (!wnd_scaled || already_sent >=3D wnd_scaled) { conn_flag(c, conn, STALLED); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; } =20 /* Set up buffer descriptors we'll fill completely and partially. */ - fill_bufs =3D DIV_ROUND_UP(conn->wnd_from_tap - already_sent, - conn->tap_mss); + fill_bufs =3D DIV_ROUND_UP(wnd_scaled - already_sent, mss); if (fill_bufs > TCP_FRAMES) { fill_bufs =3D TCP_FRAMES; iov_rem =3D 0; } else { - iov_rem =3D (conn->wnd_from_tap - already_sent) % conn->tap_mss; + iov_rem =3D (wnd_scaled - already_sent) % mss; } =20 mh_sock.msg_iov =3D iov_sock; @@ -2302,7 +2325,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp= _conn *conn) iov->iov_base =3D &tcp4_l2_buf[tcp4_l2_buf_used + i].data; else iov->iov_base =3D &tcp6_l2_buf[tcp6_l2_buf_used + i].data; - iov->iov_len =3D conn->tap_mss; + iov->iov_len =3D mss; } if (iov_rem) iov_sock[fill_bufs].iov_len =3D iov_rem; @@ -2327,14 +2350,14 @@ recvmsg: =20 conn_flag(c, conn, ~STALLED); =20 - send_bufs =3D DIV_ROUND_UP(sendlen, conn->tap_mss); - last_len =3D sendlen - (send_bufs - 1) * conn->tap_mss; + send_bufs =3D DIV_ROUND_UP(sendlen, mss); + last_len =3D sendlen - (send_bufs - 1) * mss; =20 /* Likely, some new data was acked too. */ tcp_update_seqack_wnd(c, conn, 0, NULL); =20 /* Finally, queue to tap */ - plen =3D conn->tap_mss; + plen =3D mss; for (i =3D 0; i < send_bufs; i++) { int no_csum =3D i && i !=3D send_bufs - 1 && tcp4_l2_buf_used; =20 @@ -2383,8 +2406,8 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp= _conn *conn, struct tap_l4_msg *msg, int count) { int i, iov_i, ack =3D 0, fin =3D 0, retr =3D 0, keep =3D -1; - uint32_t max_ack_seq =3D conn->seq_ack_from_tap; uint16_t max_ack_seq_wnd =3D conn->wnd_from_tap; + uint32_t max_ack_seq =3D conn->seq_ack_from_tap; uint32_t seq_from_tap =3D conn->seq_from_tap; struct msghdr mh =3D { .msg_iov =3D tcp_iov }; int partial_send =3D 0; @@ -2541,8 +2564,12 @@ eintr: =20 out: if (keep !=3D -1) { - if (conn->seq_dup_ack !=3D conn->seq_from_tap) { - conn->seq_dup_ack =3D conn->seq_from_tap; + /* We use an 8-bit approximation here: the associated risk is + * that we skip a duplicate ACK on 8-bit sequence number + * collision. Fast retransmit is a SHOULD in RFC 5681, 3.2. + */ + if (conn->seq_dup_ack_approx !=3D (conn->seq_from_tap & 0xff)) { + conn->seq_dup_ack_approx =3D conn->seq_from_tap & 0xff; tcp_send_flag(c, conn, DUP_ACK); } return; @@ -2572,7 +2599,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, st= ruct tcp_conn *conn, struct tcphdr *th, size_t len) { tcp_clamp_window(c, conn, th, len, 0, 1); - conn->tap_mss =3D tcp_conn_tap_mss(c, conn, th, len); + MSS_SET(conn, tcp_conn_tap_mss(c, conn, th, len)); =20 conn->seq_init_from_tap =3D ntohl(th->seq) + 1; conn->seq_from_tap =3D conn->seq_init_from_tap; @@ -2744,6 +2771,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epo= ll_ref ref, conn =3D CONN(c->tcp.conn_count++); conn->sock =3D s; conn->timer =3D -1; + conn->ws_to_tap =3D conn->ws_from_tap =3D 0; conn_event(c, conn, SOCK_ACCEPTED); =20 if (ref.r.p.tcp.tcp.v6) { @@ -2773,7 +2801,6 @@ static void tcp_conn_from_sock(struct ctx *c, union epo= ll_ref ref, conn->sock_port, conn->tap_port, now); - conn->seq_init_to_tap =3D conn->seq_to_tap; =20 tcp_hash_insert(c, conn, AF_INET6, &sa6.sin6_addr); } else { @@ -2800,7 +2827,6 @@ static void tcp_conn_from_sock(struct ctx *c, union epo= ll_ref ref, conn->sock_port, conn->tap_port, now); - conn->seq_init_to_tap =3D conn->seq_to_tap; =20 tcp_hash_insert(c, conn, AF_INET, &s_addr); } @@ -2822,9 +2848,12 @@ static void tcp_conn_from_sock(struct ctx *c, union ep= oll_ref ref, */ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref) { - struct tcp_conn *conn =3D CONN(ref.r.p.tcp.tcp.index); + struct tcp_conn *conn =3D CONN_OR_NULL(ref.r.p.tcp.tcp.index); struct epoll_event ev =3D { 0 }; =20 + if (!conn) + return; + if (CONN_IS_CLOSED(conn)) { tcp_hash_remove(conn); tcp_table_compact(c, conn); @@ -2898,7 +2927,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref re= f, uint32_t events, return; } =20 - if (!(conn =3D CONN(ref.r.p.tcp.tcp.index))) + if (!(conn =3D CONN_OR_NULL(ref.r.p.tcp.tcp.index))) return; =20 if (events & EPOLLERR) { @@ -3098,7 +3127,8 @@ static int tcp_sock_refill(void *arg) return -EIO; } =20 - tcp_sock_set_bufsize(a->c, *p4); + if (*p4 >=3D 0) + tcp_sock_set_bufsize(a->c, *p4); } =20 for (i =3D 0; a->c->v6 && i < TCP_SOCK_POOL_SIZE; i++, p6++) { @@ -3113,7 +3143,8 @@ static int tcp_sock_refill(void *arg) return -EIO; } =20 - tcp_sock_set_bufsize(a->c, *p6); + if (*p6 >=3D 0) + tcp_sock_set_bufsize(a->c, *p6); } =20 return 0; diff --git a/tcp.h b/tcp.h index 3154b4b..109516d 100644 --- a/tcp.h +++ b/tcp.h @@ -6,11 +6,12 @@ #ifndef TCP_H #define TCP_H =20 -#define REFILL_INTERVAL 1000 /* ms */ +#define REFILL_INTERVAL 1000 /* ms */ #define PORT_DETECT_INTERVAL 1000 #define TCP_TIMER_INTERVAL MIN(REFILL_INTERVAL, PORT_DETECT_INTERVAL) =20 -#define TCP_MAX_CONNS (128 * 1024) +#define TCP_CONN_INDEX_BITS 17 /* 128k */ +#define TCP_MAX_CONNS (1 << TCP_CONN_INDEX_BITS) #define TCP_MAX_SOCKS (TCP_MAX_CONNS + USHRT_MAX * 2) =20 #define TCP_SOCK_POOL_SIZE 32 --=20 2.35.1 --===============8745259835765916349==--