On Fri, May 31, 2024 at 04:23:37PM +0200, Laurent Vivier wrote: Commit message. > Signed-off-by: Laurent Vivier > --- > Makefile | 5 +- > tcp.c | 575 ++----------------------------------------------- > tcp_buf.c | 526 ++++++++++++++++++++++++++++++++++++++++++++ > tcp_buf.h | 16 ++ > tcp_internal.h | 87 ++++++++ > 5 files changed, 652 insertions(+), 557 deletions(-) > create mode 100644 tcp_buf.c > create mode 100644 tcp_buf.h > create mode 100644 tcp_internal.h > > diff --git a/Makefile b/Makefile > index 8ea175762e36..1ac2e5e0053f 100644 > --- a/Makefile > +++ b/Makefile > @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) > PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ > icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ > ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ > - tcp_splice.c udp.c util.c > + tcp_buf.c tcp_splice.c udp.c util.c > QRAP_SRCS = qrap.c > SRCS = $(PASST_SRCS) $(QRAP_SRCS) > > @@ -56,7 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1 > PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ > flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ > lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ > - siphash.h tap.h tcp.h tcp_conn.h tcp_splice.h udp.h util.h > + siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ > + udp.h util.h > HEADERS = $(PASST_HEADERS) seccomp.h > > C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; > diff --git a/tcp.c b/tcp.c > index a6f43010f58f..48d8f7c6d696 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -302,28 +302,14 @@ > #include "flow.h" > > #include "flow_table.h" > - > -#define TCP_FRAMES_MEM 128 > -#define TCP_FRAMES \ > - (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) > +#include "tcp_internal.h" > +#include "tcp_buf.h" > > #define TCP_HASH_TABLE_LOAD 70 /* % */ > #define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD) > > -#define MAX_WS 8 > -#define MAX_WINDOW (1 << (16 + (MAX_WS))) > - > /* MSS rounding: see SET_MSS() */ > #define MSS_DEFAULT 536 > -#define MSS4 ROUND_DOWN(IP_MAX_MTU - \ > - sizeof(struct tcphdr) - \ > - sizeof(struct iphdr), \ > - sizeof(uint32_t)) > -#define MSS6 ROUND_DOWN(IP_MAX_MTU - \ > - sizeof(struct tcphdr) - \ > - sizeof(struct ipv6hdr), \ > - sizeof(uint32_t)) > - > #define WINDOW_DEFAULT 14600 /* RFC 6928 */ > #ifdef HAS_SND_WND > # define KERNEL_REPORTS_SND_WND(c) (c->tcp.kernel_snd_wnd) > @@ -345,33 +331,10 @@ > */ > #define SOL_TCP IPPROTO_TCP > > -#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) > -#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) > -#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) > -#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) > - > -#define FIN (1 << 0) > -#define SYN (1 << 1) > -#define RST (1 << 2) > -#define ACK (1 << 4) > -/* Flags for internal usage */ > -#define DUP_ACK (1 << 5) > #define ACK_IF_NEEDED 0 /* See tcp_send_flag() */ > > -#define OPT_EOL 0 > -#define OPT_NOP 1 > -#define OPT_MSS 2 > -#define OPT_MSS_LEN 4 > -#define OPT_WS 3 > -#define OPT_WS_LEN 3 > -#define OPT_SACKP 4 > -#define OPT_SACK 5 > -#define OPT_TS 8 > - > #define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP) > > -#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) > -#define CONN_V6(conn) (!CONN_V4(conn)) > #define CONN_IS_CLOSING(conn) \ > ((conn->events & ESTABLISHED) && \ > (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) > @@ -408,114 +371,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; > */ > static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; > > -/** > - * tcp_buf_seq_update - Sequences to update with length of frames once sent > - * @seq: Pointer to sequence number sent to tap-side, to be updated > - * @len: TCP payload length > - */ > -struct tcp_buf_seq_update { > - uint32_t *seq; > - uint16_t len; > -}; This will conflict with Jon's upcoming changes, and I think it will be simpler if his go first (although they have taken rather longer to land than I was expecting). > -/* Static buffers */ > -/** > - * struct tcp_payload_t - TCP header and data to send segments with payload > - * @th: TCP header > - * @data: TCP data > - */ > -struct tcp_payload_t { > - struct tcphdr th; > - uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; > -#ifdef __AVX2__ > -} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */ > -#else > -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); > -#endif > - > -/** > - * struct tcp_flags_t - TCP header and data to send zero-length > - * segments (flags) > - * @th: TCP header > - * @opts TCP options > - */ > -struct tcp_flags_t { > - struct tcphdr th; > - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; > -#ifdef __AVX2__ > -} __attribute__ ((packed, aligned(32))); > -#else > -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); > -#endif > - > -/* Ethernet header for IPv4 frames */ > -static struct ethhdr tcp4_eth_src; > - > -static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM]; > -/* IPv4 headers */ > -static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; > -/* TCP segments with payload for IPv4 frames */ > -static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; > - > -static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"); > - > -static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM]; > -static unsigned int tcp4_payload_used; > - > -static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM]; > -/* IPv4 headers for TCP segment without payload */ > -static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM]; > -/* TCP segments without payload for IPv4 frames */ > -static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM]; > - > -static unsigned int tcp4_flags_used; > - > -/* Ethernet header for IPv6 frames */ > -static struct ethhdr tcp6_eth_src; > - > -static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM]; > -/* IPv6 headers */ > -static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; > -/* TCP headers and data for IPv6 frames */ > -static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; > - > -static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"); > - > -static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM]; > -static unsigned int tcp6_payload_used; > - > -static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM]; > -/* IPv6 headers for TCP segment without payload */ > -static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM]; > -/* TCP segment without payload for IPv6 frames */ > -static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM]; > - > -static unsigned int tcp6_flags_used; > - > -/* recvmsg()/sendmsg() data for tap */ > -static char tcp_buf_discard [MAX_WINDOW]; > -static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; > - > -/* > - * enum tcp_iov_parts - I/O vector parts for one TCP frame > - * @TCP_IOV_TAP tap backend specific header > - * @TCP_IOV_ETH Ethernet header > - * @TCP_IOV_IP IP (v4/v6) header > - * @TCP_IOV_PAYLOAD IP payload (TCP header + data) > - * @TCP_NUM_IOVS the number of entries in the iovec array > - */ > -enum tcp_iov_parts { > - TCP_IOV_TAP = 0, > - TCP_IOV_ETH = 1, > - TCP_IOV_IP = 2, > - TCP_IOV_PAYLOAD = 3, > - TCP_NUM_IOVS > -}; > - > -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; > -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; > -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; > -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; > +char tcp_buf_discard [MAX_WINDOW]; > > /* sendmsg() to socket */ > static struct iovec tcp_iov [UIO_MAXIOV]; > @@ -560,14 +416,6 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) > return EPOLLRDHUP; > } > > -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, > - unsigned long flag); > -#define conn_flag(c, conn, flag) \ > - do { \ > - flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ > - conn_flag_do(c, conn, flag); \ > - } while (0) > - > /** > * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events > * @c: Execution context > @@ -679,8 +527,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) > * @conn: Connection pointer > * @flag: Flag to set, or ~flag to unset > */ > -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, > - unsigned long flag) > +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, > + unsigned long flag) > { > if (flag & (flag - 1)) { > int flag_index = fls(~flag); > @@ -730,8 +578,8 @@ static void tcp_hash_remove(const struct ctx *c, > * @conn: Connection pointer > * @event: Connection event > */ > -static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, > - unsigned long event) > +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, > + unsigned long event) > { > int prev, new, num = fls(event); > > @@ -779,12 +627,6 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, > tcp_timer_ctl(c, conn); > } > > -#define conn_event(c, conn, event) \ > - do { \ > - flow_trace(conn, "event at %s:%i", __func__, __LINE__); \ > - conn_event_do(c, conn, event); \ > - } while (0) > - > /** > * tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint > * @conn: Connection pointer > @@ -914,104 +756,6 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) > th->check = csum(th, l4len, sum); > } > > -/** > - * tcp_update_l2_buf() - Update Ethernet header buffers with addresses > - * @eth_d: Ethernet destination address, NULL if unchanged > - * @eth_s: Ethernet source address, NULL if unchanged > - */ > -void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) > -{ > - eth_update_mac(&tcp4_eth_src, eth_d, eth_s); > - eth_update_mac(&tcp6_eth_src, eth_d, eth_s); > -} > - > -/** > - * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets > - * @c: Execution context > - */ > -static void tcp_sock4_iov_init(const struct ctx *c) > -{ > - struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); > - struct iovec *iov; > - int i; > - > - tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); > - > - for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) { > - tcp4_payload_ip[i] = iph; > - tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4; > - tcp4_payload[i].th.ack = 1; > - } > - > - for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) { > - tcp4_flags_ip[i] = iph; > - tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4; > - tcp4_flags[i].th.ack = 1; > - } > - > - for (i = 0; i < TCP_FRAMES_MEM; i++) { > - iov = tcp4_l2_iov[i]; > - > - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]); > - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); > - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]); > - iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i]; > - } > - > - for (i = 0; i < TCP_FRAMES_MEM; i++) { > - iov = tcp4_l2_flags_iov[i]; > - > - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]); > - iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; > - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); > - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]); > - iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i]; > - } > -} > - > -/** > - * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets > - * @c: Execution context > - */ > -static void tcp_sock6_iov_init(const struct ctx *c) > -{ > - struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); > - struct iovec *iov; > - int i; > - > - tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); > - > - for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) { > - tcp6_payload_ip[i] = ip6; > - tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4; > - tcp6_payload[i].th.ack = 1; > - } > - > - for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) { > - tcp6_flags_ip[i] = ip6; > - tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4; > - tcp6_flags[i].th .ack = 1; > - } > - > - for (i = 0; i < TCP_FRAMES_MEM; i++) { > - iov = tcp6_l2_iov[i]; > - > - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]); > - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); > - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]); > - iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i]; > - } > - > - for (i = 0; i < TCP_FRAMES_MEM; i++) { > - iov = tcp6_l2_flags_iov[i]; > - > - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]); > - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); > - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]); > - iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i]; > - } > -} > - > /** > * tcp_opt_get() - Get option, and value if any, from TCP header > * @opts: Pointer to start of TCP options in header > @@ -1235,50 +979,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn) > return true; > } > > -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); > -#define tcp_rst(c, conn) \ > - do { \ > - flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ > - tcp_rst_do(c, conn); \ > - } while (0) > - > -/** > - * tcp_flags_flush() - Send out buffers for segments with no data (flags) > - * @c: Execution context > - */ > -static void tcp_flags_flush(const struct ctx *c) > -{ > - tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS, > - tcp6_flags_used); > - tcp6_flags_used = 0; > - > - tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS, > - tcp4_flags_used); > - tcp4_flags_used = 0; > -} > - > -/** > - * tcp_payload_flush() - Send out buffers for segments with data > - * @c: Execution context > - */ > -static void tcp_payload_flush(const struct ctx *c) > -{ > - unsigned i; > - size_t m; > - > - m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, > - tcp6_payload_used); > - for (i = 0; i < m; i++) > - *tcp6_seq_update[i].seq += tcp6_seq_update[i].len; > - tcp6_payload_used = 0; > - > - m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, > - tcp4_payload_used); > - for (i = 0; i < m; i++) > - *tcp4_seq_update[i].seq += tcp4_seq_update[i].len; > - tcp4_payload_used = 0; > -} > - > /** > * tcp_defer_handler() - Handler for TCP deferred tasks > * @c: Execution context > @@ -1326,7 +1026,7 @@ static void tcp_fill_header(struct tcphdr *th, > * > * Return: The IPv4 payload length, host order > */ > -static size_t tcp_fill_headers4(const struct ctx *c, > +size_t tcp_fill_headers4(const struct ctx *c, > const struct tcp_tap_conn *conn, > struct tap_hdr *taph, > struct iphdr *iph, struct tcphdr *th, > @@ -1369,11 +1069,11 @@ static size_t tcp_fill_headers4(const struct ctx *c, > * > * Return: The IPv6 payload length, host order > */ > -static size_t tcp_fill_headers6(const struct ctx *c, > - const struct tcp_tap_conn *conn, > - struct tap_hdr *taph, > - struct ipv6hdr *ip6h, struct tcphdr *th, > - size_t dlen, uint32_t seq) > +size_t tcp_fill_headers6(const struct ctx *c, > + const struct tcp_tap_conn *conn, > + struct tap_hdr *taph, > + struct ipv6hdr *ip6h, struct tcphdr *th, > + size_t dlen, uint32_t seq) > { > size_t l4len = dlen + sizeof(*th); > > @@ -1410,8 +1110,8 @@ static size_t tcp_fill_headers6(const struct ctx *c, > * > * Return: 1 if sequence or window were updated, 0 otherwise > */ > -static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, > - int force_seq, struct tcp_info *tinfo) > +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, > + int force_seq, struct tcp_info *tinfo) > { > uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; > uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; > @@ -1530,7 +1230,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, > * 0 if there is no flag to send > * 1 otherwise > */ > -static int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, > +int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, > int flags, struct tcphdr *th, char *data, > size_t *optlen) > { > @@ -1620,69 +1320,9 @@ static int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, > return 1; > } > > -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > +int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > { > - struct tcp_flags_t *payload; > - size_t optlen = 0; > - struct iovec *iov; > - size_t l4len; > - int ret; > - > - if (CONN_V4(conn)) { > - iov = tcp4_l2_flags_iov[tcp4_flags_used++]; > - > - payload = iov[TCP_IOV_PAYLOAD].iov_base; > - > - ret = tcp_fill_flag_header(c, conn, flags, &payload->th, > - payload->opts, &optlen); > - if (ret <= 0) > - return ret; > - > - l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, > - iov[TCP_IOV_IP].iov_base, > - iov[TCP_IOV_PAYLOAD].iov_base, optlen, > - NULL, conn->seq_to_tap); > - } else { > - iov = tcp6_l2_flags_iov[tcp6_flags_used++]; > - > - payload = iov[TCP_IOV_PAYLOAD].iov_base; > - > - ret = tcp_fill_flag_header(c, conn, flags, &payload->th, > - payload->opts, &optlen); > - if (ret <= 0) > - return ret; > - > - l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, > - iov[TCP_IOV_IP].iov_base, > - iov[TCP_IOV_PAYLOAD].iov_base, optlen, > - conn->seq_to_tap); > - } > - iov[TCP_IOV_PAYLOAD].iov_len = l4len; > - > - if (flags & DUP_ACK) { > - struct iovec *dup_iov; > - int i; > - > - if (CONN_V4(conn)) > - dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++]; > - else > - dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; > - > - for (i = 0; i < TCP_NUM_IOVS; i++) > - memcpy(dup_iov[i].iov_base, iov[i].iov_base, > - iov[i].iov_len); > - dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len; > - } > - > - if (CONN_V4(conn)) { > - if (tcp4_flags_used > TCP_FRAMES_MEM - 2) > - tcp_flags_flush(c); > - } else { > - if (tcp6_flags_used > TCP_FRAMES_MEM - 2) > - tcp_flags_flush(c); > - } > - > - return 0; > + return tcp_buf_send_flag(c, conn, flags); > } > > /** > @@ -1690,7 +1330,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > * @c: Execution context > * @conn: Connection pointer > */ > -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) > +void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) > { > if (conn->events == CLOSED) > return; > @@ -2117,184 +1757,9 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) > return 0; > } > > -/** > - * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer > - * @c: Execution context > - * @conn: Connection pointer > - * @dlen: TCP payload length > - * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer > - * @seq: Sequence number to be sent > - */ > -static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, > - ssize_t dlen, int no_csum, uint32_t seq) > -{ > - uint32_t *seq_update = &conn->seq_to_tap; > - struct iovec *iov; > - size_t l4len; > - > - if (CONN_V4(conn)) { > - struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; > - const uint16_t *check = NULL; > - > - if (no_csum) { > - struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; > - check = &iph->check; > - } > - > - tcp4_seq_update[tcp4_payload_used].seq = seq_update; > - tcp4_seq_update[tcp4_payload_used].len = dlen; > - > - iov = tcp4_l2_iov[tcp4_payload_used++]; > - l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, > - iov[TCP_IOV_IP].iov_base, > - iov[TCP_IOV_PAYLOAD].iov_base, dlen, > - check, seq); > - iov[TCP_IOV_PAYLOAD].iov_len = l4len; > - if (tcp4_payload_used > TCP_FRAMES_MEM - 1) > - tcp_payload_flush(c); > - } else if (CONN_V6(conn)) { > - tcp6_seq_update[tcp6_payload_used].seq = seq_update; > - tcp6_seq_update[tcp6_payload_used].len = dlen; > - > - iov = tcp6_l2_iov[tcp6_payload_used++]; > - l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, > - iov[TCP_IOV_IP].iov_base, > - iov[TCP_IOV_PAYLOAD].iov_base, dlen, > - seq); > - iov[TCP_IOV_PAYLOAD].iov_len = l4len; > - if (tcp6_payload_used > TCP_FRAMES_MEM - 1) > - tcp_payload_flush(c); > - } > -} > - > -/** > - * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window > - * @c: Execution context > - * @conn: Connection pointer > - * > - * Return: negative on connection reset, 0 otherwise > - * > - * #syscalls recvmsg > - */ > static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > { > - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; > - int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; > - int sendlen, len, dlen, v4 = CONN_V4(conn); > - int s = conn->sock, i, ret = 0; > - struct msghdr mh_sock = { 0 }; > - uint16_t mss = MSS_GET(conn); > - uint32_t already_sent, seq; > - struct iovec *iov; > - > - already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; > - > - if (SEQ_LT(already_sent, 0)) { > - /* RFC 761, section 2.1. */ > - flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", > - conn->seq_ack_from_tap, conn->seq_to_tap); > - conn->seq_to_tap = conn->seq_ack_from_tap; > - already_sent = 0; > - } > - > - if (!wnd_scaled || already_sent >= wnd_scaled) { > - conn_flag(c, conn, STALLED); > - conn_flag(c, conn, ACK_FROM_TAP_DUE); > - return 0; > - } > - > - /* Set up buffer descriptors we'll fill completely and partially. */ > - fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); > - if (fill_bufs > TCP_FRAMES) { > - fill_bufs = TCP_FRAMES; > - iov_rem = 0; > - } else { > - iov_rem = (wnd_scaled - already_sent) % mss; > - } > - > - mh_sock.msg_iov = iov_sock; > - mh_sock.msg_iovlen = fill_bufs + 1; > - > - iov_sock[0].iov_base = tcp_buf_discard; > - iov_sock[0].iov_len = already_sent; > - > - if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || > - (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { > - tcp_payload_flush(c); > - > - /* Silence Coverity CWE-125 false positive */ > - tcp4_payload_used = tcp6_payload_used = 0; > - } > - > - for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { > - if (v4) > - iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data; > - else > - iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data; > - iov->iov_len = mss; > - } > - if (iov_rem) > - iov_sock[fill_bufs].iov_len = iov_rem; > - > - /* Receive into buffers, don't dequeue until acknowledged by guest. */ > - do > - len = recvmsg(s, &mh_sock, MSG_PEEK); > - while (len < 0 && errno == EINTR); > - > - if (len < 0) > - goto err; > - > - if (!len) { > - if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { > - if ((ret = tcp_send_flag(c, conn, FIN | ACK))) { > - tcp_rst(c, conn); > - return ret; > - } > - > - conn_event(c, conn, TAP_FIN_SENT); > - } > - > - return 0; > - } > - > - sendlen = len - already_sent; > - if (sendlen <= 0) { > - conn_flag(c, conn, STALLED); > - return 0; > - } > - > - conn_flag(c, conn, ~STALLED); > - > - send_bufs = DIV_ROUND_UP(sendlen, mss); > - last_len = sendlen - (send_bufs - 1) * mss; > - > - /* Likely, some new data was acked too. */ > - tcp_update_seqack_wnd(c, conn, 0, NULL); > - > - /* Finally, queue to tap */ > - dlen = mss; > - seq = conn->seq_to_tap; > - for (i = 0; i < send_bufs; i++) { > - int no_csum = i && i != send_bufs - 1 && tcp4_payload_used; > - > - if (i == send_bufs - 1) > - dlen = last_len; > - > - tcp_data_to_tap(c, conn, dlen, no_csum, seq); > - seq += dlen; > - } > - > - conn_flag(c, conn, ACK_FROM_TAP_DUE); > - > - return 0; > - > -err: > - if (errno != EAGAIN && errno != EWOULDBLOCK) { > - ret = -errno; > - tcp_rst(c, conn); > - } > - > - return ret; > + return tcp_buf_data_from_sock(c, conn); > } > > /** > diff --git a/tcp_buf.c b/tcp_buf.c > new file mode 100644 > index 000000000000..87923029a958 > --- /dev/null > +++ b/tcp_buf.c > @@ -0,0 +1,526 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > + > +/* PASST - Plug A Simple Socket Transport > + * for qemu/UNIX domain socket mode > + * > + * PASTA - Pack A Subtle Tap Abstraction > + * for network namespace/tap device mode > + * > + * tcp_buf.c - TCP L2-L4 translation state machine This description doesn't appear correct, or at least not complete, for the new file. > + * > + * Copyright (c) 2020-2022 Red Hat GmbH And this should probably be updated since you're touching it too. Maybe go with the plain "Copyright Red Hat" that Red Hat legal seems to recommend. > + * Author: Stefano Brivio > + */ > + > +#include > +#include > +#include > +#include > +#include > + > +#include > + > +#include > + > +#include "util.h" > +#include "ip.h" > +#include "iov.h" > +#include "passt.h" > +#include "tap.h" > +#include "siphash.h" > +#include "inany.h" > +#include "tcp_conn.h" > +#include "tcp_internal.h" > +#include "tcp_buf.h" > + > +#define TCP_FRAMES_MEM 128 > +#define TCP_FRAMES \ > + (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) > + > +/** > + * tcp_buf_seq_update - Sequences to update with length of frames once sent > + * @seq: Pointer to sequence number sent to tap-side, to be updated > + * @len: TCP payload length > + */ > +struct tcp_buf_seq_update { > + uint32_t *seq; > + uint16_t len; > +}; > + > +/* Static buffers */ > +/** > + * struct tcp_payload_t - TCP header and data to send segments with payload > + * @th: TCP header > + * @data: TCP data > + */ > +struct tcp_payload_t { > + struct tcphdr th; > + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; > +#ifdef __AVX2__ > +} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */ > +#else > +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); > +#endif > + > +/** > + * struct tcp_flags_t - TCP header and data to send zero-length > + * segments (flags) > + * @th: TCP header > + * @opts TCP options > + */ > +struct tcp_flags_t { > + struct tcphdr th; > + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; > +#ifdef __AVX2__ > +} __attribute__ ((packed, aligned(32))); > +#else > +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); > +#endif > + > +/* Ethernet header for IPv4 frames */ > +static struct ethhdr tcp4_eth_src; > + > +static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM]; > +/* IPv4 headers */ > +static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; > +/* TCP segments with payload for IPv4 frames */ > +static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; > + > +static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"); > + > +static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM]; > +static unsigned int tcp4_payload_used; > + > +static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM]; > +/* IPv4 headers for TCP segment without payload */ > +static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM]; > +/* TCP segments without payload for IPv4 frames */ > +static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM]; > + > +static unsigned int tcp4_flags_used; > + > +/* Ethernet header for IPv6 frames */ > +static struct ethhdr tcp6_eth_src; > + > +static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM]; > +/* IPv6 headers */ > +static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; > +/* TCP headers and data for IPv6 frames */ > +static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; > + > +static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"); > + > +static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM]; > +static unsigned int tcp6_payload_used; > + > +static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM]; > +/* IPv6 headers for TCP segment without payload */ > +static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM]; > +/* TCP segment without payload for IPv6 frames */ > +static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM]; > + > +static unsigned int tcp6_flags_used; > + > +/* recvmsg()/sendmsg() data for tap */ > +static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; > + > +/* > + * enum tcp_iov_parts - I/O vector parts for one TCP frame > + * @TCP_IOV_TAP tap backend specific header > + * @TCP_IOV_ETH Ethernet header > + * @TCP_IOV_IP IP (v4/v6) header > + * @TCP_IOV_PAYLOAD IP payload (TCP header + data) > + * @TCP_NUM_IOVS the number of entries in the iovec array > + */ > +enum tcp_iov_parts { > + TCP_IOV_TAP = 0, > + TCP_IOV_ETH = 1, > + TCP_IOV_IP = 2, > + TCP_IOV_PAYLOAD = 3, > + TCP_NUM_IOVS > +}; > + > +static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; > +static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; > +static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; > +static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; > + > +/** > + * tcp_update_l2_buf() - Update Ethernet header buffers with addresses > + * @eth_d: Ethernet destination address, NULL if unchanged > + * @eth_s: Ethernet source address, NULL if unchanged > + */ > +void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) > +{ > + eth_update_mac(&tcp4_eth_src, eth_d, eth_s); > + eth_update_mac(&tcp6_eth_src, eth_d, eth_s); > +} > + > +/** > + * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets > + * @c: Execution context > + */ > +void tcp_sock4_iov_init(const struct ctx *c) > +{ > + struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); > + struct iovec *iov; > + int i; > + > + tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); > + > + for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) { > + tcp4_payload_ip[i] = iph; > + tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4; > + tcp4_payload[i].th.ack = 1; > + } > + > + for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) { > + tcp4_flags_ip[i] = iph; > + tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4; > + tcp4_flags[i].th.ack = 1; > + } > + > + for (i = 0; i < TCP_FRAMES_MEM; i++) { > + iov = tcp4_l2_iov[i]; > + > + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]); > + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); > + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]); > + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i]; > + } > + > + for (i = 0; i < TCP_FRAMES_MEM; i++) { > + iov = tcp4_l2_flags_iov[i]; > + > + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]); > + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; > + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); > + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]); > + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i]; > + } > +} > + > +/** > + * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets > + * @c: Execution context > + */ > +void tcp_sock6_iov_init(const struct ctx *c) > +{ > + struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); > + struct iovec *iov; > + int i; > + > + tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); > + > + for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) { > + tcp6_payload_ip[i] = ip6; > + tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4; > + tcp6_payload[i].th.ack = 1; > + } > + > + for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) { > + tcp6_flags_ip[i] = ip6; > + tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4; > + tcp6_flags[i].th .ack = 1; > + } > + > + for (i = 0; i < TCP_FRAMES_MEM; i++) { > + iov = tcp6_l2_iov[i]; > + > + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]); > + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); > + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]); > + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i]; > + } > + > + for (i = 0; i < TCP_FRAMES_MEM; i++) { > + iov = tcp6_l2_flags_iov[i]; > + > + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]); > + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); > + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]); > + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i]; > + } > +} > + > +/** > + * tcp_flags_flush() - Send out buffers for segments with no data (flags) > + * @c: Execution context > + */ > +void tcp_flags_flush(const struct ctx *c) > +{ > + tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS, > + tcp6_flags_used); > + tcp6_flags_used = 0; > + > + tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS, > + tcp4_flags_used); > + tcp4_flags_used = 0; > +} > + > +/** > + * tcp_payload_flush() - Send out buffers for segments with data > + * @c: Execution context > + */ > +void tcp_payload_flush(const struct ctx *c) > +{ > + unsigned i; > + size_t m; > + > + m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, > + tcp6_payload_used); > + for (i = 0; i < m; i++) > + *tcp6_seq_update[i].seq += tcp6_seq_update[i].len; > + tcp6_payload_used = 0; > + > + m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, > + tcp4_payload_used); > + for (i = 0; i < m; i++) > + *tcp4_seq_update[i].seq += tcp4_seq_update[i].len; > + tcp4_payload_used = 0; > +} > + > +int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > +{ > + struct tcp_flags_t *payload; > + size_t optlen = 0; > + struct iovec *iov; > + size_t l4len; > + int ret; > + > + if (CONN_V4(conn)) { > + iov = tcp4_l2_flags_iov[tcp4_flags_used++]; > + > + payload = iov[TCP_IOV_PAYLOAD].iov_base; > + > + ret = tcp_fill_flag_header(c, conn, flags, &payload->th, > + payload->opts, &optlen); > + if (ret <= 0) > + return ret; > + > + l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, > + iov[TCP_IOV_IP].iov_base, > + iov[TCP_IOV_PAYLOAD].iov_base, optlen, > + NULL, conn->seq_to_tap); > + } else { > + iov = tcp6_l2_flags_iov[tcp6_flags_used++]; > + > + payload = iov[TCP_IOV_PAYLOAD].iov_base; > + > + ret = tcp_fill_flag_header(c, conn, flags, &payload->th, > + payload->opts, &optlen); > + if (ret <= 0) > + return ret; > + > + l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, > + iov[TCP_IOV_IP].iov_base, > + iov[TCP_IOV_PAYLOAD].iov_base, optlen, > + conn->seq_to_tap); > + } > + iov[TCP_IOV_PAYLOAD].iov_len = l4len; > + > + if (flags & DUP_ACK) { > + struct iovec *dup_iov; > + int i; > + > + if (CONN_V4(conn)) > + dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++]; > + else > + dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; > + > + for (i = 0; i < TCP_NUM_IOVS; i++) > + memcpy(dup_iov[i].iov_base, iov[i].iov_base, > + iov[i].iov_len); > + dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len; > + } > + > + if (CONN_V4(conn)) { > + if (tcp4_flags_used > TCP_FRAMES_MEM - 2) > + tcp_flags_flush(c); > + } else { > + if (tcp6_flags_used > TCP_FRAMES_MEM - 2) > + tcp_flags_flush(c); > + } > + > + return 0; > +} > + > +/** > + * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer > + * @c: Execution context > + * @conn: Connection pointer > + * @dlen: TCP payload length > + * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer > + * @seq: Sequence number to be sent > + */ > +void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, > + ssize_t dlen, int no_csum, uint32_t seq) > +{ > + uint32_t *seq_update = &conn->seq_to_tap; > + struct iovec *iov; > + size_t l4len; > + > + if (CONN_V4(conn)) { > + struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; > + const uint16_t *check = NULL; > + > + if (no_csum) { > + struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; > + check = &iph->check; > + } > + > + tcp4_seq_update[tcp4_payload_used].seq = seq_update; > + tcp4_seq_update[tcp4_payload_used].len = dlen; > + > + iov = tcp4_l2_iov[tcp4_payload_used++]; > + l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, > + iov[TCP_IOV_IP].iov_base, > + iov[TCP_IOV_PAYLOAD].iov_base, dlen, > + check, seq); > + iov[TCP_IOV_PAYLOAD].iov_len = l4len; > + if (tcp4_payload_used > TCP_FRAMES_MEM - 1) > + tcp_payload_flush(c); > + } else if (CONN_V6(conn)) { > + tcp6_seq_update[tcp6_payload_used].seq = seq_update; > + tcp6_seq_update[tcp6_payload_used].len = dlen; > + > + iov = tcp6_l2_iov[tcp6_payload_used++]; > + l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, > + iov[TCP_IOV_IP].iov_base, > + iov[TCP_IOV_PAYLOAD].iov_base, dlen, > + seq); > + iov[TCP_IOV_PAYLOAD].iov_len = l4len; > + if (tcp6_payload_used > TCP_FRAMES_MEM - 1) > + tcp_payload_flush(c); > + } > +} > + > +/** > + * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window > + * @c: Execution context > + * @conn: Connection pointer > + * > + * Return: negative on connection reset, 0 otherwise > + * > + * #syscalls recvmsg > + */ > +int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > +{ > + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; > + int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; > + int sendlen, len, dlen, v4 = CONN_V4(conn); > + int s = conn->sock, i, ret = 0; > + struct msghdr mh_sock = { 0 }; > + uint16_t mss = MSS_GET(conn); > + uint32_t already_sent, seq; > + struct iovec *iov; > + > + already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; > + > + if (SEQ_LT(already_sent, 0)) { > + /* RFC 761, section 2.1. */ > + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", > + conn->seq_ack_from_tap, conn->seq_to_tap); > + conn->seq_to_tap = conn->seq_ack_from_tap; > + already_sent = 0; > + } > + > + if (!wnd_scaled || already_sent >= wnd_scaled) { > + conn_flag(c, conn, STALLED); > + conn_flag(c, conn, ACK_FROM_TAP_DUE); > + return 0; > + } > + > + /* Set up buffer descriptors we'll fill completely and partially. */ > + fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); > + if (fill_bufs > TCP_FRAMES) { > + fill_bufs = TCP_FRAMES; > + iov_rem = 0; > + } else { > + iov_rem = (wnd_scaled - already_sent) % mss; > + } > + > + mh_sock.msg_iov = iov_sock; > + mh_sock.msg_iovlen = fill_bufs + 1; > + > + iov_sock[0].iov_base = tcp_buf_discard; > + iov_sock[0].iov_len = already_sent; > + > + if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || > + (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { > + tcp_payload_flush(c); > + > + /* Silence Coverity CWE-125 false positive */ > + tcp4_payload_used = tcp6_payload_used = 0; > + } > + > + for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { > + if (v4) > + iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data; > + else > + iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data; > + iov->iov_len = mss; > + } > + if (iov_rem) > + iov_sock[fill_bufs].iov_len = iov_rem; > + > + /* Receive into buffers, don't dequeue until acknowledged by guest. */ > + do > + len = recvmsg(s, &mh_sock, MSG_PEEK); > + while (len < 0 && errno == EINTR); > + > + if (len < 0) > + goto err; > + > + if (!len) { > + if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { > + if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) { > + tcp_rst(c, conn); > + return ret; > + } > + > + conn_event(c, conn, TAP_FIN_SENT); > + } > + > + return 0; > + } > + > + sendlen = len - already_sent; > + if (sendlen <= 0) { > + conn_flag(c, conn, STALLED); > + return 0; > + } > + > + conn_flag(c, conn, ~STALLED); > + > + send_bufs = DIV_ROUND_UP(sendlen, mss); > + last_len = sendlen - (send_bufs - 1) * mss; > + > + /* Likely, some new data was acked too. */ > + tcp_update_seqack_wnd(c, conn, 0, NULL); > + > + /* Finally, queue to tap */ > + dlen = mss; > + seq = conn->seq_to_tap; > + for (i = 0; i < send_bufs; i++) { > + int no_csum = i && i != send_bufs - 1 && tcp4_payload_used; > + > + if (i == send_bufs - 1) > + dlen = last_len; > + > + tcp_data_to_tap(c, conn, dlen, no_csum, seq); > + seq += dlen; > + } > + > + conn_flag(c, conn, ACK_FROM_TAP_DUE); > + > + return 0; > + > +err: > + if (errno != EAGAIN && errno != EWOULDBLOCK) { > + ret = -errno; > + tcp_rst(c, conn); > + } > + > + return ret; > +} > diff --git a/tcp_buf.h b/tcp_buf.h > new file mode 100644 > index 000000000000..14be7b945285 > --- /dev/null > +++ b/tcp_buf.h > @@ -0,0 +1,16 @@ > +/* SPDX-License-Identifier: GPL-2.0-or-later > + * Copyright (c) 2021 Red Hat GmbH > + * Author: Stefano Brivio > + */ > + > +#ifndef TCP_BUF_H > +#define TCP_BUF_H > + > +void tcp_sock4_iov_init(const struct ctx *c); > +void tcp_sock6_iov_init(const struct ctx *c); > +void tcp_flags_flush(const struct ctx *c); > +void tcp_payload_flush(const struct ctx *c); > +int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn); > +int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags); > + > +#endif /*TCP_BUF_H */ > diff --git a/tcp_internal.h b/tcp_internal.h > new file mode 100644 > index 000000000000..e47b64a68afd > --- /dev/null > +++ b/tcp_internal.h > @@ -0,0 +1,87 @@ > +/* SPDX-License-Identifier: GPL-2.0-or-later > + * Copyright (c) 2021 Red Hat GmbH > + * Author: Stefano Brivio > + */ > + > +#ifndef TCP_INTERNAL_H > +#define TCP_INTERNAL_H > + > +#define MAX_WS 8 > +#define MAX_WINDOW (1 << (16 + (MAX_WS))) > + > +#define MSS4 ROUND_DOWN(IP_MAX_MTU - \ > + sizeof(struct tcphdr) - \ > + sizeof(struct iphdr), \ > + sizeof(uint32_t)) > +#define MSS6 ROUND_DOWN(IP_MAX_MTU - \ > + sizeof(struct tcphdr) - \ > + sizeof(struct ipv6hdr), \ > + sizeof(uint32_t)) > + > +#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) > +#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) > +#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) > +#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) > + > +#define FIN (1 << 0) > +#define SYN (1 << 1) > +#define RST (1 << 2) > +#define ACK (1 << 4) > + > +/* Flags for internal usage */ > +#define DUP_ACK (1 << 5) > +#define OPT_EOL 0 > +#define OPT_NOP 1 > +#define OPT_MSS 2 > +#define OPT_MSS_LEN 4 > +#define OPT_WS 3 > +#define OPT_WS_LEN 3 > +#define OPT_SACKP 4 > +#define OPT_SACK 5 > +#define OPT_TS 8 > +#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) > +#define CONN_V6(conn) (!CONN_V4(conn)) > + > +extern char tcp_buf_discard [MAX_WINDOW]; > + > +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, > + unsigned long flag); > +#define conn_flag(c, conn, flag) \ > + do { \ > + flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ > + conn_flag_do(c, conn, flag); \ > + } while (0) > + > + > +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, > + unsigned long event); > +#define conn_event(c, conn, event) \ > + do { \ > + flow_trace(conn, "event at %s:%i", __func__, __LINE__); \ > + conn_event_do(c, conn, event); \ > + } while (0) > + > +void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); > +#define tcp_rst(c, conn) \ > + do { \ > + flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ > + tcp_rst_do(c, conn); \ > + } while (0) > + > +size_t tcp_fill_headers4(const struct ctx *c, > + const struct tcp_tap_conn *conn, > + struct tap_hdr *taph, > + struct iphdr *iph, struct tcphdr *th, > + size_t dlen, const uint16_t *check, > + uint32_t seq); > +size_t tcp_fill_headers6(const struct ctx *c, > + const struct tcp_tap_conn *conn, > + struct tap_hdr *taph, > + struct ipv6hdr *ip6h, struct tcphdr *th, > + size_t dlen, uint32_t seq); > +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, > + int force_seq, struct tcp_info *tinfo); > +int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, int flags, > + struct tcphdr *th, char *data, size_t *optlen); > + > +#endif /* TCP_INTERNAL_H */ -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson