public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: David Gibson <david@gibson.dropbear.id.au>
To: Laurent Vivier <lvivier@redhat.com>
Cc: passt-dev@passt.top
Subject: Re: [PATCH v4 03/10] tcp: move buffers management functions to their own file
Date: Mon, 3 Jun 2024 11:27:24 +1000	[thread overview]
Message-ID: <Zl0b_EuPVF3c1SfI@zatzit> (raw)
In-Reply-To: <20240531142344.1420034-4-lvivier@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 45779 bytes --]

On Fri, May 31, 2024 at 04:23:37PM +0200, Laurent Vivier wrote:

Commit message.

> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> ---
>  Makefile       |   5 +-
>  tcp.c          | 575 ++-----------------------------------------------
>  tcp_buf.c      | 526 ++++++++++++++++++++++++++++++++++++++++++++
>  tcp_buf.h      |  16 ++
>  tcp_internal.h |  87 ++++++++
>  5 files changed, 652 insertions(+), 557 deletions(-)
>  create mode 100644 tcp_buf.c
>  create mode 100644 tcp_buf.h
>  create mode 100644 tcp_internal.h
> 
> diff --git a/Makefile b/Makefile
> index 8ea175762e36..1ac2e5e0053f 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
>  PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
>  	icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
>  	ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
> -	tcp_splice.c udp.c util.c
> +	tcp_buf.c tcp_splice.c udp.c util.c
>  QRAP_SRCS = qrap.c
>  SRCS = $(PASST_SRCS) $(QRAP_SRCS)
>  
> @@ -56,7 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1
>  PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
>  	flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
>  	lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
> -	siphash.h tap.h tcp.h tcp_conn.h tcp_splice.h udp.h util.h
> +	siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
> +	udp.h util.h
>  HEADERS = $(PASST_HEADERS) seccomp.h
>  
>  C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
> diff --git a/tcp.c b/tcp.c
> index a6f43010f58f..48d8f7c6d696 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -302,28 +302,14 @@
>  #include "flow.h"
>  
>  #include "flow_table.h"
> -
> -#define TCP_FRAMES_MEM			128
> -#define TCP_FRAMES							\
> -	(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
> +#include "tcp_internal.h"
> +#include "tcp_buf.h"
>  
>  #define TCP_HASH_TABLE_LOAD		70		/* % */
>  #define TCP_HASH_TABLE_SIZE		(FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD)
>  
> -#define MAX_WS				8
> -#define MAX_WINDOW			(1 << (16 + (MAX_WS)))
> -
>  /* MSS rounding: see SET_MSS() */
>  #define MSS_DEFAULT			536
> -#define MSS4				ROUND_DOWN(IP_MAX_MTU -		   \
> -						   sizeof(struct tcphdr) - \
> -						   sizeof(struct iphdr),   \
> -						   sizeof(uint32_t))
> -#define MSS6				ROUND_DOWN(IP_MAX_MTU -		   \
> -						   sizeof(struct tcphdr) - \
> -						   sizeof(struct ipv6hdr), \
> -						   sizeof(uint32_t))
> -
>  #define WINDOW_DEFAULT			14600		/* RFC 6928 */
>  #ifdef HAS_SND_WND
>  # define KERNEL_REPORTS_SND_WND(c)	(c->tcp.kernel_snd_wnd)
> @@ -345,33 +331,10 @@
>   */
>  #define SOL_TCP				IPPROTO_TCP
>  
> -#define SEQ_LE(a, b)			((b) - (a) < MAX_WINDOW)
> -#define SEQ_LT(a, b)			((b) - (a) - 1 < MAX_WINDOW)
> -#define SEQ_GE(a, b)			((a) - (b) < MAX_WINDOW)
> -#define SEQ_GT(a, b)			((a) - (b) - 1 < MAX_WINDOW)
> -
> -#define FIN		(1 << 0)
> -#define SYN		(1 << 1)
> -#define RST		(1 << 2)
> -#define ACK		(1 << 4)
> -/* Flags for internal usage */
> -#define DUP_ACK		(1 << 5)
>  #define ACK_IF_NEEDED	0		/* See tcp_send_flag() */
>  
> -#define OPT_EOL		0
> -#define OPT_NOP		1
> -#define OPT_MSS		2
> -#define OPT_MSS_LEN	4
> -#define OPT_WS		3
> -#define OPT_WS_LEN	3
> -#define OPT_SACKP	4
> -#define OPT_SACK	5
> -#define OPT_TS		8
> -
>  #define TAPSIDE(conn_)	((conn_)->f.pif[1] == PIF_TAP)
>  
> -#define CONN_V4(conn)		(!!inany_v4(&(conn)->faddr))
> -#define CONN_V6(conn)		(!CONN_V4(conn))
>  #define CONN_IS_CLOSING(conn)						\
>  	((conn->events & ESTABLISHED) &&				\
>  	 (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
> @@ -408,114 +371,7 @@ static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
>   */
>  static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
>  
> -/**
> - * tcp_buf_seq_update - Sequences to update with length of frames once sent
> - * @seq:	Pointer to sequence number sent to tap-side, to be updated
> - * @len:	TCP payload length
> - */
> -struct tcp_buf_seq_update {
> -	uint32_t *seq;
> -	uint16_t len;
> -};

This will conflict with Jon's upcoming changes, and I think it will be
simpler if his go first (although they have taken rather longer to
land than I was expecting).

> -/* Static buffers */
> -/**
> - * struct tcp_payload_t - TCP header and data to send segments with payload
> - * @th:		TCP header
> - * @data:	TCP data
> - */
> -struct tcp_payload_t {
> -	struct tcphdr th;
> -	uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)));    /* For AVX2 checksum routines */
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> -#endif
> -
> -/**
> - * struct tcp_flags_t - TCP header and data to send zero-length
> - *                      segments (flags)
> - * @th:		TCP header
> - * @opts	TCP options
> - */
> -struct tcp_flags_t {
> -	struct tcphdr th;
> -	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)));
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> -#endif
> -
> -/* Ethernet header for IPv4 frames */
> -static struct ethhdr		tcp4_eth_src;
> -
> -static struct tap_hdr		tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
> -/* IPv4 headers */
> -static struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
> -/* TCP segments with payload for IPv4 frames */
> -static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
> -
> -static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
> -
> -static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
> -static unsigned int tcp4_payload_used;
> -
> -static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
> -/* IPv4 headers for TCP segment without payload */
> -static struct iphdr		tcp4_flags_ip[TCP_FRAMES_MEM];
> -/* TCP segments without payload for IPv4 frames */
> -static struct tcp_flags_t	tcp4_flags[TCP_FRAMES_MEM];
> -
> -static unsigned int tcp4_flags_used;
> -
> -/* Ethernet header for IPv6 frames */
> -static struct ethhdr		tcp6_eth_src;
> -
> -static struct tap_hdr		tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
> -/* IPv6 headers */
> -static struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
> -/* TCP headers and data for IPv6 frames */
> -static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
> -
> -static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
> -
> -static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
> -static unsigned int tcp6_payload_used;
> -
> -static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
> -/* IPv6 headers for TCP segment without payload */
> -static struct ipv6hdr		tcp6_flags_ip[TCP_FRAMES_MEM];
> -/* TCP segment without payload for IPv6 frames */
> -static struct tcp_flags_t	tcp6_flags[TCP_FRAMES_MEM];
> -
> -static unsigned int tcp6_flags_used;
> -
> -/* recvmsg()/sendmsg() data for tap */
> -static char 		tcp_buf_discard		[MAX_WINDOW];
> -static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
> -
> -/*
> - * enum tcp_iov_parts - I/O vector parts for one TCP frame
> - * @TCP_IOV_TAP		tap backend specific header
> - * @TCP_IOV_ETH		Ethernet header
> - * @TCP_IOV_IP		IP (v4/v6) header
> - * @TCP_IOV_PAYLOAD	IP payload (TCP header + data)
> - * @TCP_NUM_IOVS 	the number of entries in the iovec array
> - */
> -enum tcp_iov_parts {
> -	TCP_IOV_TAP	= 0,
> -	TCP_IOV_ETH	= 1,
> -	TCP_IOV_IP	= 2,
> -	TCP_IOV_PAYLOAD	= 3,
> -	TCP_NUM_IOVS
> -};
> -
> -static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> -static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> -static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> -static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +char		tcp_buf_discard		[MAX_WINDOW];
>  
>  /* sendmsg() to socket */
>  static struct iovec	tcp_iov			[UIO_MAXIOV];
> @@ -560,14 +416,6 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
>  	return EPOLLRDHUP;
>  }
>  
> -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
> -			 unsigned long flag);
> -#define conn_flag(c, conn, flag)					\
> -	do {								\
> -		flow_trace(conn, "flag at %s:%i", __func__, __LINE__);	\
> -		conn_flag_do(c, conn, flag);				\
> -	} while (0)
> -
>  /**
>   * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
>   * @c:		Execution context
> @@ -679,8 +527,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
>   * @conn:	Connection pointer
>   * @flag:	Flag to set, or ~flag to unset
>   */
> -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
> -			 unsigned long flag)
> +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
> +		  unsigned long flag)
>  {
>  	if (flag & (flag - 1)) {
>  		int flag_index = fls(~flag);
> @@ -730,8 +578,8 @@ static void tcp_hash_remove(const struct ctx *c,
>   * @conn:	Connection pointer
>   * @event:	Connection event
>   */
> -static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
> -			  unsigned long event)
> +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
> +		   unsigned long event)
>  {
>  	int prev, new, num = fls(event);
>  
> @@ -779,12 +627,6 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
>  		tcp_timer_ctl(c, conn);
>  }
>  
> -#define conn_event(c, conn, event)					\
> -	do {								\
> -		flow_trace(conn, "event at %s:%i", __func__, __LINE__);	\
> -		conn_event_do(c, conn, event);				\
> -	} while (0)
> -
>  /**
>   * tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint
>   * @conn:	Connection pointer
> @@ -914,104 +756,6 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
>  	th->check = csum(th, l4len, sum);
>  }
>  
> -/**
> - * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
> - * @eth_d:	Ethernet destination address, NULL if unchanged
> - * @eth_s:	Ethernet source address, NULL if unchanged
> - */
> -void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
> -{
> -	eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
> -	eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
> -}
> -
> -/**
> - * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
> - * @c:		Execution context
> - */
> -static void tcp_sock4_iov_init(const struct ctx *c)
> -{
> -	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
> -	struct iovec *iov;
> -	int i;
> -
> -	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
> -
> -	for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
> -		tcp4_payload_ip[i] = iph;
> -		tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
> -		tcp4_payload[i].th.ack = 1;
> -	}
> -
> -	for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
> -		tcp4_flags_ip[i] = iph;
> -		tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
> -		tcp4_flags[i].th.ack = 1;
> -	}
> -
> -	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> -		iov = tcp4_l2_iov[i];
> -
> -		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
> -		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
> -		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
> -		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
> -	}
> -
> -	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> -		iov = tcp4_l2_flags_iov[i];
> -
> -		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
> -		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
> -		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
> -		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
> -		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
> -	}
> -}
> -
> -/**
> - * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
> - * @c:		Execution context
> - */
> -static void tcp_sock6_iov_init(const struct ctx *c)
> -{
> -	struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
> -	struct iovec *iov;
> -	int i;
> -
> -	tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
> -
> -	for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
> -		tcp6_payload_ip[i] = ip6;
> -		tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
> -		tcp6_payload[i].th.ack = 1;
> -	}
> -
> -	for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
> -		tcp6_flags_ip[i] = ip6;
> -		tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
> -		tcp6_flags[i].th .ack = 1;
> -	}
> -
> -	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> -		iov = tcp6_l2_iov[i];
> -
> -		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
> -		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
> -		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
> -		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
> -	}
> -
> -	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> -		iov = tcp6_l2_flags_iov[i];
> -
> -		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
> -		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
> -		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
> -		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
> -	}
> -}
> -
>  /**
>   * tcp_opt_get() - Get option, and value if any, from TCP header
>   * @opts:	Pointer to start of TCP options in header
> @@ -1235,50 +979,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn)
>  	return true;
>  }
>  
> -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
> -#define tcp_rst(c, conn)						\
> -	do {								\
> -		flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
> -		tcp_rst_do(c, conn);					\
> -	} while (0)
> -
> -/**
> - * tcp_flags_flush() - Send out buffers for segments with no data (flags)
> - * @c:		Execution context
> - */
> -static void tcp_flags_flush(const struct ctx *c)
> -{
> -	tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
> -			tcp6_flags_used);
> -	tcp6_flags_used = 0;
> -
> -	tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
> -			tcp4_flags_used);
> -	tcp4_flags_used = 0;
> -}
> -
> -/**
> - * tcp_payload_flush() - Send out buffers for segments with data
> - * @c:		Execution context
> - */
> -static void tcp_payload_flush(const struct ctx *c)
> -{
> -	unsigned i;
> -	size_t m;
> -
> -	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
> -			    tcp6_payload_used);
> -	for (i = 0; i < m; i++)
> -		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
> -	tcp6_payload_used = 0;
> -
> -	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
> -			    tcp4_payload_used);
> -	for (i = 0; i < m; i++)
> -		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
> -	tcp4_payload_used = 0;
> -}
> -
>  /**
>   * tcp_defer_handler() - Handler for TCP deferred tasks
>   * @c:		Execution context
> @@ -1326,7 +1026,7 @@ static void tcp_fill_header(struct tcphdr *th,
>   *
>   * Return: The IPv4 payload length, host order
>   */
> -static size_t tcp_fill_headers4(const struct ctx *c,
> +size_t tcp_fill_headers4(const struct ctx *c,
>  				const struct tcp_tap_conn *conn,
>  				struct tap_hdr *taph,
>  				struct iphdr *iph, struct tcphdr *th,
> @@ -1369,11 +1069,11 @@ static size_t tcp_fill_headers4(const struct ctx *c,
>   *
>   * Return: The IPv6 payload length, host order
>   */
> -static size_t tcp_fill_headers6(const struct ctx *c,
> -				const struct tcp_tap_conn *conn,
> -				struct tap_hdr *taph,
> -				struct ipv6hdr *ip6h, struct tcphdr *th,
> -				size_t dlen, uint32_t seq)
> +size_t tcp_fill_headers6(const struct ctx *c,
> +			 const struct tcp_tap_conn *conn,
> +			 struct tap_hdr *taph,
> +			 struct ipv6hdr *ip6h, struct tcphdr *th,
> +			 size_t dlen, uint32_t seq)
>  {
>  	size_t l4len = dlen + sizeof(*th);
>  
> @@ -1410,8 +1110,8 @@ static size_t tcp_fill_headers6(const struct ctx *c,
>   *
>   * Return: 1 if sequence or window were updated, 0 otherwise
>   */
> -static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
> -				 int force_seq, struct tcp_info *tinfo)
> +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
> +			  int force_seq, struct tcp_info *tinfo)
>  {
>  	uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
>  	uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
> @@ -1530,7 +1230,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
>   *           0 if there is no flag to send
>   *	     1 otherwise
>   */
> -static int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn,
> +int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn,
>  				int flags, struct tcphdr *th, char *data,
>  				size_t *optlen)
>  {
> @@ -1620,69 +1320,9 @@ static int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn,
>  	return 1;
>  }
>  
> -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
> +int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  {
> -	struct tcp_flags_t *payload;
> -	size_t optlen = 0;
> -	struct iovec *iov;
> -	size_t l4len;
> -	int ret;
> -
> -	if (CONN_V4(conn)) {
> -		iov = tcp4_l2_flags_iov[tcp4_flags_used++];
> -
> -		payload = iov[TCP_IOV_PAYLOAD].iov_base;
> -
> -		ret = tcp_fill_flag_header(c, conn, flags, &payload->th,
> -					   payload->opts, &optlen);
> -		if (ret <= 0)
> -			return ret;
> -
> -		l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
> -					  iov[TCP_IOV_IP].iov_base,
> -					  iov[TCP_IOV_PAYLOAD].iov_base, optlen,
> -					  NULL, conn->seq_to_tap);
> -	} else {
> -		iov = tcp6_l2_flags_iov[tcp6_flags_used++];
> -
> -		payload = iov[TCP_IOV_PAYLOAD].iov_base;
> -
> -		ret = tcp_fill_flag_header(c, conn, flags, &payload->th,
> -					   payload->opts, &optlen);
> -		if (ret <= 0)
> -			return ret;
> -
> -		l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
> -					  iov[TCP_IOV_IP].iov_base,
> -					  iov[TCP_IOV_PAYLOAD].iov_base, optlen,
> -					  conn->seq_to_tap);
> -	}
> -	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> -
> -	if (flags & DUP_ACK) {
> -		struct iovec *dup_iov;
> -		int i;
> -
> -		if (CONN_V4(conn))
> -			dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
> -		else
> -			dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
> -
> -		for (i = 0; i < TCP_NUM_IOVS; i++)
> -			memcpy(dup_iov[i].iov_base, iov[i].iov_base,
> -			       iov[i].iov_len);
> -		dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
> -	}
> -
> -	if (CONN_V4(conn)) {
> -		if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
> -			tcp_flags_flush(c);
> -	} else {
> -		if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
> -			tcp_flags_flush(c);
> -	}
> -
> -	return 0;
> +	return tcp_buf_send_flag(c, conn, flags);
>  }
>  
>  /**
> @@ -1690,7 +1330,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>   * @c:		Execution context
>   * @conn:	Connection pointer
>   */
> -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
> +void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
>  {
>  	if (conn->events == CLOSED)
>  		return;
> @@ -2117,184 +1757,9 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
>  	return 0;
>  }
>  
> -/**
> - * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
> - * @c:		Execution context
> - * @conn:	Connection pointer
> - * @dlen:	TCP payload length
> - * @no_csum:	Don't compute IPv4 checksum, use the one from previous buffer
> - * @seq:	Sequence number to be sent
> - */
> -static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> -			    ssize_t dlen, int no_csum, uint32_t seq)
> -{
> -	uint32_t *seq_update = &conn->seq_to_tap;
> -	struct iovec *iov;
> -	size_t l4len;
> -
> -	if (CONN_V4(conn)) {
> -		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
> -		const uint16_t *check = NULL;
> -
> -		if (no_csum) {
> -			struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
> -			check = &iph->check;
> -		}
> -
> -		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
> -		tcp4_seq_update[tcp4_payload_used].len = dlen;
> -
> -		iov = tcp4_l2_iov[tcp4_payload_used++];
> -		l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
> -					  iov[TCP_IOV_IP].iov_base,
> -					  iov[TCP_IOV_PAYLOAD].iov_base, dlen,
> -					  check, seq);
> -		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> -		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
> -			tcp_payload_flush(c);
> -	} else if (CONN_V6(conn)) {
> -		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
> -		tcp6_seq_update[tcp6_payload_used].len = dlen;
> -
> -		iov = tcp6_l2_iov[tcp6_payload_used++];
> -		l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
> -				 iov[TCP_IOV_IP].iov_base,
> -				 iov[TCP_IOV_PAYLOAD].iov_base, dlen,
> -				 seq);
> -		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> -		if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
> -			tcp_payload_flush(c);
> -	}
> -}
> -
> -/**
> - * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window
> - * @c:		Execution context
> - * @conn:	Connection pointer
> - *
> - * Return: negative on connection reset, 0 otherwise
> - *
> - * #syscalls recvmsg
> - */
>  static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  {
> -	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
> -	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
> -	int sendlen, len, dlen, v4 = CONN_V4(conn);
> -	int s = conn->sock, i, ret = 0;
> -	struct msghdr mh_sock = { 0 };
> -	uint16_t mss = MSS_GET(conn);
> -	uint32_t already_sent, seq;
> -	struct iovec *iov;
> -
> -	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
> -
> -	if (SEQ_LT(already_sent, 0)) {
> -		/* RFC 761, section 2.1. */
> -		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
> -			   conn->seq_ack_from_tap, conn->seq_to_tap);
> -		conn->seq_to_tap = conn->seq_ack_from_tap;
> -		already_sent = 0;
> -	}
> -
> -	if (!wnd_scaled || already_sent >= wnd_scaled) {
> -		conn_flag(c, conn, STALLED);
> -		conn_flag(c, conn, ACK_FROM_TAP_DUE);
> -		return 0;
> -	}
> -
> -	/* Set up buffer descriptors we'll fill completely and partially. */
> -	fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
> -	if (fill_bufs > TCP_FRAMES) {
> -		fill_bufs = TCP_FRAMES;
> -		iov_rem = 0;
> -	} else {
> -		iov_rem = (wnd_scaled - already_sent) % mss;
> -	}
> -
> -	mh_sock.msg_iov = iov_sock;
> -	mh_sock.msg_iovlen = fill_bufs + 1;
> -
> -	iov_sock[0].iov_base = tcp_buf_discard;
> -	iov_sock[0].iov_len = already_sent;
> -
> -	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
> -	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
> -		tcp_payload_flush(c);
> -
> -		/* Silence Coverity CWE-125 false positive */
> -		tcp4_payload_used = tcp6_payload_used = 0;
> -	}
> -
> -	for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
> -		if (v4)
> -			iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
> -		else
> -			iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
> -		iov->iov_len = mss;
> -	}
> -	if (iov_rem)
> -		iov_sock[fill_bufs].iov_len = iov_rem;
> -
> -	/* Receive into buffers, don't dequeue until acknowledged by guest. */
> -	do
> -		len = recvmsg(s, &mh_sock, MSG_PEEK);
> -	while (len < 0 && errno == EINTR);
> -
> -	if (len < 0)
> -		goto err;
> -
> -	if (!len) {
> -		if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
> -			if ((ret = tcp_send_flag(c, conn, FIN | ACK))) {
> -				tcp_rst(c, conn);
> -				return ret;
> -			}
> -
> -			conn_event(c, conn, TAP_FIN_SENT);
> -		}
> -
> -		return 0;
> -	}
> -
> -	sendlen = len - already_sent;
> -	if (sendlen <= 0) {
> -		conn_flag(c, conn, STALLED);
> -		return 0;
> -	}
> -
> -	conn_flag(c, conn, ~STALLED);
> -
> -	send_bufs = DIV_ROUND_UP(sendlen, mss);
> -	last_len = sendlen - (send_bufs - 1) * mss;
> -
> -	/* Likely, some new data was acked too. */
> -	tcp_update_seqack_wnd(c, conn, 0, NULL);
> -
> -	/* Finally, queue to tap */
> -	dlen = mss;
> -	seq = conn->seq_to_tap;
> -	for (i = 0; i < send_bufs; i++) {
> -		int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
> -
> -		if (i == send_bufs - 1)
> -			dlen = last_len;
> -
> -		tcp_data_to_tap(c, conn, dlen, no_csum, seq);
> -		seq += dlen;
> -	}
> -
> -	conn_flag(c, conn, ACK_FROM_TAP_DUE);
> -
> -	return 0;
> -
> -err:
> -	if (errno != EAGAIN && errno != EWOULDBLOCK) {
> -		ret = -errno;
> -		tcp_rst(c, conn);
> -	}
> -
> -	return ret;
> +	return tcp_buf_data_from_sock(c, conn);
>  }
>  
>  /**
> diff --git a/tcp_buf.c b/tcp_buf.c
> new file mode 100644
> index 000000000000..87923029a958
> --- /dev/null
> +++ b/tcp_buf.c
> @@ -0,0 +1,526 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +/* PASST - Plug A Simple Socket Transport
> + *  for qemu/UNIX domain socket mode
> + *
> + * PASTA - Pack A Subtle Tap Abstraction
> + *  for network namespace/tap device mode
> + *
> + * tcp_buf.c - TCP L2-L4 translation state machine

This description doesn't appear correct, or at least not complete, for
the new file.

> + *
> + * Copyright (c) 2020-2022 Red Hat GmbH

And this should probably be updated since you're touching it too.
Maybe go with the plain "Copyright Red Hat" that Red Hat legal seems
to recommend.

> + * Author: Stefano Brivio <sbrivio@redhat.com>
> + */
> +
> +#include <stddef.h>
> +#include <stdint.h>
> +#include <limits.h>
> +#include <string.h>
> +#include <errno.h>
> +
> +#include <netinet/ip.h>
> +
> +#include <linux/tcp.h>
> +
> +#include "util.h"
> +#include "ip.h"
> +#include "iov.h"
> +#include "passt.h"
> +#include "tap.h"
> +#include "siphash.h"
> +#include "inany.h"
> +#include "tcp_conn.h"
> +#include "tcp_internal.h"
> +#include "tcp_buf.h"
> +
> +#define TCP_FRAMES_MEM			128
> +#define TCP_FRAMES							   \
> +	(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
> +
> +/**
> + * tcp_buf_seq_update - Sequences to update with length of frames once sent
> + * @seq:	Pointer to sequence number sent to tap-side, to be updated
> + * @len:	TCP payload length
> + */
> +struct tcp_buf_seq_update {
> +	uint32_t *seq;
> +	uint16_t len;
> +};
> +
> +/* Static buffers */
> +/**
> + * struct tcp_payload_t - TCP header and data to send segments with payload
> + * @th:		TCP header
> + * @data:	TCP data
> + */
> +struct tcp_payload_t {
> +	struct tcphdr th;
> +	uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
> +#ifdef __AVX2__
> +} __attribute__ ((packed, aligned(32)));    /* For AVX2 checksum routines */
> +#else
> +} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> +#endif
> +
> +/**
> + * struct tcp_flags_t - TCP header and data to send zero-length
> + *                      segments (flags)
> + * @th:		TCP header
> + * @opts	TCP options
> + */
> +struct tcp_flags_t {
> +	struct tcphdr th;
> +	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> +#ifdef __AVX2__
> +} __attribute__ ((packed, aligned(32)));
> +#else
> +} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> +#endif
> +
> +/* Ethernet header for IPv4 frames */
> +static struct ethhdr		tcp4_eth_src;
> +
> +static struct tap_hdr		tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
> +/* IPv4 headers */
> +static struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
> +/* TCP segments with payload for IPv4 frames */
> +static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
> +
> +static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
> +
> +static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
> +static unsigned int tcp4_payload_used;
> +
> +static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
> +/* IPv4 headers for TCP segment without payload */
> +static struct iphdr		tcp4_flags_ip[TCP_FRAMES_MEM];
> +/* TCP segments without payload for IPv4 frames */
> +static struct tcp_flags_t	tcp4_flags[TCP_FRAMES_MEM];
> +
> +static unsigned int tcp4_flags_used;
> +
> +/* Ethernet header for IPv6 frames */
> +static struct ethhdr		tcp6_eth_src;
> +
> +static struct tap_hdr		tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
> +/* IPv6 headers */
> +static struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
> +/* TCP headers and data for IPv6 frames */
> +static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
> +
> +static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
> +
> +static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
> +static unsigned int tcp6_payload_used;
> +
> +static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
> +/* IPv6 headers for TCP segment without payload */
> +static struct ipv6hdr		tcp6_flags_ip[TCP_FRAMES_MEM];
> +/* TCP segment without payload for IPv6 frames */
> +static struct tcp_flags_t	tcp6_flags[TCP_FRAMES_MEM];
> +
> +static unsigned int tcp6_flags_used;
> +
> +/* recvmsg()/sendmsg() data for tap */
> +static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
> +
> +/*
> + * enum tcp_iov_parts - I/O vector parts for one TCP frame
> + * @TCP_IOV_TAP		tap backend specific header
> + * @TCP_IOV_ETH		Ethernet header
> + * @TCP_IOV_IP		IP (v4/v6) header
> + * @TCP_IOV_PAYLOAD	IP payload (TCP header + data)
> + * @TCP_NUM_IOVS 	the number of entries in the iovec array
> + */
> +enum tcp_iov_parts {
> +	TCP_IOV_TAP	= 0,
> +	TCP_IOV_ETH	= 1,
> +	TCP_IOV_IP	= 2,
> +	TCP_IOV_PAYLOAD	= 3,
> +	TCP_NUM_IOVS
> +};
> +
> +static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +
> +/**
> + * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
> + * @eth_d:	Ethernet destination address, NULL if unchanged
> + * @eth_s:	Ethernet source address, NULL if unchanged
> + */
> +void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
> +{
> +	eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
> +	eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
> +}
> +
> +/**
> + * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
> + * @c:		Execution context
> + */
> +void tcp_sock4_iov_init(const struct ctx *c)
> +{
> +	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
> +	struct iovec *iov;
> +	int i;
> +
> +	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
> +
> +	for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
> +		tcp4_payload_ip[i] = iph;
> +		tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
> +		tcp4_payload[i].th.ack = 1;
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
> +		tcp4_flags_ip[i] = iph;
> +		tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
> +		tcp4_flags[i].th.ack = 1;
> +	}
> +
> +	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> +		iov = tcp4_l2_iov[i];
> +
> +		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
> +		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
> +		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
> +	}
> +
> +	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> +		iov = tcp4_l2_flags_iov[i];
> +
> +		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
> +		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
> +		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
> +		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
> +	}
> +}
> +
> +/**
> + * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
> + * @c:		Execution context
> + */
> +void tcp_sock6_iov_init(const struct ctx *c)
> +{
> +	struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
> +	struct iovec *iov;
> +	int i;
> +
> +	tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
> +
> +	for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
> +		tcp6_payload_ip[i] = ip6;
> +		tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
> +		tcp6_payload[i].th.ack = 1;
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
> +		tcp6_flags_ip[i] = ip6;
> +		tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
> +		tcp6_flags[i].th .ack = 1;
> +	}
> +
> +	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> +		iov = tcp6_l2_iov[i];
> +
> +		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
> +		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
> +		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
> +	}
> +
> +	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> +		iov = tcp6_l2_flags_iov[i];
> +
> +		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
> +		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
> +		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
> +	}
> +}
> +
> +/**
> + * tcp_flags_flush() - Send out buffers for segments with no data (flags)
> + * @c:		Execution context
> + */
> +void tcp_flags_flush(const struct ctx *c)
> +{
> +	tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
> +			tcp6_flags_used);
> +	tcp6_flags_used = 0;
> +
> +	tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
> +			tcp4_flags_used);
> +	tcp4_flags_used = 0;
> +}
> +
> +/**
> + * tcp_payload_flush() - Send out buffers for segments with data
> + * @c:		Execution context
> + */
> +void tcp_payload_flush(const struct ctx *c)
> +{
> +	unsigned i;
> +	size_t m;
> +
> +	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
> +			    tcp6_payload_used);
> +	for (i = 0; i < m; i++)
> +		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
> +	tcp6_payload_used = 0;
> +
> +	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
> +			    tcp4_payload_used);
> +	for (i = 0; i < m; i++)
> +		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
> +	tcp4_payload_used = 0;
> +}
> +
> +int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
> +{
> +	struct tcp_flags_t *payload;
> +	size_t optlen = 0;
> +	struct iovec *iov;
> +	size_t l4len;
> +	int ret;
> +
> +	if (CONN_V4(conn)) {
> +		iov = tcp4_l2_flags_iov[tcp4_flags_used++];
> +
> +		payload = iov[TCP_IOV_PAYLOAD].iov_base;
> +
> +		ret = tcp_fill_flag_header(c, conn, flags, &payload->th,
> +					   payload->opts, &optlen);
> +		if (ret <= 0)
> +			return ret;
> +
> +		l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
> +					  iov[TCP_IOV_IP].iov_base,
> +					  iov[TCP_IOV_PAYLOAD].iov_base, optlen,
> +					  NULL, conn->seq_to_tap);
> +	} else {
> +		iov = tcp6_l2_flags_iov[tcp6_flags_used++];
> +
> +		payload = iov[TCP_IOV_PAYLOAD].iov_base;
> +
> +		ret = tcp_fill_flag_header(c, conn, flags, &payload->th,
> +					   payload->opts, &optlen);
> +		if (ret <= 0)
> +			return ret;
> +
> +		l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
> +					  iov[TCP_IOV_IP].iov_base,
> +					  iov[TCP_IOV_PAYLOAD].iov_base, optlen,
> +					  conn->seq_to_tap);
> +	}
> +	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> +
> +	if (flags & DUP_ACK) {
> +		struct iovec *dup_iov;
> +		int i;
> +
> +		if (CONN_V4(conn))
> +			dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
> +		else
> +			dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
> +
> +		for (i = 0; i < TCP_NUM_IOVS; i++)
> +			memcpy(dup_iov[i].iov_base, iov[i].iov_base,
> +			       iov[i].iov_len);
> +		dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
> +	}
> +
> +	if (CONN_V4(conn)) {
> +		if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
> +			tcp_flags_flush(c);
> +	} else {
> +		if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
> +			tcp_flags_flush(c);
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
> + * @c:		Execution context
> + * @conn:	Connection pointer
> + * @dlen:	TCP payload length
> + * @no_csum:	Don't compute IPv4 checksum, use the one from previous buffer
> + * @seq:	Sequence number to be sent
> + */
> +void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> +		     ssize_t dlen, int no_csum, uint32_t seq)
> +{
> +	uint32_t *seq_update = &conn->seq_to_tap;
> +	struct iovec *iov;
> +	size_t l4len;
> +
> +	if (CONN_V4(conn)) {
> +		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
> +		const uint16_t *check = NULL;
> +
> +		if (no_csum) {
> +			struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
> +			check = &iph->check;
> +		}
> +
> +		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
> +		tcp4_seq_update[tcp4_payload_used].len = dlen;
> +
> +		iov = tcp4_l2_iov[tcp4_payload_used++];
> +		l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
> +					  iov[TCP_IOV_IP].iov_base,
> +					  iov[TCP_IOV_PAYLOAD].iov_base, dlen,
> +					  check, seq);
> +		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> +		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
> +			tcp_payload_flush(c);
> +	} else if (CONN_V6(conn)) {
> +		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
> +		tcp6_seq_update[tcp6_payload_used].len = dlen;
> +
> +		iov = tcp6_l2_iov[tcp6_payload_used++];
> +		l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
> +				 iov[TCP_IOV_IP].iov_base,
> +				 iov[TCP_IOV_PAYLOAD].iov_base, dlen,
> +				 seq);
> +		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> +		if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
> +			tcp_payload_flush(c);
> +	}
> +}
> +
> +/**
> + * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window
> + * @c:		Execution context
> + * @conn:	Connection pointer
> + *
> + * Return: negative on connection reset, 0 otherwise
> + *
> + * #syscalls recvmsg
> + */
> +int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
> +{
> +	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
> +	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
> +	int sendlen, len, dlen, v4 = CONN_V4(conn);
> +	int s = conn->sock, i, ret = 0;
> +	struct msghdr mh_sock = { 0 };
> +	uint16_t mss = MSS_GET(conn);
> +	uint32_t already_sent, seq;
> +	struct iovec *iov;
> +
> +	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
> +
> +	if (SEQ_LT(already_sent, 0)) {
> +		/* RFC 761, section 2.1. */
> +		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
> +			   conn->seq_ack_from_tap, conn->seq_to_tap);
> +		conn->seq_to_tap = conn->seq_ack_from_tap;
> +		already_sent = 0;
> +	}
> +
> +	if (!wnd_scaled || already_sent >= wnd_scaled) {
> +		conn_flag(c, conn, STALLED);
> +		conn_flag(c, conn, ACK_FROM_TAP_DUE);
> +		return 0;
> +	}
> +
> +	/* Set up buffer descriptors we'll fill completely and partially. */
> +	fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
> +	if (fill_bufs > TCP_FRAMES) {
> +		fill_bufs = TCP_FRAMES;
> +		iov_rem = 0;
> +	} else {
> +		iov_rem = (wnd_scaled - already_sent) % mss;
> +	}
> +
> +	mh_sock.msg_iov = iov_sock;
> +	mh_sock.msg_iovlen = fill_bufs + 1;
> +
> +	iov_sock[0].iov_base = tcp_buf_discard;
> +	iov_sock[0].iov_len = already_sent;
> +
> +	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
> +	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
> +		tcp_payload_flush(c);
> +
> +		/* Silence Coverity CWE-125 false positive */
> +		tcp4_payload_used = tcp6_payload_used = 0;
> +	}
> +
> +	for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
> +		if (v4)
> +			iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
> +		else
> +			iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
> +		iov->iov_len = mss;
> +	}
> +	if (iov_rem)
> +		iov_sock[fill_bufs].iov_len = iov_rem;
> +
> +	/* Receive into buffers, don't dequeue until acknowledged by guest. */
> +	do
> +		len = recvmsg(s, &mh_sock, MSG_PEEK);
> +	while (len < 0 && errno == EINTR);
> +
> +	if (len < 0)
> +		goto err;
> +
> +	if (!len) {
> +		if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
> +			if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
> +				tcp_rst(c, conn);
> +				return ret;
> +			}
> +
> +			conn_event(c, conn, TAP_FIN_SENT);
> +		}
> +
> +		return 0;
> +	}
> +
> +	sendlen = len - already_sent;
> +	if (sendlen <= 0) {
> +		conn_flag(c, conn, STALLED);
> +		return 0;
> +	}
> +
> +	conn_flag(c, conn, ~STALLED);
> +
> +	send_bufs = DIV_ROUND_UP(sendlen, mss);
> +	last_len = sendlen - (send_bufs - 1) * mss;
> +
> +	/* Likely, some new data was acked too. */
> +	tcp_update_seqack_wnd(c, conn, 0, NULL);
> +
> +	/* Finally, queue to tap */
> +	dlen = mss;
> +	seq = conn->seq_to_tap;
> +	for (i = 0; i < send_bufs; i++) {
> +		int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
> +
> +		if (i == send_bufs - 1)
> +			dlen = last_len;
> +
> +		tcp_data_to_tap(c, conn, dlen, no_csum, seq);
> +		seq += dlen;
> +	}
> +
> +	conn_flag(c, conn, ACK_FROM_TAP_DUE);
> +
> +	return 0;
> +
> +err:
> +	if (errno != EAGAIN && errno != EWOULDBLOCK) {
> +		ret = -errno;
> +		tcp_rst(c, conn);
> +	}
> +
> +	return ret;
> +}
> diff --git a/tcp_buf.h b/tcp_buf.h
> new file mode 100644
> index 000000000000..14be7b945285
> --- /dev/null
> +++ b/tcp_buf.h
> @@ -0,0 +1,16 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later
> + * Copyright (c) 2021 Red Hat GmbH
> + * Author: Stefano Brivio <sbrivio@redhat.com>
> + */
> +
> +#ifndef TCP_BUF_H
> +#define TCP_BUF_H
> +
> +void tcp_sock4_iov_init(const struct ctx *c);
> +void tcp_sock6_iov_init(const struct ctx *c);
> +void tcp_flags_flush(const struct ctx *c);
> +void tcp_payload_flush(const struct ctx *c);
> +int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
> +int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
> +
> +#endif  /*TCP_BUF_H */
> diff --git a/tcp_internal.h b/tcp_internal.h
> new file mode 100644
> index 000000000000..e47b64a68afd
> --- /dev/null
> +++ b/tcp_internal.h
> @@ -0,0 +1,87 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later
> + * Copyright (c) 2021 Red Hat GmbH
> + * Author: Stefano Brivio <sbrivio@redhat.com>
> + */
> +
> +#ifndef TCP_INTERNAL_H
> +#define TCP_INTERNAL_H
> +
> +#define MAX_WS				8
> +#define MAX_WINDOW			(1 << (16 + (MAX_WS)))
> +
> +#define MSS4				ROUND_DOWN(IP_MAX_MTU -		   \
> +						   sizeof(struct tcphdr) - \
> +						   sizeof(struct iphdr),   \
> +						   sizeof(uint32_t))
> +#define MSS6				ROUND_DOWN(IP_MAX_MTU -		   \
> +						   sizeof(struct tcphdr) - \
> +						   sizeof(struct ipv6hdr), \
> +						   sizeof(uint32_t))
> +
> +#define SEQ_LE(a, b)			((b) - (a) < MAX_WINDOW)
> +#define SEQ_LT(a, b)			((b) - (a) - 1 < MAX_WINDOW)
> +#define SEQ_GE(a, b)			((a) - (b) < MAX_WINDOW)
> +#define SEQ_GT(a, b)			((a) - (b) - 1 < MAX_WINDOW)
> +
> +#define FIN		(1 << 0)
> +#define SYN		(1 << 1)
> +#define RST		(1 << 2)
> +#define ACK		(1 << 4)
> +
> +/* Flags for internal usage */
> +#define DUP_ACK		(1 << 5)
> +#define OPT_EOL		0
> +#define OPT_NOP		1
> +#define OPT_MSS		2
> +#define OPT_MSS_LEN	4
> +#define OPT_WS		3
> +#define OPT_WS_LEN	3
> +#define OPT_SACKP	4
> +#define OPT_SACK	5
> +#define OPT_TS		8
> +#define CONN_V4(conn)		(!!inany_v4(&(conn)->faddr))
> +#define CONN_V6(conn)		(!CONN_V4(conn))
> +
> +extern char tcp_buf_discard [MAX_WINDOW];
> +
> +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
> +		  unsigned long flag);
> +#define conn_flag(c, conn, flag)					\
> +	do {								\
> +		flow_trace(conn, "flag at %s:%i", __func__, __LINE__);	\
> +		conn_flag_do(c, conn, flag);				\
> +	} while (0)
> +
> +
> +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
> +		   unsigned long event);
> +#define conn_event(c, conn, event)					\
> +	do {								\
> +		flow_trace(conn, "event at %s:%i", __func__, __LINE__);	\
> +		conn_event_do(c, conn, event);				\
> +	} while (0)
> +
> +void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
> +#define tcp_rst(c, conn)						\
> +	do {								\
> +		flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
> +		tcp_rst_do(c, conn);					\
> +	} while (0)
> +
> +size_t tcp_fill_headers4(const struct ctx *c,
> +				const struct tcp_tap_conn *conn,
> +				struct tap_hdr *taph,
> +				struct iphdr *iph, struct tcphdr *th,
> +				size_t dlen, const uint16_t *check,
> +				uint32_t seq);
> +size_t tcp_fill_headers6(const struct ctx *c,
> +			 const struct tcp_tap_conn *conn,
> +			 struct tap_hdr *taph,
> +			 struct ipv6hdr *ip6h, struct tcphdr *th,
> +			 size_t dlen, uint32_t seq);
> +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
> +			  int force_seq, struct tcp_info *tinfo);
> +int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, int flags,
> +			 struct tcphdr *th, char *data, size_t *optlen);
> +
> +#endif /* TCP_INTERNAL_H */

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

  reply	other threads:[~2024-06-03  1:33 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-05-31 14:23 [PATCH v4 00/10] Add vhost-user support to passt (part 2) Laurent Vivier
2024-05-31 14:23 ` [PATCH v4 01/10] tcp: inline tcp_l2_buf_fill_headers() Laurent Vivier
2024-05-31 14:23 ` [PATCH v4 02/10] tcp: extract buffer management from tcp_send_flag() Laurent Vivier
2024-06-01  5:43   ` David Gibson
2024-05-31 14:23 ` [PATCH v4 03/10] tcp: move buffers management functions to their own file Laurent Vivier
2024-06-03  1:27   ` David Gibson [this message]
2024-05-31 14:23 ` [PATCH v4 04/10] tap: export pool_flush()/tapX_handler()/packet_add() Laurent Vivier
2024-06-03  1:32   ` David Gibson
2024-05-31 14:23 ` [PATCH v4 05/10] udp: move udpX_l2_buf_t and udpX_l2_mh_sock out of udp_update_hdrX() Laurent Vivier
2024-06-03  2:54   ` David Gibson
2024-05-31 14:23 ` [PATCH v4 06/10] udp: rename udp_sock_handler() to udp_buf_sock_handler() Laurent Vivier
2024-06-03  4:02   ` David Gibson
2024-05-31 14:23 ` [PATCH v4 07/10] vhost-user: compare mode MODE_PASTA and not MODE_PASST Laurent Vivier
2024-06-03  4:04   ` David Gibson
2024-05-31 14:23 ` [PATCH v4 08/10] iov: remove iov_copy() Laurent Vivier
2024-06-03  4:05   ` David Gibson
2024-05-31 14:23 ` [PATCH v4 09/10] tcp: remove tap_hdr parameter Laurent Vivier
2024-06-03  4:12   ` David Gibson
2024-05-31 14:23 ` [PATCH v4 10/10] tap: use in->buf_size rather than sizeof(pkt_buf) Laurent Vivier
2024-06-03  4:20   ` David Gibson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Zl0b_EuPVF3c1SfI@zatzit \
    --to=david@gibson.dropbear.id.au \
    --cc=lvivier@redhat.com \
    --cc=passt-dev@passt.top \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).