public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: Laurent Vivier <lvivier@redhat.com>
To: passt-dev@passt.top
Cc: Laurent Vivier <lvivier@redhat.com>
Subject: [PATCH v6 2/8] tcp: move buffers management functions to their own file
Date: Wed, 12 Jun 2024 17:47:28 +0200	[thread overview]
Message-ID: <20240612154734.1044883-3-lvivier@redhat.com> (raw)
In-Reply-To: <20240612154734.1044883-1-lvivier@redhat.com>

Move all the TCP parts using internal buffers to tcp_buf.c
and keep generic TCP management functions in tcp.c.
Add tcp_internal.h to export needed functions from tcp.c and
tcp_buf.h from tcp_buf.c

With this change we can use existing TCP functions with a
different kind of memory storage as for instance the shared
memory provided by the guest via vhost-user.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---

Notes:
    v5:
      - as we export now tcp_l2_buf_fill_headers() move
        also enum tcp_iov_part to tcp_internal.h
    
    v4:
      - rename tcp_send_flag() and tcp_data_from_sock() to
        tcp_buf_send_flag() and tcp_buf_data_from_sock()

 Makefile       |   5 +-
 tcp.c          | 562 ++-----------------------------------------------
 tcp_buf.c      | 513 ++++++++++++++++++++++++++++++++++++++++++++
 tcp_buf.h      |  16 ++
 tcp_internal.h |  96 +++++++++
 5 files changed, 648 insertions(+), 544 deletions(-)
 create mode 100644 tcp_buf.c
 create mode 100644 tcp_buf.h
 create mode 100644 tcp_internal.h

diff --git a/Makefile b/Makefile
index e2180b599bdb..09fc461d087e 100644
--- a/Makefile
+++ b/Makefile
@@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
 PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
 	icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
 	ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
-	tcp_splice.c udp.c util.c
+	tcp_buf.c tcp_splice.c udp.c util.c
 QRAP_SRCS = qrap.c
 SRCS = $(PASST_SRCS) $(QRAP_SRCS)
 
@@ -56,7 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1
 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
 	flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
 	lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
-	siphash.h tap.h tcp.h tcp_conn.h tcp_splice.h udp.h util.h
+	siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
+	udp.h util.h
 HEADERS = $(PASST_HEADERS) seccomp.h
 
 C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
diff --git a/tcp.c b/tcp.c
index 6800209d4122..875e318c925b 100644
--- a/tcp.c
+++ b/tcp.c
@@ -302,28 +302,14 @@
 #include "flow.h"
 
 #include "flow_table.h"
-
-#define TCP_FRAMES_MEM			128
-#define TCP_FRAMES							\
-	(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
+#include "tcp_internal.h"
+#include "tcp_buf.h"
 
 #define TCP_HASH_TABLE_LOAD		70		/* % */
 #define TCP_HASH_TABLE_SIZE		(FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD)
 
-#define MAX_WS				8
-#define MAX_WINDOW			(1 << (16 + (MAX_WS)))
-
 /* MSS rounding: see SET_MSS() */
 #define MSS_DEFAULT			536
-#define MSS4				ROUND_DOWN(IP_MAX_MTU -		   \
-						   sizeof(struct tcphdr) - \
-						   sizeof(struct iphdr),   \
-						   sizeof(uint32_t))
-#define MSS6				ROUND_DOWN(IP_MAX_MTU -		   \
-						   sizeof(struct tcphdr) - \
-						   sizeof(struct ipv6hdr), \
-						   sizeof(uint32_t))
-
 #define WINDOW_DEFAULT			14600		/* RFC 6928 */
 #ifdef HAS_SND_WND
 # define KERNEL_REPORTS_SND_WND(c)	((c)->tcp.kernel_snd_wnd)
@@ -345,33 +331,10 @@
  */
 #define SOL_TCP				IPPROTO_TCP
 
-#define SEQ_LE(a, b)			((b) - (a) < MAX_WINDOW)
-#define SEQ_LT(a, b)			((b) - (a) - 1 < MAX_WINDOW)
-#define SEQ_GE(a, b)			((a) - (b) < MAX_WINDOW)
-#define SEQ_GT(a, b)			((a) - (b) - 1 < MAX_WINDOW)
-
-#define FIN		(1 << 0)
-#define SYN		(1 << 1)
-#define RST		(1 << 2)
-#define ACK		(1 << 4)
-/* Flags for internal usage */
-#define DUP_ACK		(1 << 5)
 #define ACK_IF_NEEDED	0		/* See tcp_send_flag() */
 
-#define OPT_EOL		0
-#define OPT_NOP		1
-#define OPT_MSS		2
-#define OPT_MSS_LEN	4
-#define OPT_WS		3
-#define OPT_WS_LEN	3
-#define OPT_SACKP	4
-#define OPT_SACK	5
-#define OPT_TS		8
-
 #define TAPSIDE(conn_)	((conn_)->f.pif[1] == PIF_TAP)
 
-#define CONN_V4(conn)		(!!inany_v4(&(conn)->faddr))
-#define CONN_V6(conn)		(!CONN_V4(conn))
 #define CONN_IS_CLOSING(conn)						\
 	(((conn)->events & ESTABLISHED) &&				\
 	 ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
@@ -408,106 +371,7 @@ static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
  */
 static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
 
-/* Static buffers */
-/**
- * struct tcp_payload_t - TCP header and data to send segments with payload
- * @th:		TCP header
- * @data:	TCP data
- */
-struct tcp_payload_t {
-	struct tcphdr th;
-	uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));    /* For AVX2 checksum routines */
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-/**
- * struct tcp_flags_t - TCP header and data to send zero-length
- *                      segments (flags)
- * @th:		TCP header
- * @opts	TCP options
- */
-struct tcp_flags_t {
-	struct tcphdr th;
-	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-/* Ethernet header for IPv4 frames */
-static struct ethhdr		tcp4_eth_src;
-
-static struct tap_hdr		tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
-/* IPv4 headers */
-static struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
-/* TCP segments with payload for IPv4 frames */
-static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
-
-static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
-
-/* References tracking the owner connection of frames in the tap outqueue */
-static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
-static unsigned int tcp4_payload_used;
-
-static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
-/* IPv4 headers for TCP segment without payload */
-static struct iphdr		tcp4_flags_ip[TCP_FRAMES_MEM];
-/* TCP segments without payload for IPv4 frames */
-static struct tcp_flags_t	tcp4_flags[TCP_FRAMES_MEM];
-
-static unsigned int tcp4_flags_used;
-
-/* Ethernet header for IPv6 frames */
-static struct ethhdr		tcp6_eth_src;
-
-static struct tap_hdr		tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
-/* IPv6 headers */
-static struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
-/* TCP headers and data for IPv6 frames */
-static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
-
-static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
-
-/* References tracking the owner connection of frames in the tap outqueue */
-static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
-static unsigned int tcp6_payload_used;
-
-static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
-/* IPv6 headers for TCP segment without payload */
-static struct ipv6hdr		tcp6_flags_ip[TCP_FRAMES_MEM];
-/* TCP segment without payload for IPv6 frames */
-static struct tcp_flags_t	tcp6_flags[TCP_FRAMES_MEM];
-
-static unsigned int tcp6_flags_used;
-
-/* recvmsg()/sendmsg() data for tap */
-static char 		tcp_buf_discard		[MAX_WINDOW];
-static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
-
-/*
- * enum tcp_iov_parts - I/O vector parts for one TCP frame
- * @TCP_IOV_TAP		tap backend specific header
- * @TCP_IOV_ETH		Ethernet header
- * @TCP_IOV_IP		IP (v4/v6) header
- * @TCP_IOV_PAYLOAD	IP payload (TCP header + data)
- * @TCP_NUM_IOVS 	the number of entries in the iovec array
- */
-enum tcp_iov_parts {
-	TCP_IOV_TAP	= 0,
-	TCP_IOV_ETH	= 1,
-	TCP_IOV_IP	= 2,
-	TCP_IOV_PAYLOAD	= 3,
-	TCP_NUM_IOVS
-};
-
-static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
-static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
-static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
-static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
+char		tcp_buf_discard		[MAX_WINDOW];
 
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
@@ -552,14 +416,6 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
 	return EPOLLRDHUP;
 }
 
-static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
-			 unsigned long flag);
-#define conn_flag(c, conn, flag)					\
-	do {								\
-		flow_trace(conn, "flag at %s:%i", __func__, __LINE__);	\
-		conn_flag_do(c, conn, flag);				\
-	} while (0)
-
 /**
  * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
  * @c:		Execution context
@@ -671,8 +527,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
  * @conn:	Connection pointer
  * @flag:	Flag to set, or ~flag to unset
  */
-static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
-			 unsigned long flag)
+void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
+		  unsigned long flag)
 {
 	if (flag & (flag - 1)) {
 		int flag_index = fls(~flag);
@@ -722,8 +578,8 @@ static void tcp_hash_remove(const struct ctx *c,
  * @conn:	Connection pointer
  * @event:	Connection event
  */
-static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
-			  unsigned long event)
+void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
+		   unsigned long event)
 {
 	int prev, new, num = fls(event);
 
@@ -771,12 +627,6 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
 		tcp_timer_ctl(c, conn);
 }
 
-#define conn_event(c, conn, event)					\
-	do {								\
-		flow_trace(conn, "event at %s:%i", __func__, __LINE__);	\
-		conn_event_do(c, conn, event);				\
-	} while (0)
-
 /**
  * tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint
  * @conn:	Connection pointer
@@ -906,104 +756,6 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
 	th->check = csum(th, l4len, sum);
 }
 
-/**
- * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
- * @eth_d:	Ethernet destination address, NULL if unchanged
- * @eth_s:	Ethernet source address, NULL if unchanged
- */
-void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
-{
-	eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
-	eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
-}
-
-/**
- * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
- * @c:		Execution context
- */
-static void tcp_sock4_iov_init(const struct ctx *c)
-{
-	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
-	struct iovec *iov;
-	int i;
-
-	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
-
-	for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
-		tcp4_payload_ip[i] = iph;
-		tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
-		tcp4_payload[i].th.ack = 1;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
-		tcp4_flags_ip[i] = iph;
-		tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
-		tcp4_flags[i].th.ack = 1;
-	}
-
-	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		iov = tcp4_l2_iov[i];
-
-		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
-		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
-		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
-	}
-
-	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		iov = tcp4_l2_flags_iov[i];
-
-		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
-		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
-		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
-		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
-	}
-}
-
-/**
- * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
- * @c:		Execution context
- */
-static void tcp_sock6_iov_init(const struct ctx *c)
-{
-	struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
-	struct iovec *iov;
-	int i;
-
-	tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
-
-	for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
-		tcp6_payload_ip[i] = ip6;
-		tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
-		tcp6_payload[i].th.ack = 1;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
-		tcp6_flags_ip[i] = ip6;
-		tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
-		tcp6_flags[i].th .ack = 1;
-	}
-
-	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		iov = tcp6_l2_iov[i];
-
-		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
-		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
-		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
-	}
-
-	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		iov = tcp6_l2_flags_iov[i];
-
-		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
-		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
-		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
-		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
-	}
-}
-
 /**
  * tcp_opt_get() - Get option, and value if any, from TCP header
  * @opts:	Pointer to start of TCP options in header
@@ -1227,76 +979,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn)
 	return true;
 }
 
-static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
-#define tcp_rst(c, conn)						\
-	do {								\
-		flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
-		tcp_rst_do(c, conn);					\
-	} while (0)
-
-/**
- * tcp_flags_flush() - Send out buffers for segments with no data (flags)
- * @c:		Execution context
- */
-static void tcp_flags_flush(const struct ctx *c)
-{
-	tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
-			tcp6_flags_used);
-	tcp6_flags_used = 0;
-
-	tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
-			tcp4_flags_used);
-	tcp4_flags_used = 0;
-}
-
-/**
- * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
- * @conns:       Array of connection pointers corresponding to queued frames
- * @frames:      Two-dimensional array containing queued frames with sub-iovs
- * @num_frames:  Number of entries in the two arrays to be compared
- */
-static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[TCP_NUM_IOVS],
-			   int num_frames)
-{
-	int i;
-
-	for (i = 0; i < num_frames; i++) {
-		const struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base;
-		struct tcp_tap_conn *conn = conns[i];
-		uint32_t seq = ntohl(th->seq);
-
-		if (SEQ_LE(conn->seq_to_tap, seq))
-			continue;
-
-		conn->seq_to_tap = seq;
-	}
-}
-
-/**
- * tcp_payload_flush() - Send out buffers for segments with data
- * @c:		Execution context
- */
-static void tcp_payload_flush(const struct ctx *c)
-{
-	size_t m;
-
-	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
-			    tcp6_payload_used);
-	if (m != tcp6_payload_used) {
-		tcp_revert_seq(&tcp6_frame_conns[m], &tcp6_l2_iov[m],
-			       tcp6_payload_used - m);
-	}
-	tcp6_payload_used = 0;
-
-	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
-			    tcp4_payload_used);
-	if (m != tcp4_payload_used) {
-		tcp_revert_seq(&tcp4_frame_conns[m], &tcp4_l2_iov[m],
-			       tcp4_payload_used - m);
-	}
-	tcp4_payload_used = 0;
-}
-
 /**
  * tcp_defer_handler() - Handler for TCP deferred tasks
  * @c:		Execution context
@@ -1430,10 +1112,10 @@ static size_t tcp_fill_headers6(const struct ctx *c,
  *
  * Return: IP payload length, host order
  */
-static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
-				      const struct tcp_tap_conn *conn,
-				      struct iovec *iov, size_t dlen,
-				      const uint16_t *check, uint32_t seq)
+size_t tcp_l2_buf_fill_headers(const struct ctx *c,
+			       const struct tcp_tap_conn *conn,
+			       struct iovec *iov, size_t dlen,
+			       const uint16_t *check, uint32_t seq)
 {
 	const struct in_addr *a4 = inany_v4(&conn->faddr);
 
@@ -1459,8 +1141,8 @@ static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
  *
  * Return: 1 if sequence or window were updated, 0 otherwise
  */
-static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
-				 int force_seq, struct tcp_info *tinfo)
+int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
+			  int force_seq, struct tcp_info *tinfo)
 {
 	uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
 	uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
@@ -1579,9 +1261,9 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
  *	     0 if there is no flag to send
  *	     1 otherwise
  */
-static int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
-			     int flags, struct tcphdr *th, char *data,
-			     size_t *optlen)
+int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
+		      int flags, struct tcphdr *th, char *data,
+		      size_t *optlen)
 {
 	struct tcp_info tinfo = { 0 };
 	socklen_t sl = sizeof(tinfo);
@@ -1678,54 +1360,9 @@ static int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
  *
  * Return: negative error code on connection reset, 0 otherwise
  */
-static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 {
-	struct tcp_flags_t *payload;
-	struct iovec *iov;
-	size_t optlen;
-	size_t l4len;
-	int ret;
-
-	if (CONN_V4(conn))
-		iov = tcp4_l2_flags_iov[tcp4_flags_used++];
-	else
-		iov = tcp6_l2_flags_iov[tcp6_flags_used++];
-
-	payload = iov[TCP_IOV_PAYLOAD].iov_base;
-
-	ret = tcp_prepare_flags(c, conn, flags, &payload->th,
-				payload->opts, &optlen);
-	if (ret <= 0)
-		return ret;
-
-	l4len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL,
-					conn->seq_to_tap);
-	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
-
-	if (flags & DUP_ACK) {
-		struct iovec *dup_iov;
-		int i;
-
-		if (CONN_V4(conn))
-			dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
-		else
-			dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
-
-		for (i = 0; i < TCP_NUM_IOVS; i++)
-			memcpy(dup_iov[i].iov_base, iov[i].iov_base,
-			       iov[i].iov_len);
-		dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
-	}
-
-	if (CONN_V4(conn)) {
-		if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
-			tcp_flags_flush(c);
-	} else {
-		if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
-			tcp_flags_flush(c);
-	}
-
-	return 0;
+	return tcp_buf_send_flag(c, conn, flags);
 }
 
 /**
@@ -1733,7 +1370,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
  * @c:		Execution context
  * @conn:	Connection pointer
  */
-static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
+void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
 {
 	if (conn->events == CLOSED)
 		return;
@@ -2160,49 +1797,6 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
 	return 0;
 }
 
-/**
- * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
- * @c:		Execution context
- * @conn:	Connection pointer
- * @dlen:	TCP payload length
- * @no_csum:	Don't compute IPv4 checksum, use the one from previous buffer
- * @seq:	Sequence number to be sent
- */
-static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
-			    ssize_t dlen, int no_csum, uint32_t seq)
-{
-	struct iovec *iov;
-	size_t l4len;
-
-	conn->seq_to_tap = seq + dlen;
-
-	if (CONN_V4(conn)) {
-		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
-		const uint16_t *check = NULL;
-
-		if (no_csum) {
-			struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
-			check = &iph->check;
-		}
-
-		tcp4_frame_conns[tcp4_payload_used] = conn;
-
-		iov = tcp4_l2_iov[tcp4_payload_used++];
-		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
-		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
-		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
-			tcp_payload_flush(c);
-	} else if (CONN_V6(conn)) {
-		tcp6_frame_conns[tcp6_payload_used] = conn;
-
-		iov = tcp6_l2_iov[tcp6_payload_used++];
-		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);
-		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
-		if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
-			tcp_payload_flush(c);
-	}
-}
-
 /**
  * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window
  * @c:		Execution context
@@ -2214,123 +1808,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
  */
 static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 {
-	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
-	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
-	int sendlen, len, dlen, v4 = CONN_V4(conn);
-	int s = conn->sock, i, ret = 0;
-	struct msghdr mh_sock = { 0 };
-	uint16_t mss = MSS_GET(conn);
-	uint32_t already_sent, seq;
-	struct iovec *iov;
-
-	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
-
-	if (SEQ_LT(already_sent, 0)) {
-		/* RFC 761, section 2.1. */
-		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
-			   conn->seq_ack_from_tap, conn->seq_to_tap);
-		conn->seq_to_tap = conn->seq_ack_from_tap;
-		already_sent = 0;
-	}
-
-	if (!wnd_scaled || already_sent >= wnd_scaled) {
-		conn_flag(c, conn, STALLED);
-		conn_flag(c, conn, ACK_FROM_TAP_DUE);
-		return 0;
-	}
-
-	/* Set up buffer descriptors we'll fill completely and partially. */
-	fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
-	if (fill_bufs > TCP_FRAMES) {
-		fill_bufs = TCP_FRAMES;
-		iov_rem = 0;
-	} else {
-		iov_rem = (wnd_scaled - already_sent) % mss;
-	}
-
-	mh_sock.msg_iov = iov_sock;
-	mh_sock.msg_iovlen = fill_bufs + 1;
-
-	iov_sock[0].iov_base = tcp_buf_discard;
-	iov_sock[0].iov_len = already_sent;
-
-	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
-	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
-		tcp_payload_flush(c);
-
-		/* Silence Coverity CWE-125 false positive */
-		tcp4_payload_used = tcp6_payload_used = 0;
-	}
-
-	for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
-		if (v4)
-			iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
-		else
-			iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
-		iov->iov_len = mss;
-	}
-	if (iov_rem)
-		iov_sock[fill_bufs].iov_len = iov_rem;
-
-	/* Receive into buffers, don't dequeue until acknowledged by guest. */
-	do
-		len = recvmsg(s, &mh_sock, MSG_PEEK);
-	while (len < 0 && errno == EINTR);
-
-	if (len < 0)
-		goto err;
-
-	if (!len) {
-		if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
-			if ((ret = tcp_send_flag(c, conn, FIN | ACK))) {
-				tcp_rst(c, conn);
-				return ret;
-			}
-
-			conn_event(c, conn, TAP_FIN_SENT);
-		}
-
-		return 0;
-	}
-
-	sendlen = len - already_sent;
-	if (sendlen <= 0) {
-		conn_flag(c, conn, STALLED);
-		return 0;
-	}
-
-	conn_flag(c, conn, ~STALLED);
-
-	send_bufs = DIV_ROUND_UP(sendlen, mss);
-	last_len = sendlen - (send_bufs - 1) * mss;
-
-	/* Likely, some new data was acked too. */
-	tcp_update_seqack_wnd(c, conn, 0, NULL);
-
-	/* Finally, queue to tap */
-	dlen = mss;
-	seq = conn->seq_to_tap;
-	for (i = 0; i < send_bufs; i++) {
-		int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
-
-		if (i == send_bufs - 1)
-			dlen = last_len;
-
-		tcp_data_to_tap(c, conn, dlen, no_csum, seq);
-		seq += dlen;
-	}
-
-	conn_flag(c, conn, ACK_FROM_TAP_DUE);
-
-	return 0;
-
-err:
-	if (errno != EAGAIN && errno != EWOULDBLOCK) {
-		ret = -errno;
-		tcp_rst(c, conn);
-	}
-
-	return ret;
+	return tcp_buf_data_from_sock(c, conn);
 }
 
 /**
diff --git a/tcp_buf.c b/tcp_buf.c
new file mode 100644
index 000000000000..0c7d07b8d0bd
--- /dev/null
+++ b/tcp_buf.c
@@ -0,0 +1,513 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ *  for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ *  for network namespace/tap device mode
+ *
+ * tcp_buf.c - TCP L2-L4 buffer management functions
+ *
+ * Copyright Red Hat
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <limits.h>
+#include <string.h>
+#include <errno.h>
+
+#include <netinet/ip.h>
+
+#include <linux/tcp.h>
+
+#include "util.h"
+#include "ip.h"
+#include "iov.h"
+#include "passt.h"
+#include "tap.h"
+#include "siphash.h"
+#include "inany.h"
+#include "tcp_conn.h"
+#include "tcp_internal.h"
+#include "tcp_buf.h"
+
+#define TCP_FRAMES_MEM			128
+#define TCP_FRAMES							   \
+	(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
+
+/* Static buffers */
+/**
+ * struct tcp_payload_t - TCP header and data to send segments with payload
+ * @th:		TCP header
+ * @data:	TCP data
+ */
+struct tcp_payload_t {
+	struct tcphdr th;
+	uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)));    /* For AVX2 checksum routines */
+#else
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
+#endif
+
+/**
+ * struct tcp_flags_t - TCP header and data to send zero-length
+ *                      segments (flags)
+ * @th:		TCP header
+ * @opts	TCP options
+ */
+struct tcp_flags_t {
+	struct tcphdr th;
+	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)));
+#else
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
+#endif
+
+/* Ethernet header for IPv4 frames */
+static struct ethhdr		tcp4_eth_src;
+
+static struct tap_hdr		tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
+/* IPv4 headers */
+static struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
+/* TCP segments with payload for IPv4 frames */
+static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
+
+static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
+
+/* References tracking the owner connection of frames in the tap outqueue */
+static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
+static unsigned int tcp4_payload_used;
+
+static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
+/* IPv4 headers for TCP segment without payload */
+static struct iphdr		tcp4_flags_ip[TCP_FRAMES_MEM];
+/* TCP segments without payload for IPv4 frames */
+static struct tcp_flags_t	tcp4_flags[TCP_FRAMES_MEM];
+
+static unsigned int tcp4_flags_used;
+
+/* Ethernet header for IPv6 frames */
+static struct ethhdr		tcp6_eth_src;
+
+static struct tap_hdr		tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
+/* IPv6 headers */
+static struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
+/* TCP headers and data for IPv6 frames */
+static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
+
+static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
+
+/* References tracking the owner connection of frames in the tap outqueue */
+static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
+static unsigned int tcp6_payload_used;
+
+static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
+/* IPv6 headers for TCP segment without payload */
+static struct ipv6hdr		tcp6_flags_ip[TCP_FRAMES_MEM];
+/* TCP segment without payload for IPv6 frames */
+static struct tcp_flags_t	tcp6_flags[TCP_FRAMES_MEM];
+
+static unsigned int tcp6_flags_used;
+
+/* recvmsg()/sendmsg() data for tap */
+static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
+
+static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
+/**
+ * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
+ * @eth_d:	Ethernet destination address, NULL if unchanged
+ * @eth_s:	Ethernet source address, NULL if unchanged
+ */
+void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
+{
+	eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
+	eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
+}
+
+/**
+ * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
+ * @c:		Execution context
+ */
+void tcp_sock4_iov_init(const struct ctx *c)
+{
+	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
+	struct iovec *iov;
+	int i;
+
+	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
+
+	for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
+		tcp4_payload_ip[i] = iph;
+		tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
+		tcp4_payload[i].th.ack = 1;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
+		tcp4_flags_ip[i] = iph;
+		tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
+		tcp4_flags[i].th.ack = 1;
+	}
+
+	for (i = 0; i < TCP_FRAMES_MEM; i++) {
+		iov = tcp4_l2_iov[i];
+
+		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
+		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
+	}
+
+	for (i = 0; i < TCP_FRAMES_MEM; i++) {
+		iov = tcp4_l2_flags_iov[i];
+
+		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
+		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
+		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
+	}
+}
+
+/**
+ * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
+ * @c:		Execution context
+ */
+void tcp_sock6_iov_init(const struct ctx *c)
+{
+	struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
+	struct iovec *iov;
+	int i;
+
+	tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
+
+	for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
+		tcp6_payload_ip[i] = ip6;
+		tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
+		tcp6_payload[i].th.ack = 1;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
+		tcp6_flags_ip[i] = ip6;
+		tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
+		tcp6_flags[i].th .ack = 1;
+	}
+
+	for (i = 0; i < TCP_FRAMES_MEM; i++) {
+		iov = tcp6_l2_iov[i];
+
+		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
+		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
+	}
+
+	for (i = 0; i < TCP_FRAMES_MEM; i++) {
+		iov = tcp6_l2_flags_iov[i];
+
+		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
+		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
+		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
+	}
+}
+
+/**
+ * tcp_flags_flush() - Send out buffers for segments with no data (flags)
+ * @c:		Execution context
+ */
+void tcp_flags_flush(const struct ctx *c)
+{
+	tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
+			tcp6_flags_used);
+	tcp6_flags_used = 0;
+
+	tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
+			tcp4_flags_used);
+	tcp4_flags_used = 0;
+}
+
+/**
+ * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
+ * @conns:       Array of connection pointers corresponding to queued frames
+ * @frames:      Two-dimensional array containing queued frames with sub-iovs
+ * @num_frames:  Number of entries in the two arrays to be compared
+ */
+static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[TCP_NUM_IOVS],
+			   int num_frames)
+{
+	int i;
+
+	for (i = 0; i < num_frames; i++) {
+		const struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base;
+		struct tcp_tap_conn *conn = conns[i];
+		uint32_t seq = ntohl(th->seq);
+
+		if (SEQ_LE(conn->seq_to_tap, seq))
+			continue;
+
+		conn->seq_to_tap = seq;
+	}
+}
+
+/**
+ * tcp_payload_flush() - Send out buffers for segments with data
+ * @c:		Execution context
+ */
+void tcp_payload_flush(const struct ctx *c)
+{
+	size_t m;
+
+	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
+			    tcp6_payload_used);
+	if (m != tcp6_payload_used) {
+		tcp_revert_seq(&tcp6_frame_conns[m], &tcp6_l2_iov[m],
+			       tcp6_payload_used - m);
+	}
+	tcp6_payload_used = 0;
+
+	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
+			    tcp4_payload_used);
+	if (m != tcp4_payload_used) {
+		tcp_revert_seq(&tcp4_frame_conns[m], &tcp4_l2_iov[m],
+			       tcp4_payload_used - m);
+	}
+	tcp4_payload_used = 0;
+}
+
+/**
+ * tcp_buf_send_flag() - Send segment with flags to tap (no payload)
+ * @c:         Execution context
+ * @conn:      Connection pointer
+ * @flags:     TCP flags: if not set, send segment only if ACK is due
+ *
+ * Return: negative error code on connection reset, 0 otherwise
+ */
+int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+{
+	struct tcp_flags_t *payload;
+	struct iovec *iov;
+	size_t optlen;
+	size_t l4len;
+	int ret;
+
+	if (CONN_V4(conn))
+		iov = tcp4_l2_flags_iov[tcp4_flags_used++];
+	else
+		iov = tcp6_l2_flags_iov[tcp6_flags_used++];
+
+	payload = iov[TCP_IOV_PAYLOAD].iov_base;
+
+	ret = tcp_prepare_flags(c, conn, flags, &payload->th,
+				payload->opts, &optlen);
+	if (ret <= 0)
+		return ret;
+
+	l4len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL,
+					conn->seq_to_tap);
+	iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+
+	if (flags & DUP_ACK) {
+		struct iovec *dup_iov;
+		int i;
+
+		if (CONN_V4(conn))
+			dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
+		else
+			dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
+
+		for (i = 0; i < TCP_NUM_IOVS; i++)
+			memcpy(dup_iov[i].iov_base, iov[i].iov_base,
+			       iov[i].iov_len);
+		dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
+	}
+
+	if (CONN_V4(conn)) {
+		if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
+			tcp_flags_flush(c);
+	} else {
+		if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
+			tcp_flags_flush(c);
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
+ * @c:		Execution context
+ * @conn:	Connection pointer
+ * @dlen:	TCP payload length
+ * @no_csum:	Don't compute IPv4 checksum, use the one from previous buffer
+ * @seq:	Sequence number to be sent
+ */
+static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
+			    ssize_t dlen, int no_csum, uint32_t seq)
+{
+	struct iovec *iov;
+	size_t l4len;
+
+	conn->seq_to_tap = seq + dlen;
+
+	if (CONN_V4(conn)) {
+		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
+		const uint16_t *check = NULL;
+
+		if (no_csum) {
+			struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
+			check = &iph->check;
+		}
+
+		tcp4_frame_conns[tcp4_payload_used] = conn;
+
+		iov = tcp4_l2_iov[tcp4_payload_used++];
+		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
+		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
+			tcp_payload_flush(c);
+	} else if (CONN_V6(conn)) {
+		tcp6_frame_conns[tcp6_payload_used] = conn;
+
+		iov = tcp6_l2_iov[tcp6_payload_used++];
+		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);
+		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+		if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
+			tcp_payload_flush(c);
+	}
+}
+
+/**
+ * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window
+ * @c:		Execution context
+ * @conn:	Connection pointer
+ *
+ * Return: negative on connection reset, 0 otherwise
+ *
+ * #syscalls recvmsg
+ */
+int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+{
+	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
+	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
+	int sendlen, len, dlen, v4 = CONN_V4(conn);
+	int s = conn->sock, i, ret = 0;
+	struct msghdr mh_sock = { 0 };
+	uint16_t mss = MSS_GET(conn);
+	uint32_t already_sent, seq;
+	struct iovec *iov;
+
+	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
+
+	if (SEQ_LT(already_sent, 0)) {
+		/* RFC 761, section 2.1. */
+		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
+			   conn->seq_ack_from_tap, conn->seq_to_tap);
+		conn->seq_to_tap = conn->seq_ack_from_tap;
+		already_sent = 0;
+	}
+
+	if (!wnd_scaled || already_sent >= wnd_scaled) {
+		conn_flag(c, conn, STALLED);
+		conn_flag(c, conn, ACK_FROM_TAP_DUE);
+		return 0;
+	}
+
+	/* Set up buffer descriptors we'll fill completely and partially. */
+	fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
+	if (fill_bufs > TCP_FRAMES) {
+		fill_bufs = TCP_FRAMES;
+		iov_rem = 0;
+	} else {
+		iov_rem = (wnd_scaled - already_sent) % mss;
+	}
+
+	mh_sock.msg_iov = iov_sock;
+	mh_sock.msg_iovlen = fill_bufs + 1;
+
+	iov_sock[0].iov_base = tcp_buf_discard;
+	iov_sock[0].iov_len = already_sent;
+
+	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
+	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
+		tcp_payload_flush(c);
+
+		/* Silence Coverity CWE-125 false positive */
+		tcp4_payload_used = tcp6_payload_used = 0;
+	}
+
+	for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
+		if (v4)
+			iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
+		else
+			iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
+		iov->iov_len = mss;
+	}
+	if (iov_rem)
+		iov_sock[fill_bufs].iov_len = iov_rem;
+
+	/* Receive into buffers, don't dequeue until acknowledged by guest. */
+	do
+		len = recvmsg(s, &mh_sock, MSG_PEEK);
+	while (len < 0 && errno == EINTR);
+
+	if (len < 0)
+		goto err;
+
+	if (!len) {
+		if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
+			if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
+				tcp_rst(c, conn);
+				return ret;
+			}
+
+			conn_event(c, conn, TAP_FIN_SENT);
+		}
+
+		return 0;
+	}
+
+	sendlen = len - already_sent;
+	if (sendlen <= 0) {
+		conn_flag(c, conn, STALLED);
+		return 0;
+	}
+
+	conn_flag(c, conn, ~STALLED);
+
+	send_bufs = DIV_ROUND_UP(sendlen, mss);
+	last_len = sendlen - (send_bufs - 1) * mss;
+
+	/* Likely, some new data was acked too. */
+	tcp_update_seqack_wnd(c, conn, 0, NULL);
+
+	/* Finally, queue to tap */
+	dlen = mss;
+	seq = conn->seq_to_tap;
+	for (i = 0; i < send_bufs; i++) {
+		int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
+
+		if (i == send_bufs - 1)
+			dlen = last_len;
+
+		tcp_data_to_tap(c, conn, dlen, no_csum, seq);
+		seq += dlen;
+	}
+
+	conn_flag(c, conn, ACK_FROM_TAP_DUE);
+
+	return 0;
+
+err:
+	if (errno != EAGAIN && errno != EWOULDBLOCK) {
+		ret = -errno;
+		tcp_rst(c, conn);
+	}
+
+	return ret;
+}
diff --git a/tcp_buf.h b/tcp_buf.h
new file mode 100644
index 000000000000..14be7b945285
--- /dev/null
+++ b/tcp_buf.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2021 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#ifndef TCP_BUF_H
+#define TCP_BUF_H
+
+void tcp_sock4_iov_init(const struct ctx *c);
+void tcp_sock6_iov_init(const struct ctx *c);
+void tcp_flags_flush(const struct ctx *c);
+void tcp_payload_flush(const struct ctx *c);
+int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
+
+#endif  /*TCP_BUF_H */
diff --git a/tcp_internal.h b/tcp_internal.h
new file mode 100644
index 000000000000..51aaa16918cf
--- /dev/null
+++ b/tcp_internal.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2021 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#ifndef TCP_INTERNAL_H
+#define TCP_INTERNAL_H
+
+#define MAX_WS				8
+#define MAX_WINDOW			(1 << (16 + (MAX_WS)))
+
+#define MSS4				ROUND_DOWN(IP_MAX_MTU -		   \
+						   sizeof(struct tcphdr) - \
+						   sizeof(struct iphdr),   \
+						   sizeof(uint32_t))
+#define MSS6				ROUND_DOWN(IP_MAX_MTU -		   \
+						   sizeof(struct tcphdr) - \
+						   sizeof(struct ipv6hdr), \
+						   sizeof(uint32_t))
+
+#define SEQ_LE(a, b)			((b) - (a) < MAX_WINDOW)
+#define SEQ_LT(a, b)			((b) - (a) - 1 < MAX_WINDOW)
+#define SEQ_GE(a, b)			((a) - (b) < MAX_WINDOW)
+#define SEQ_GT(a, b)			((a) - (b) - 1 < MAX_WINDOW)
+
+#define FIN		(1 << 0)
+#define SYN		(1 << 1)
+#define RST		(1 << 2)
+#define ACK		(1 << 4)
+
+/* Flags for internal usage */
+#define DUP_ACK		(1 << 5)
+#define OPT_EOL		0
+#define OPT_NOP		1
+#define OPT_MSS		2
+#define OPT_MSS_LEN	4
+#define OPT_WS		3
+#define OPT_WS_LEN	3
+#define OPT_SACKP	4
+#define OPT_SACK	5
+#define OPT_TS		8
+#define CONN_V4(conn)		(!!inany_v4(&(conn)->faddr))
+#define CONN_V6(conn)		(!CONN_V4(conn))
+
+/*
+ * enum tcp_iov_parts - I/O vector parts for one TCP frame
+ * @TCP_IOV_TAP		tap backend specific header
+ * @TCP_IOV_ETH		Ethernet header
+ * @TCP_IOV_IP		IP (v4/v6) header
+ * @TCP_IOV_PAYLOAD	IP payload (TCP header + data)
+ * @TCP_NUM_IOVS 	the number of entries in the iovec array
+ */
+enum tcp_iov_parts {
+	TCP_IOV_TAP	= 0,
+	TCP_IOV_ETH	= 1,
+	TCP_IOV_IP	= 2,
+	TCP_IOV_PAYLOAD	= 3,
+	TCP_NUM_IOVS
+};
+
+extern char tcp_buf_discard [MAX_WINDOW];
+
+void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
+		  unsigned long flag);
+#define conn_flag(c, conn, flag)					\
+	do {								\
+		flow_trace(conn, "flag at %s:%i", __func__, __LINE__);	\
+		conn_flag_do(c, conn, flag);				\
+	} while (0)
+
+
+void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
+		   unsigned long event);
+#define conn_event(c, conn, event)					\
+	do {								\
+		flow_trace(conn, "event at %s:%i", __func__, __LINE__);	\
+		conn_event_do(c, conn, event);				\
+	} while (0)
+
+void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
+#define tcp_rst(c, conn)						\
+	do {								\
+		flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
+		tcp_rst_do(c, conn);					\
+	} while (0)
+
+size_t tcp_l2_buf_fill_headers(const struct ctx *c,
+			       const struct tcp_tap_conn *conn,
+			       struct iovec *iov, size_t dlen,
+			       const uint16_t *check, uint32_t seq);
+int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
+			  int force_seq, struct tcp_info *tinfo);
+int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags,
+		      struct tcphdr *th, char *data, size_t *optlen);
+
+#endif /* TCP_INTERNAL_H */
-- 
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2021 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#ifndef TCP_INTERNAL_H
+#define TCP_INTERNAL_H
+
+#define MAX_WS				8
+#define MAX_WINDOW			(1 << (16 + (MAX_WS)))
+
+#define MSS4				ROUND_DOWN(IP_MAX_MTU -		   \
+						   sizeof(struct tcphdr) - \
+						   sizeof(struct iphdr),   \
+						   sizeof(uint32_t))
+#define MSS6				ROUND_DOWN(IP_MAX_MTU -		   \
+						   sizeof(struct tcphdr) - \
+						   sizeof(struct ipv6hdr), \
+						   sizeof(uint32_t))
+
+#define SEQ_LE(a, b)			((b) - (a) < MAX_WINDOW)
+#define SEQ_LT(a, b)			((b) - (a) - 1 < MAX_WINDOW)
+#define SEQ_GE(a, b)			((a) - (b) < MAX_WINDOW)
+#define SEQ_GT(a, b)			((a) - (b) - 1 < MAX_WINDOW)
+
+#define FIN		(1 << 0)
+#define SYN		(1 << 1)
+#define RST		(1 << 2)
+#define ACK		(1 << 4)
+
+/* Flags for internal usage */
+#define DUP_ACK		(1 << 5)
+#define OPT_EOL		0
+#define OPT_NOP		1
+#define OPT_MSS		2
+#define OPT_MSS_LEN	4
+#define OPT_WS		3
+#define OPT_WS_LEN	3
+#define OPT_SACKP	4
+#define OPT_SACK	5
+#define OPT_TS		8
+#define CONN_V4(conn)		(!!inany_v4(&(conn)->faddr))
+#define CONN_V6(conn)		(!CONN_V4(conn))
+
+/*
+ * enum tcp_iov_parts - I/O vector parts for one TCP frame
+ * @TCP_IOV_TAP		tap backend specific header
+ * @TCP_IOV_ETH		Ethernet header
+ * @TCP_IOV_IP		IP (v4/v6) header
+ * @TCP_IOV_PAYLOAD	IP payload (TCP header + data)
+ * @TCP_NUM_IOVS 	the number of entries in the iovec array
+ */
+enum tcp_iov_parts {
+	TCP_IOV_TAP	= 0,
+	TCP_IOV_ETH	= 1,
+	TCP_IOV_IP	= 2,
+	TCP_IOV_PAYLOAD	= 3,
+	TCP_NUM_IOVS
+};
+
+extern char tcp_buf_discard [MAX_WINDOW];
+
+void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
+		  unsigned long flag);
+#define conn_flag(c, conn, flag)					\
+	do {								\
+		flow_trace(conn, "flag at %s:%i", __func__, __LINE__);	\
+		conn_flag_do(c, conn, flag);				\
+	} while (0)
+
+
+void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
+		   unsigned long event);
+#define conn_event(c, conn, event)					\
+	do {								\
+		flow_trace(conn, "event at %s:%i", __func__, __LINE__);	\
+		conn_event_do(c, conn, event);				\
+	} while (0)
+
+void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
+#define tcp_rst(c, conn)						\
+	do {								\
+		flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
+		tcp_rst_do(c, conn);					\
+	} while (0)
+
+size_t tcp_l2_buf_fill_headers(const struct ctx *c,
+			       const struct tcp_tap_conn *conn,
+			       struct iovec *iov, size_t dlen,
+			       const uint16_t *check, uint32_t seq);
+int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
+			  int force_seq, struct tcp_info *tinfo);
+int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags,
+		      struct tcphdr *th, char *data, size_t *optlen);
+
+#endif /* TCP_INTERNAL_H */
-- 
2.45.2


  parent reply	other threads:[~2024-06-12 15:47 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-12 15:47 [PATCH v6 0/8] Add vhost-user support to passt (part 2) Laurent Vivier
2024-06-12 15:47 ` [PATCH v6 1/8] tcp: extract buffer management from tcp_send_flag() Laurent Vivier
2024-06-12 21:22   ` Stefano Brivio
2024-06-13  6:07     ` Stefano Brivio
2024-06-13  8:24       ` Laurent Vivier
2024-06-13 10:14         ` Stefano Brivio
2024-06-13 10:22           ` Laurent Vivier
2024-06-13 10:49             ` Stefano Brivio
2024-06-13 10:58               ` Laurent Vivier
2024-06-13  7:31     ` Laurent Vivier
2024-06-13  9:35       ` Stefano Brivio
2024-06-13 14:36         ` David Gibson
2024-06-12 15:47 ` Laurent Vivier [this message]
2024-06-12 15:54   ` [PATCH v6 2/8] tcp: move buffers management functions to their own file Stefano Brivio
2024-06-12 15:47 ` [PATCH v6 3/8] tap: refactor packets handling functions Laurent Vivier
2024-06-12 15:52   ` Stefano Brivio
2024-06-12 16:00     ` Laurent Vivier
2024-06-12 15:47 ` [PATCH v6 4/8] udp: refactor UDP header update functions Laurent Vivier
2024-06-12 15:47 ` [PATCH v6 5/8] udp: rename udp_sock_handler() to udp_buf_sock_handler() Laurent Vivier
2024-06-12 15:47 ` [PATCH v6 6/8] vhost-user: compare mode MODE_PASTA and not MODE_PASST Laurent Vivier
2024-06-12 15:47 ` [PATCH v6 7/8] iov: remove iov_copy() Laurent Vivier
2024-06-12 15:47 ` [PATCH v6 8/8] tap: use in->buf_size rather than sizeof(pkt_buf) Laurent Vivier
2024-06-12 17:16 ` [PATCH v6 0/8] Add vhost-user support to passt (part 2) Stefano Brivio
2024-06-12 17:37   ` Stefano Brivio
2024-06-12 21:23     ` Stefano Brivio

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240612154734.1044883-3-lvivier@redhat.com \
    --to=lvivier@redhat.com \
    --cc=passt-dev@passt.top \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).