public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [RFC] tcp: Replace TCP buffer structure by an iovec array
@ 2024-03-11 13:33 Laurent Vivier
  2024-03-12 22:56 ` Stefano Brivio
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Laurent Vivier @ 2024-03-11 13:33 UTC (permalink / raw)
  To: passt-dev; +Cc: Laurent Vivier

To be able to provide pointers to TCP headers and IP headers without
worrying about alignment in the structure, split the structure into
several arrays and point to each part of the frame using an iovec array.

Using iovec also allows us to simply ignore the first entry when the
vnet length header is not needed. And as the payload buffer contains
only the TCP header and the TCP data we can increase the size of the
TCP data to USHRT_MAX - sizeof(struct tcphdr).

As a side effect, these changes improve performance by a factor of
x1.5.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
 tap.c | 104 ++++++++++++++
 tap.h |  16 +++
 tcp.c | 429 ++++++++++++++++++++++++----------------------------------
 3 files changed, 296 insertions(+), 253 deletions(-)

diff --git a/tap.c b/tap.c
index c7b9372668ec..afd01cffbed6 100644
--- a/tap.c
+++ b/tap.c
@@ -350,6 +350,30 @@ static size_t tap_send_frames_pasta(const struct ctx *c,
 	return i;
 }
 
+/**
+ * tap_send_iov_pasta() - Send out multiple prepared frames
+ * @c:		Execution context
+ * @iov:	Array of frames, each frames is divided in an array of iovecs.
+ *              The first entry of the iovec is ignored
+ * @n:		Number of frames in @iov
+ *
+ * Return: number of frames actually sent
+ */
+static size_t tap_send_iov_pasta(const struct ctx *c,
+				 struct iovec iov[][TCP_IOV_NUM], size_t n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; i++) {
+		if (!tap_send_frames_pasta(c, &iov[i][TCP_IOV_ETH],
+					   TCP_IOV_NUM - TCP_IOV_ETH))
+			break;
+	}
+
+	return i;
+
+}
+
 /**
  * tap_send_frames_passt() - Send multiple frames to the passt tap
  * @c:		Execution context
@@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c,
 	return i;
 }
 
+/**
+ * tap_send_iov_passt() - Send out multiple prepared frames
+ * @c:		Execution context
+ * @iov:	Array of frames, each frames is divided in an array of iovecs.
+ *              The first entry of the iovec is updated to point to an
+ *              uint32_t storing the frame length.
+ * @n:		Number of frames in @iov
+ *
+ * Return: number of frames actually sent
+ */
+static size_t tap_send_iov_passt(const struct ctx *c,
+				 struct iovec iov[][TCP_IOV_NUM],
+				 size_t n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; i++) {
+		uint32_t vnet_len;
+		int j;
+
+		vnet_len = 0;
+		for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
+			vnet_len += iov[i][j].iov_len;
+
+		vnet_len = htonl(vnet_len);
+		iov[i][TCP_IOV_VNET].iov_base = &vnet_len;
+		iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len);
+
+		if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))
+			break;
+	}
+
+	return i;
+
+}
+
 /**
  * tap_send_frames() - Send out multiple prepared frames
  * @c:		Execution context
@@ -418,6 +478,50 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n)
 	return m;
 }
 
+/**
+ * tap_send_iov() - Send out multiple prepared frames
+ * @c:		Execution context
+ * @iov:	Array of frames, each frames is divided in an array of iovecs.
+ * 		iovec array is:
+ * 		TCP_IOV_VNET	(0)	vnet length
+ * 		TCP_IOV_ETH	(1)	ethernet header
+ * 		TCP_IOV_IP	(2)	IP (v4/v6) header
+ *		TCP_IOV_PAYLOAD	(3)	IP payload (TCP header + data)
+ *		TCP_IOV_NUM (4) is the number of entries in the iovec array
+ *		TCP_IOV_VNET entry is updated with passt, ignored with pasta.
+ * @n:		Number of frames in @iov
+ *
+ * Return: number of frames actually sent
+ */
+size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM],
+		    size_t n)
+{
+	size_t m;
+	unsigned int i;
+
+	if (!n)
+		return 0;
+
+	switch (c->mode) {
+	case MODE_PASST:
+		m = tap_send_iov_passt(c, iov, n);
+		break;
+	case MODE_PASTA:
+		m = tap_send_iov_pasta(c, iov, n);
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	if (m < n)
+		debug("tap: failed to send %zu frames of %zu", n - m, n);
+
+	for (i = 0; i < m; i++)
+		pcap_iov(&iov[i][TCP_IOV_ETH], TCP_IOV_NUM - TCP_IOV_ETH);
+
+	return m;
+}
+
 /**
  * eth_update_mac() - Update tap L2 header with new Ethernet addresses
  * @eh:		Ethernet headers to update
diff --git a/tap.h b/tap.h
index 437b9aa2b43f..2d8e7aa1101d 100644
--- a/tap.h
+++ b/tap.h
@@ -6,6 +6,20 @@
 #ifndef TAP_H
 #define TAP_H
 
+/*
+ * TCP frame iovec array:
+ * TCP_IOV_VNET		vnet length
+ * TCP_IOV_ETH		ethernet header
+ * TCP_IOV_IP		IP (v4/v6) header
+ * TCP_IOV_PAYLOAD	IP payload (TCP header + data)
+ * TCP_IOV_NUM is the number of entries in the iovec array
+ */
+#define TCP_IOV_VNET	0
+#define TCP_IOV_ETH	1
+#define TCP_IOV_IP	2
+#define TCP_IOV_PAYLOAD	3
+#define TCP_IOV_NUM	4
+
 /**
  * struct tap_hdr - L2 and tap specific headers
  * @vnet_len:	Frame length (for qemu socket transport)
@@ -74,6 +88,8 @@ void tap_icmp6_send(const struct ctx *c,
 		    const void *in, size_t len);
 int tap_send(const struct ctx *c, const void *data, size_t len);
 size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n);
+size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM],
+		    size_t n);
 void eth_update_mac(struct ethhdr *eh,
 		    const unsigned char *eth_d, const unsigned char *eth_s);
 void tap_listen_handler(struct ctx *c, uint32_t events);
diff --git a/tcp.c b/tcp.c
index d5eedf4d0138..5c8488108ef7 100644
--- a/tcp.c
+++ b/tcp.c
@@ -318,39 +318,7 @@
 
 /* MSS rounding: see SET_MSS() */
 #define MSS_DEFAULT			536
-
-struct tcp4_l2_head {	/* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
-#ifdef __AVX2__
-	uint8_t pad[26];
-#else
-	uint8_t pad[2];
-#endif
-	struct tap_hdr taph;
-	struct iphdr iph;
-	struct tcphdr th;
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
-#ifdef __AVX2__
-	uint8_t pad[14];
-#else
-	uint8_t pad[2];
-#endif
-	struct tap_hdr taph;
-	struct ipv6hdr ip6h;
-	struct tcphdr th;
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-#define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
-#define MSS6	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
+#define MSS				(USHRT_MAX - sizeof(struct tcphdr))
 
 #define WINDOW_DEFAULT			14600		/* RFC 6928 */
 #ifdef HAS_SND_WND
@@ -445,133 +413,78 @@ struct tcp_buf_seq_update {
 };
 
 /* Static buffers */
-
 /**
- * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
- * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
- * @taph:	Tap-level headers (partially pre-filled)
- * @iph:	Pre-filled IP header (except for tot_len and saddr)
- * @uh:		Headroom for TCP header
- * @data:	Storage for TCP payload
+ * tcp_l2_flags_t - TCP header and data to send option flags
+ * @th:		TCP header
+ * @opts	TCP option flags
  */
-static struct tcp4_l2_buf_t {
-#ifdef __AVX2__
-	uint8_t pad[26];	/* 0, align th to 32 bytes */
-#else
-	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
-#endif
-	struct tap_hdr taph;	/* 26				2 */
-	struct iphdr iph;	/* 44				20 */
-	struct tcphdr th;	/* 64				40 */
-	uint8_t data[MSS4];	/* 84				60 */
-				/* 65536			65532 */
+struct tcp_l2_flags_t {
+	struct tcphdr th;
+	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
+};
+/**
+ * tcp_l2_payload_t - TCP header and data to send data
+ * 		32 bytes aligned to be able to use AVX2 checksum
+ * @th:		TCP header
+ * @data:	TCP data
+ */
+struct tcp_l2_payload_t {
+	struct tcphdr th;	/*    20 bytes */
+	uint8_t data[MSS];	/* 65516 bytes */
 #ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
+} __attribute__ ((packed, aligned(32)));
 #else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
 #endif
-tcp4_l2_buf[TCP_FRAMES_MEM];
 
-static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
+/* Ethernet header for IPv4 frames */
+static struct ethhdr		tcp4_eth_src;
 
+/* IPv4 headers */
+static struct iphdr		tcp4_l2_ip[TCP_FRAMES_MEM];
+/* TCP headers and data for IPv4 frames */
+static struct tcp_l2_payload_t	tcp4_l2_payload[TCP_FRAMES_MEM];
+
+static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp4_l2_buf_used;
 
-/**
- * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
- * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
- * @taph:	Tap-level headers (partially pre-filled)
- * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
- * @th:		Headroom for TCP header
- * @data:	Storage for TCP payload
- */
-struct tcp6_l2_buf_t {
-#ifdef __AVX2__
-	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
-#else
-	uint8_t pad[2];		/*	align ip6h to 4 bytes	0 */
-#endif
-	struct tap_hdr taph;	/* 14				2 */
-	struct ipv6hdr ip6h;	/* 32				20 */
-	struct tcphdr th;	/* 72				60 */
-	uint8_t data[MSS6];	/* 92				80 */
-				/* 65536			65532 */
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-tcp6_l2_buf[TCP_FRAMES_MEM];
+/* IPv4 headers for TCP option flags frames */
+static struct iphdr		tcp4_l2_flags_ip[TCP_FRAMES_MEM];
+/* TCP headers and option flags for IPv4 frames */
+static struct tcp_l2_flags_t	tcp4_l2_flags[TCP_FRAMES_MEM];
 
-static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
+static unsigned int tcp4_l2_flags_buf_used;
+
+/* Ethernet header for IPv6 frames */
+static struct ethhdr		tcp6_eth_src;
 
+/* IPv6 headers */
+static struct ipv6hdr		tcp6_l2_ip[TCP_FRAMES_MEM];
+/* TCP headers and data for IPv6 frames */
+static struct tcp_l2_payload_t	tcp6_l2_payload[TCP_FRAMES_MEM];
+
+static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp6_l2_buf_used;
 
+/* IPv6 headers for TCP option flags frames */
+static struct ipv6hdr		tcp6_l2_flags_ip[TCP_FRAMES_MEM];
+/* TCP headers and option flags for IPv6 frames */
+static struct tcp_l2_flags_t	tcp6_l2_flags[TCP_FRAMES_MEM];
+
+static unsigned int tcp6_l2_flags_buf_used;
+
 /* recvmsg()/sendmsg() data for tap */
 static char 		tcp_buf_discard		[MAX_WINDOW];
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
-static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
-static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM];
-static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM];
-static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM];
+static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM][TCP_IOV_NUM];
+static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_IOV_NUM];
+static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_IOV_NUM];
+static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_IOV_NUM];
 
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
 
-/**
- * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
- * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
- * @taph:	Tap-level headers (partially pre-filled)
- * @iph:	Pre-filled IP header (except for tot_len and saddr)
- * @th:		Headroom for TCP header
- * @opts:	Headroom for TCP options
- */
-static struct tcp4_l2_flags_buf_t {
-#ifdef __AVX2__
-	uint8_t pad[26];	/* 0, align th to 32 bytes */
-#else
-	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
-#endif
-	struct tap_hdr taph;	/* 26				2 */
-	struct iphdr iph;	/* 44				20 */
-	struct tcphdr th;	/* 64				40 */
-	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-tcp4_l2_flags_buf[TCP_FRAMES_MEM];
-
-static unsigned int tcp4_l2_flags_buf_used;
-
-/**
- * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
- * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
- * @taph:	Tap-level headers (partially pre-filled)
- * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
- * @th:		Headroom for TCP header
- * @opts:	Headroom for TCP options
- */
-static struct tcp6_l2_flags_buf_t {
-#ifdef __AVX2__
-	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
-#else
-	uint8_t pad[2];		/*	align ip6h to 4 bytes		   0 */
-#endif
-	struct tap_hdr taph;	/* 14					   2 */
-	struct ipv6hdr ip6h;	/* 32					  20 */
-	struct tcphdr th	/* 72 */ __attribute__ ((aligned(4))); /* 60 */
-	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-tcp6_l2_flags_buf[TCP_FRAMES_MEM];
-
-static unsigned int tcp6_l2_flags_buf_used;
-
 #define CONN(idx)		(&(FLOW(idx)->tcp))
 
 /* Table for lookup from remote address, local port, remote port */
@@ -973,19 +886,8 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
  */
 void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
 {
-	int i;
-
-	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
-		struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
-		struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
-		struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
-
-		eth_update_mac(&b4->taph.eh, eth_d, eth_s);
-		eth_update_mac(&b6->taph.eh, eth_d, eth_s);
-		eth_update_mac(&b4f->taph.eh, eth_d, eth_s);
-		eth_update_mac(&b6f->taph.eh, eth_d, eth_s);
-	}
+	eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
+	eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
 }
 
 /**
@@ -995,29 +897,42 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
 static void tcp_sock4_iov_init(const struct ctx *c)
 {
 	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
-	struct iovec *iov;
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
-		tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
-			.taph = TAP_HDR_INIT(ETH_P_IP),
-			.iph = iph,
-			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
-		};
-	}
+	(void)c;
 
-	for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
-		tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) {
-			.taph = TAP_HDR_INIT(ETH_P_IP),
-			.iph = L2_BUF_IP4_INIT(IPPROTO_TCP)
-		};
-	}
+	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
+	for (i = 0; i < TCP_FRAMES_MEM; i++) {
+		struct iovec *iov;
+
+		/* headers */
+		tcp4_l2_ip[i] = iph;
+		tcp4_l2_payload[i].th = (struct tcphdr){
+					.doff = sizeof(struct tcphdr) / 4,
+					.ack = 1
+				};
 
-	for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
-		iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph);
+		tcp4_l2_flags_ip[i] = iph;
+		tcp4_l2_flags[i].th = (struct tcphdr){
+					.doff = sizeof(struct tcphdr) / 4,
+					.ack = 1
+				};
 
-	for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
-		iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph);
+		/* iovecs */
+		iov = tcp4_l2_iov[i];
+		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
+		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
+		iov[TCP_IOV_IP].iov_base = &tcp4_l2_ip[i];
+		iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_payload[i];
+
+		iov = tcp4_l2_flags_iov[i];
+		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
+		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
+		iov[TCP_IOV_IP].iov_base = &tcp4_l2_flags_ip[i];
+		iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_flags[i];
+	}
 }
 
 /**
@@ -1026,29 +941,43 @@ static void tcp_sock4_iov_init(const struct ctx *c)
  */
 static void tcp_sock6_iov_init(const struct ctx *c)
 {
-	struct iovec *iov;
+	struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
-		tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
-			.taph = TAP_HDR_INIT(ETH_P_IPV6),
-			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP),
-			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
-		};
-	}
+	(void)c;
 
-	for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
-		tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
-			.taph = TAP_HDR_INIT(ETH_P_IPV6),
-			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP)
-		};
-	}
+	tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
+	for (i = 0; i < TCP_FRAMES_MEM; i++) {
+		struct iovec *iov;
+
+		/* headers */
+		tcp6_l2_ip[i] = ip6;
+		tcp6_l2_payload[i].th = (struct tcphdr){
+					.doff = sizeof(struct tcphdr) / 4,
+					.ack = 1
+				};
 
-	for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
-		iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph);
+		tcp6_l2_flags_ip[i] = ip6;
+		tcp6_l2_flags[i].th = (struct tcphdr){
+					.doff = sizeof(struct tcphdr) / 4,
+					.ack = 1
+				};
 
-	for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
-		iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph);
+		/* iovecs */
+		iov = tcp6_l2_iov[i];
+		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
+		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
+		iov[TCP_IOV_IP].iov_base = &tcp6_l2_ip[i];
+		iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_payload[i];
+
+		iov = tcp6_l2_flags_iov[i];
+		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
+		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
+		iov[TCP_IOV_IP].iov_base = &tcp6_l2_flags_ip[i];
+		iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_flags[i];
+	}
 }
 
 /**
@@ -1289,10 +1218,10 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
  */
 static void tcp_l2_flags_buf_flush(const struct ctx *c)
 {
-	tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
+	tap_send_iov(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
 	tcp6_l2_flags_buf_used = 0;
 
-	tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
+	tap_send_iov(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
 	tcp4_l2_flags_buf_used = 0;
 }
 
@@ -1305,12 +1234,12 @@ static void tcp_l2_data_buf_flush(const struct ctx *c)
 	unsigned i;
 	size_t m;
 
-	m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used);
+	m = tap_send_iov(c, tcp6_l2_iov, tcp6_l2_buf_used);
 	for (i = 0; i < m; i++)
 		*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
 	tcp6_l2_buf_used = 0;
 
-	m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used);
+	m = tap_send_iov(c, tcp4_l2_iov, tcp4_l2_buf_used);
 	for (i = 0; i < m; i++)
 		*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
 	tcp4_l2_buf_used = 0;
@@ -1433,7 +1362,7 @@ static size_t tcp_fill_headers6(const struct ctx *c,
  * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
  * @c:		Execution context
  * @conn:	Connection pointer
- * @p:		Pointer to any type of TCP pre-cooked buffer
+ * @iov:	Pointer to an array of iovec of TCP pre-cooked buffer
  * @plen:	Payload length (including TCP header options)
  * @check:	Checksum, if already known
  * @seq:	Sequence number for this segment
@@ -1442,26 +1371,22 @@ static size_t tcp_fill_headers6(const struct ctx *c,
  */
 static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
 				      const struct tcp_tap_conn *conn,
-				      void *p, size_t plen,
+				      struct iovec *iov, size_t plen,
 				      const uint16_t *check, uint32_t seq)
 {
 	const struct in_addr *a4 = inany_v4(&conn->faddr);
 	size_t ip_len, tlen;
 
 	if (a4) {
-		struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p;
-
-		ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen,
+		ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base,
+					   iov[TCP_IOV_PAYLOAD].iov_base, plen,
 					   check, seq);
-
-		tlen = tap_iov_len(c, &b->taph, ip_len);
+		tlen = ip_len - sizeof(struct iphdr);
 	} else {
-		struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p;
-
-		ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen,
+		ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base,
+					   iov[TCP_IOV_PAYLOAD].iov_base, plen,
 					   seq);
-
-		tlen = tap_iov_len(c, &b->taph, ip_len);
+		tlen = ip_len - sizeof(struct ipv6hdr);
 	}
 
 	return tlen;
@@ -1595,16 +1520,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 {
 	uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
 	uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
-	struct tcp4_l2_flags_buf_t *b4 = NULL;
-	struct tcp6_l2_flags_buf_t *b6 = NULL;
+	struct tcp_l2_flags_t *payload;
 	struct tcp_info tinfo = { 0 };
 	socklen_t sl = sizeof(tinfo);
 	int s = conn->sock;
 	size_t optlen = 0;
 	struct iovec *iov;
+	struct iovec *dup_iov;
 	struct tcphdr *th;
 	char *data;
-	void *p;
 
 	if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
 	    !flags && conn->wnd_to_tap)
@@ -1627,18 +1551,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 		return 0;
 
 	if (CONN_V4(conn)) {
-		iov = tcp4_l2_flags_iov    + tcp4_l2_flags_buf_used;
-		p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
-		th = &b4->th;
-
-		/* gcc 11.2 would complain on data = (char *)(th + 1); */
-		data = b4->opts;
+		iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used++];
+		dup_iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used];
 	} else {
-		iov = tcp6_l2_flags_iov    + tcp6_l2_flags_buf_used;
-		p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
-		th = &b6->th;
-		data = b6->opts;
+		iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used++];
+		dup_iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used];
 	}
+	payload = iov[TCP_IOV_PAYLOAD].iov_base;
+	th = &payload->th;
+	data = payload->opts;
 
 	if (flags & SYN) {
 		int mss;
@@ -1653,10 +1574,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 			mss = tinfo.tcpi_snd_mss;
 		} else {
 			mss = c->mtu - sizeof(struct tcphdr);
-			if (CONN_V4(conn))
-				mss -= sizeof(struct iphdr);
-			else
-				mss -= sizeof(struct ipv6hdr);
 
 			if (c->low_wmem &&
 			    !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn))
@@ -1688,8 +1605,9 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	th->syn = !!(flags & SYN);
 	th->fin = !!(flags & FIN);
 
-	iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen,
-					       NULL, conn->seq_to_tap);
+	iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, iov,
+							       optlen, NULL,
+							      conn->seq_to_tap);
 
 	if (th->ack) {
 		if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
@@ -1705,23 +1623,26 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	if (th->fin || th->syn)
 		conn->seq_to_tap++;
 
+	if (flags & DUP_ACK) {
+		int i;
+		for (i = 0; i < TCP_IOV_NUM; i++) {
+			memcpy(dup_iov[i].iov_base, iov[i].iov_base,
+			       iov[i].iov_len);
+			dup_iov[i].iov_len = iov[i].iov_len;
+		}
+	}
+
 	if (CONN_V4(conn)) {
-		if (flags & DUP_ACK) {
-			memcpy(b4 + 1, b4, sizeof(*b4));
-			(iov + 1)->iov_len = iov->iov_len;
+		if (flags & DUP_ACK)
 			tcp4_l2_flags_buf_used++;
-		}
 
-		if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
+		if (tcp4_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
 			tcp_l2_flags_buf_flush(c);
 	} else {
-		if (flags & DUP_ACK) {
-			memcpy(b6 + 1, b6, sizeof(*b6));
-			(iov + 1)->iov_len = iov->iov_len;
+		if (flags & DUP_ACK)
 			tcp6_l2_flags_buf_used++;
-		}
 
-		if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
+		if (tcp6_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
 			tcp_l2_flags_buf_flush(c);
 	}
 
@@ -1887,15 +1808,14 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
 	unsigned int mss;
 	int ret;
 
+	(void)conn; /* unused */
+
 	if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0)
 		mss = MSS_DEFAULT;
 	else
 		mss = ret;
 
-	if (CONN_V4(conn))
-		mss = MIN(MSS4, mss);
-	else
-		mss = MIN(MSS6, mss);
+	mss = MIN(MSS, mss);
 
 	return MIN(mss, USHRT_MAX);
 }
@@ -2171,27 +2091,30 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 	struct iovec *iov;
 
 	if (CONN_V4(conn)) {
-		struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
-		const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
+		struct iovec *iov_prev = tcp4_l2_iov[tcp4_l2_buf_used - 1];
+		const uint16_t *check = NULL;
+
+		if (no_csum) {
+			struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
+			check = &iph->check;
+		}
 
 		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
 		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
 
-		iov = tcp4_l2_iov + tcp4_l2_buf_used++;
-		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
-						       check, seq);
-		if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1)
+		iov = tcp4_l2_iov[tcp4_l2_buf_used++];
+		iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn,
+							iov, plen, check, seq);
+		if (tcp4_l2_buf_used > TCP_FRAMES_MEM - 1)
 			tcp_l2_data_buf_flush(c);
 	} else if (CONN_V6(conn)) {
-		struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
-
 		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
 		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
 
-		iov = tcp6_l2_iov + tcp6_l2_buf_used++;
-		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
-						       NULL, seq);
-		if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1)
+		iov = tcp6_l2_iov[tcp6_l2_buf_used++];
+		iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn,
+							iov, plen, NULL, seq);
+		if (tcp6_l2_buf_used > TCP_FRAMES_MEM - 1)
 			tcp_l2_data_buf_flush(c);
 	}
 }
@@ -2247,8 +2170,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	iov_sock[0].iov_base = tcp_buf_discard;
 	iov_sock[0].iov_len = already_sent;
 
-	if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) ||
-	    (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) {
+	if (( v4 && tcp4_l2_buf_used + fill_bufs > TCP_FRAMES_MEM) ||
+	    (!v4 && tcp6_l2_buf_used + fill_bufs > TCP_FRAMES_MEM)) {
 		tcp_l2_data_buf_flush(c);
 
 		/* Silence Coverity CWE-125 false positive */
@@ -2257,9 +2180,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 
 	for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
 		if (v4)
-			iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data;
+			iov->iov_base = &tcp4_l2_payload[tcp4_l2_buf_used + i].data;
 		else
-			iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data;
+			iov->iov_base = &tcp6_l2_payload[tcp6_l2_buf_used + i].data;
 		iov->iov_len = mss;
 	}
 	if (iov_rem)
-- 
@@ -318,39 +318,7 @@
 
 /* MSS rounding: see SET_MSS() */
 #define MSS_DEFAULT			536
-
-struct tcp4_l2_head {	/* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
-#ifdef __AVX2__
-	uint8_t pad[26];
-#else
-	uint8_t pad[2];
-#endif
-	struct tap_hdr taph;
-	struct iphdr iph;
-	struct tcphdr th;
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
-#ifdef __AVX2__
-	uint8_t pad[14];
-#else
-	uint8_t pad[2];
-#endif
-	struct tap_hdr taph;
-	struct ipv6hdr ip6h;
-	struct tcphdr th;
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-#define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
-#define MSS6	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
+#define MSS				(USHRT_MAX - sizeof(struct tcphdr))
 
 #define WINDOW_DEFAULT			14600		/* RFC 6928 */
 #ifdef HAS_SND_WND
@@ -445,133 +413,78 @@ struct tcp_buf_seq_update {
 };
 
 /* Static buffers */
-
 /**
- * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
- * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
- * @taph:	Tap-level headers (partially pre-filled)
- * @iph:	Pre-filled IP header (except for tot_len and saddr)
- * @uh:		Headroom for TCP header
- * @data:	Storage for TCP payload
+ * tcp_l2_flags_t - TCP header and data to send option flags
+ * @th:		TCP header
+ * @opts	TCP option flags
  */
-static struct tcp4_l2_buf_t {
-#ifdef __AVX2__
-	uint8_t pad[26];	/* 0, align th to 32 bytes */
-#else
-	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
-#endif
-	struct tap_hdr taph;	/* 26				2 */
-	struct iphdr iph;	/* 44				20 */
-	struct tcphdr th;	/* 64				40 */
-	uint8_t data[MSS4];	/* 84				60 */
-				/* 65536			65532 */
+struct tcp_l2_flags_t {
+	struct tcphdr th;
+	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
+};
+/**
+ * tcp_l2_payload_t - TCP header and data to send data
+ * 		32 bytes aligned to be able to use AVX2 checksum
+ * @th:		TCP header
+ * @data:	TCP data
+ */
+struct tcp_l2_payload_t {
+	struct tcphdr th;	/*    20 bytes */
+	uint8_t data[MSS];	/* 65516 bytes */
 #ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
+} __attribute__ ((packed, aligned(32)));
 #else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
 #endif
-tcp4_l2_buf[TCP_FRAMES_MEM];
 
-static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
+/* Ethernet header for IPv4 frames */
+static struct ethhdr		tcp4_eth_src;
 
+/* IPv4 headers */
+static struct iphdr		tcp4_l2_ip[TCP_FRAMES_MEM];
+/* TCP headers and data for IPv4 frames */
+static struct tcp_l2_payload_t	tcp4_l2_payload[TCP_FRAMES_MEM];
+
+static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp4_l2_buf_used;
 
-/**
- * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
- * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
- * @taph:	Tap-level headers (partially pre-filled)
- * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
- * @th:		Headroom for TCP header
- * @data:	Storage for TCP payload
- */
-struct tcp6_l2_buf_t {
-#ifdef __AVX2__
-	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
-#else
-	uint8_t pad[2];		/*	align ip6h to 4 bytes	0 */
-#endif
-	struct tap_hdr taph;	/* 14				2 */
-	struct ipv6hdr ip6h;	/* 32				20 */
-	struct tcphdr th;	/* 72				60 */
-	uint8_t data[MSS6];	/* 92				80 */
-				/* 65536			65532 */
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-tcp6_l2_buf[TCP_FRAMES_MEM];
+/* IPv4 headers for TCP option flags frames */
+static struct iphdr		tcp4_l2_flags_ip[TCP_FRAMES_MEM];
+/* TCP headers and option flags for IPv4 frames */
+static struct tcp_l2_flags_t	tcp4_l2_flags[TCP_FRAMES_MEM];
 
-static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
+static unsigned int tcp4_l2_flags_buf_used;
+
+/* Ethernet header for IPv6 frames */
+static struct ethhdr		tcp6_eth_src;
 
+/* IPv6 headers */
+static struct ipv6hdr		tcp6_l2_ip[TCP_FRAMES_MEM];
+/* TCP headers and data for IPv6 frames */
+static struct tcp_l2_payload_t	tcp6_l2_payload[TCP_FRAMES_MEM];
+
+static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp6_l2_buf_used;
 
+/* IPv6 headers for TCP option flags frames */
+static struct ipv6hdr		tcp6_l2_flags_ip[TCP_FRAMES_MEM];
+/* TCP headers and option flags for IPv6 frames */
+static struct tcp_l2_flags_t	tcp6_l2_flags[TCP_FRAMES_MEM];
+
+static unsigned int tcp6_l2_flags_buf_used;
+
 /* recvmsg()/sendmsg() data for tap */
 static char 		tcp_buf_discard		[MAX_WINDOW];
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
-static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
-static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM];
-static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM];
-static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM];
+static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM][TCP_IOV_NUM];
+static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_IOV_NUM];
+static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_IOV_NUM];
+static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_IOV_NUM];
 
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
 
-/**
- * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
- * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
- * @taph:	Tap-level headers (partially pre-filled)
- * @iph:	Pre-filled IP header (except for tot_len and saddr)
- * @th:		Headroom for TCP header
- * @opts:	Headroom for TCP options
- */
-static struct tcp4_l2_flags_buf_t {
-#ifdef __AVX2__
-	uint8_t pad[26];	/* 0, align th to 32 bytes */
-#else
-	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
-#endif
-	struct tap_hdr taph;	/* 26				2 */
-	struct iphdr iph;	/* 44				20 */
-	struct tcphdr th;	/* 64				40 */
-	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-tcp4_l2_flags_buf[TCP_FRAMES_MEM];
-
-static unsigned int tcp4_l2_flags_buf_used;
-
-/**
- * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
- * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
- * @taph:	Tap-level headers (partially pre-filled)
- * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
- * @th:		Headroom for TCP header
- * @opts:	Headroom for TCP options
- */
-static struct tcp6_l2_flags_buf_t {
-#ifdef __AVX2__
-	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
-#else
-	uint8_t pad[2];		/*	align ip6h to 4 bytes		   0 */
-#endif
-	struct tap_hdr taph;	/* 14					   2 */
-	struct ipv6hdr ip6h;	/* 32					  20 */
-	struct tcphdr th	/* 72 */ __attribute__ ((aligned(4))); /* 60 */
-	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-tcp6_l2_flags_buf[TCP_FRAMES_MEM];
-
-static unsigned int tcp6_l2_flags_buf_used;
-
 #define CONN(idx)		(&(FLOW(idx)->tcp))
 
 /* Table for lookup from remote address, local port, remote port */
@@ -973,19 +886,8 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
  */
 void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
 {
-	int i;
-
-	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
-		struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
-		struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
-		struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
-
-		eth_update_mac(&b4->taph.eh, eth_d, eth_s);
-		eth_update_mac(&b6->taph.eh, eth_d, eth_s);
-		eth_update_mac(&b4f->taph.eh, eth_d, eth_s);
-		eth_update_mac(&b6f->taph.eh, eth_d, eth_s);
-	}
+	eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
+	eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
 }
 
 /**
@@ -995,29 +897,42 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
 static void tcp_sock4_iov_init(const struct ctx *c)
 {
 	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
-	struct iovec *iov;
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
-		tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
-			.taph = TAP_HDR_INIT(ETH_P_IP),
-			.iph = iph,
-			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
-		};
-	}
+	(void)c;
 
-	for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
-		tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) {
-			.taph = TAP_HDR_INIT(ETH_P_IP),
-			.iph = L2_BUF_IP4_INIT(IPPROTO_TCP)
-		};
-	}
+	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
+	for (i = 0; i < TCP_FRAMES_MEM; i++) {
+		struct iovec *iov;
+
+		/* headers */
+		tcp4_l2_ip[i] = iph;
+		tcp4_l2_payload[i].th = (struct tcphdr){
+					.doff = sizeof(struct tcphdr) / 4,
+					.ack = 1
+				};
 
-	for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
-		iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph);
+		tcp4_l2_flags_ip[i] = iph;
+		tcp4_l2_flags[i].th = (struct tcphdr){
+					.doff = sizeof(struct tcphdr) / 4,
+					.ack = 1
+				};
 
-	for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
-		iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph);
+		/* iovecs */
+		iov = tcp4_l2_iov[i];
+		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
+		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
+		iov[TCP_IOV_IP].iov_base = &tcp4_l2_ip[i];
+		iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_payload[i];
+
+		iov = tcp4_l2_flags_iov[i];
+		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
+		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
+		iov[TCP_IOV_IP].iov_base = &tcp4_l2_flags_ip[i];
+		iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_flags[i];
+	}
 }
 
 /**
@@ -1026,29 +941,43 @@ static void tcp_sock4_iov_init(const struct ctx *c)
  */
 static void tcp_sock6_iov_init(const struct ctx *c)
 {
-	struct iovec *iov;
+	struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
-		tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
-			.taph = TAP_HDR_INIT(ETH_P_IPV6),
-			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP),
-			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
-		};
-	}
+	(void)c;
 
-	for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
-		tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
-			.taph = TAP_HDR_INIT(ETH_P_IPV6),
-			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP)
-		};
-	}
+	tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
+	for (i = 0; i < TCP_FRAMES_MEM; i++) {
+		struct iovec *iov;
+
+		/* headers */
+		tcp6_l2_ip[i] = ip6;
+		tcp6_l2_payload[i].th = (struct tcphdr){
+					.doff = sizeof(struct tcphdr) / 4,
+					.ack = 1
+				};
 
-	for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
-		iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph);
+		tcp6_l2_flags_ip[i] = ip6;
+		tcp6_l2_flags[i].th = (struct tcphdr){
+					.doff = sizeof(struct tcphdr) / 4,
+					.ack = 1
+				};
 
-	for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
-		iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph);
+		/* iovecs */
+		iov = tcp6_l2_iov[i];
+		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
+		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
+		iov[TCP_IOV_IP].iov_base = &tcp6_l2_ip[i];
+		iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_payload[i];
+
+		iov = tcp6_l2_flags_iov[i];
+		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
+		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
+		iov[TCP_IOV_IP].iov_base = &tcp6_l2_flags_ip[i];
+		iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_flags[i];
+	}
 }
 
 /**
@@ -1289,10 +1218,10 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
  */
 static void tcp_l2_flags_buf_flush(const struct ctx *c)
 {
-	tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
+	tap_send_iov(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
 	tcp6_l2_flags_buf_used = 0;
 
-	tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
+	tap_send_iov(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
 	tcp4_l2_flags_buf_used = 0;
 }
 
@@ -1305,12 +1234,12 @@ static void tcp_l2_data_buf_flush(const struct ctx *c)
 	unsigned i;
 	size_t m;
 
-	m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used);
+	m = tap_send_iov(c, tcp6_l2_iov, tcp6_l2_buf_used);
 	for (i = 0; i < m; i++)
 		*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
 	tcp6_l2_buf_used = 0;
 
-	m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used);
+	m = tap_send_iov(c, tcp4_l2_iov, tcp4_l2_buf_used);
 	for (i = 0; i < m; i++)
 		*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
 	tcp4_l2_buf_used = 0;
@@ -1433,7 +1362,7 @@ static size_t tcp_fill_headers6(const struct ctx *c,
  * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
  * @c:		Execution context
  * @conn:	Connection pointer
- * @p:		Pointer to any type of TCP pre-cooked buffer
+ * @iov:	Pointer to an array of iovec of TCP pre-cooked buffer
  * @plen:	Payload length (including TCP header options)
  * @check:	Checksum, if already known
  * @seq:	Sequence number for this segment
@@ -1442,26 +1371,22 @@ static size_t tcp_fill_headers6(const struct ctx *c,
  */
 static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
 				      const struct tcp_tap_conn *conn,
-				      void *p, size_t plen,
+				      struct iovec *iov, size_t plen,
 				      const uint16_t *check, uint32_t seq)
 {
 	const struct in_addr *a4 = inany_v4(&conn->faddr);
 	size_t ip_len, tlen;
 
 	if (a4) {
-		struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p;
-
-		ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen,
+		ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base,
+					   iov[TCP_IOV_PAYLOAD].iov_base, plen,
 					   check, seq);
-
-		tlen = tap_iov_len(c, &b->taph, ip_len);
+		tlen = ip_len - sizeof(struct iphdr);
 	} else {
-		struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p;
-
-		ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen,
+		ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base,
+					   iov[TCP_IOV_PAYLOAD].iov_base, plen,
 					   seq);
-
-		tlen = tap_iov_len(c, &b->taph, ip_len);
+		tlen = ip_len - sizeof(struct ipv6hdr);
 	}
 
 	return tlen;
@@ -1595,16 +1520,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 {
 	uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
 	uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
-	struct tcp4_l2_flags_buf_t *b4 = NULL;
-	struct tcp6_l2_flags_buf_t *b6 = NULL;
+	struct tcp_l2_flags_t *payload;
 	struct tcp_info tinfo = { 0 };
 	socklen_t sl = sizeof(tinfo);
 	int s = conn->sock;
 	size_t optlen = 0;
 	struct iovec *iov;
+	struct iovec *dup_iov;
 	struct tcphdr *th;
 	char *data;
-	void *p;
 
 	if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
 	    !flags && conn->wnd_to_tap)
@@ -1627,18 +1551,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 		return 0;
 
 	if (CONN_V4(conn)) {
-		iov = tcp4_l2_flags_iov    + tcp4_l2_flags_buf_used;
-		p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
-		th = &b4->th;
-
-		/* gcc 11.2 would complain on data = (char *)(th + 1); */
-		data = b4->opts;
+		iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used++];
+		dup_iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used];
 	} else {
-		iov = tcp6_l2_flags_iov    + tcp6_l2_flags_buf_used;
-		p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
-		th = &b6->th;
-		data = b6->opts;
+		iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used++];
+		dup_iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used];
 	}
+	payload = iov[TCP_IOV_PAYLOAD].iov_base;
+	th = &payload->th;
+	data = payload->opts;
 
 	if (flags & SYN) {
 		int mss;
@@ -1653,10 +1574,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 			mss = tinfo.tcpi_snd_mss;
 		} else {
 			mss = c->mtu - sizeof(struct tcphdr);
-			if (CONN_V4(conn))
-				mss -= sizeof(struct iphdr);
-			else
-				mss -= sizeof(struct ipv6hdr);
 
 			if (c->low_wmem &&
 			    !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn))
@@ -1688,8 +1605,9 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	th->syn = !!(flags & SYN);
 	th->fin = !!(flags & FIN);
 
-	iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen,
-					       NULL, conn->seq_to_tap);
+	iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, iov,
+							       optlen, NULL,
+							      conn->seq_to_tap);
 
 	if (th->ack) {
 		if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
@@ -1705,23 +1623,26 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	if (th->fin || th->syn)
 		conn->seq_to_tap++;
 
+	if (flags & DUP_ACK) {
+		int i;
+		for (i = 0; i < TCP_IOV_NUM; i++) {
+			memcpy(dup_iov[i].iov_base, iov[i].iov_base,
+			       iov[i].iov_len);
+			dup_iov[i].iov_len = iov[i].iov_len;
+		}
+	}
+
 	if (CONN_V4(conn)) {
-		if (flags & DUP_ACK) {
-			memcpy(b4 + 1, b4, sizeof(*b4));
-			(iov + 1)->iov_len = iov->iov_len;
+		if (flags & DUP_ACK)
 			tcp4_l2_flags_buf_used++;
-		}
 
-		if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
+		if (tcp4_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
 			tcp_l2_flags_buf_flush(c);
 	} else {
-		if (flags & DUP_ACK) {
-			memcpy(b6 + 1, b6, sizeof(*b6));
-			(iov + 1)->iov_len = iov->iov_len;
+		if (flags & DUP_ACK)
 			tcp6_l2_flags_buf_used++;
-		}
 
-		if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
+		if (tcp6_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
 			tcp_l2_flags_buf_flush(c);
 	}
 
@@ -1887,15 +1808,14 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
 	unsigned int mss;
 	int ret;
 
+	(void)conn; /* unused */
+
 	if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0)
 		mss = MSS_DEFAULT;
 	else
 		mss = ret;
 
-	if (CONN_V4(conn))
-		mss = MIN(MSS4, mss);
-	else
-		mss = MIN(MSS6, mss);
+	mss = MIN(MSS, mss);
 
 	return MIN(mss, USHRT_MAX);
 }
@@ -2171,27 +2091,30 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 	struct iovec *iov;
 
 	if (CONN_V4(conn)) {
-		struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
-		const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
+		struct iovec *iov_prev = tcp4_l2_iov[tcp4_l2_buf_used - 1];
+		const uint16_t *check = NULL;
+
+		if (no_csum) {
+			struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
+			check = &iph->check;
+		}
 
 		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
 		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
 
-		iov = tcp4_l2_iov + tcp4_l2_buf_used++;
-		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
-						       check, seq);
-		if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1)
+		iov = tcp4_l2_iov[tcp4_l2_buf_used++];
+		iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn,
+							iov, plen, check, seq);
+		if (tcp4_l2_buf_used > TCP_FRAMES_MEM - 1)
 			tcp_l2_data_buf_flush(c);
 	} else if (CONN_V6(conn)) {
-		struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
-
 		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
 		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
 
-		iov = tcp6_l2_iov + tcp6_l2_buf_used++;
-		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
-						       NULL, seq);
-		if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1)
+		iov = tcp6_l2_iov[tcp6_l2_buf_used++];
+		iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn,
+							iov, plen, NULL, seq);
+		if (tcp6_l2_buf_used > TCP_FRAMES_MEM - 1)
 			tcp_l2_data_buf_flush(c);
 	}
 }
@@ -2247,8 +2170,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	iov_sock[0].iov_base = tcp_buf_discard;
 	iov_sock[0].iov_len = already_sent;
 
-	if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) ||
-	    (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) {
+	if (( v4 && tcp4_l2_buf_used + fill_bufs > TCP_FRAMES_MEM) ||
+	    (!v4 && tcp6_l2_buf_used + fill_bufs > TCP_FRAMES_MEM)) {
 		tcp_l2_data_buf_flush(c);
 
 		/* Silence Coverity CWE-125 false positive */
@@ -2257,9 +2180,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 
 	for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
 		if (v4)
-			iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data;
+			iov->iov_base = &tcp4_l2_payload[tcp4_l2_buf_used + i].data;
 		else
-			iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data;
+			iov->iov_base = &tcp6_l2_payload[tcp6_l2_buf_used + i].data;
 		iov->iov_len = mss;
 	}
 	if (iov_rem)
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC] tcp: Replace TCP buffer structure by an iovec array
  2024-03-11 13:33 [RFC] tcp: Replace TCP buffer structure by an iovec array Laurent Vivier
@ 2024-03-12 22:56 ` Stefano Brivio
  2024-03-13 11:37 ` Stefano Brivio
  2024-03-14  4:22 ` David Gibson
  2 siblings, 0 replies; 13+ messages in thread
From: Stefano Brivio @ 2024-03-12 22:56 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: passt-dev

On Mon, 11 Mar 2024 14:33:56 +0100
Laurent Vivier <lvivier@redhat.com> wrote:

> To be able to provide pointers to TCP headers and IP headers without
> worrying about alignment in the structure, split the structure into
> several arrays and point to each part of the frame using an iovec array.
> 
> Using iovec also allows us to simply ignore the first entry when the
> vnet length header is not needed. And as the payload buffer contains
> only the TCP header and the TCP data we can increase the size of the
> TCP data to USHRT_MAX - sizeof(struct tcphdr).
> 
> As a side effect, these changes improve performance by a factor of
> x1.5.
> 
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> ---
>  tap.c | 104 ++++++++++++++
>  tap.h |  16 +++
>  tcp.c | 429 ++++++++++++++++++++++++----------------------------------
>  3 files changed, 296 insertions(+), 253 deletions(-)
> 
> diff --git a/tap.c b/tap.c
> index c7b9372668ec..afd01cffbed6 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -350,6 +350,30 @@ static size_t tap_send_frames_pasta(const struct ctx *c,
>  	return i;
>  }
>  
> +/**
> + * tap_send_iov_pasta() - Send out multiple prepared frames

...to the pasta tap.

That is, this should get the description of the existing
tap_send_frames_pasta(), and, I guess, tap_send_frames_pasta() should
now switch to writev(), as it's used to send one single message from
multiple buffers, otherwise you'll get a lot of system calls for pasta?

I'm still in the middle of reviewing here, but I thought I would
comment on this right away because it looks quite fundamental.

-- 
Stefano


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] tcp: Replace TCP buffer structure by an iovec array
  2024-03-11 13:33 [RFC] tcp: Replace TCP buffer structure by an iovec array Laurent Vivier
  2024-03-12 22:56 ` Stefano Brivio
@ 2024-03-13 11:37 ` Stefano Brivio
  2024-03-13 14:42   ` Laurent Vivier
                     ` (2 more replies)
  2024-03-14  4:22 ` David Gibson
  2 siblings, 3 replies; 13+ messages in thread
From: Stefano Brivio @ 2024-03-13 11:37 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: passt-dev

On Mon, 11 Mar 2024 14:33:56 +0100
Laurent Vivier <lvivier@redhat.com> wrote:

> To be able to provide pointers to TCP headers and IP headers without
> worrying about alignment in the structure, split the structure into
> several arrays and point to each part of the frame using an iovec array.
> 
> Using iovec also allows us to simply ignore the first entry when the
> vnet length header is not needed. And as the payload buffer contains
> only the TCP header and the TCP data we can increase the size of the
> TCP data to USHRT_MAX - sizeof(struct tcphdr).
> 
> As a side effect, these changes improve performance by a factor of
> x1.5.
> 
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> ---
>  tap.c | 104 ++++++++++++++
>  tap.h |  16 +++
>  tcp.c | 429 ++++++++++++++++++++++++----------------------------------
>  3 files changed, 296 insertions(+), 253 deletions(-)
> 
> diff --git a/tap.c b/tap.c
> index c7b9372668ec..afd01cffbed6 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -350,6 +350,30 @@ static size_t tap_send_frames_pasta(const struct ctx *c,
>  	return i;
>  }
>  
> +/**
> + * tap_send_iov_pasta() - Send out multiple prepared frames
> + * @c:		Execution context
> + * @iov:	Array of frames, each frames is divided in an array of iovecs.
> + *              The first entry of the iovec is ignored

Fits on single lines (I think it's more readable):

 * @c:		Execution context
 * @iov:	Array of frames, each one a vector of parts, TCP_IOV_VNET unused
 * @n:		Number of frames in @iov

> + * @n:		Number of frames in @iov
> + *
> + * Return: number of frames actually sent
> + */
> +static size_t tap_send_iov_pasta(const struct ctx *c,
> +				 struct iovec iov[][TCP_IOV_NUM], size_t n)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < n; i++) {
> +		if (!tap_send_frames_pasta(c, &iov[i][TCP_IOV_ETH],
> +					   TCP_IOV_NUM - TCP_IOV_ETH))
> +			break;
> +	}
> +
> +	return i;
> +
> +}
> +
>  /**
>   * tap_send_frames_passt() - Send multiple frames to the passt tap
>   * @c:		Execution context
> @@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c,
>  	return i;
>  }
>  
> +/**
> + * tap_send_iov_passt() - Send out multiple prepared frames

...I would argue that this function prepares frames as well. Maybe:

 * tap_send_iov_passt() - Prepare TCP_IOV_VNET parts and send multiple frames

> + * @c:		Execution context
> + * @iov:	Array of frames, each frames is divided in an array of iovecs.
> + *              The first entry of the iovec is updated to point to an
> + *              uint32_t storing the frame length.

 * @iov:	Array of frames, each one a vector of parts, TCP_IOV_VNET blank

> + * @n:		Number of frames in @iov
> + *
> + * Return: number of frames actually sent
> + */
> +static size_t tap_send_iov_passt(const struct ctx *c,
> +				 struct iovec iov[][TCP_IOV_NUM],
> +				 size_t n)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < n; i++) {
> +		uint32_t vnet_len;
> +		int j;
> +
> +		vnet_len = 0;

This could be initialised in the declaration (yes, it's "reset" at
every loop iteration).

> +		for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
> +			vnet_len += iov[i][j].iov_len;
> +
> +		vnet_len = htonl(vnet_len);
> +		iov[i][TCP_IOV_VNET].iov_base = &vnet_len;
> +		iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len);
> +
> +		if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))

...which would now send a single frame at a time, but actually it can
already send everything in one shot because it's using sendmsg(), if you
move it outside of the loop and do something like (untested):

	return tap_send_frames_passt(c, iov, TCP_IOV_NUM * n);

> +			break;
> +	}
> +
> +	return i;
> +
> +}
> +
>  /**
>   * tap_send_frames() - Send out multiple prepared frames
>   * @c:		Execution context
> @@ -418,6 +478,50 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n)
>  	return m;
>  }
>  
> +/**
> + * tap_send_iov() - Send out multiple prepared frames
> + * @c:		Execution context
> + * @iov:	Array of frames, each frames is divided in an array of iovecs.
> + * 		iovec array is:

I think this description should be moved before the defines -- it could
even be an enum with fixed numbers, and I just realised that in
kernel-doc enums should be documented in the same way as structs (we
didn't do that so far), that is:

/**
 * enum tcp_iov_parts - I/O vector parts for one TCP frame
 * @TCP_IOV_NET:	vnet length: host order, 32-bit frame length descriptor
 * ...
 */
enum tcp_iov_parts {
	/* vnet length (host order, 32-bit length descriptor) */
	TCP_IOV_VNET		= 0,
	[...]
};

> + * 		TCP_IOV_VNET	(0)	vnet length
> + * 		TCP_IOV_ETH	(1)	ethernet header
> + * 		TCP_IOV_IP	(2)	IP (v4/v6) header
> + *		TCP_IOV_PAYLOAD	(3)	IP payload (TCP header + data)
> + *		TCP_IOV_NUM (4) is the number of entries in the iovec array
> + *		TCP_IOV_VNET entry is updated with passt, ignored with pasta.
> + * @n:		Number of frames in @iov
> + *
> + * Return: number of frames actually sent
> + */
> +size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM],
> +		    size_t n)
> +{
> +	size_t m;
> +	unsigned int i;
> +
> +	if (!n)
> +		return 0;
> +
> +	switch (c->mode) {
> +	case MODE_PASST:
> +		m = tap_send_iov_passt(c, iov, n);
> +		break;
> +	case MODE_PASTA:
> +		m = tap_send_iov_pasta(c, iov, n);
> +		break;
> +	default:
> +		ASSERT(0);
> +	}
> +
> +	if (m < n)
> +		debug("tap: failed to send %zu frames of %zu", n - m, n);
> +
> +	for (i = 0; i < m; i++)
> +		pcap_iov(&iov[i][TCP_IOV_ETH], TCP_IOV_NUM - TCP_IOV_ETH);
> +
> +	return m;
> +}
> +
>  /**
>   * eth_update_mac() - Update tap L2 header with new Ethernet addresses
>   * @eh:		Ethernet headers to update
> diff --git a/tap.h b/tap.h
> index 437b9aa2b43f..2d8e7aa1101d 100644
> --- a/tap.h
> +++ b/tap.h
> @@ -6,6 +6,20 @@
>  #ifndef TAP_H
>  #define TAP_H
>  
> +/*
> + * TCP frame iovec array:
> + * TCP_IOV_VNET		vnet length
> + * TCP_IOV_ETH		ethernet header
> + * TCP_IOV_IP		IP (v4/v6) header
> + * TCP_IOV_PAYLOAD	IP payload (TCP header + data)
> + * TCP_IOV_NUM is the number of entries in the iovec array
> + */
> +#define TCP_IOV_VNET	0
> +#define TCP_IOV_ETH	1
> +#define TCP_IOV_IP	2
> +#define TCP_IOV_PAYLOAD	3
> +#define TCP_IOV_NUM	4
> +
>  /**
>   * struct tap_hdr - L2 and tap specific headers
>   * @vnet_len:	Frame length (for qemu socket transport)
> @@ -74,6 +88,8 @@ void tap_icmp6_send(const struct ctx *c,
>  		    const void *in, size_t len);
>  int tap_send(const struct ctx *c, const void *data, size_t len);
>  size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n);
> +size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM],
> +		    size_t n);
>  void eth_update_mac(struct ethhdr *eh,
>  		    const unsigned char *eth_d, const unsigned char *eth_s);
>  void tap_listen_handler(struct ctx *c, uint32_t events);
> diff --git a/tcp.c b/tcp.c
> index d5eedf4d0138..5c8488108ef7 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -318,39 +318,7 @@
>  
>  /* MSS rounding: see SET_MSS() */
>  #define MSS_DEFAULT			536
> -
> -struct tcp4_l2_head {	/* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
> -#ifdef __AVX2__
> -	uint8_t pad[26];
> -#else
> -	uint8_t pad[2];
> -#endif
> -	struct tap_hdr taph;
> -	struct iphdr iph;
> -	struct tcphdr th;
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)));
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> -#endif
> -
> -struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
> -#ifdef __AVX2__
> -	uint8_t pad[14];
> -#else
> -	uint8_t pad[2];
> -#endif
> -	struct tap_hdr taph;
> -	struct ipv6hdr ip6h;
> -	struct tcphdr th;
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)));
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> -#endif

I'm so glad to see this going away.

> -#define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
> -#define MSS6	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
>
> +#define MSS				(USHRT_MAX - sizeof(struct tcphdr))

We can't exceed USHRT_MAX at Layer-2 level, though, so the MSS for IPv4
should now be:

#define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct ethhdr) -
			   	       sizeof(struct iphdr) -
				       sizeof(struct tcphdr),
			   4)

...and similar for IPv6.

>  #define WINDOW_DEFAULT			14600		/* RFC 6928 */
>  #ifdef HAS_SND_WND
> @@ -445,133 +413,78 @@ struct tcp_buf_seq_update {
>  };
>  
>  /* Static buffers */
> -
>  /**
> - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
> - * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @iph:	Pre-filled IP header (except for tot_len and saddr)
> - * @uh:		Headroom for TCP header
> - * @data:	Storage for TCP payload
> + * tcp_l2_flags_t - TCP header and data to send option flags

'struct tcp_l2_flags_t - ...', by the way (pre-existing, I see).

> + * @th:		TCP header
> + * @opts	TCP option flags
>   */
> -static struct tcp4_l2_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[26];	/* 0, align th to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
> -#endif
> -	struct tap_hdr taph;	/* 26				2 */
> -	struct iphdr iph;	/* 44				20 */
> -	struct tcphdr th;	/* 64				40 */
> -	uint8_t data[MSS4];	/* 84				60 */
> -				/* 65536			65532 */
> +struct tcp_l2_flags_t {
> +	struct tcphdr th;
> +	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> +};

This should still be aligned (when declared below) with:

#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif

because yes, now you get the unsigned int alignment for free, but the
32-byte boundary for AVX2 (checksums) not really, so that needs to be
there, and then for clarity I would keep the explicit attribute for the
non-AVX2 case.

By the way, I guess you could still combine the definition and
declaration as it was done earlier.

> +/**
> + * tcp_l2_payload_t - TCP header and data to send data
> + * 		32 bytes aligned to be able to use AVX2 checksum
> + * @th:		TCP header
> + * @data:	TCP data
> + */
> +struct tcp_l2_payload_t {
> +	struct tcphdr th;	/*    20 bytes */
> +	uint8_t data[MSS];	/* 65516 bytes */
>  #ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> +} __attribute__ ((packed, aligned(32)));
>  #else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> +} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
>  #endif
> -tcp4_l2_buf[TCP_FRAMES_MEM];
>  
> -static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
> +/* Ethernet header for IPv4 frames */
> +static struct ethhdr		tcp4_eth_src;
>  
> +/* IPv4 headers */
> +static struct iphdr		tcp4_l2_ip[TCP_FRAMES_MEM];
> +/* TCP headers and data for IPv4 frames */
> +static struct tcp_l2_payload_t	tcp4_l2_payload[TCP_FRAMES_MEM];
> +
> +static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
>  static unsigned int tcp4_l2_buf_used;
>  
> -/**
> - * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
> - * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
> - * @th:		Headroom for TCP header
> - * @data:	Storage for TCP payload
> - */
> -struct tcp6_l2_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align ip6h to 4 bytes	0 */
> -#endif
> -	struct tap_hdr taph;	/* 14				2 */
> -	struct ipv6hdr ip6h;	/* 32				20 */
> -	struct tcphdr th;	/* 72				60 */
> -	uint8_t data[MSS6];	/* 92				80 */
> -				/* 65536			65532 */
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> -#endif
> -tcp6_l2_buf[TCP_FRAMES_MEM];
> +/* IPv4 headers for TCP option flags frames */
> +static struct iphdr		tcp4_l2_flags_ip[TCP_FRAMES_MEM];
> +/* TCP headers and option flags for IPv4 frames */
> +static struct tcp_l2_flags_t	tcp4_l2_flags[TCP_FRAMES_MEM];
>  
> -static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
> +static unsigned int tcp4_l2_flags_buf_used;
> +
> +/* Ethernet header for IPv6 frames */
> +static struct ethhdr		tcp6_eth_src;
>  
> +/* IPv6 headers */
> +static struct ipv6hdr		tcp6_l2_ip[TCP_FRAMES_MEM];
> +/* TCP headers and data for IPv6 frames */
> +static struct tcp_l2_payload_t	tcp6_l2_payload[TCP_FRAMES_MEM];
> +
> +static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
>  static unsigned int tcp6_l2_buf_used;
>  
> +/* IPv6 headers for TCP option flags frames */
> +static struct ipv6hdr		tcp6_l2_flags_ip[TCP_FRAMES_MEM];
> +/* TCP headers and option flags for IPv6 frames */
> +static struct tcp_l2_flags_t	tcp6_l2_flags[TCP_FRAMES_MEM];
> +
> +static unsigned int tcp6_l2_flags_buf_used;
> +
>  /* recvmsg()/sendmsg() data for tap */
>  static char 		tcp_buf_discard		[MAX_WINDOW];
>  static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
>  
> -static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
> -static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM];
> -static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM];
> -static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM];
> +static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM][TCP_IOV_NUM];
> +static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_IOV_NUM];
> +static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_IOV_NUM];
> +static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_IOV_NUM];
>  
>  /* sendmsg() to socket */
>  static struct iovec	tcp_iov			[UIO_MAXIOV];
>  
> -/**
> - * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
> - * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @iph:	Pre-filled IP header (except for tot_len and saddr)
> - * @th:		Headroom for TCP header
> - * @opts:	Headroom for TCP options
> - */
> -static struct tcp4_l2_flags_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[26];	/* 0, align th to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
> -#endif
> -	struct tap_hdr taph;	/* 26				2 */
> -	struct iphdr iph;	/* 44				20 */
> -	struct tcphdr th;	/* 64				40 */
> -	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> -#endif
> -tcp4_l2_flags_buf[TCP_FRAMES_MEM];
> -
> -static unsigned int tcp4_l2_flags_buf_used;
> -
> -/**
> - * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
> - * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
> - * @th:		Headroom for TCP header
> - * @opts:	Headroom for TCP options
> - */
> -static struct tcp6_l2_flags_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align ip6h to 4 bytes		   0 */
> -#endif
> -	struct tap_hdr taph;	/* 14					   2 */
> -	struct ipv6hdr ip6h;	/* 32					  20 */
> -	struct tcphdr th	/* 72 */ __attribute__ ((aligned(4))); /* 60 */
> -	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> -#endif
> -tcp6_l2_flags_buf[TCP_FRAMES_MEM];
> -
> -static unsigned int tcp6_l2_flags_buf_used;
> -
>  #define CONN(idx)		(&(FLOW(idx)->tcp))
>  
>  /* Table for lookup from remote address, local port, remote port */
> @@ -973,19 +886,8 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
>   */
>  void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)

This doesn't update "L2 buffers" anymore, just the Ethernet addresses
(function comment should be updated).

>  {
> -	int i;
> -
> -	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> -		struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
> -		struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
> -		struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
> -		struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
> -
> -		eth_update_mac(&b4->taph.eh, eth_d, eth_s);
> -		eth_update_mac(&b6->taph.eh, eth_d, eth_s);
> -		eth_update_mac(&b4f->taph.eh, eth_d, eth_s);
> -		eth_update_mac(&b6f->taph.eh, eth_d, eth_s);
> -	}
> +	eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
> +	eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
>  }
>  
>  /**
> @@ -995,29 +897,42 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
>  static void tcp_sock4_iov_init(const struct ctx *c)
>  {
>  	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
> -	struct iovec *iov;
>  	int i;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
> -		tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IP),
> -			.iph = iph,
> -			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
> -		};
> -	}
> +	(void)c;

...then drop it from the parameter list?

>  
> -	for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
> -		tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IP),
> -			.iph = L2_BUF_IP4_INIT(IPPROTO_TCP)
> -		};
> -	}
> +	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
> +	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> +		struct iovec *iov;
> +
> +		/* headers */
> +		tcp4_l2_ip[i] = iph;
> +		tcp4_l2_payload[i].th = (struct tcphdr){
> +					.doff = sizeof(struct tcphdr) / 4,
> +					.ack = 1
> +				};
>  
> -	for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph);
> +		tcp4_l2_flags_ip[i] = iph;
> +		tcp4_l2_flags[i].th = (struct tcphdr){
> +					.doff = sizeof(struct tcphdr) / 4,
> +					.ack = 1
> +				};
>  
> -	for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph);
> +		/* iovecs */
> +		iov = tcp4_l2_iov[i];
> +		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
> +		iov[TCP_IOV_IP].iov_base = &tcp4_l2_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_payload[i];
> +
> +		iov = tcp4_l2_flags_iov[i];
> +		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
> +		iov[TCP_IOV_IP].iov_base = &tcp4_l2_flags_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_flags[i];
> +	}
>  }
>  
>  /**
> @@ -1026,29 +941,43 @@ static void tcp_sock4_iov_init(const struct ctx *c)
>   */
>  static void tcp_sock6_iov_init(const struct ctx *c)
>  {
> -	struct iovec *iov;
> +	struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
>  	int i;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
> -		tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IPV6),
> -			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP),
> -			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
> -		};
> -	}
> +	(void)c;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
> -		tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IPV6),
> -			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP)
> -		};
> -	}
> +	tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
> +	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> +		struct iovec *iov;
> +
> +		/* headers */
> +		tcp6_l2_ip[i] = ip6;
> +		tcp6_l2_payload[i].th = (struct tcphdr){
> +					.doff = sizeof(struct tcphdr) / 4,
> +					.ack = 1
> +				};
>  
> -	for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph);
> +		tcp6_l2_flags_ip[i] = ip6;
> +		tcp6_l2_flags[i].th = (struct tcphdr){
> +					.doff = sizeof(struct tcphdr) / 4,
> +					.ack = 1
> +				};
>  
> -	for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph);
> +		/* iovecs */
> +		iov = tcp6_l2_iov[i];
> +		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
> +		iov[TCP_IOV_IP].iov_base = &tcp6_l2_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_payload[i];
> +
> +		iov = tcp6_l2_flags_iov[i];
> +		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
> +		iov[TCP_IOV_IP].iov_base = &tcp6_l2_flags_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_flags[i];
> +	}
>  }
>  
>  /**
> @@ -1289,10 +1218,10 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
>   */
>  static void tcp_l2_flags_buf_flush(const struct ctx *c)
>  {
> -	tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
> +	tap_send_iov(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
>  	tcp6_l2_flags_buf_used = 0;
>  
> -	tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
> +	tap_send_iov(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
>  	tcp4_l2_flags_buf_used = 0;
>  }
>  
> @@ -1305,12 +1234,12 @@ static void tcp_l2_data_buf_flush(const struct ctx *c)
>  	unsigned i;
>  	size_t m;
>  
> -	m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used);
> +	m = tap_send_iov(c, tcp6_l2_iov, tcp6_l2_buf_used);
>  	for (i = 0; i < m; i++)
>  		*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
>  	tcp6_l2_buf_used = 0;
>  
> -	m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used);
> +	m = tap_send_iov(c, tcp4_l2_iov, tcp4_l2_buf_used);
>  	for (i = 0; i < m; i++)
>  		*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
>  	tcp4_l2_buf_used = 0;
> @@ -1433,7 +1362,7 @@ static size_t tcp_fill_headers6(const struct ctx *c,
>   * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
>   * @c:		Execution context
>   * @conn:	Connection pointer
> - * @p:		Pointer to any type of TCP pre-cooked buffer
> + * @iov:	Pointer to an array of iovec of TCP pre-cooked buffer
>   * @plen:	Payload length (including TCP header options)
>   * @check:	Checksum, if already known
>   * @seq:	Sequence number for this segment
> @@ -1442,26 +1371,22 @@ static size_t tcp_fill_headers6(const struct ctx *c,
>   */
>  static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
>  				      const struct tcp_tap_conn *conn,
> -				      void *p, size_t plen,
> +				      struct iovec *iov, size_t plen,
>  				      const uint16_t *check, uint32_t seq)
>  {
>  	const struct in_addr *a4 = inany_v4(&conn->faddr);
>  	size_t ip_len, tlen;
>  
>  	if (a4) {
> -		struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p;
> -
> -		ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen,
> +		ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base,
> +					   iov[TCP_IOV_PAYLOAD].iov_base, plen,
>  					   check, seq);
> -
> -		tlen = tap_iov_len(c, &b->taph, ip_len);
> +		tlen = ip_len - sizeof(struct iphdr);
>  	} else {
> -		struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p;
> -
> -		ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen,
> +		ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base,
> +					   iov[TCP_IOV_PAYLOAD].iov_base, plen,
>  					   seq);
> -
> -		tlen = tap_iov_len(c, &b->taph, ip_len);
> +		tlen = ip_len - sizeof(struct ipv6hdr);
>  	}
>  
>  	return tlen;
> @@ -1595,16 +1520,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  {
>  	uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
>  	uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
> -	struct tcp4_l2_flags_buf_t *b4 = NULL;
> -	struct tcp6_l2_flags_buf_t *b6 = NULL;
> +	struct tcp_l2_flags_t *payload;
>  	struct tcp_info tinfo = { 0 };
>  	socklen_t sl = sizeof(tinfo);
>  	int s = conn->sock;
>  	size_t optlen = 0;
>  	struct iovec *iov;
> +	struct iovec *dup_iov;

This should now go before 'int s' (longest to shortest), or the
declaration could be combined on the same line (it's really symmetric)
with *iov.

>  	struct tcphdr *th;
>  	char *data;
> -	void *p;
>  
>  	if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
>  	    !flags && conn->wnd_to_tap)
> @@ -1627,18 +1551,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  		return 0;
>  
>  	if (CONN_V4(conn)) {
> -		iov = tcp4_l2_flags_iov    + tcp4_l2_flags_buf_used;
> -		p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
> -		th = &b4->th;
> -
> -		/* gcc 11.2 would complain on data = (char *)(th + 1); */
> -		data = b4->opts;
> +		iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used++];
> +		dup_iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used];
>  	} else {
> -		iov = tcp6_l2_flags_iov    + tcp6_l2_flags_buf_used;
> -		p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
> -		th = &b6->th;
> -		data = b6->opts;
> +		iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used++];
> +		dup_iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used];

Maybe you could move the dup_iov assignments below, and group these
with the increments of tcp[46]_l2_flags_buf_used, otherwise I'll scream
"off-by-one" every time I see this (even though it's not actually the case).

>  	}
> +	payload = iov[TCP_IOV_PAYLOAD].iov_base;
> +	th = &payload->th;
> +	data = payload->opts;
>  
>  	if (flags & SYN) {
>  		int mss;
> @@ -1653,10 +1574,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  			mss = tinfo.tcpi_snd_mss;
>  		} else {
>  			mss = c->mtu - sizeof(struct tcphdr);
> -			if (CONN_V4(conn))
> -				mss -= sizeof(struct iphdr);
> -			else
> -				mss -= sizeof(struct ipv6hdr);
>  
>  			if (c->low_wmem &&
>  			    !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn))
> @@ -1688,8 +1605,9 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  	th->syn = !!(flags & SYN);
>  	th->fin = !!(flags & FIN);
>  
> -	iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen,
> -					       NULL, conn->seq_to_tap);
> +	iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, iov,
> +							       optlen, NULL,
> +							      conn->seq_to_tap);
>  
>  	if (th->ack) {
>  		if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
> @@ -1705,23 +1623,26 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  	if (th->fin || th->syn)
>  		conn->seq_to_tap++;
>  
> +	if (flags & DUP_ACK) {
> +		int i;
> +		for (i = 0; i < TCP_IOV_NUM; i++) {
> +			memcpy(dup_iov[i].iov_base, iov[i].iov_base,
> +			       iov[i].iov_len);
> +			dup_iov[i].iov_len = iov[i].iov_len;
> +		}
> +	}
> +
>  	if (CONN_V4(conn)) {
> -		if (flags & DUP_ACK) {
> -			memcpy(b4 + 1, b4, sizeof(*b4));
> -			(iov + 1)->iov_len = iov->iov_len;
> +		if (flags & DUP_ACK)
>  			tcp4_l2_flags_buf_used++;
> -		}
>  
> -		if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
> +		if (tcp4_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
>  			tcp_l2_flags_buf_flush(c);
>  	} else {
> -		if (flags & DUP_ACK) {
> -			memcpy(b6 + 1, b6, sizeof(*b6));
> -			(iov + 1)->iov_len = iov->iov_len;
> +		if (flags & DUP_ACK)
>  			tcp6_l2_flags_buf_used++;
> -		}
>  
> -		if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
> +		if (tcp6_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
>  			tcp_l2_flags_buf_flush(c);
>  	}
>  
> @@ -1887,15 +1808,14 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
>  	unsigned int mss;
>  	int ret;
>  
> +	(void)conn; /* unused */
> +
>  	if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0)
>  		mss = MSS_DEFAULT;
>  	else
>  		mss = ret;
>  
> -	if (CONN_V4(conn))
> -		mss = MIN(MSS4, mss);
> -	else
> -		mss = MIN(MSS6, mss);
> +	mss = MIN(MSS, mss);
>  
>  	return MIN(mss, USHRT_MAX);
>  }
> @@ -2171,27 +2091,30 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>  	struct iovec *iov;
>  
>  	if (CONN_V4(conn)) {
> -		struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
> -		const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
> +		struct iovec *iov_prev = tcp4_l2_iov[tcp4_l2_buf_used - 1];
> +		const uint16_t *check = NULL;
> +
> +		if (no_csum) {
> +			struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
> +			check = &iph->check;
> +		}
>  
>  		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
>  		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
>  
> -		iov = tcp4_l2_iov + tcp4_l2_buf_used++;
> -		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
> -						       check, seq);
> -		if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1)
> +		iov = tcp4_l2_iov[tcp4_l2_buf_used++];
> +		iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn,
> +							iov, plen, check, seq);
> +		if (tcp4_l2_buf_used > TCP_FRAMES_MEM - 1)
>  			tcp_l2_data_buf_flush(c);
>  	} else if (CONN_V6(conn)) {
> -		struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
> -
>  		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
>  		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
>  
> -		iov = tcp6_l2_iov + tcp6_l2_buf_used++;
> -		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
> -						       NULL, seq);
> -		if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1)
> +		iov = tcp6_l2_iov[tcp6_l2_buf_used++];
> +		iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn,
> +							iov, plen, NULL, seq);

Maybe use a temporary variable?

> +		if (tcp6_l2_buf_used > TCP_FRAMES_MEM - 1)
>  			tcp_l2_data_buf_flush(c);
>  	}
>  }
> @@ -2247,8 +2170,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  	iov_sock[0].iov_base = tcp_buf_discard;
>  	iov_sock[0].iov_len = already_sent;
>  
> -	if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) ||
> -	    (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) {
> +	if (( v4 && tcp4_l2_buf_used + fill_bufs > TCP_FRAMES_MEM) ||
> +	    (!v4 && tcp6_l2_buf_used + fill_bufs > TCP_FRAMES_MEM)) {
>  		tcp_l2_data_buf_flush(c);
>  
>  		/* Silence Coverity CWE-125 false positive */
> @@ -2257,9 +2180,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  
>  	for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
>  		if (v4)
> -			iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data;
> +			iov->iov_base = &tcp4_l2_payload[tcp4_l2_buf_used + i].data;
>  		else
> -			iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data;
> +			iov->iov_base = &tcp6_l2_payload[tcp6_l2_buf_used + i].data;
>  		iov->iov_len = mss;
>  	}
>  	if (iov_rem)

The rest looks good to me (at the moment :)).

-- 
Stefano


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] tcp: Replace TCP buffer structure by an iovec array
  2024-03-13 11:37 ` Stefano Brivio
@ 2024-03-13 14:42   ` Laurent Vivier
  2024-03-13 15:27     ` Stefano Brivio
  2024-03-13 15:20   ` Laurent Vivier
  2024-03-14 14:07   ` Laurent Vivier
  2 siblings, 1 reply; 13+ messages in thread
From: Laurent Vivier @ 2024-03-13 14:42 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

On 3/13/24 12:37, Stefano Brivio wrote:
> On Mon, 11 Mar 2024 14:33:56 +0100
> Laurent Vivier <lvivier@redhat.com> wrote:
> 
...
>> diff --git a/tcp.c b/tcp.c
>> index d5eedf4d0138..5c8488108ef7 100644
>> --- a/tcp.c
>> +++ b/tcp.c
>> @@ -318,39 +318,7 @@
...
>> -#define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
>> -#define MSS6	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
>>
>> +#define MSS				(USHRT_MAX - sizeof(struct tcphdr))
> 
> We can't exceed USHRT_MAX at Layer-2 level, though, so the MSS for IPv4
> should now be:
> 
> #define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct ethhdr) -
> 			   	       sizeof(struct iphdr) -
> 				       sizeof(struct tcphdr),
> 			   4)
> 
> ...and similar for IPv6.

Is there a specification that limits the MSS?
Or is it only a compatibility problem with the network stack implementation?

At headers level the only limitation we have is the length field size in the IP header 
that is a 16bit (it's why I put "USHRT_MAX - sizeof(struct tcphdr)").

Thanks,
Laurent



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] tcp: Replace TCP buffer structure by an iovec array
  2024-03-13 11:37 ` Stefano Brivio
  2024-03-13 14:42   ` Laurent Vivier
@ 2024-03-13 15:20   ` Laurent Vivier
  2024-03-13 16:58     ` Stefano Brivio
  2024-03-14 14:07   ` Laurent Vivier
  2 siblings, 1 reply; 13+ messages in thread
From: Laurent Vivier @ 2024-03-13 15:20 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

On 3/13/24 12:37, Stefano Brivio wrote:
> On Mon, 11 Mar 2024 14:33:56 +0100
> Laurent Vivier <lvivier@redhat.com> wrote:
> 
...
>> @@ -445,133 +413,78 @@ struct tcp_buf_seq_update {
>>   };
>>   
>>   /* Static buffers */
>> -
>>   /**
>> - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
>> - * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
>> - * @taph:	Tap-level headers (partially pre-filled)
>> - * @iph:	Pre-filled IP header (except for tot_len and saddr)
>> - * @uh:		Headroom for TCP header
>> - * @data:	Storage for TCP payload
>> + * tcp_l2_flags_t - TCP header and data to send option flags
> 
> 'struct tcp_l2_flags_t - ...', by the way (pre-existing, I see).
> 
>> + * @th:		TCP header
>> + * @opts	TCP option flags
>>    */
>> -static struct tcp4_l2_buf_t {
>> -#ifdef __AVX2__
>> -	uint8_t pad[26];	/* 0, align th to 32 bytes */
>> -#else
>> -	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
>> -#endif
>> -	struct tap_hdr taph;	/* 26				2 */
>> -	struct iphdr iph;	/* 44				20 */
>> -	struct tcphdr th;	/* 64				40 */
>> -	uint8_t data[MSS4];	/* 84				60 */
>> -				/* 65536			65532 */
>> +struct tcp_l2_flags_t {
>> +	struct tcphdr th;
>> +	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
>> +};
> 
> This should still be aligned (when declared below) with:
> 
> #ifdef __AVX2__
> } __attribute__ ((packed, aligned(32)))
> #else
> } __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> #endif
> 
> because yes, now you get the unsigned int alignment for free, but the
> 32-byte boundary for AVX2 (checksums) not really, so that needs to be
> there, and then for clarity I would keep the explicit attribute for the
> non-AVX2 case.
> 
> By the way, I guess you could still combine the definition and
> declaration as it was done earlier.

We can't combine the definition and declaration because the same definition is used with 
IPv4 and IPv6 TCP arrays declaration.

I'm not sure: the __attribute__ must be used on the structure definition or on the array 
of structures declaration?

Thanks,
Laurent


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] tcp: Replace TCP buffer structure by an iovec array
  2024-03-13 14:42   ` Laurent Vivier
@ 2024-03-13 15:27     ` Stefano Brivio
  0 siblings, 0 replies; 13+ messages in thread
From: Stefano Brivio @ 2024-03-13 15:27 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: passt-dev

On Wed, 13 Mar 2024 15:42:27 +0100
Laurent Vivier <lvivier@redhat.com> wrote:

> On 3/13/24 12:37, Stefano Brivio wrote:
> > On Mon, 11 Mar 2024 14:33:56 +0100
> > Laurent Vivier <lvivier@redhat.com> wrote:
> >   
> ...
> >> diff --git a/tcp.c b/tcp.c
> >> index d5eedf4d0138..5c8488108ef7 100644
> >> --- a/tcp.c
> >> +++ b/tcp.c
> >> @@ -318,39 +318,7 @@  
> ...
> >> -#define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
> >> -#define MSS6	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
> >>
> >> +#define MSS				(USHRT_MAX - sizeof(struct tcphdr))  
> > 
> > We can't exceed USHRT_MAX at Layer-2 level, though, so the MSS for IPv4
> > should now be:
> > 
> > #define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct ethhdr) -
> > 			   	       sizeof(struct iphdr) -
> > 				       sizeof(struct tcphdr),
> > 			   4)
> > 
> > ...and similar for IPv6.  
> 
> Is there a specification that limits the MSS?

Well, on Linux, virtual network interfaces such as virtio-net still
have their maximum configurable MTU defined as ETH_MAX_MTU (same as the
maximum IPv4 MTU, 65535), and that's a symmetric value (in standards,
drivers, and elsewhere in the network stack).

Side note: the TCP MSS doesn't need to be the same value for both
directions, instead.

But other than that, no, there are no normative references. It's an
implementation "detail" if you want.

> Or is it only a compatibility problem with the network stack implementation?

Kind of, yes. Except that:

> At headers level the only limitation we have is the length field size in the IP header 
> that is a 16bit (it's why I put "USHRT_MAX - sizeof(struct tcphdr)").

...in IPv4, that field also contains the length of the *IP header*, so,
ignoring for a moment Linux and virtio-net with ETH_MAX_MTU, you would
be limited to 65495 bytes (65535 minus 20 bytes of IP header, minus 20
bytes of TCP header). As RFC 791, section 3.1, states:

  Total Length:  16 bits

    Total Length is the length of the datagram, measured in octets,
    including internet header and data.  This field allows the length of
    a datagram to be up to 65,535 octets. [...]

...but 65535 is the *MTU*, not MSS.

And if it needs to fit ETH_MAX_MTU, you're now back to 65520 bytes of
MTU (it needs to match IPv4 32-bit words, if I recall correctly), hence
65480 bytes of MSS.

IPv6 is different, in that the equivalent field doesn't include the
size of the IPv6 header. But the header is 40 bytes long, so the
outcome is the same. Well, except that you could have jumbograms (RFC
2675) and exceed 65535 bytes of IPv6 MTU, but that won't work anyway
with Ethernet-like interfaces.

So, well, I haven't actually tried to send an Ethernet frame to a
virtio-net interface that's bigger than 65535 bytes. As far as
normative references are concerned, you could send 65549 (65535 bytes
maximum IPv4 MTU, plus 14 bytes of 802.3 header on top) bytes. I guess
it will be dropped by the kernel, but it's perhaps worth a try.

-- 
Stefano


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] tcp: Replace TCP buffer structure by an iovec array
  2024-03-13 15:20   ` Laurent Vivier
@ 2024-03-13 16:58     ` Stefano Brivio
  0 siblings, 0 replies; 13+ messages in thread
From: Stefano Brivio @ 2024-03-13 16:58 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: passt-dev

On Wed, 13 Mar 2024 16:20:12 +0100
Laurent Vivier <lvivier@redhat.com> wrote:

> On 3/13/24 12:37, Stefano Brivio wrote:
> > On Mon, 11 Mar 2024 14:33:56 +0100
> > Laurent Vivier <lvivier@redhat.com> wrote:
> >   
> ...
> >> @@ -445,133 +413,78 @@ struct tcp_buf_seq_update {
> >>   };
> >>   
> >>   /* Static buffers */
> >> -
> >>   /**
> >> - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
> >> - * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
> >> - * @taph:	Tap-level headers (partially pre-filled)
> >> - * @iph:	Pre-filled IP header (except for tot_len and saddr)
> >> - * @uh:		Headroom for TCP header
> >> - * @data:	Storage for TCP payload
> >> + * tcp_l2_flags_t - TCP header and data to send option flags  
> > 
> > 'struct tcp_l2_flags_t - ...', by the way (pre-existing, I see).
> >   
> >> + * @th:		TCP header
> >> + * @opts	TCP option flags
> >>    */
> >> -static struct tcp4_l2_buf_t {
> >> -#ifdef __AVX2__
> >> -	uint8_t pad[26];	/* 0, align th to 32 bytes */
> >> -#else
> >> -	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
> >> -#endif
> >> -	struct tap_hdr taph;	/* 26				2 */
> >> -	struct iphdr iph;	/* 44				20 */
> >> -	struct tcphdr th;	/* 64				40 */
> >> -	uint8_t data[MSS4];	/* 84				60 */
> >> -				/* 65536			65532 */
> >> +struct tcp_l2_flags_t {
> >> +	struct tcphdr th;
> >> +	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> >> +};  
> > 
> > This should still be aligned (when declared below) with:
> > 
> > #ifdef __AVX2__
> > } __attribute__ ((packed, aligned(32)))
> > #else
> > } __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> > #endif
> > 
> > because yes, now you get the unsigned int alignment for free, but the
> > 32-byte boundary for AVX2 (checksums) not really, so that needs to be
> > there, and then for clarity I would keep the explicit attribute for the
> > non-AVX2 case.
> > 
> > By the way, I guess you could still combine the definition and
> > declaration as it was done earlier.  
> 
> We can't combine the definition and declaration because the same definition is used with 
> IPv4 and IPv6 TCP arrays declaration.

Ah, sorry, of course!

By the way of that, I wonder: would it then be possible to have a
single copy of payload buffers, that's shared for IPv4 and IPv6? We
would probably need to introduce a separate counter, on top of
tcp[46]_l2_buf_used (which could become tcp[46]_l2_hdr_used).

Well, probably not in scope for this change anyway.

> I'm not sure: the __attribute__ must be used on the structure definition or on the array 
> of structures declaration?

__attribute__ can be used on definitions or declarations depending on
the type of attribute itself -- here it would be on the definition,
because it affects the definition of the data type itself.

Strictly speaking, you can use it on the declaration too, but gcc will
ignore it:

$ cat attr.c
struct a {
    int b;
};

int main()
{
    struct a c __attribute__ ((packed, aligned(32)));
    return 0;
}

$ gcc -o attr attr.c
attr.c: In function ‘main’:
attr.c:7:12: warning: ‘packed’ attribute ignored [-Wattributes]
    7 |     struct a c __attribute__ ((packed, aligned(32)));
      |            

-- 
Stefano


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] tcp: Replace TCP buffer structure by an iovec array
  2024-03-11 13:33 [RFC] tcp: Replace TCP buffer structure by an iovec array Laurent Vivier
  2024-03-12 22:56 ` Stefano Brivio
  2024-03-13 11:37 ` Stefano Brivio
@ 2024-03-14  4:22 ` David Gibson
  2 siblings, 0 replies; 13+ messages in thread
From: David Gibson @ 2024-03-14  4:22 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 30894 bytes --]

On Mon, Mar 11, 2024 at 02:33:56PM +0100, Laurent Vivier wrote:
> To be able to provide pointers to TCP headers and IP headers without
> worrying about alignment in the structure, split the structure into
> several arrays and point to each part of the frame using an iovec array.
> 
> Using iovec also allows us to simply ignore the first entry when the
> vnet length header is not needed. And as the payload buffer contains
> only the TCP header and the TCP data we can increase the size of the
> TCP data to USHRT_MAX - sizeof(struct tcphdr).
> 
> As a side effect, these changes improve performance by a factor of
> x1.5.
> 
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> ---
>  tap.c | 104 ++++++++++++++
>  tap.h |  16 +++
>  tcp.c | 429 ++++++++++++++++++++++++----------------------------------
>  3 files changed, 296 insertions(+), 253 deletions(-)
> 
> diff --git a/tap.c b/tap.c
> index c7b9372668ec..afd01cffbed6 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -350,6 +350,30 @@ static size_t tap_send_frames_pasta(const struct ctx *c,
>  	return i;
>  }
>  
> +/**
> + * tap_send_iov_pasta() - Send out multiple prepared frames
> + * @c:		Execution context
> + * @iov:	Array of frames, each frames is divided in an array of iovecs.
> + *              The first entry of the iovec is ignored
> + * @n:		Number of frames in @iov
> + *
> + * Return: number of frames actually sent
> + */
> +static size_t tap_send_iov_pasta(const struct ctx *c,
> +				 struct iovec iov[][TCP_IOV_NUM], size_t n)

As Stefano also points out, this isn't quite right.  At the very least
the names are misleading, and I'm pretty sure it also won't work
right: the tuntap interface requires a whole frame to be given with a
single write()/writev(), you can't split it into multiple writes.

I also think it's kind of a layering violation to refer to TCP_IOV_NUM
here, so I think the number of iovecs per frame should be a parameter.

Which.. is basically what I already did in the tap rework series I
already sent.  I think it should be pretty straightforward to rebase
the TCP part of this patch on top of that, and drop the tap part.

> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < n; i++) {
> +		if (!tap_send_frames_pasta(c, &iov[i][TCP_IOV_ETH],
> +					   TCP_IOV_NUM - TCP_IOV_ETH))
> +			break;
> +	}
> +
> +	return i;
> +
> +}
> +
>  /**
>   * tap_send_frames_passt() - Send multiple frames to the passt tap
>   * @c:		Execution context
> @@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c,
>  	return i;
>  }
>  
> +/**
> + * tap_send_iov_passt() - Send out multiple prepared frames
> + * @c:		Execution context
> + * @iov:	Array of frames, each frames is divided in an array of iovecs.
> + *              The first entry of the iovec is updated to point to an
> + *              uint32_t storing the frame length.
> + * @n:		Number of frames in @iov
> + *
> + * Return: number of frames actually sent
> + */
> +static size_t tap_send_iov_passt(const struct ctx *c,
> +				 struct iovec iov[][TCP_IOV_NUM],
> +				 size_t n)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < n; i++) {
> +		uint32_t vnet_len;
> +		int j;
> +
> +		vnet_len = 0;
> +		for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
> +			vnet_len += iov[i][j].iov_len;
> +
> +		vnet_len = htonl(vnet_len);
> +		iov[i][TCP_IOV_VNET].iov_base = &vnet_len;
> +		iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len);
> +
> +		if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))
> +			break;
> +	}
> +
> +	return i;
> +
> +}
> +
>  /**
>   * tap_send_frames() - Send out multiple prepared frames
>   * @c:		Execution context
> @@ -418,6 +478,50 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n)
>  	return m;
>  }
>  
> +/**
> + * tap_send_iov() - Send out multiple prepared frames
> + * @c:		Execution context
> + * @iov:	Array of frames, each frames is divided in an array of iovecs.
> + * 		iovec array is:

As per Stefano, I think making these constants an enum and moving this
description there would be preferable.

> + * 		TCP_IOV_VNET	(0)	vnet length

Maybe call this TCP_IOV_TAP, to more easily to generalise to any TAP
backend specific headers we might want in future?

> + * 		TCP_IOV_ETH	(1)	ethernet header
> + * 		TCP_IOV_IP	(2)	IP (v4/v6) header
> + *		TCP_IOV_PAYLOAD	(3)	IP payload (TCP header + data)
> + *		TCP_IOV_NUM (4) is the number of entries in the iovec array
> + *		TCP_IOV_VNET entry is updated with passt, ignored with pasta.
> + * @n:		Number of frames in @iov
> + *
> + * Return: number of frames actually sent
> + */
> +size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM],
> +		    size_t n)
> +{
> +	size_t m;
> +	unsigned int i;
> +
> +	if (!n)
> +		return 0;
> +
> +	switch (c->mode) {
> +	case MODE_PASST:
> +		m = tap_send_iov_passt(c, iov, n);
> +		break;
> +	case MODE_PASTA:
> +		m = tap_send_iov_pasta(c, iov, n);
> +		break;
> +	default:
> +		ASSERT(0);
> +	}
> +
> +	if (m < n)
> +		debug("tap: failed to send %zu frames of %zu", n - m, n);
> +
> +	for (i = 0; i < m; i++)
> +		pcap_iov(&iov[i][TCP_IOV_ETH], TCP_IOV_NUM - TCP_IOV_ETH);
> +
> +	return m;
> +}
> +
>  /**
>   * eth_update_mac() - Update tap L2 header with new Ethernet addresses
>   * @eh:		Ethernet headers to update
> diff --git a/tap.h b/tap.h
> index 437b9aa2b43f..2d8e7aa1101d 100644
> --- a/tap.h
> +++ b/tap.h
> @@ -6,6 +6,20 @@
>  #ifndef TAP_H
>  #define TAP_H
>  
> +/*
> + * TCP frame iovec array:
> + * TCP_IOV_VNET		vnet length
> + * TCP_IOV_ETH		ethernet header
> + * TCP_IOV_IP		IP (v4/v6) header
> + * TCP_IOV_PAYLOAD	IP payload (TCP header + data)
> + * TCP_IOV_NUM is the number of entries in the iovec array
> + */
> +#define TCP_IOV_VNET	0
> +#define TCP_IOV_ETH	1
> +#define TCP_IOV_IP	2
> +#define TCP_IOV_PAYLOAD	3
> +#define TCP_IOV_NUM	4

As with using TCP_IOV_NUM etc. inside tap.c, defining these in tap.h
rather than tcp.h seems odd to me.


> +
>  /**
>   * struct tap_hdr - L2 and tap specific headers
>   * @vnet_len:	Frame length (for qemu socket transport)
> @@ -74,6 +88,8 @@ void tap_icmp6_send(const struct ctx *c,
>  		    const void *in, size_t len);
>  int tap_send(const struct ctx *c, const void *data, size_t len);
>  size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n);
> +size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM],
> +		    size_t n);
>  void eth_update_mac(struct ethhdr *eh,
>  		    const unsigned char *eth_d, const unsigned char *eth_s);
>  void tap_listen_handler(struct ctx *c, uint32_t events);
> diff --git a/tcp.c b/tcp.c
> index d5eedf4d0138..5c8488108ef7 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -318,39 +318,7 @@
>  
>  /* MSS rounding: see SET_MSS() */
>  #define MSS_DEFAULT			536
> -
> -struct tcp4_l2_head {	/* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
> -#ifdef __AVX2__
> -	uint8_t pad[26];
> -#else
> -	uint8_t pad[2];
> -#endif
> -	struct tap_hdr taph;
> -	struct iphdr iph;
> -	struct tcphdr th;
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)));
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> -#endif
> -
> -struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
> -#ifdef __AVX2__
> -	uint8_t pad[14];
> -#else
> -	uint8_t pad[2];
> -#endif
> -	struct tap_hdr taph;
> -	struct ipv6hdr ip6h;
> -	struct tcphdr th;
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)));
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> -#endif
> -
> -#define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
> -#define MSS6	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
> +#define MSS				(USHRT_MAX - sizeof(struct tcphdr))

As Stefano points out, the buffer size isn't the only limit on the MSS
here, so this isn't quite right.  In particular there's the 16-bit
length field in the IPv4 header.  I think making the *buffers* be
exactly 64k is a good idea (page aligned on nearly everything), but
the MSS has to be clamped a bit smaller.


>  #define WINDOW_DEFAULT			14600		/* RFC 6928 */
>  #ifdef HAS_SND_WND
> @@ -445,133 +413,78 @@ struct tcp_buf_seq_update {
>  };
>  
>  /* Static buffers */
> -
>  /**
> - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
> - * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @iph:	Pre-filled IP header (except for tot_len and saddr)
> - * @uh:		Headroom for TCP header
> - * @data:	Storage for TCP payload
> + * tcp_l2_flags_t - TCP header and data to send option flags
> + * @th:		TCP header
> + * @opts	TCP option flags
>   */
> -static struct tcp4_l2_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[26];	/* 0, align th to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
> -#endif
> -	struct tap_hdr taph;	/* 26				2 */
> -	struct iphdr iph;	/* 44				20 */
> -	struct tcphdr th;	/* 64				40 */
> -	uint8_t data[MSS4];	/* 84				60 */
> -				/* 65536			65532 */
> +struct tcp_l2_flags_t {
> +	struct tcphdr th;
> +	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> +};
> +/**
> + * tcp_l2_payload_t - TCP header and data to send data
> + * 		32 bytes aligned to be able to use AVX2 checksum
> + * @th:		TCP header
> + * @data:	TCP data
> + */
> +struct tcp_l2_payload_t {
> +	struct tcphdr th;	/*    20 bytes */
> +	uint8_t data[MSS];	/* 65516 bytes */

Actually 65515 bytes - USHRT_MAX is 65535, not 65536.  I think it
might be an idea to explicitly set this length to 65536 -
sizeof(tcphdr), then static_assert() to make sure our calculated MSS
values will fit in here.


>  #ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> +} __attribute__ ((packed, aligned(32)));
>  #else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> +} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
>  #endif
> -tcp4_l2_buf[TCP_FRAMES_MEM];
>  
> -static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
> +/* Ethernet header for IPv4 frames */
> +static struct ethhdr		tcp4_eth_src;
>  
> +/* IPv4 headers */
> +static struct iphdr		tcp4_l2_ip[TCP_FRAMES_MEM];
> +/* TCP headers and data for IPv4 frames */
> +static struct tcp_l2_payload_t	tcp4_l2_payload[TCP_FRAMES_MEM];
> +
> +static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
>  static unsigned int tcp4_l2_buf_used;
>  
> -/**
> - * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
> - * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
> - * @th:		Headroom for TCP header
> - * @data:	Storage for TCP payload
> - */
> -struct tcp6_l2_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align ip6h to 4 bytes	0 */
> -#endif
> -	struct tap_hdr taph;	/* 14				2 */
> -	struct ipv6hdr ip6h;	/* 32				20 */
> -	struct tcphdr th;	/* 72				60 */
> -	uint8_t data[MSS6];	/* 92				80 */
> -				/* 65536			65532 */
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> -#endif
> -tcp6_l2_buf[TCP_FRAMES_MEM];
> +/* IPv4 headers for TCP option flags frames */
> +static struct iphdr		tcp4_l2_flags_ip[TCP_FRAMES_MEM];
> +/* TCP headers and option flags for IPv4 frames */
> +static struct tcp_l2_flags_t	tcp4_l2_flags[TCP_FRAMES_MEM];
>  
> -static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
> +static unsigned int tcp4_l2_flags_buf_used;
> +
> +/* Ethernet header for IPv6 frames */
> +static struct ethhdr		tcp6_eth_src;
>  
> +/* IPv6 headers */
> +static struct ipv6hdr		tcp6_l2_ip[TCP_FRAMES_MEM];
> +/* TCP headers and data for IPv6 frames */
> +static struct tcp_l2_payload_t	tcp6_l2_payload[TCP_FRAMES_MEM];

We should be able to unify this with the IPv4 payload buffers (and I
think we'll want to to allow v4<->v6 NAT).  Makes sense to do that as
a later followup, though.

> +
> +static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
>  static unsigned int tcp6_l2_buf_used;
>  
> +/* IPv6 headers for TCP option flags frames */
> +static struct ipv6hdr		tcp6_l2_flags_ip[TCP_FRAMES_MEM];
> +/* TCP headers and option flags for IPv6 frames */
> +static struct tcp_l2_flags_t	tcp6_l2_flags[TCP_FRAMES_MEM];
> +
> +static unsigned int tcp6_l2_flags_buf_used;
> +
>  /* recvmsg()/sendmsg() data for tap */
>  static char 		tcp_buf_discard		[MAX_WINDOW];
>  static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
>  
> -static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
> -static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM];
> -static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM];
> -static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM];
> +static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM][TCP_IOV_NUM];
> +static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_IOV_NUM];
> +static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_IOV_NUM];
> +static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_IOV_NUM];
>  
>  /* sendmsg() to socket */
>  static struct iovec	tcp_iov			[UIO_MAXIOV];
>  
> -/**
> - * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
> - * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @iph:	Pre-filled IP header (except for tot_len and saddr)
> - * @th:		Headroom for TCP header
> - * @opts:	Headroom for TCP options
> - */
> -static struct tcp4_l2_flags_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[26];	/* 0, align th to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
> -#endif
> -	struct tap_hdr taph;	/* 26				2 */
> -	struct iphdr iph;	/* 44				20 */
> -	struct tcphdr th;	/* 64				40 */
> -	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> -#endif
> -tcp4_l2_flags_buf[TCP_FRAMES_MEM];
> -
> -static unsigned int tcp4_l2_flags_buf_used;
> -
> -/**
> - * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
> - * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
> - * @th:		Headroom for TCP header
> - * @opts:	Headroom for TCP options
> - */
> -static struct tcp6_l2_flags_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align ip6h to 4 bytes		   0 */
> -#endif
> -	struct tap_hdr taph;	/* 14					   2 */
> -	struct ipv6hdr ip6h;	/* 32					  20 */
> -	struct tcphdr th	/* 72 */ __attribute__ ((aligned(4))); /* 60 */
> -	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> -#endif
> -tcp6_l2_flags_buf[TCP_FRAMES_MEM];
> -
> -static unsigned int tcp6_l2_flags_buf_used;
> -
>  #define CONN(idx)		(&(FLOW(idx)->tcp))
>  
>  /* Table for lookup from remote address, local port, remote port */
> @@ -973,19 +886,8 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
>   */
>  void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
>  {
> -	int i;
> -
> -	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> -		struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
> -		struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
> -		struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
> -		struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
> -
> -		eth_update_mac(&b4->taph.eh, eth_d, eth_s);
> -		eth_update_mac(&b6->taph.eh, eth_d, eth_s);
> -		eth_update_mac(&b4f->taph.eh, eth_d, eth_s);
> -		eth_update_mac(&b6f->taph.eh, eth_d, eth_s);
> -	}
> +	eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
> +	eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
>  }
>  
>  /**
> @@ -995,29 +897,42 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
>  static void tcp_sock4_iov_init(const struct ctx *c)
>  {
>  	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
> -	struct iovec *iov;
>  	int i;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
> -		tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IP),
> -			.iph = iph,
> -			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
> -		};
> -	}
> +	(void)c;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
> -		tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IP),
> -			.iph = L2_BUF_IP4_INIT(IPPROTO_TCP)
> -		};
> -	}
> +	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
> +	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> +		struct iovec *iov;
> +
> +		/* headers */
> +		tcp4_l2_ip[i] = iph;
> +		tcp4_l2_payload[i].th = (struct tcphdr){
> +					.doff = sizeof(struct tcphdr) / 4,
> +					.ack = 1
> +				};

I find C struct literal syntax sufficiently awkward, that I'd probably
prefer to just open code setting each field.  That's only a very weak
preference, though.

>  
> -	for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph);
> +		tcp4_l2_flags_ip[i] = iph;
> +		tcp4_l2_flags[i].th = (struct tcphdr){
> +					.doff = sizeof(struct tcphdr) / 4,
> +					.ack = 1
> +				};
>  
> -	for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph);
> +		/* iovecs */
> +		iov = tcp4_l2_iov[i];
> +		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
> +		iov[TCP_IOV_IP].iov_base = &tcp4_l2_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_payload[i];
> +
> +		iov = tcp4_l2_flags_iov[i];
> +		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
> +		iov[TCP_IOV_IP].iov_base = &tcp4_l2_flags_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_flags[i];
> +	}
>  }
>  
>  /**
> @@ -1026,29 +941,43 @@ static void tcp_sock4_iov_init(const struct ctx *c)
>   */
>  static void tcp_sock6_iov_init(const struct ctx *c)
>  {
> -	struct iovec *iov;
> +	struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
>  	int i;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
> -		tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IPV6),
> -			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP),
> -			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
> -		};
> -	}
> +	(void)c;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
> -		tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IPV6),
> -			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP)
> -		};
> -	}
> +	tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
> +	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> +		struct iovec *iov;
> +
> +		/* headers */
> +		tcp6_l2_ip[i] = ip6;
> +		tcp6_l2_payload[i].th = (struct tcphdr){
> +					.doff = sizeof(struct tcphdr) / 4,
> +					.ack = 1
> +				};
>  
> -	for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph);
> +		tcp6_l2_flags_ip[i] = ip6;
> +		tcp6_l2_flags[i].th = (struct tcphdr){
> +					.doff = sizeof(struct tcphdr) / 4,
> +					.ack = 1
> +				};
>  
> -	for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph);
> +		/* iovecs */
> +		iov = tcp6_l2_iov[i];
> +		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
> +		iov[TCP_IOV_IP].iov_base = &tcp6_l2_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_payload[i];
> +
> +		iov = tcp6_l2_flags_iov[i];
> +		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
> +		iov[TCP_IOV_IP].iov_base = &tcp6_l2_flags_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_flags[i];
> +	}
>  }
>  
>  /**
> @@ -1289,10 +1218,10 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
>   */
>  static void tcp_l2_flags_buf_flush(const struct ctx *c)
>  {
> -	tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
> +	tap_send_iov(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
>  	tcp6_l2_flags_buf_used = 0;
>  
> -	tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
> +	tap_send_iov(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
>  	tcp4_l2_flags_buf_used = 0;
>  }
>  
> @@ -1305,12 +1234,12 @@ static void tcp_l2_data_buf_flush(const struct ctx *c)
>  	unsigned i;
>  	size_t m;
>  
> -	m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used);
> +	m = tap_send_iov(c, tcp6_l2_iov, tcp6_l2_buf_used);
>  	for (i = 0; i < m; i++)
>  		*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
>  	tcp6_l2_buf_used = 0;
>  
> -	m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used);
> +	m = tap_send_iov(c, tcp4_l2_iov, tcp4_l2_buf_used);
>  	for (i = 0; i < m; i++)
>  		*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
>  	tcp4_l2_buf_used = 0;
> @@ -1433,7 +1362,7 @@ static size_t tcp_fill_headers6(const struct ctx *c,
>   * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
>   * @c:		Execution context
>   * @conn:	Connection pointer
> - * @p:		Pointer to any type of TCP pre-cooked buffer
> + * @iov:	Pointer to an array of iovec of TCP pre-cooked buffer
>   * @plen:	Payload length (including TCP header options)
>   * @check:	Checksum, if already known
>   * @seq:	Sequence number for this segment
> @@ -1442,26 +1371,22 @@ static size_t tcp_fill_headers6(const struct ctx *c,
>   */
>  static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
>  				      const struct tcp_tap_conn *conn,
> -				      void *p, size_t plen,
> +				      struct iovec *iov, size_t plen,
>  				      const uint16_t *check, uint32_t seq)
>  {
>  	const struct in_addr *a4 = inany_v4(&conn->faddr);
>  	size_t ip_len, tlen;
>  
>  	if (a4) {
> -		struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p;
> -
> -		ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen,
> +		ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base,
> +					   iov[TCP_IOV_PAYLOAD].iov_base, plen,
>  					   check, seq);
> -
> -		tlen = tap_iov_len(c, &b->taph, ip_len);
> +		tlen = ip_len - sizeof(struct iphdr);
>  	} else {
> -		struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p;
> -
> -		ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen,
> +		ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base,
> +					   iov[TCP_IOV_PAYLOAD].iov_base, plen,
>  					   seq);
> -
> -		tlen = tap_iov_len(c, &b->taph, ip_len);
> +		tlen = ip_len - sizeof(struct ipv6hdr);
>  	}
>  
>  	return tlen;
> @@ -1595,16 +1520,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  {
>  	uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
>  	uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
> -	struct tcp4_l2_flags_buf_t *b4 = NULL;
> -	struct tcp6_l2_flags_buf_t *b6 = NULL;
> +	struct tcp_l2_flags_t *payload;
>  	struct tcp_info tinfo = { 0 };
>  	socklen_t sl = sizeof(tinfo);
>  	int s = conn->sock;
>  	size_t optlen = 0;
>  	struct iovec *iov;
> +	struct iovec *dup_iov;
>  	struct tcphdr *th;
>  	char *data;
> -	void *p;
>  
>  	if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
>  	    !flags && conn->wnd_to_tap)
> @@ -1627,18 +1551,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  		return 0;
>  
>  	if (CONN_V4(conn)) {
> -		iov = tcp4_l2_flags_iov    + tcp4_l2_flags_buf_used;
> -		p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
> -		th = &b4->th;
> -
> -		/* gcc 11.2 would complain on data = (char *)(th + 1); */
> -		data = b4->opts;
> +		iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used++];
> +		dup_iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used];
>  	} else {
> -		iov = tcp6_l2_flags_iov    + tcp6_l2_flags_buf_used;
> -		p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
> -		th = &b6->th;
> -		data = b6->opts;
> +		iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used++];
> +		dup_iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used];
>  	}
> +	payload = iov[TCP_IOV_PAYLOAD].iov_base;
> +	th = &payload->th;
> +	data = payload->opts;
>  
>  	if (flags & SYN) {
>  		int mss;
> @@ -1653,10 +1574,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  			mss = tinfo.tcpi_snd_mss;
>  		} else {
>  			mss = c->mtu - sizeof(struct tcphdr);
> -			if (CONN_V4(conn))
> -				mss -= sizeof(struct iphdr);
> -			else
> -				mss -= sizeof(struct ipv6hdr);
>  
>  			if (c->low_wmem &&
>  			    !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn))
> @@ -1688,8 +1605,9 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  	th->syn = !!(flags & SYN);
>  	th->fin = !!(flags & FIN);
>  
> -	iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen,
> -					       NULL, conn->seq_to_tap);
> +	iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, iov,
> +							       optlen, NULL,
> +							      conn->seq_to_tap);
>  
>  	if (th->ack) {
>  		if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
> @@ -1705,23 +1623,26 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  	if (th->fin || th->syn)
>  		conn->seq_to_tap++;
>  
> +	if (flags & DUP_ACK) {
> +		int i;
> +		for (i = 0; i < TCP_IOV_NUM; i++) {
> +			memcpy(dup_iov[i].iov_base, iov[i].iov_base,
> +			       iov[i].iov_len);
> +			dup_iov[i].iov_len = iov[i].iov_len;
> +		}
> +	}
> +
>  	if (CONN_V4(conn)) {
> -		if (flags & DUP_ACK) {
> -			memcpy(b4 + 1, b4, sizeof(*b4));
> -			(iov + 1)->iov_len = iov->iov_len;
> +		if (flags & DUP_ACK)
>  			tcp4_l2_flags_buf_used++;
> -		}
>  
> -		if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
> +		if (tcp4_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
>  			tcp_l2_flags_buf_flush(c);
>  	} else {
> -		if (flags & DUP_ACK) {
> -			memcpy(b6 + 1, b6, sizeof(*b6));
> -			(iov + 1)->iov_len = iov->iov_len;
> +		if (flags & DUP_ACK)
>  			tcp6_l2_flags_buf_used++;
> -		}
>  
> -		if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
> +		if (tcp6_l2_flags_buf_used > TCP_FRAMES_MEM - 2)
>  			tcp_l2_flags_buf_flush(c);
>  	}
>  
> @@ -1887,15 +1808,14 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
>  	unsigned int mss;
>  	int ret;
>  
> +	(void)conn; /* unused */
> +
>  	if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0)
>  		mss = MSS_DEFAULT;
>  	else
>  		mss = ret;
>  
> -	if (CONN_V4(conn))
> -		mss = MIN(MSS4, mss);
> -	else
> -		mss = MIN(MSS6, mss);
> +	mss = MIN(MSS, mss);
>  
>  	return MIN(mss, USHRT_MAX);
>  }
> @@ -2171,27 +2091,30 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>  	struct iovec *iov;
>  
>  	if (CONN_V4(conn)) {
> -		struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
> -		const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
> +		struct iovec *iov_prev = tcp4_l2_iov[tcp4_l2_buf_used - 1];
> +		const uint16_t *check = NULL;
> +
> +		if (no_csum) {
> +			struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
> +			check = &iph->check;
> +		}
>  
>  		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
>  		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
>  
> -		iov = tcp4_l2_iov + tcp4_l2_buf_used++;
> -		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
> -						       check, seq);
> -		if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1)
> +		iov = tcp4_l2_iov[tcp4_l2_buf_used++];
> +		iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn,
> +							iov, plen, check, seq);
> +		if (tcp4_l2_buf_used > TCP_FRAMES_MEM - 1)
>  			tcp_l2_data_buf_flush(c);
>  	} else if (CONN_V6(conn)) {
> -		struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
> -
>  		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
>  		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
>  
> -		iov = tcp6_l2_iov + tcp6_l2_buf_used++;
> -		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
> -						       NULL, seq);
> -		if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1)
> +		iov = tcp6_l2_iov[tcp6_l2_buf_used++];
> +		iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn,
> +							iov, plen, NULL, seq);
> +		if (tcp6_l2_buf_used > TCP_FRAMES_MEM - 1)
>  			tcp_l2_data_buf_flush(c);
>  	}
>  }
> @@ -2247,8 +2170,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  	iov_sock[0].iov_base = tcp_buf_discard;
>  	iov_sock[0].iov_len = already_sent;
>  
> -	if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) ||
> -	    (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) {
> +	if (( v4 && tcp4_l2_buf_used + fill_bufs > TCP_FRAMES_MEM) ||
> +	    (!v4 && tcp6_l2_buf_used + fill_bufs > TCP_FRAMES_MEM)) {
>  		tcp_l2_data_buf_flush(c);
>  
>  		/* Silence Coverity CWE-125 false positive */
> @@ -2257,9 +2180,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  
>  	for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
>  		if (v4)
> -			iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data;
> +			iov->iov_base = &tcp4_l2_payload[tcp4_l2_buf_used + i].data;
>  		else
> -			iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data;
> +			iov->iov_base = &tcp6_l2_payload[tcp6_l2_buf_used + i].data;
>  		iov->iov_len = mss;
>  	}
>  	if (iov_rem)

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] tcp: Replace TCP buffer structure by an iovec array
  2024-03-13 11:37 ` Stefano Brivio
  2024-03-13 14:42   ` Laurent Vivier
  2024-03-13 15:20   ` Laurent Vivier
@ 2024-03-14 14:07   ` Laurent Vivier
  2024-03-14 15:47     ` Stefano Brivio
  2 siblings, 1 reply; 13+ messages in thread
From: Laurent Vivier @ 2024-03-14 14:07 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

On 3/13/24 12:37, Stefano Brivio wrote:
...
>> @@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c,
>>   	return i;
>>   }
>>   
>> +/**
>> + * tap_send_iov_passt() - Send out multiple prepared frames
> 
> ...I would argue that this function prepares frames as well. Maybe:
> 
>   * tap_send_iov_passt() - Prepare TCP_IOV_VNET parts and send multiple frames
> 
>> + * @c:		Execution context
>> + * @iov:	Array of frames, each frames is divided in an array of iovecs.
>> + *              The first entry of the iovec is updated to point to an
>> + *              uint32_t storing the frame length.
> 
>   * @iov:	Array of frames, each one a vector of parts, TCP_IOV_VNET blank
> 
>> + * @n:		Number of frames in @iov
>> + *
>> + * Return: number of frames actually sent
>> + */
>> +static size_t tap_send_iov_passt(const struct ctx *c,
>> +				 struct iovec iov[][TCP_IOV_NUM],
>> +				 size_t n)
>> +{
>> +	unsigned int i;
>> +
>> +	for (i = 0; i < n; i++) {
>> +		uint32_t vnet_len;
>> +		int j;
>> +
>> +		vnet_len = 0;
> 
> This could be initialised in the declaration (yes, it's "reset" at
> every loop iteration).
> 
>> +		for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
>> +			vnet_len += iov[i][j].iov_len;
>> +
>> +		vnet_len = htonl(vnet_len);
>> +		iov[i][TCP_IOV_VNET].iov_base = &vnet_len;
>> +		iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len);
>> +
>> +		if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))
> 
> ...which would now send a single frame at a time, but actually it can
> already send everything in one shot because it's using sendmsg(), if you
> move it outside of the loop and do something like (untested):
> 
> 	return tap_send_frames_passt(c, iov, TCP_IOV_NUM * n);
> 
>> +			break;
>> +	}
>> +
>> +	return i;
>> +
>> +}
>> +

I tried to do something like that but I have a performance drop:

static size_t tap_send_iov_passt(const struct ctx *c,
                                  struct iovec iov[][TCP_IOV_NUM],
                                  size_t n)
{
         unsigned int i;
         uint32_t vnet_len[n];

         for (i = 0; i < n; i++) {
                 int j;

                 vnet_len[i] = 0;
                 for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
                         vnet_len[i] += iov[i][j].iov_len;

                 vnet_len[i] = htonl(vnet_len[i]);
                 iov[i][TCP_IOV_VNET].iov_base = &vnet_len[i];
                 iov[i][TCP_IOV_VNET].iov_len = sizeof(uint32_t);
         }

         return tap_send_frames_passt(c, &iov[0][0], TCP_IOV_NUM * n) / TCP_IOV_NUM;
}

iperf3 -c localhost -p 10001  -t 60  -4

berfore
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-60.00  sec  33.0 GBytes  4.72 Gbits/sec    1             sender
[  5]   0.00-60.06  sec  33.0 GBytes  4.72 Gbits/sec                  receiver

after:
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-60.00  sec  18.2 GBytes  2.60 Gbits/sec    0             sender
[  5]   0.00-60.07  sec  18.2 GBytes  2.60 Gbits/sec                  receiver

iperf3 -c localhost -p 10001  -t 60  -6

before
[  5]   0.00-60.00  sec  25.5 GBytes  3.65 Gbits/sec    0             sender
[  5]   0.00-60.06  sec  25.5 GBytes  3.65 Gbits/sec                  receiver

after:
[  5]   0.00-60.00  sec  16.1 GBytes  2.31 Gbits/sec    0             sender
[  5]   0.00-60.07  sec  16.1 GBytes  2.31 Gbits/sec                  receiver

Thanks,
Laurent


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] tcp: Replace TCP buffer structure by an iovec array
  2024-03-14 14:07   ` Laurent Vivier
@ 2024-03-14 15:47     ` Stefano Brivio
  2024-03-14 15:54       ` Laurent Vivier
  0 siblings, 1 reply; 13+ messages in thread
From: Stefano Brivio @ 2024-03-14 15:47 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: passt-dev

On Thu, 14 Mar 2024 15:07:48 +0100
Laurent Vivier <lvivier@redhat.com> wrote:

> On 3/13/24 12:37, Stefano Brivio wrote:
> ...
> >> @@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c,
> >>   	return i;
> >>   }
> >>   
> >> +/**
> >> + * tap_send_iov_passt() - Send out multiple prepared frames  
> > 
> > ...I would argue that this function prepares frames as well. Maybe:
> > 
> >   * tap_send_iov_passt() - Prepare TCP_IOV_VNET parts and send multiple frames
> >   
> >> + * @c:		Execution context
> >> + * @iov:	Array of frames, each frames is divided in an array of iovecs.
> >> + *              The first entry of the iovec is updated to point to an
> >> + *              uint32_t storing the frame length.  
> > 
> >   * @iov:	Array of frames, each one a vector of parts, TCP_IOV_VNET blank
> >   
> >> + * @n:		Number of frames in @iov
> >> + *
> >> + * Return: number of frames actually sent
> >> + */
> >> +static size_t tap_send_iov_passt(const struct ctx *c,
> >> +				 struct iovec iov[][TCP_IOV_NUM],
> >> +				 size_t n)
> >> +{
> >> +	unsigned int i;
> >> +
> >> +	for (i = 0; i < n; i++) {
> >> +		uint32_t vnet_len;
> >> +		int j;
> >> +
> >> +		vnet_len = 0;  
> > 
> > This could be initialised in the declaration (yes, it's "reset" at
> > every loop iteration).
> >   
> >> +		for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
> >> +			vnet_len += iov[i][j].iov_len;
> >> +
> >> +		vnet_len = htonl(vnet_len);
> >> +		iov[i][TCP_IOV_VNET].iov_base = &vnet_len;
> >> +		iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len);
> >> +
> >> +		if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))  
> > 
> > ...which would now send a single frame at a time, but actually it can
> > already send everything in one shot because it's using sendmsg(), if you
> > move it outside of the loop and do something like (untested):
> > 
> > 	return tap_send_frames_passt(c, iov, TCP_IOV_NUM * n);
> >   
> >> +			break;
> >> +	}
> >> +
> >> +	return i;
> >> +
> >> +}
> >> +  
> 
> I tried to do something like that but I have a performance drop:
> 
> static size_t tap_send_iov_passt(const struct ctx *c,
>                                   struct iovec iov[][TCP_IOV_NUM],
>                                   size_t n)
> {
>          unsigned int i;
>          uint32_t vnet_len[n];
> 
>          for (i = 0; i < n; i++) {
>                  int j;
> 
>                  vnet_len[i] = 0;
>                  for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
>                          vnet_len[i] += iov[i][j].iov_len;
> 
>                  vnet_len[i] = htonl(vnet_len[i]);
>                  iov[i][TCP_IOV_VNET].iov_base = &vnet_len[i];
>                  iov[i][TCP_IOV_VNET].iov_len = sizeof(uint32_t);
>          }
> 
>          return tap_send_frames_passt(c, &iov[0][0], TCP_IOV_NUM * n) / TCP_IOV_NUM;
> }
> 
> iperf3 -c localhost -p 10001  -t 60  -4
> 
> berfore
> [ ID] Interval           Transfer     Bitrate         Retr
> [  5]   0.00-60.00  sec  33.0 GBytes  4.72 Gbits/sec    1             sender
> [  5]   0.00-60.06  sec  33.0 GBytes  4.72 Gbits/sec                  receiver
> 
> after:
> [ ID] Interval           Transfer     Bitrate         Retr
> [  5]   0.00-60.00  sec  18.2 GBytes  2.60 Gbits/sec    0             sender
> [  5]   0.00-60.07  sec  18.2 GBytes  2.60 Gbits/sec                  receiver

Weird, it looks like doing one sendmsg() per frame results in a higher
throughput than one sendmsg() per multiple frames, which sounds rather
absurd. Perhaps we should start looking into what perf(1) reports, in
terms of both syscall overhead and cache misses.

I'll have a look later today or tomorrow -- unless you have other
ideas as to why this might happen...

-- 
Stefano


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] tcp: Replace TCP buffer structure by an iovec array
  2024-03-14 15:47     ` Stefano Brivio
@ 2024-03-14 15:54       ` Laurent Vivier
  2024-03-14 16:26         ` Stefano Brivio
  0 siblings, 1 reply; 13+ messages in thread
From: Laurent Vivier @ 2024-03-14 15:54 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

On 3/14/24 16:47, Stefano Brivio wrote:
> On Thu, 14 Mar 2024 15:07:48 +0100
> Laurent Vivier <lvivier@redhat.com> wrote:
> 
>> On 3/13/24 12:37, Stefano Brivio wrote:
>> ...
>>>> @@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c,
>>>>    	return i;
>>>>    }
>>>>    
>>>> +/**
>>>> + * tap_send_iov_passt() - Send out multiple prepared frames
>>>
>>> ...I would argue that this function prepares frames as well. Maybe:
>>>
>>>    * tap_send_iov_passt() - Prepare TCP_IOV_VNET parts and send multiple frames
>>>    
>>>> + * @c:		Execution context
>>>> + * @iov:	Array of frames, each frames is divided in an array of iovecs.
>>>> + *              The first entry of the iovec is updated to point to an
>>>> + *              uint32_t storing the frame length.
>>>
>>>    * @iov:	Array of frames, each one a vector of parts, TCP_IOV_VNET blank
>>>    
>>>> + * @n:		Number of frames in @iov
>>>> + *
>>>> + * Return: number of frames actually sent
>>>> + */
>>>> +static size_t tap_send_iov_passt(const struct ctx *c,
>>>> +				 struct iovec iov[][TCP_IOV_NUM],
>>>> +				 size_t n)
>>>> +{
>>>> +	unsigned int i;
>>>> +
>>>> +	for (i = 0; i < n; i++) {
>>>> +		uint32_t vnet_len;
>>>> +		int j;
>>>> +
>>>> +		vnet_len = 0;
>>>
>>> This could be initialised in the declaration (yes, it's "reset" at
>>> every loop iteration).
>>>    
>>>> +		for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
>>>> +			vnet_len += iov[i][j].iov_len;
>>>> +
>>>> +		vnet_len = htonl(vnet_len);
>>>> +		iov[i][TCP_IOV_VNET].iov_base = &vnet_len;
>>>> +		iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len);
>>>> +
>>>> +		if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))
>>>
>>> ...which would now send a single frame at a time, but actually it can
>>> already send everything in one shot because it's using sendmsg(), if you
>>> move it outside of the loop and do something like (untested):
>>>
>>> 	return tap_send_frames_passt(c, iov, TCP_IOV_NUM * n);
>>>    
>>>> +			break;
>>>> +	}
>>>> +
>>>> +	return i;
>>>> +
>>>> +}
>>>> +
>>
>> I tried to do something like that but I have a performance drop:
>>
>> static size_t tap_send_iov_passt(const struct ctx *c,
>>                                    struct iovec iov[][TCP_IOV_NUM],
>>                                    size_t n)
>> {
>>           unsigned int i;
>>           uint32_t vnet_len[n];
>>
>>           for (i = 0; i < n; i++) {
>>                   int j;
>>
>>                   vnet_len[i] = 0;
>>                   for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
>>                           vnet_len[i] += iov[i][j].iov_len;
>>
>>                   vnet_len[i] = htonl(vnet_len[i]);
>>                   iov[i][TCP_IOV_VNET].iov_base = &vnet_len[i];
>>                   iov[i][TCP_IOV_VNET].iov_len = sizeof(uint32_t);
>>           }
>>
>>           return tap_send_frames_passt(c, &iov[0][0], TCP_IOV_NUM * n) / TCP_IOV_NUM;
>> }
>>
>> iperf3 -c localhost -p 10001  -t 60  -4
>>
>> berfore
>> [ ID] Interval           Transfer     Bitrate         Retr
>> [  5]   0.00-60.00  sec  33.0 GBytes  4.72 Gbits/sec    1             sender
>> [  5]   0.00-60.06  sec  33.0 GBytes  4.72 Gbits/sec                  receiver
>>
>> after:
>> [ ID] Interval           Transfer     Bitrate         Retr
>> [  5]   0.00-60.00  sec  18.2 GBytes  2.60 Gbits/sec    0             sender
>> [  5]   0.00-60.07  sec  18.2 GBytes  2.60 Gbits/sec                  receiver
> 
> Weird, it looks like doing one sendmsg() per frame results in a higher
> throughput than one sendmsg() per multiple frames, which sounds rather
> absurd. Perhaps we should start looking into what perf(1) reports, in
> terms of both syscall overhead and cache misses.
> 
> I'll have a look later today or tomorrow -- unless you have other
> ideas as to why this might happen...
> 

Perhaps in first case we only update one vnet_len and in the second case we have to update 
an array of vnet_len, so there is an use of more cache lines?

Thanks,
Laurent


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] tcp: Replace TCP buffer structure by an iovec array
  2024-03-14 15:54       ` Laurent Vivier
@ 2024-03-14 16:26         ` Stefano Brivio
  2024-03-15  0:46           ` David Gibson
  0 siblings, 1 reply; 13+ messages in thread
From: Stefano Brivio @ 2024-03-14 16:26 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: passt-dev

On Thu, 14 Mar 2024 16:54:02 +0100
Laurent Vivier <lvivier@redhat.com> wrote:

> On 3/14/24 16:47, Stefano Brivio wrote:
> > On Thu, 14 Mar 2024 15:07:48 +0100
> > Laurent Vivier <lvivier@redhat.com> wrote:
> >   
> >> On 3/13/24 12:37, Stefano Brivio wrote:
> >> ...  
> >>>> @@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c,
> >>>>    	return i;
> >>>>    }
> >>>>    
> >>>> +/**
> >>>> + * tap_send_iov_passt() - Send out multiple prepared frames  
> >>>
> >>> ...I would argue that this function prepares frames as well. Maybe:
> >>>
> >>>    * tap_send_iov_passt() - Prepare TCP_IOV_VNET parts and send multiple frames
> >>>      
> >>>> + * @c:		Execution context
> >>>> + * @iov:	Array of frames, each frames is divided in an array of iovecs.
> >>>> + *              The first entry of the iovec is updated to point to an
> >>>> + *              uint32_t storing the frame length.  
> >>>
> >>>    * @iov:	Array of frames, each one a vector of parts, TCP_IOV_VNET blank
> >>>      
> >>>> + * @n:		Number of frames in @iov
> >>>> + *
> >>>> + * Return: number of frames actually sent
> >>>> + */
> >>>> +static size_t tap_send_iov_passt(const struct ctx *c,
> >>>> +				 struct iovec iov[][TCP_IOV_NUM],
> >>>> +				 size_t n)
> >>>> +{
> >>>> +	unsigned int i;
> >>>> +
> >>>> +	for (i = 0; i < n; i++) {
> >>>> +		uint32_t vnet_len;
> >>>> +		int j;
> >>>> +
> >>>> +		vnet_len = 0;  
> >>>
> >>> This could be initialised in the declaration (yes, it's "reset" at
> >>> every loop iteration).
> >>>      
> >>>> +		for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
> >>>> +			vnet_len += iov[i][j].iov_len;
> >>>> +
> >>>> +		vnet_len = htonl(vnet_len);
> >>>> +		iov[i][TCP_IOV_VNET].iov_base = &vnet_len;
> >>>> +		iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len);
> >>>> +
> >>>> +		if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))  
> >>>
> >>> ...which would now send a single frame at a time, but actually it can
> >>> already send everything in one shot because it's using sendmsg(), if you
> >>> move it outside of the loop and do something like (untested):
> >>>
> >>> 	return tap_send_frames_passt(c, iov, TCP_IOV_NUM * n);
> >>>      
> >>>> +			break;
> >>>> +	}
> >>>> +
> >>>> +	return i;
> >>>> +
> >>>> +}
> >>>> +  
> >>
> >> I tried to do something like that but I have a performance drop:
> >>
> >> static size_t tap_send_iov_passt(const struct ctx *c,
> >>                                    struct iovec iov[][TCP_IOV_NUM],
> >>                                    size_t n)
> >> {
> >>           unsigned int i;
> >>           uint32_t vnet_len[n];
> >>
> >>           for (i = 0; i < n; i++) {
> >>                   int j;
> >>
> >>                   vnet_len[i] = 0;
> >>                   for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
> >>                           vnet_len[i] += iov[i][j].iov_len;
> >>
> >>                   vnet_len[i] = htonl(vnet_len[i]);
> >>                   iov[i][TCP_IOV_VNET].iov_base = &vnet_len[i];
> >>                   iov[i][TCP_IOV_VNET].iov_len = sizeof(uint32_t);
> >>           }
> >>
> >>           return tap_send_frames_passt(c, &iov[0][0], TCP_IOV_NUM * n) / TCP_IOV_NUM;
> >> }
> >>
> >> iperf3 -c localhost -p 10001  -t 60  -4
> >>
> >> berfore
> >> [ ID] Interval           Transfer     Bitrate         Retr
> >> [  5]   0.00-60.00  sec  33.0 GBytes  4.72 Gbits/sec    1             sender
> >> [  5]   0.00-60.06  sec  33.0 GBytes  4.72 Gbits/sec                  receiver
> >>
> >> after:
> >> [ ID] Interval           Transfer     Bitrate         Retr
> >> [  5]   0.00-60.00  sec  18.2 GBytes  2.60 Gbits/sec    0             sender
> >> [  5]   0.00-60.07  sec  18.2 GBytes  2.60 Gbits/sec                  receiver  
> > 
> > Weird, it looks like doing one sendmsg() per frame results in a higher
> > throughput than one sendmsg() per multiple frames, which sounds rather
> > absurd. Perhaps we should start looking into what perf(1) reports, in
> > terms of both syscall overhead and cache misses.
> > 
> > I'll have a look later today or tomorrow -- unless you have other
> > ideas as to why this might happen...
> 
> Perhaps in first case we only update one vnet_len and in the second case we have to update 
> an array of vnet_len, so there is an use of more cache lines?

Yes, I'm wondering if for example this:

		iov[i][TCP_IOV_VNET].iov_base = &vnet_len[i];

causes a prefetch of everything pointed by iov[i][...], so we would
prefetch (and throw away) each buffer, one by one.

Another interesting experiment to verify if this is the case could be
to "flush" a few frames at a time (say, 4), with something like this on
top of your original change (completely untested):

		[...]

		if (!((i + 1) % 4) &&
		    !tap_send_frames_passt(c, iov[i / 4], TCP_IOV_NUM * 4))
			break;
	}

	if ((i + 1) % 4) {
		tap_send_frames_passt(c, iov[i / 4],
				      TCP_IOV_NUM * ((i + 1) % 4));
	}

Or maybe we could set vnet_len right after we receive data in the
buffers.

-- 
Stefano


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] tcp: Replace TCP buffer structure by an iovec array
  2024-03-14 16:26         ` Stefano Brivio
@ 2024-03-15  0:46           ` David Gibson
  0 siblings, 0 replies; 13+ messages in thread
From: David Gibson @ 2024-03-15  0:46 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Laurent Vivier, passt-dev

[-- Attachment #1: Type: text/plain, Size: 5948 bytes --]

On Thu, Mar 14, 2024 at 05:26:17PM +0100, Stefano Brivio wrote:
> On Thu, 14 Mar 2024 16:54:02 +0100
> Laurent Vivier <lvivier@redhat.com> wrote:
> 
> > On 3/14/24 16:47, Stefano Brivio wrote:
> > > On Thu, 14 Mar 2024 15:07:48 +0100
> > > Laurent Vivier <lvivier@redhat.com> wrote:
> > >   
> > >> On 3/13/24 12:37, Stefano Brivio wrote:
> > >> ...  
> > >>>> @@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c,
> > >>>>    	return i;
> > >>>>    }
> > >>>>    
> > >>>> +/**
> > >>>> + * tap_send_iov_passt() - Send out multiple prepared frames  
> > >>>
> > >>> ...I would argue that this function prepares frames as well. Maybe:
> > >>>
> > >>>    * tap_send_iov_passt() - Prepare TCP_IOV_VNET parts and send multiple frames
> > >>>      
> > >>>> + * @c:		Execution context
> > >>>> + * @iov:	Array of frames, each frames is divided in an array of iovecs.
> > >>>> + *              The first entry of the iovec is updated to point to an
> > >>>> + *              uint32_t storing the frame length.  
> > >>>
> > >>>    * @iov:	Array of frames, each one a vector of parts, TCP_IOV_VNET blank
> > >>>      
> > >>>> + * @n:		Number of frames in @iov
> > >>>> + *
> > >>>> + * Return: number of frames actually sent
> > >>>> + */
> > >>>> +static size_t tap_send_iov_passt(const struct ctx *c,
> > >>>> +				 struct iovec iov[][TCP_IOV_NUM],
> > >>>> +				 size_t n)
> > >>>> +{
> > >>>> +	unsigned int i;
> > >>>> +
> > >>>> +	for (i = 0; i < n; i++) {
> > >>>> +		uint32_t vnet_len;
> > >>>> +		int j;
> > >>>> +
> > >>>> +		vnet_len = 0;  
> > >>>
> > >>> This could be initialised in the declaration (yes, it's "reset" at
> > >>> every loop iteration).
> > >>>      
> > >>>> +		for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
> > >>>> +			vnet_len += iov[i][j].iov_len;
> > >>>> +
> > >>>> +		vnet_len = htonl(vnet_len);
> > >>>> +		iov[i][TCP_IOV_VNET].iov_base = &vnet_len;
> > >>>> +		iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len);
> > >>>> +
> > >>>> +		if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))  
> > >>>
> > >>> ...which would now send a single frame at a time, but actually it can
> > >>> already send everything in one shot because it's using sendmsg(), if you
> > >>> move it outside of the loop and do something like (untested):
> > >>>
> > >>> 	return tap_send_frames_passt(c, iov, TCP_IOV_NUM * n);
> > >>>      
> > >>>> +			break;
> > >>>> +	}
> > >>>> +
> > >>>> +	return i;
> > >>>> +
> > >>>> +}
> > >>>> +  
> > >>
> > >> I tried to do something like that but I have a performance drop:
> > >>
> > >> static size_t tap_send_iov_passt(const struct ctx *c,
> > >>                                    struct iovec iov[][TCP_IOV_NUM],
> > >>                                    size_t n)
> > >> {
> > >>           unsigned int i;
> > >>           uint32_t vnet_len[n];
> > >>
> > >>           for (i = 0; i < n; i++) {
> > >>                   int j;
> > >>
> > >>                   vnet_len[i] = 0;
> > >>                   for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++)
> > >>                           vnet_len[i] += iov[i][j].iov_len;
> > >>
> > >>                   vnet_len[i] = htonl(vnet_len[i]);
> > >>                   iov[i][TCP_IOV_VNET].iov_base = &vnet_len[i];
> > >>                   iov[i][TCP_IOV_VNET].iov_len = sizeof(uint32_t);
> > >>           }
> > >>
> > >>           return tap_send_frames_passt(c, &iov[0][0], TCP_IOV_NUM * n) / TCP_IOV_NUM;
> > >> }
> > >>
> > >> iperf3 -c localhost -p 10001  -t 60  -4
> > >>
> > >> berfore
> > >> [ ID] Interval           Transfer     Bitrate         Retr
> > >> [  5]   0.00-60.00  sec  33.0 GBytes  4.72 Gbits/sec    1             sender
> > >> [  5]   0.00-60.06  sec  33.0 GBytes  4.72 Gbits/sec                  receiver
> > >>
> > >> after:
> > >> [ ID] Interval           Transfer     Bitrate         Retr
> > >> [  5]   0.00-60.00  sec  18.2 GBytes  2.60 Gbits/sec    0             sender
> > >> [  5]   0.00-60.07  sec  18.2 GBytes  2.60 Gbits/sec                  receiver  
> > > 
> > > Weird, it looks like doing one sendmsg() per frame results in a higher
> > > throughput than one sendmsg() per multiple frames, which sounds rather
> > > absurd. Perhaps we should start looking into what perf(1) reports, in
> > > terms of both syscall overhead and cache misses.
> > > 
> > > I'll have a look later today or tomorrow -- unless you have other
> > > ideas as to why this might happen...
> > 
> > Perhaps in first case we only update one vnet_len and in the second case we have to update 
> > an array of vnet_len, so there is an use of more cache lines?

We should be able to test this relatively easily, yes?  By updating
all the vnet_len then using a single sendmsg().


> Yes, I'm wondering if for example this:
> 
> 		iov[i][TCP_IOV_VNET].iov_base = &vnet_len[i];
> 
> causes a prefetch of everything pointed by iov[i][...], so we would
> prefetch (and throw away) each buffer, one by one.
> 
> Another interesting experiment to verify if this is the case could be
> to "flush" a few frames at a time (say, 4), with something like this on
> top of your original change (completely untested):
> 
> 		[...]
> 
> 		if (!((i + 1) % 4) &&
> 		    !tap_send_frames_passt(c, iov[i / 4], TCP_IOV_NUM * 4))
> 			break;
> 	}
> 
> 	if ((i + 1) % 4) {
> 		tap_send_frames_passt(c, iov[i / 4],
> 				      TCP_IOV_NUM * ((i + 1) % 4));
> 	}
> 
> Or maybe we could set vnet_len right after we receive data in the
> buffers.

I really hope we can avoid this.  If we want to allow IPv4<->IPv6
translation, then we can't know the vnet_len until after we've done
our routing / translation.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2024-03-15  0:47 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-11 13:33 [RFC] tcp: Replace TCP buffer structure by an iovec array Laurent Vivier
2024-03-12 22:56 ` Stefano Brivio
2024-03-13 11:37 ` Stefano Brivio
2024-03-13 14:42   ` Laurent Vivier
2024-03-13 15:27     ` Stefano Brivio
2024-03-13 15:20   ` Laurent Vivier
2024-03-13 16:58     ` Stefano Brivio
2024-03-14 14:07   ` Laurent Vivier
2024-03-14 15:47     ` Stefano Brivio
2024-03-14 15:54       ` Laurent Vivier
2024-03-14 16:26         ` Stefano Brivio
2024-03-15  0:46           ` David Gibson
2024-03-14  4:22 ` David Gibson

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).