public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [RFC v4] tcp: Replace TCP buffer structure by an iovec array
@ 2024-03-21 10:26 Laurent Vivier
  2024-03-22  2:40 ` David Gibson
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Laurent Vivier @ 2024-03-21 10:26 UTC (permalink / raw)
  To: passt-dev; +Cc: Laurent Vivier, David Gibson

To be able to provide pointers to TCP headers and IP headers without
worrying about alignment in the structure, split the structure into
several arrays and point to each part of the frame using an iovec array.

Using iovec also allows us to simply ignore the first entry when the
vnet length header is not needed.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---

Notes:
    v4:
      - fix static_assert()
    
    v3:
      - use (again) a 2d array of iovecs
      - fix pcap_multiple() call
      - introduce IP_MAX_MTU to define MSS4 and MSS6
      - use VAR[0].data rather than ((TYPE *)0)->data in static_assert()
      - cleanup tcp_iov_parts comments
      - use sizeof(VAR) rather than sizeof(TYPE)
      - fix tcp_l2_buf_fill_headers function comment
      - don't mix network order and host order in vnet_len
      - with dup_iov, update only TCP_IOV_PAYLOAD length
      - remove _l2_ from the buffers names
    
    v2:
      - rebased on top of David's series
        "Some improvements to the tap send path"
      - no performance improvement anymore
      - remove the iovec functions of v1 introduced in tap.c to use new
        functions from David
      - don't use an array of array of iovec
      - fix comments
      - re-introduce MSS4 and MSS6
      - address comments from David and Stefano
    
    I have tested only passt, not pasta

 tcp.c  | 524 +++++++++++++++++++++++++++------------------------------
 util.h |   3 +
 2 files changed, 253 insertions(+), 274 deletions(-)

diff --git a/tcp.c b/tcp.c
index a1860d10b15f..cc705064f059 100644
--- a/tcp.c
+++ b/tcp.c
@@ -318,39 +318,8 @@
 
 /* MSS rounding: see SET_MSS() */
 #define MSS_DEFAULT			536
-
-struct tcp4_l2_head {	/* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
-#ifdef __AVX2__
-	uint8_t pad[26];
-#else
-	uint8_t pad[2];
-#endif
-	struct tap_hdr taph;
-	struct iphdr iph;
-	struct tcphdr th;
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
-#ifdef __AVX2__
-	uint8_t pad[14];
-#else
-	uint8_t pad[2];
-#endif
-	struct tap_hdr taph;
-	struct ipv6hdr ip6h;
-	struct tcphdr th;
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-#define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
-#define MSS6	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
+#define MSS4				ROUND_DOWN(IP_MAX_MTU - sizeof(struct tcphdr) - sizeof(struct iphdr), sizeof(uint32_t))
+#define MSS6				ROUND_DOWN(IP_MAX_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), sizeof(uint32_t))
 
 #define WINDOW_DEFAULT			14600		/* RFC 6928 */
 #ifdef HAS_SND_WND
@@ -445,133 +414,107 @@ struct tcp_buf_seq_update {
 };
 
 /* Static buffers */
-
 /**
- * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
- * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
- * @taph:	Tap-level headers (partially pre-filled)
- * @iph:	Pre-filled IP header (except for tot_len and saddr)
- * @uh:		Headroom for TCP header
- * @data:	Storage for TCP payload
+ * struct tcp_payload_t - TCP header and data to send data
+ * 		32 bytes aligned to be able to use AVX2 checksum
+ * @th:		TCP header
+ * @data:	TCP data
  */
-static struct tcp4_l2_buf_t {
-#ifdef __AVX2__
-	uint8_t pad[26];	/* 0, align th to 32 bytes */
-#else
-	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
-#endif
-	struct tap_hdr taph;	/* 26				2 */
-	struct iphdr iph;	/* 44				20 */
-	struct tcphdr th;	/* 64				40 */
-	uint8_t data[MSS4];	/* 84				60 */
-				/* 65536			65532 */
+struct tcp_payload_t {
+	struct tcphdr th;
+	uint8_t data[65536 - sizeof(struct tcphdr)];
 #ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
+} __attribute__ ((packed, aligned(32)));
 #else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
 #endif
-tcp4_l2_buf[TCP_FRAMES_MEM];
-
-static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
-
-static unsigned int tcp4_l2_buf_used;
 
 /**
- * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
- * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
- * @taph:	Tap-level headers (partially pre-filled)
- * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
- * @th:		Headroom for TCP header
- * @data:	Storage for TCP payload
+ * struct tcp_flags_t - TCP header and data to send option flags
+ * @th:		TCP header
+ * @opts	TCP option flags
  */
-struct tcp6_l2_buf_t {
-#ifdef __AVX2__
-	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
-#else
-	uint8_t pad[2];		/*	align ip6h to 4 bytes	0 */
-#endif
-	struct tap_hdr taph;	/* 14				2 */
-	struct ipv6hdr ip6h;	/* 32				20 */
-	struct tcphdr th;	/* 72				60 */
-	uint8_t data[MSS6];	/* 92				80 */
-				/* 65536			65532 */
+struct tcp_flags_t {
+	struct tcphdr th;
+	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
 #ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
+} __attribute__ ((packed, aligned(32)));
 #else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
 #endif
-tcp6_l2_buf[TCP_FRAMES_MEM];
 
-static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
+/* Ethernet header for IPv4 frames */
+static struct ethhdr		tcp4_eth_src;
+
+static uint32_t			tcp4_payload_vnet_len[TCP_FRAMES_MEM];
+/* IPv4 headers */
+static struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
+/* TCP headers and data for IPv4 frames */
+static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
+
+static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
+
+static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
+static unsigned int tcp4_payload_used;
+
+static uint32_t			tcp4_flags_vnet_len[TCP_FRAMES_MEM];
+/* IPv4 headers for TCP option flags frames */
+static struct iphdr		tcp4_flags_ip[TCP_FRAMES_MEM];
+/* TCP headers and option flags for IPv4 frames */
+static struct tcp_flags_t	tcp4_flags[TCP_FRAMES_MEM];
+
+static unsigned int tcp4_flags_used;
 
-static unsigned int tcp6_l2_buf_used;
+/* Ethernet header for IPv6 frames */
+static struct ethhdr		tcp6_eth_src;
+
+static uint32_t			tcp6_payload_vnet_len[TCP_FRAMES_MEM];
+/* IPv6 headers */
+static struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
+/* TCP headers and data for IPv6 frames */
+static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
+
+static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
+
+static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
+static unsigned int tcp6_payload_used;
+
+static uint32_t			tcp6_flags_vnet_len[TCP_FRAMES_MEM];
+/* IPv6 headers for TCP option flags frames */
+static struct ipv6hdr		tcp6_flags_ip[TCP_FRAMES_MEM];
+/* TCP headers and option flags for IPv6 frames */
+static struct tcp_flags_t	tcp6_flags[TCP_FRAMES_MEM];
+
+static unsigned int tcp6_flags_used;
 
 /* recvmsg()/sendmsg() data for tap */
 static char 		tcp_buf_discard		[MAX_WINDOW];
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
-static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
-static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM];
-static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM];
-static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM];
+/*
+ * enum tcp_iov_parts - I/O vector parts for one TCP frame
+ * @TCP_IOV_TAP		TAP header
+ * @TCP_IOV_ETH		ethernet header
+ * @TCP_IOV_IP		IP (v4/v6) header
+ * @TCP_IOV_PAYLOAD	IP payload (TCP header + data)
+ * @TCP_NUM_IOVS 	the number of entries in the iovec array
+ */
+enum tcp_iov_parts {
+	TCP_IOV_TAP	= 0,
+	TCP_IOV_ETH	= 1,
+	TCP_IOV_IP	= 2,
+	TCP_IOV_PAYLOAD	= 3,
+	TCP_NUM_IOVS
+};
+
+static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
 
-/**
- * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
- * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
- * @taph:	Tap-level headers (partially pre-filled)
- * @iph:	Pre-filled IP header (except for tot_len and saddr)
- * @th:		Headroom for TCP header
- * @opts:	Headroom for TCP options
- */
-static struct tcp4_l2_flags_buf_t {
-#ifdef __AVX2__
-	uint8_t pad[26];	/* 0, align th to 32 bytes */
-#else
-	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
-#endif
-	struct tap_hdr taph;	/* 26				2 */
-	struct iphdr iph;	/* 44				20 */
-	struct tcphdr th;	/* 64				40 */
-	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-tcp4_l2_flags_buf[TCP_FRAMES_MEM];
-
-static unsigned int tcp4_l2_flags_buf_used;
-
-/**
- * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
- * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
- * @taph:	Tap-level headers (partially pre-filled)
- * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
- * @th:		Headroom for TCP header
- * @opts:	Headroom for TCP options
- */
-static struct tcp6_l2_flags_buf_t {
-#ifdef __AVX2__
-	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
-#else
-	uint8_t pad[2];		/*	align ip6h to 4 bytes		   0 */
-#endif
-	struct tap_hdr taph;	/* 14					   2 */
-	struct ipv6hdr ip6h;	/* 32					  20 */
-	struct tcphdr th	/* 72 */ __attribute__ ((aligned(4))); /* 60 */
-	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-tcp6_l2_flags_buf[TCP_FRAMES_MEM];
-
-static unsigned int tcp6_l2_flags_buf_used;
-
 #define CONN(idx)		(&(FLOW(idx)->tcp))
 
 /* Table for lookup from remote address, local port, remote port */
@@ -967,25 +910,14 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
 }
 
 /**
- * tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
+ * tcp_update_l2_buf() - Update ethernet header buffers with addresses
  * @eth_d:	Ethernet destination address, NULL if unchanged
  * @eth_s:	Ethernet source address, NULL if unchanged
  */
 void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
 {
-	int i;
-
-	for (i = 0; i < TCP_FRAMES_MEM; i++) {
-		struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
-		struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
-		struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
-		struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
-
-		eth_update_mac(&b4->taph.eh, eth_d, eth_s);
-		eth_update_mac(&b6->taph.eh, eth_d, eth_s);
-		eth_update_mac(&b4f->taph.eh, eth_d, eth_s);
-		eth_update_mac(&b6f->taph.eh, eth_d, eth_s);
-	}
+	eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
+	eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
 }
 
 /**
@@ -995,29 +927,42 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
 static void tcp_sock4_iov_init(const struct ctx *c)
 {
 	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
-	struct iovec *iov;
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
-		tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
-			.taph = TAP_HDR_INIT(ETH_P_IP),
-			.iph = iph,
-			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
-		};
-	}
+	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
+	for (i = 0; i < TCP_FRAMES_MEM; i++) {
+		struct iovec *iov;
 
-	for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
-		tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) {
-			.taph = TAP_HDR_INIT(ETH_P_IP),
-			.iph = L2_BUF_IP4_INIT(IPPROTO_TCP)
-		};
-	}
+		/* headers */
+		tcp4_payload_ip[i] = iph;
+		tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
+		tcp4_payload[i].th.ack = 1;
+
+		tcp4_flags_ip[i] = iph;
+		tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
+		tcp4_flags[i].th.ack = 1;
 
-	for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
-		iov->iov_base = tap_frame_base(c, &tcp4_l2_buf[i].taph);
+		/* iovecs */
+		iov = tcp4_l2_iov[i];
+		iov[TCP_IOV_TAP].iov_base = &tcp4_payload_vnet_len[i];
+		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
+					   sizeof(tcp4_payload_vnet_len[i]) : 0;
+		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
+		iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
+		iov[TCP_IOV_IP].iov_base = &tcp4_payload_ip[i];
+		iov[TCP_IOV_IP].iov_len = sizeof(tcp4_payload_ip[i]);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
 
-	for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
-		iov->iov_base = tap_frame_base(c, &tcp4_l2_flags_buf[i].taph);
+		iov = tcp4_l2_flags_iov[i];
+		iov[TCP_IOV_TAP].iov_base = &tcp4_flags_vnet_len[i];
+		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
+					   sizeof(tcp4_flags_vnet_len[i]) : 0;
+		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
+		iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
+		iov[TCP_IOV_IP].iov_base = &tcp4_flags_ip[i];
+		iov[TCP_IOV_IP].iov_len = sizeof(tcp4_flags_ip[i]);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
+	}
 }
 
 /**
@@ -1026,29 +971,43 @@ static void tcp_sock4_iov_init(const struct ctx *c)
  */
 static void tcp_sock6_iov_init(const struct ctx *c)
 {
-	struct iovec *iov;
+	struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
-		tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
-			.taph = TAP_HDR_INIT(ETH_P_IPV6),
-			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP),
-			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
-		};
-	}
+	tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
+	for (i = 0; i < TCP_FRAMES_MEM; i++) {
+		struct iovec *iov;
 
-	for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
-		tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
-			.taph = TAP_HDR_INIT(ETH_P_IPV6),
-			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP)
-		};
-	}
+		/* headers */
+		tcp6_payload_ip[i] = ip6;
+		tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
+		tcp6_payload[i].th.ack = 1;
+
+		tcp6_flags_ip[i] = ip6;
+		tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
+		tcp6_flags[i].th .ack = 1;
 
-	for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
-		iov->iov_base = tap_frame_base(c, &tcp6_l2_buf[i].taph);
+		/* iovecs */
+		iov = tcp6_l2_iov[i];
+		iov[TCP_IOV_TAP].iov_base = &tcp6_payload_vnet_len[i];
+		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
+					   sizeof(tcp6_payload_vnet_len[i]) : 0;
+		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
+		iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
+		iov[TCP_IOV_IP].iov_base = &tcp6_payload_ip[i];
+		iov[TCP_IOV_IP].iov_len = sizeof(tcp6_payload_ip[i]);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
 
-	for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
-		iov->iov_base = tap_frame_base(c, &tcp6_l2_flags_buf[i].taph);
+		iov = tcp6_l2_flags_iov[i];
+		iov[TCP_IOV_TAP].iov_base = &tcp6_flags_vnet_len[i];
+		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
+					   sizeof(tcp6_flags_vnet_len[i]) : 0;
+		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
+		iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
+		iov[TCP_IOV_IP].iov_base = &tcp6_flags_ip[i];
+		iov[TCP_IOV_IP].iov_len = sizeof(tcp6_flags_ip[i]);
+		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
+	}
 }
 
 /**
@@ -1284,36 +1243,40 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
 	} while (0)
 
 /**
- * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags)
+ * tcp_flags_flush() - Send out buffers for segments with no data (flags)
  * @c:		Execution context
  */
-static void tcp_l2_flags_buf_flush(const struct ctx *c)
+static void tcp_flags_flush(const struct ctx *c)
 {
-	tap_send_frames(c, tcp6_l2_flags_iov, 1, tcp6_l2_flags_buf_used);
-	tcp6_l2_flags_buf_used = 0;
+	tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
+			tcp6_flags_used);
+	tcp6_flags_used = 0;
 
-	tap_send_frames(c, tcp4_l2_flags_iov, 1, tcp4_l2_flags_buf_used);
-	tcp4_l2_flags_buf_used = 0;
+	tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
+			tcp4_flags_used);
+	tcp4_flags_used = 0;
 }
 
 /**
- * tcp_l2_data_buf_flush() - Send out buffers for segments with data
+ * tcp_payload_flush() - Send out buffers for segments with data
  * @c:		Execution context
  */
-static void tcp_l2_data_buf_flush(const struct ctx *c)
+static void tcp_payload_flush(const struct ctx *c)
 {
 	unsigned i;
 	size_t m;
 
-	m = tap_send_frames(c, tcp6_l2_iov, 1, tcp6_l2_buf_used);
+	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
+			    tcp6_payload_used);
 	for (i = 0; i < m; i++)
-		*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
-	tcp6_l2_buf_used = 0;
+		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
+	tcp6_payload_used = 0;
 
-	m = tap_send_frames(c, tcp4_l2_iov, 1, tcp4_l2_buf_used);
+	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
+			    tcp4_payload_used);
 	for (i = 0; i < m; i++)
-		*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
-	tcp4_l2_buf_used = 0;
+		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
+	tcp4_payload_used = 0;
 }
 
 /**
@@ -1323,8 +1286,8 @@ static void tcp_l2_data_buf_flush(const struct ctx *c)
 /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
 void tcp_defer_handler(struct ctx *c)
 {
-	tcp_l2_flags_buf_flush(c);
-	tcp_l2_data_buf_flush(c);
+	tcp_flags_flush(c);
+	tcp_payload_flush(c);
 }
 
 /**
@@ -1433,35 +1396,31 @@ static size_t tcp_fill_headers6(const struct ctx *c,
  * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
  * @c:		Execution context
  * @conn:	Connection pointer
- * @p:		Pointer to any type of TCP pre-cooked buffer
+ * @iov:	Pointer to an array of iovec of TCP pre-cooked buffer
  * @plen:	Payload length (including TCP header options)
  * @check:	Checksum, if already known
  * @seq:	Sequence number for this segment
  *
- * Return: frame length including L2 headers, host order
+ * Return: IP payload length, host order
  */
 static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
 				      const struct tcp_tap_conn *conn,
-				      void *p, size_t plen,
+				      struct iovec *iov, size_t plen,
 				      const uint16_t *check, uint32_t seq)
 {
 	const struct in_addr *a4 = inany_v4(&conn->faddr);
 	size_t ip_len, tlen;
 
 	if (a4) {
-		struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p;
-
-		ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen,
+		ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base,
+					   iov[TCP_IOV_PAYLOAD].iov_base, plen,
 					   check, seq);
-
-		tlen = tap_frame_len(c, &b->taph, ip_len);
+		tlen = ip_len - sizeof(struct iphdr);
 	} else {
-		struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p;
-
-		ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen,
+		ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base,
+					   iov[TCP_IOV_PAYLOAD].iov_base, plen,
 					   seq);
-
-		tlen = tap_frame_len(c, &b->taph, ip_len);
+		tlen = ip_len - sizeof(struct ipv6hdr);
 	}
 
 	return tlen;
@@ -1595,16 +1554,16 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 {
 	uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
 	uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
-	struct tcp4_l2_flags_buf_t *b4 = NULL;
-	struct tcp6_l2_flags_buf_t *b6 = NULL;
+	struct tcp_flags_t *payload;
 	struct tcp_info tinfo = { 0 };
 	socklen_t sl = sizeof(tinfo);
 	int s = conn->sock;
+	uint32_t vnet_len;
 	size_t optlen = 0;
-	struct iovec *iov;
 	struct tcphdr *th;
+	struct iovec *iov;
+	size_t ip_len;
 	char *data;
-	void *p;
 
 	if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
 	    !flags && conn->wnd_to_tap)
@@ -1627,19 +1586,17 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 		return 0;
 
 	if (CONN_V4(conn)) {
-		iov = tcp4_l2_flags_iov    + tcp4_l2_flags_buf_used;
-		p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
-		th = &b4->th;
-
-		/* gcc 11.2 would complain on data = (char *)(th + 1); */
-		data = b4->opts;
+		iov = tcp4_l2_flags_iov[tcp4_flags_used++];
+		vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr);
 	} else {
-		iov = tcp6_l2_flags_iov    + tcp6_l2_flags_buf_used;
-		p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
-		th = &b6->th;
-		data = b6->opts;
+		iov = tcp6_l2_flags_iov[tcp6_flags_used++];
+		vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
 	}
 
+	payload = iov[TCP_IOV_PAYLOAD].iov_base;
+	th = &payload->th;
+	data = payload->opts;
+
 	if (flags & SYN) {
 		int mss;
 
@@ -1688,8 +1645,11 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	th->syn = !!(flags & SYN);
 	th->fin = !!(flags & FIN);
 
-	iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen,
-					       NULL, conn->seq_to_tap);
+	ip_len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL,
+					 conn->seq_to_tap);
+	iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
+
+	*(uint32_t *)iov[TCP_IOV_TAP].iov_base = htonl(vnet_len + ip_len);
 
 	if (th->ack) {
 		if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
@@ -1705,24 +1665,27 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	if (th->fin || th->syn)
 		conn->seq_to_tap++;
 
-	if (CONN_V4(conn)) {
-		if (flags & DUP_ACK) {
-			memcpy(b4 + 1, b4, sizeof(*b4));
-			(iov + 1)->iov_len = iov->iov_len;
-			tcp4_l2_flags_buf_used++;
-		}
+	if (flags & DUP_ACK) {
+		struct iovec *dup_iov;
+		int i;
 
-		if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
-			tcp_l2_flags_buf_flush(c);
-	} else {
-		if (flags & DUP_ACK) {
-			memcpy(b6 + 1, b6, sizeof(*b6));
-			(iov + 1)->iov_len = iov->iov_len;
-			tcp6_l2_flags_buf_used++;
-		}
+		if (CONN_V4(conn))
+			dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
+		else
+			dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
 
-		if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
-			tcp_l2_flags_buf_flush(c);
+		for (i = 0; i < TCP_NUM_IOVS; i++)
+			memcpy(dup_iov[i].iov_base, iov[i].iov_base,
+			       iov[i].iov_len);
+		dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
+	}
+
+	if (CONN_V4(conn)) {
+		if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
+			tcp_flags_flush(c);
+	} else {
+		if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
+			tcp_flags_flush(c);
 	}
 
 	return 0;
@@ -2169,30 +2132,43 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 {
 	uint32_t *seq_update = &conn->seq_to_tap;
 	struct iovec *iov;
+	size_t ip_len;
 
 	if (CONN_V4(conn)) {
-		struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
-		const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
+		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
+		const uint16_t *check = NULL;
 
-		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
-		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
+		if (no_csum) {
+			struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
+			check = &iph->check;
+		}
 
-		iov = tcp4_l2_iov + tcp4_l2_buf_used++;
-		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
-						       check, seq);
-		if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1)
-			tcp_l2_data_buf_flush(c);
+		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
+		tcp4_seq_update[tcp4_payload_used].len = plen;
+
+		iov = tcp4_l2_iov[tcp4_payload_used++];
+		ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, check,
+						 seq);
+		iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
+		*(uint32_t *)iov[TCP_IOV_TAP].iov_base =
+						htonl(sizeof(struct ethhdr) +
+						      sizeof(struct iphdr) +
+						      ip_len);
+		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
+			tcp_payload_flush(c);
 	} else if (CONN_V6(conn)) {
-		struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
-
-		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
-		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
+		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
+		tcp6_seq_update[tcp6_payload_used].len = plen;
 
-		iov = tcp6_l2_iov + tcp6_l2_buf_used++;
-		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
-						       NULL, seq);
-		if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1)
-			tcp_l2_data_buf_flush(c);
+		iov = tcp6_l2_iov[tcp6_payload_used++];
+		ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, NULL, seq);
+		iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
+		*(uint32_t *)iov[TCP_IOV_TAP].iov_base =
+						htonl(sizeof(struct ethhdr) +
+						      sizeof(struct ipv6hdr) +
+						      ip_len);
+		if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
+			tcp_payload_flush(c);
 	}
 }
 
@@ -2247,19 +2223,19 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	iov_sock[0].iov_base = tcp_buf_discard;
 	iov_sock[0].iov_len = already_sent;
 
-	if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) ||
-	    (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) {
-		tcp_l2_data_buf_flush(c);
+	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
+	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
+		tcp_payload_flush(c);
 
 		/* Silence Coverity CWE-125 false positive */
-		tcp4_l2_buf_used = tcp6_l2_buf_used = 0;
+		tcp4_payload_used = tcp6_payload_used = 0;
 	}
 
 	for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
 		if (v4)
-			iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data;
+			iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
 		else
-			iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data;
+			iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
 		iov->iov_len = mss;
 	}
 	if (iov_rem)
@@ -2304,7 +2280,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	plen = mss;
 	seq = conn->seq_to_tap;
 	for (i = 0; i < send_bufs; i++) {
-		int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
+		int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
 
 		if (i == send_bufs - 1)
 			plen = last_len;
diff --git a/util.h b/util.h
index 48f35604df92..0069df34a5d2 100644
--- a/util.h
+++ b/util.h
@@ -31,6 +31,9 @@
 #ifndef ETH_MIN_MTU
 #define ETH_MIN_MTU			68
 #endif
+#ifndef IP_MAX_MTU
+#define IP_MAX_MTU			USHRT_MAX
+#endif
 
 #ifndef MIN
 #define MIN(x, y)		(((x) < (y)) ? (x) : (y))
-- 
@@ -31,6 +31,9 @@
 #ifndef ETH_MIN_MTU
 #define ETH_MIN_MTU			68
 #endif
+#ifndef IP_MAX_MTU
+#define IP_MAX_MTU			USHRT_MAX
+#endif
 
 #ifndef MIN
 #define MIN(x, y)		(((x) < (y)) ? (x) : (y))
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RFC v4] tcp: Replace TCP buffer structure by an iovec array
  2024-03-21 10:26 [RFC v4] tcp: Replace TCP buffer structure by an iovec array Laurent Vivier
@ 2024-03-22  2:40 ` David Gibson
  2024-03-26 10:19 ` Laurent Vivier
  2024-04-12 17:59 ` Stefano Brivio
  2 siblings, 0 replies; 5+ messages in thread
From: David Gibson @ 2024-03-22  2:40 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 29257 bytes --]

On Thu, Mar 21, 2024 at 11:26:55AM +0100, Laurent Vivier wrote:
> To be able to provide pointers to TCP headers and IP headers without
> worrying about alignment in the structure, split the structure into
> several arrays and point to each part of the frame using an iovec array.
> 
> Using iovec also allows us to simply ignore the first entry when the
> vnet length header is not needed.
> 
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

I've replaced v3 with this in my next branch.

> ---
> 
> Notes:
>     v4:
>       - fix static_assert()
>     
>     v3:
>       - use (again) a 2d array of iovecs
>       - fix pcap_multiple() call
>       - introduce IP_MAX_MTU to define MSS4 and MSS6
>       - use VAR[0].data rather than ((TYPE *)0)->data in static_assert()
>       - cleanup tcp_iov_parts comments
>       - use sizeof(VAR) rather than sizeof(TYPE)
>       - fix tcp_l2_buf_fill_headers function comment
>       - don't mix network order and host order in vnet_len
>       - with dup_iov, update only TCP_IOV_PAYLOAD length
>       - remove _l2_ from the buffers names
>     
>     v2:
>       - rebased on top of David's series
>         "Some improvements to the tap send path"
>       - no performance improvement anymore
>       - remove the iovec functions of v1 introduced in tap.c to use new
>         functions from David
>       - don't use an array of array of iovec
>       - fix comments
>       - re-introduce MSS4 and MSS6
>       - address comments from David and Stefano
>     
>     I have tested only passt, not pasta
> 
>  tcp.c  | 524 +++++++++++++++++++++++++++------------------------------
>  util.h |   3 +
>  2 files changed, 253 insertions(+), 274 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index a1860d10b15f..cc705064f059 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -318,39 +318,8 @@
>  
>  /* MSS rounding: see SET_MSS() */
>  #define MSS_DEFAULT			536
> -
> -struct tcp4_l2_head {	/* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
> -#ifdef __AVX2__
> -	uint8_t pad[26];
> -#else
> -	uint8_t pad[2];
> -#endif
> -	struct tap_hdr taph;
> -	struct iphdr iph;
> -	struct tcphdr th;
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)));
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> -#endif
> -
> -struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
> -#ifdef __AVX2__
> -	uint8_t pad[14];
> -#else
> -	uint8_t pad[2];
> -#endif
> -	struct tap_hdr taph;
> -	struct ipv6hdr ip6h;
> -	struct tcphdr th;
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)));
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> -#endif
> -
> -#define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
> -#define MSS6	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
> +#define MSS4				ROUND_DOWN(IP_MAX_MTU - sizeof(struct tcphdr) - sizeof(struct iphdr), sizeof(uint32_t))
> +#define MSS6				ROUND_DOWN(IP_MAX_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), sizeof(uint32_t))
>  
>  #define WINDOW_DEFAULT			14600		/* RFC 6928 */
>  #ifdef HAS_SND_WND
> @@ -445,133 +414,107 @@ struct tcp_buf_seq_update {
>  };
>  
>  /* Static buffers */
> -
>  /**
> - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
> - * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @iph:	Pre-filled IP header (except for tot_len and saddr)
> - * @uh:		Headroom for TCP header
> - * @data:	Storage for TCP payload
> + * struct tcp_payload_t - TCP header and data to send data
> + * 		32 bytes aligned to be able to use AVX2 checksum
> + * @th:		TCP header
> + * @data:	TCP data
>   */
> -static struct tcp4_l2_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[26];	/* 0, align th to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
> -#endif
> -	struct tap_hdr taph;	/* 26				2 */
> -	struct iphdr iph;	/* 44				20 */
> -	struct tcphdr th;	/* 64				40 */
> -	uint8_t data[MSS4];	/* 84				60 */
> -				/* 65536			65532 */
> +struct tcp_payload_t {
> +	struct tcphdr th;
> +	uint8_t data[65536 - sizeof(struct tcphdr)];
>  #ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> +} __attribute__ ((packed, aligned(32)));
>  #else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> +} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
>  #endif
> -tcp4_l2_buf[TCP_FRAMES_MEM];
> -
> -static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
> -
> -static unsigned int tcp4_l2_buf_used;
>  
>  /**
> - * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
> - * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
> - * @th:		Headroom for TCP header
> - * @data:	Storage for TCP payload
> + * struct tcp_flags_t - TCP header and data to send option flags
> + * @th:		TCP header
> + * @opts	TCP option flags
>   */
> -struct tcp6_l2_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align ip6h to 4 bytes	0 */
> -#endif
> -	struct tap_hdr taph;	/* 14				2 */
> -	struct ipv6hdr ip6h;	/* 32				20 */
> -	struct tcphdr th;	/* 72				60 */
> -	uint8_t data[MSS6];	/* 92				80 */
> -				/* 65536			65532 */
> +struct tcp_flags_t {
> +	struct tcphdr th;
> +	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
>  #ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> +} __attribute__ ((packed, aligned(32)));
>  #else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> +} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
>  #endif
> -tcp6_l2_buf[TCP_FRAMES_MEM];
>  
> -static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
> +/* Ethernet header for IPv4 frames */
> +static struct ethhdr		tcp4_eth_src;
> +
> +static uint32_t			tcp4_payload_vnet_len[TCP_FRAMES_MEM];
> +/* IPv4 headers */
> +static struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
> +/* TCP headers and data for IPv4 frames */
> +static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
> +
> +static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
> +
> +static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
> +static unsigned int tcp4_payload_used;
> +
> +static uint32_t			tcp4_flags_vnet_len[TCP_FRAMES_MEM];
> +/* IPv4 headers for TCP option flags frames */
> +static struct iphdr		tcp4_flags_ip[TCP_FRAMES_MEM];
> +/* TCP headers and option flags for IPv4 frames */
> +static struct tcp_flags_t	tcp4_flags[TCP_FRAMES_MEM];
> +
> +static unsigned int tcp4_flags_used;
>  
> -static unsigned int tcp6_l2_buf_used;
> +/* Ethernet header for IPv6 frames */
> +static struct ethhdr		tcp6_eth_src;
> +
> +static uint32_t			tcp6_payload_vnet_len[TCP_FRAMES_MEM];
> +/* IPv6 headers */
> +static struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
> +/* TCP headers and data for IPv6 frames */
> +static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
> +
> +static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
> +
> +static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
> +static unsigned int tcp6_payload_used;
> +
> +static uint32_t			tcp6_flags_vnet_len[TCP_FRAMES_MEM];
> +/* IPv6 headers for TCP option flags frames */
> +static struct ipv6hdr		tcp6_flags_ip[TCP_FRAMES_MEM];
> +/* TCP headers and option flags for IPv6 frames */
> +static struct tcp_flags_t	tcp6_flags[TCP_FRAMES_MEM];
> +
> +static unsigned int tcp6_flags_used;
>  
>  /* recvmsg()/sendmsg() data for tap */
>  static char 		tcp_buf_discard		[MAX_WINDOW];
>  static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
>  
> -static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
> -static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM];
> -static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM];
> -static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM];
> +/*
> + * enum tcp_iov_parts - I/O vector parts for one TCP frame
> + * @TCP_IOV_TAP		TAP header
> + * @TCP_IOV_ETH		ethernet header
> + * @TCP_IOV_IP		IP (v4/v6) header
> + * @TCP_IOV_PAYLOAD	IP payload (TCP header + data)
> + * @TCP_NUM_IOVS 	the number of entries in the iovec array
> + */
> +enum tcp_iov_parts {
> +	TCP_IOV_TAP	= 0,
> +	TCP_IOV_ETH	= 1,
> +	TCP_IOV_IP	= 2,
> +	TCP_IOV_PAYLOAD	= 3,
> +	TCP_NUM_IOVS
> +};
> +
> +static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
>  
>  /* sendmsg() to socket */
>  static struct iovec	tcp_iov			[UIO_MAXIOV];
>  
> -/**
> - * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
> - * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @iph:	Pre-filled IP header (except for tot_len and saddr)
> - * @th:		Headroom for TCP header
> - * @opts:	Headroom for TCP options
> - */
> -static struct tcp4_l2_flags_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[26];	/* 0, align th to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
> -#endif
> -	struct tap_hdr taph;	/* 26				2 */
> -	struct iphdr iph;	/* 44				20 */
> -	struct tcphdr th;	/* 64				40 */
> -	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> -#endif
> -tcp4_l2_flags_buf[TCP_FRAMES_MEM];
> -
> -static unsigned int tcp4_l2_flags_buf_used;
> -
> -/**
> - * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
> - * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
> - * @th:		Headroom for TCP header
> - * @opts:	Headroom for TCP options
> - */
> -static struct tcp6_l2_flags_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align ip6h to 4 bytes		   0 */
> -#endif
> -	struct tap_hdr taph;	/* 14					   2 */
> -	struct ipv6hdr ip6h;	/* 32					  20 */
> -	struct tcphdr th	/* 72 */ __attribute__ ((aligned(4))); /* 60 */
> -	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> -#endif
> -tcp6_l2_flags_buf[TCP_FRAMES_MEM];
> -
> -static unsigned int tcp6_l2_flags_buf_used;
> -
>  #define CONN(idx)		(&(FLOW(idx)->tcp))
>  
>  /* Table for lookup from remote address, local port, remote port */
> @@ -967,25 +910,14 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
>  }
>  
>  /**
> - * tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
> + * tcp_update_l2_buf() - Update ethernet header buffers with addresses
>   * @eth_d:	Ethernet destination address, NULL if unchanged
>   * @eth_s:	Ethernet source address, NULL if unchanged
>   */
>  void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
>  {
> -	int i;
> -
> -	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> -		struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
> -		struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
> -		struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
> -		struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
> -
> -		eth_update_mac(&b4->taph.eh, eth_d, eth_s);
> -		eth_update_mac(&b6->taph.eh, eth_d, eth_s);
> -		eth_update_mac(&b4f->taph.eh, eth_d, eth_s);
> -		eth_update_mac(&b6f->taph.eh, eth_d, eth_s);
> -	}
> +	eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
> +	eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
>  }
>  
>  /**
> @@ -995,29 +927,42 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
>  static void tcp_sock4_iov_init(const struct ctx *c)
>  {
>  	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
> -	struct iovec *iov;
>  	int i;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
> -		tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IP),
> -			.iph = iph,
> -			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
> -		};
> -	}
> +	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
> +	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> +		struct iovec *iov;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
> -		tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IP),
> -			.iph = L2_BUF_IP4_INIT(IPPROTO_TCP)
> -		};
> -	}
> +		/* headers */
> +		tcp4_payload_ip[i] = iph;
> +		tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
> +		tcp4_payload[i].th.ack = 1;
> +
> +		tcp4_flags_ip[i] = iph;
> +		tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
> +		tcp4_flags[i].th.ack = 1;
>  
> -	for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_frame_base(c, &tcp4_l2_buf[i].taph);
> +		/* iovecs */
> +		iov = tcp4_l2_iov[i];
> +		iov[TCP_IOV_TAP].iov_base = &tcp4_payload_vnet_len[i];
> +		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
> +					   sizeof(tcp4_payload_vnet_len[i]) : 0;
> +		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
> +		iov[TCP_IOV_IP].iov_base = &tcp4_payload_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(tcp4_payload_ip[i]);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
>  
> -	for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_frame_base(c, &tcp4_l2_flags_buf[i].taph);
> +		iov = tcp4_l2_flags_iov[i];
> +		iov[TCP_IOV_TAP].iov_base = &tcp4_flags_vnet_len[i];
> +		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
> +					   sizeof(tcp4_flags_vnet_len[i]) : 0;
> +		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
> +		iov[TCP_IOV_IP].iov_base = &tcp4_flags_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(tcp4_flags_ip[i]);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
> +	}
>  }
>  
>  /**
> @@ -1026,29 +971,43 @@ static void tcp_sock4_iov_init(const struct ctx *c)
>   */
>  static void tcp_sock6_iov_init(const struct ctx *c)
>  {
> -	struct iovec *iov;
> +	struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
>  	int i;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
> -		tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IPV6),
> -			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP),
> -			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
> -		};
> -	}
> +	tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
> +	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> +		struct iovec *iov;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
> -		tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IPV6),
> -			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP)
> -		};
> -	}
> +		/* headers */
> +		tcp6_payload_ip[i] = ip6;
> +		tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
> +		tcp6_payload[i].th.ack = 1;
> +
> +		tcp6_flags_ip[i] = ip6;
> +		tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
> +		tcp6_flags[i].th .ack = 1;
>  
> -	for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_frame_base(c, &tcp6_l2_buf[i].taph);
> +		/* iovecs */
> +		iov = tcp6_l2_iov[i];
> +		iov[TCP_IOV_TAP].iov_base = &tcp6_payload_vnet_len[i];
> +		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
> +					   sizeof(tcp6_payload_vnet_len[i]) : 0;
> +		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
> +		iov[TCP_IOV_IP].iov_base = &tcp6_payload_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(tcp6_payload_ip[i]);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
>  
> -	for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_frame_base(c, &tcp6_l2_flags_buf[i].taph);
> +		iov = tcp6_l2_flags_iov[i];
> +		iov[TCP_IOV_TAP].iov_base = &tcp6_flags_vnet_len[i];
> +		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
> +					   sizeof(tcp6_flags_vnet_len[i]) : 0;
> +		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
> +		iov[TCP_IOV_IP].iov_base = &tcp6_flags_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(tcp6_flags_ip[i]);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
> +	}
>  }
>  
>  /**
> @@ -1284,36 +1243,40 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
>  	} while (0)
>  
>  /**
> - * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags)
> + * tcp_flags_flush() - Send out buffers for segments with no data (flags)
>   * @c:		Execution context
>   */
> -static void tcp_l2_flags_buf_flush(const struct ctx *c)
> +static void tcp_flags_flush(const struct ctx *c)
>  {
> -	tap_send_frames(c, tcp6_l2_flags_iov, 1, tcp6_l2_flags_buf_used);
> -	tcp6_l2_flags_buf_used = 0;
> +	tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
> +			tcp6_flags_used);
> +	tcp6_flags_used = 0;
>  
> -	tap_send_frames(c, tcp4_l2_flags_iov, 1, tcp4_l2_flags_buf_used);
> -	tcp4_l2_flags_buf_used = 0;
> +	tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
> +			tcp4_flags_used);
> +	tcp4_flags_used = 0;
>  }
>  
>  /**
> - * tcp_l2_data_buf_flush() - Send out buffers for segments with data
> + * tcp_payload_flush() - Send out buffers for segments with data
>   * @c:		Execution context
>   */
> -static void tcp_l2_data_buf_flush(const struct ctx *c)
> +static void tcp_payload_flush(const struct ctx *c)
>  {
>  	unsigned i;
>  	size_t m;
>  
> -	m = tap_send_frames(c, tcp6_l2_iov, 1, tcp6_l2_buf_used);
> +	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
> +			    tcp6_payload_used);
>  	for (i = 0; i < m; i++)
> -		*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
> -	tcp6_l2_buf_used = 0;
> +		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
> +	tcp6_payload_used = 0;
>  
> -	m = tap_send_frames(c, tcp4_l2_iov, 1, tcp4_l2_buf_used);
> +	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
> +			    tcp4_payload_used);
>  	for (i = 0; i < m; i++)
> -		*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
> -	tcp4_l2_buf_used = 0;
> +		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
> +	tcp4_payload_used = 0;
>  }
>  
>  /**
> @@ -1323,8 +1286,8 @@ static void tcp_l2_data_buf_flush(const struct ctx *c)
>  /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
>  void tcp_defer_handler(struct ctx *c)
>  {
> -	tcp_l2_flags_buf_flush(c);
> -	tcp_l2_data_buf_flush(c);
> +	tcp_flags_flush(c);
> +	tcp_payload_flush(c);
>  }
>  
>  /**
> @@ -1433,35 +1396,31 @@ static size_t tcp_fill_headers6(const struct ctx *c,
>   * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
>   * @c:		Execution context
>   * @conn:	Connection pointer
> - * @p:		Pointer to any type of TCP pre-cooked buffer
> + * @iov:	Pointer to an array of iovec of TCP pre-cooked buffer
>   * @plen:	Payload length (including TCP header options)
>   * @check:	Checksum, if already known
>   * @seq:	Sequence number for this segment
>   *
> - * Return: frame length including L2 headers, host order
> + * Return: IP payload length, host order
>   */
>  static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
>  				      const struct tcp_tap_conn *conn,
> -				      void *p, size_t plen,
> +				      struct iovec *iov, size_t plen,
>  				      const uint16_t *check, uint32_t seq)
>  {
>  	const struct in_addr *a4 = inany_v4(&conn->faddr);
>  	size_t ip_len, tlen;
>  
>  	if (a4) {
> -		struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p;
> -
> -		ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen,
> +		ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base,
> +					   iov[TCP_IOV_PAYLOAD].iov_base, plen,
>  					   check, seq);
> -
> -		tlen = tap_frame_len(c, &b->taph, ip_len);
> +		tlen = ip_len - sizeof(struct iphdr);
>  	} else {
> -		struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p;
> -
> -		ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen,
> +		ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base,
> +					   iov[TCP_IOV_PAYLOAD].iov_base, plen,
>  					   seq);
> -
> -		tlen = tap_frame_len(c, &b->taph, ip_len);
> +		tlen = ip_len - sizeof(struct ipv6hdr);
>  	}
>  
>  	return tlen;
> @@ -1595,16 +1554,16 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  {
>  	uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
>  	uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
> -	struct tcp4_l2_flags_buf_t *b4 = NULL;
> -	struct tcp6_l2_flags_buf_t *b6 = NULL;
> +	struct tcp_flags_t *payload;
>  	struct tcp_info tinfo = { 0 };
>  	socklen_t sl = sizeof(tinfo);
>  	int s = conn->sock;
> +	uint32_t vnet_len;
>  	size_t optlen = 0;
> -	struct iovec *iov;
>  	struct tcphdr *th;
> +	struct iovec *iov;
> +	size_t ip_len;
>  	char *data;
> -	void *p;
>  
>  	if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
>  	    !flags && conn->wnd_to_tap)
> @@ -1627,19 +1586,17 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  		return 0;
>  
>  	if (CONN_V4(conn)) {
> -		iov = tcp4_l2_flags_iov    + tcp4_l2_flags_buf_used;
> -		p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
> -		th = &b4->th;
> -
> -		/* gcc 11.2 would complain on data = (char *)(th + 1); */
> -		data = b4->opts;
> +		iov = tcp4_l2_flags_iov[tcp4_flags_used++];
> +		vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr);
>  	} else {
> -		iov = tcp6_l2_flags_iov    + tcp6_l2_flags_buf_used;
> -		p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
> -		th = &b6->th;
> -		data = b6->opts;
> +		iov = tcp6_l2_flags_iov[tcp6_flags_used++];
> +		vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
>  	}
>  
> +	payload = iov[TCP_IOV_PAYLOAD].iov_base;
> +	th = &payload->th;
> +	data = payload->opts;
> +
>  	if (flags & SYN) {
>  		int mss;
>  
> @@ -1688,8 +1645,11 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  	th->syn = !!(flags & SYN);
>  	th->fin = !!(flags & FIN);
>  
> -	iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen,
> -					       NULL, conn->seq_to_tap);
> +	ip_len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL,
> +					 conn->seq_to_tap);
> +	iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
> +
> +	*(uint32_t *)iov[TCP_IOV_TAP].iov_base = htonl(vnet_len + ip_len);
>  
>  	if (th->ack) {
>  		if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
> @@ -1705,24 +1665,27 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  	if (th->fin || th->syn)
>  		conn->seq_to_tap++;
>  
> -	if (CONN_V4(conn)) {
> -		if (flags & DUP_ACK) {
> -			memcpy(b4 + 1, b4, sizeof(*b4));
> -			(iov + 1)->iov_len = iov->iov_len;
> -			tcp4_l2_flags_buf_used++;
> -		}
> +	if (flags & DUP_ACK) {
> +		struct iovec *dup_iov;
> +		int i;
>  
> -		if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
> -			tcp_l2_flags_buf_flush(c);
> -	} else {
> -		if (flags & DUP_ACK) {
> -			memcpy(b6 + 1, b6, sizeof(*b6));
> -			(iov + 1)->iov_len = iov->iov_len;
> -			tcp6_l2_flags_buf_used++;
> -		}
> +		if (CONN_V4(conn))
> +			dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
> +		else
> +			dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
>  
> -		if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
> -			tcp_l2_flags_buf_flush(c);
> +		for (i = 0; i < TCP_NUM_IOVS; i++)
> +			memcpy(dup_iov[i].iov_base, iov[i].iov_base,
> +			       iov[i].iov_len);
> +		dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
> +	}
> +
> +	if (CONN_V4(conn)) {
> +		if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
> +			tcp_flags_flush(c);
> +	} else {
> +		if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
> +			tcp_flags_flush(c);
>  	}
>  
>  	return 0;
> @@ -2169,30 +2132,43 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>  {
>  	uint32_t *seq_update = &conn->seq_to_tap;
>  	struct iovec *iov;
> +	size_t ip_len;
>  
>  	if (CONN_V4(conn)) {
> -		struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
> -		const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
> +		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
> +		const uint16_t *check = NULL;
>  
> -		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
> -		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
> +		if (no_csum) {
> +			struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
> +			check = &iph->check;
> +		}
>  
> -		iov = tcp4_l2_iov + tcp4_l2_buf_used++;
> -		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
> -						       check, seq);
> -		if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1)
> -			tcp_l2_data_buf_flush(c);
> +		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
> +		tcp4_seq_update[tcp4_payload_used].len = plen;
> +
> +		iov = tcp4_l2_iov[tcp4_payload_used++];
> +		ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, check,
> +						 seq);
> +		iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
> +		*(uint32_t *)iov[TCP_IOV_TAP].iov_base =
> +						htonl(sizeof(struct ethhdr) +
> +						      sizeof(struct iphdr) +
> +						      ip_len);
> +		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
> +			tcp_payload_flush(c);
>  	} else if (CONN_V6(conn)) {
> -		struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
> -
> -		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
> -		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
> +		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
> +		tcp6_seq_update[tcp6_payload_used].len = plen;
>  
> -		iov = tcp6_l2_iov + tcp6_l2_buf_used++;
> -		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
> -						       NULL, seq);
> -		if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1)
> -			tcp_l2_data_buf_flush(c);
> +		iov = tcp6_l2_iov[tcp6_payload_used++];
> +		ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, NULL, seq);
> +		iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
> +		*(uint32_t *)iov[TCP_IOV_TAP].iov_base =
> +						htonl(sizeof(struct ethhdr) +
> +						      sizeof(struct ipv6hdr) +
> +						      ip_len);
> +		if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
> +			tcp_payload_flush(c);
>  	}
>  }
>  
> @@ -2247,19 +2223,19 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  	iov_sock[0].iov_base = tcp_buf_discard;
>  	iov_sock[0].iov_len = already_sent;
>  
> -	if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) ||
> -	    (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) {
> -		tcp_l2_data_buf_flush(c);
> +	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
> +	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
> +		tcp_payload_flush(c);
>  
>  		/* Silence Coverity CWE-125 false positive */
> -		tcp4_l2_buf_used = tcp6_l2_buf_used = 0;
> +		tcp4_payload_used = tcp6_payload_used = 0;
>  	}
>  
>  	for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
>  		if (v4)
> -			iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data;
> +			iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
>  		else
> -			iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data;
> +			iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
>  		iov->iov_len = mss;
>  	}
>  	if (iov_rem)
> @@ -2304,7 +2280,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  	plen = mss;
>  	seq = conn->seq_to_tap;
>  	for (i = 0; i < send_bufs; i++) {
> -		int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
> +		int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
>  
>  		if (i == send_bufs - 1)
>  			plen = last_len;
> diff --git a/util.h b/util.h
> index 48f35604df92..0069df34a5d2 100644
> --- a/util.h
> +++ b/util.h
> @@ -31,6 +31,9 @@
>  #ifndef ETH_MIN_MTU
>  #define ETH_MIN_MTU			68
>  #endif
> +#ifndef IP_MAX_MTU
> +#define IP_MAX_MTU			USHRT_MAX
> +#endif
>  
>  #ifndef MIN
>  #define MIN(x, y)		(((x) < (y)) ? (x) : (y))

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC v4] tcp: Replace TCP buffer structure by an iovec array
  2024-03-21 10:26 [RFC v4] tcp: Replace TCP buffer structure by an iovec array Laurent Vivier
  2024-03-22  2:40 ` David Gibson
@ 2024-03-26 10:19 ` Laurent Vivier
  2024-03-27  1:35   ` David Gibson
  2024-04-12 17:59 ` Stefano Brivio
  2 siblings, 1 reply; 5+ messages in thread
From: Laurent Vivier @ 2024-03-26 10:19 UTC (permalink / raw)
  To: passt-dev; +Cc: David Gibson

[-- Attachment #1: Type: text/plain, Size: 2494 bytes --]

Hi,

I compared perf result using this patch and a patch changing tap_send_frames_passt() to:

static size_t tap_send_frames_passt(const struct ctx *c,
                                     const struct iovec *iov,
                                     size_t bufs_per_frame, size_t nframes)
{
         struct msghdr mh = {
                 .msg_iovlen = bufs_per_frame,
         };
         size_t buf_offset;
         unsigned int i;
         ssize_t sent;

         for (i = 0; i < nframes; i++) {
                 unsigned int j;

                 if (bufs_per_frame > 1) {
			/* if we have more than 1 iovec, the first one is vnet_len */
                         uint32_t *p = iov[i * bufs_per_frame].iov_base;
                         uint32_t vnet_len = 0;

                         for (j = 1; j < bufs_per_frame; j++)
                                 vnet_len += iov[i * bufs_per_frame + j].iov_len;
                         vnet_len = htonl(vnet_len);

                         *p = vnet_len;
                 }

                 mh.msg_iov = (void *)&iov[i * bufs_per_frame];

                 sent = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT);
                 if (sent < 0)
                         return i;

                 /* Check for any partial frames due to short send */
                 j = iov_skip_bytes(&iov[i * bufs_per_frame], bufs_per_frame, sent, 
&buf_offset);

                 if (buf_offset && j < bufs_per_frame) {
                         if (write_remainder(c->fd_tap, &iov[i * bufs_per_frame + j],
                                             bufs_per_frame - j,
                                             buf_offset) < 0) {
                                 err("tap: partial frame send: %s",
                                     strerror(errno));
                                 return i;
                         }
                 }
         }

         return i;
}

And the result of 'perf record -e cache-misses' gives:

slow

   83.95%  passt.avx2  passt.avx2            [.] csum_avx2
    4.39%  passt.avx2  passt.avx2            [.] tap4_handler
    2.37%  passt.avx2  libc.so.6             [.] __printf_buffer
    0.84%  passt.avx2  passt.avx2            [.] udp_timer

fast

     22.15%  passt.avx2  passt.avx2            [.] csum_avx2
     14.91%  passt.avx2  passt.avx2            [.] udp_timer
      7.60%  passt.avx2  libc.so.6             [.] __printf_buffer
      5.10%  passt.avx2  passt.avx2            [.] ffsl

Thanks,
Laurent

[-- Attachment #2: 0001-tap-compute-vnet_len-inside-tap_send_frames_passt.patch --]
[-- Type: text/x-patch, Size: 8448 bytes --]

From d4b3e12132ceaf5484de215e9c84cbedcbbb8188 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Tue, 19 Mar 2024 18:20:20 +0100
Subject: [PATCH] tap: compute vnet_len inside tap_send_frames_passt()

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
 tap.c | 49 +++++++++++++++++++++++++++++++++----------------
 tcp.c | 39 ++++++++++-----------------------------
 2 files changed, 43 insertions(+), 45 deletions(-)

diff --git a/tap.c b/tap.c
index 13e4da79d690..1096272b411a 100644
--- a/tap.c
+++ b/tap.c
@@ -74,7 +74,7 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
  */
 void tap_send_single(const struct ctx *c, const void *data, size_t len)
 {
-	uint32_t vnet_len = htonl(len);
+	uint32_t vnet_len;
 	struct iovec iov[2];
 	size_t iovcnt = 0;
 
@@ -365,34 +365,51 @@ static size_t tap_send_frames_passt(const struct ctx *c,
 				    const struct iovec *iov,
 				    size_t bufs_per_frame, size_t nframes)
 {
-	size_t nbufs = bufs_per_frame * nframes;
 	struct msghdr mh = {
-		.msg_iov = (void *)iov,
-		.msg_iovlen = nbufs,
+		.msg_iovlen = bufs_per_frame,
 	};
 	size_t buf_offset;
 	unsigned int i;
 	ssize_t sent;
 
-	sent = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT);
-	if (sent < 0)
-		return 0;
+	for (i = 0; i < nframes; i++) {
+		unsigned int j;
+
+		if (bufs_per_frame > 1) {
+			/* if we have more than one iovec, the first one is
+			 * vnet_len
+			 */
+			uint32_t *p = iov[i * bufs_per_frame].iov_base;
+			uint32_t vnet_len = 0;
 
-	/* Check for any partial frames due to short send */
-	i = iov_skip_bytes(iov, nbufs, sent, &buf_offset);
+			for (j = 1; j < bufs_per_frame; j++)
+				vnet_len += iov[i * bufs_per_frame + j].iov_len;
+			vnet_len = htonl(vnet_len);
+
+			*p = vnet_len;
+		}
 
-	if (i < nbufs && (buf_offset || (i % bufs_per_frame))) {
-		/* Number of unsent or partially sent buffers for the frame */
-		size_t rembufs = bufs_per_frame - (i % bufs_per_frame);
+		mh.msg_iov = (void *)&iov[i * bufs_per_frame];
 
-		if (write_remainder(c->fd_tap, &iov[i], rembufs, buf_offset) < 0) {
-			err("tap: partial frame send: %s", strerror(errno));
+		sent = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT);
+		if (sent < 0)
 			return i;
+
+		/* Check for any partial frames due to short send */
+		j = iov_skip_bytes(&iov[i * bufs_per_frame], bufs_per_frame, sent, &buf_offset);
+
+		if (buf_offset && j < bufs_per_frame) {
+			if (write_remainder(c->fd_tap, &iov[i * bufs_per_frame + j],
+					    bufs_per_frame - j,
+					    buf_offset) < 0) {
+				err("tap: partial frame send: %s",
+				    strerror(errno));
+				return i;
+			}
 		}
-		i += rembufs;
 	}
 
-	return i / bufs_per_frame;
+	return i;
 }
 
 /**
diff --git a/tcp.c b/tcp.c
index cc705064f059..d147e2c41648 100644
--- a/tcp.c
+++ b/tcp.c
@@ -443,10 +443,11 @@ struct tcp_flags_t {
 } __attribute__ ((packed, aligned(__alignof__(unsigned int))));
 #endif
 
+static uint32_t vnet_len;
+
 /* Ethernet header for IPv4 frames */
 static struct ethhdr		tcp4_eth_src;
 
-static uint32_t			tcp4_payload_vnet_len[TCP_FRAMES_MEM];
 /* IPv4 headers */
 static struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
 /* TCP headers and data for IPv4 frames */
@@ -457,7 +458,6 @@ static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"
 static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp4_payload_used;
 
-static uint32_t			tcp4_flags_vnet_len[TCP_FRAMES_MEM];
 /* IPv4 headers for TCP option flags frames */
 static struct iphdr		tcp4_flags_ip[TCP_FRAMES_MEM];
 /* TCP headers and option flags for IPv4 frames */
@@ -468,7 +468,6 @@ static unsigned int tcp4_flags_used;
 /* Ethernet header for IPv6 frames */
 static struct ethhdr		tcp6_eth_src;
 
-static uint32_t			tcp6_payload_vnet_len[TCP_FRAMES_MEM];
 /* IPv6 headers */
 static struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
 /* TCP headers and data for IPv6 frames */
@@ -479,7 +478,6 @@ static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"
 static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp6_payload_used;
 
-static uint32_t			tcp6_flags_vnet_len[TCP_FRAMES_MEM];
 /* IPv6 headers for TCP option flags frames */
 static struct ipv6hdr		tcp6_flags_ip[TCP_FRAMES_MEM];
 /* TCP headers and option flags for IPv6 frames */
@@ -944,9 +942,8 @@ static void tcp_sock4_iov_init(const struct ctx *c)
 
 		/* iovecs */
 		iov = tcp4_l2_iov[i];
-		iov[TCP_IOV_TAP].iov_base = &tcp4_payload_vnet_len[i];
-		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
-					   sizeof(tcp4_payload_vnet_len[i]) : 0;
+		iov[TCP_IOV_TAP].iov_base = &vnet_len;
+		iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len);
 		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
 		iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
 		iov[TCP_IOV_IP].iov_base = &tcp4_payload_ip[i];
@@ -954,9 +951,8 @@ static void tcp_sock4_iov_init(const struct ctx *c)
 		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
 
 		iov = tcp4_l2_flags_iov[i];
-		iov[TCP_IOV_TAP].iov_base = &tcp4_flags_vnet_len[i];
-		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
-					   sizeof(tcp4_flags_vnet_len[i]) : 0;
+		iov[TCP_IOV_TAP].iov_base = &vnet_len;
+		iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len);
 		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
 		iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
 		iov[TCP_IOV_IP].iov_base = &tcp4_flags_ip[i];
@@ -989,9 +985,8 @@ static void tcp_sock6_iov_init(const struct ctx *c)
 
 		/* iovecs */
 		iov = tcp6_l2_iov[i];
-		iov[TCP_IOV_TAP].iov_base = &tcp6_payload_vnet_len[i];
-		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
-					   sizeof(tcp6_payload_vnet_len[i]) : 0;
+		iov[TCP_IOV_TAP].iov_base = &vnet_len;
+		iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len);
 		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
 		iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
 		iov[TCP_IOV_IP].iov_base = &tcp6_payload_ip[i];
@@ -999,9 +994,8 @@ static void tcp_sock6_iov_init(const struct ctx *c)
 		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
 
 		iov = tcp6_l2_flags_iov[i];
-		iov[TCP_IOV_TAP].iov_base = &tcp6_flags_vnet_len[i];
-		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
-					   sizeof(tcp6_flags_vnet_len[i]) : 0;
+		iov[TCP_IOV_TAP].iov_base = &vnet_len;
+		iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len);
 		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
 		iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
 		iov[TCP_IOV_IP].iov_base = &tcp6_flags_ip[i];
@@ -1558,7 +1552,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	struct tcp_info tinfo = { 0 };
 	socklen_t sl = sizeof(tinfo);
 	int s = conn->sock;
-	uint32_t vnet_len;
 	size_t optlen = 0;
 	struct tcphdr *th;
 	struct iovec *iov;
@@ -1587,10 +1580,8 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 
 	if (CONN_V4(conn)) {
 		iov = tcp4_l2_flags_iov[tcp4_flags_used++];
-		vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr);
 	} else {
 		iov = tcp6_l2_flags_iov[tcp6_flags_used++];
-		vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
 	}
 
 	payload = iov[TCP_IOV_PAYLOAD].iov_base;
@@ -1649,8 +1640,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 					 conn->seq_to_tap);
 	iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
 
-	*(uint32_t *)iov[TCP_IOV_TAP].iov_base = htonl(vnet_len + ip_len);
-
 	if (th->ack) {
 		if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
 			conn_flag(c, conn, ~ACK_TO_TAP_DUE);
@@ -2150,10 +2139,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, check,
 						 seq);
 		iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
-		*(uint32_t *)iov[TCP_IOV_TAP].iov_base =
-						htonl(sizeof(struct ethhdr) +
-						      sizeof(struct iphdr) +
-						      ip_len);
 		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
 			tcp_payload_flush(c);
 	} else if (CONN_V6(conn)) {
@@ -2163,10 +2148,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		iov = tcp6_l2_iov[tcp6_payload_used++];
 		ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, NULL, seq);
 		iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
-		*(uint32_t *)iov[TCP_IOV_TAP].iov_base =
-						htonl(sizeof(struct ethhdr) +
-						      sizeof(struct ipv6hdr) +
-						      ip_len);
 		if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
 			tcp_payload_flush(c);
 	}
-- 
@@ -443,10 +443,11 @@ struct tcp_flags_t {
 } __attribute__ ((packed, aligned(__alignof__(unsigned int))));
 #endif
 
+static uint32_t vnet_len;
+
 /* Ethernet header for IPv4 frames */
 static struct ethhdr		tcp4_eth_src;
 
-static uint32_t			tcp4_payload_vnet_len[TCP_FRAMES_MEM];
 /* IPv4 headers */
 static struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
 /* TCP headers and data for IPv4 frames */
@@ -457,7 +458,6 @@ static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"
 static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp4_payload_used;
 
-static uint32_t			tcp4_flags_vnet_len[TCP_FRAMES_MEM];
 /* IPv4 headers for TCP option flags frames */
 static struct iphdr		tcp4_flags_ip[TCP_FRAMES_MEM];
 /* TCP headers and option flags for IPv4 frames */
@@ -468,7 +468,6 @@ static unsigned int tcp4_flags_used;
 /* Ethernet header for IPv6 frames */
 static struct ethhdr		tcp6_eth_src;
 
-static uint32_t			tcp6_payload_vnet_len[TCP_FRAMES_MEM];
 /* IPv6 headers */
 static struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
 /* TCP headers and data for IPv6 frames */
@@ -479,7 +478,6 @@ static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"
 static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp6_payload_used;
 
-static uint32_t			tcp6_flags_vnet_len[TCP_FRAMES_MEM];
 /* IPv6 headers for TCP option flags frames */
 static struct ipv6hdr		tcp6_flags_ip[TCP_FRAMES_MEM];
 /* TCP headers and option flags for IPv6 frames */
@@ -944,9 +942,8 @@ static void tcp_sock4_iov_init(const struct ctx *c)
 
 		/* iovecs */
 		iov = tcp4_l2_iov[i];
-		iov[TCP_IOV_TAP].iov_base = &tcp4_payload_vnet_len[i];
-		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
-					   sizeof(tcp4_payload_vnet_len[i]) : 0;
+		iov[TCP_IOV_TAP].iov_base = &vnet_len;
+		iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len);
 		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
 		iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
 		iov[TCP_IOV_IP].iov_base = &tcp4_payload_ip[i];
@@ -954,9 +951,8 @@ static void tcp_sock4_iov_init(const struct ctx *c)
 		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
 
 		iov = tcp4_l2_flags_iov[i];
-		iov[TCP_IOV_TAP].iov_base = &tcp4_flags_vnet_len[i];
-		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
-					   sizeof(tcp4_flags_vnet_len[i]) : 0;
+		iov[TCP_IOV_TAP].iov_base = &vnet_len;
+		iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len);
 		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
 		iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
 		iov[TCP_IOV_IP].iov_base = &tcp4_flags_ip[i];
@@ -989,9 +985,8 @@ static void tcp_sock6_iov_init(const struct ctx *c)
 
 		/* iovecs */
 		iov = tcp6_l2_iov[i];
-		iov[TCP_IOV_TAP].iov_base = &tcp6_payload_vnet_len[i];
-		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
-					   sizeof(tcp6_payload_vnet_len[i]) : 0;
+		iov[TCP_IOV_TAP].iov_base = &vnet_len;
+		iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len);
 		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
 		iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
 		iov[TCP_IOV_IP].iov_base = &tcp6_payload_ip[i];
@@ -999,9 +994,8 @@ static void tcp_sock6_iov_init(const struct ctx *c)
 		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
 
 		iov = tcp6_l2_flags_iov[i];
-		iov[TCP_IOV_TAP].iov_base = &tcp6_flags_vnet_len[i];
-		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
-					   sizeof(tcp6_flags_vnet_len[i]) : 0;
+		iov[TCP_IOV_TAP].iov_base = &vnet_len;
+		iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len);
 		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
 		iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
 		iov[TCP_IOV_IP].iov_base = &tcp6_flags_ip[i];
@@ -1558,7 +1552,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 	struct tcp_info tinfo = { 0 };
 	socklen_t sl = sizeof(tinfo);
 	int s = conn->sock;
-	uint32_t vnet_len;
 	size_t optlen = 0;
 	struct tcphdr *th;
 	struct iovec *iov;
@@ -1587,10 +1580,8 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 
 	if (CONN_V4(conn)) {
 		iov = tcp4_l2_flags_iov[tcp4_flags_used++];
-		vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr);
 	} else {
 		iov = tcp6_l2_flags_iov[tcp6_flags_used++];
-		vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
 	}
 
 	payload = iov[TCP_IOV_PAYLOAD].iov_base;
@@ -1649,8 +1640,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
 					 conn->seq_to_tap);
 	iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
 
-	*(uint32_t *)iov[TCP_IOV_TAP].iov_base = htonl(vnet_len + ip_len);
-
 	if (th->ack) {
 		if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
 			conn_flag(c, conn, ~ACK_TO_TAP_DUE);
@@ -2150,10 +2139,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, check,
 						 seq);
 		iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
-		*(uint32_t *)iov[TCP_IOV_TAP].iov_base =
-						htonl(sizeof(struct ethhdr) +
-						      sizeof(struct iphdr) +
-						      ip_len);
 		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
 			tcp_payload_flush(c);
 	} else if (CONN_V6(conn)) {
@@ -2163,10 +2148,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		iov = tcp6_l2_iov[tcp6_payload_used++];
 		ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, NULL, seq);
 		iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
-		*(uint32_t *)iov[TCP_IOV_TAP].iov_base =
-						htonl(sizeof(struct ethhdr) +
-						      sizeof(struct ipv6hdr) +
-						      ip_len);
 		if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
 			tcp_payload_flush(c);
 	}
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RFC v4] tcp: Replace TCP buffer structure by an iovec array
  2024-03-26 10:19 ` Laurent Vivier
@ 2024-03-27  1:35   ` David Gibson
  0 siblings, 0 replies; 5+ messages in thread
From: David Gibson @ 2024-03-27  1:35 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 12885 bytes --]

On Tue, Mar 26, 2024 at 11:19:22AM +0100, Laurent Vivier wrote:
> Hi,
> 
> I compared perf result using this patch and a patch changing tap_send_frames_passt() to:
> 
> static size_t tap_send_frames_passt(const struct ctx *c,
>                                     const struct iovec *iov,
>                                     size_t bufs_per_frame, size_t nframes)
> {
>         struct msghdr mh = {
>                 .msg_iovlen = bufs_per_frame,
>         };
>         size_t buf_offset;
>         unsigned int i;
>         ssize_t sent;
> 
>         for (i = 0; i < nframes; i++) {
>                 unsigned int j;
> 
>                 if (bufs_per_frame > 1) {
> 			/* if we have more than 1 iovec, the first one is vnet_len */
>                         uint32_t *p = iov[i * bufs_per_frame].iov_base;
>                         uint32_t vnet_len = 0;
> 
>                         for (j = 1; j < bufs_per_frame; j++)
>                                 vnet_len += iov[i * bufs_per_frame + j].iov_len;
>                         vnet_len = htonl(vnet_len);
> 
>                         *p = vnet_len;
>                 }
> 
>                 mh.msg_iov = (void *)&iov[i * bufs_per_frame];
> 
>                 sent = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT);
>                 if (sent < 0)
>                         return i;
> 
>                 /* Check for any partial frames due to short send */
>                 j = iov_skip_bytes(&iov[i * bufs_per_frame], bufs_per_frame,
> sent, &buf_offset);
> 
>                 if (buf_offset && j < bufs_per_frame) {
>                         if (write_remainder(c->fd_tap, &iov[i * bufs_per_frame + j],
>                                             bufs_per_frame - j,
>                                             buf_offset) < 0) {
>                                 err("tap: partial frame send: %s",
>                                     strerror(errno));
>                                 return i;
>                         }
>                 }
>         }
> 
>         return i;
> }
> 
> And the result of 'perf record -e cache-misses' gives:
> 
> slow
> 
>   83.95%  passt.avx2  passt.avx2            [.] csum_avx2
>    4.39%  passt.avx2  passt.avx2            [.] tap4_handler
>    2.37%  passt.avx2  libc.so.6             [.] __printf_buffer
>    0.84%  passt.avx2  passt.avx2            [.] udp_timer
> 
> fast
> 
>     22.15%  passt.avx2  passt.avx2            [.] csum_avx2
>     14.91%  passt.avx2  passt.avx2            [.] udp_timer
>      7.60%  passt.avx2  libc.so.6             [.] __printf_buffer
>      5.10%  passt.avx2  passt.avx2            [.] ffsl

Well.. I *guess* that means we're getting more cache misses in the
batched version, as we suspected.  I'm a bit mystified as to how to
interpret those percentages, though.  Is that the percentage of total
cache misses that occur in that function?  The percentage of times
that function generates a cache miss (what if it generates more than
one)?  Something else..

If this does indicate many more cache misses computing the checksum,
I'm still a bit baffled as to what's going on.  It doesn't quite fit
with the theory I had: the csum_avx2() calls are in the "first loop"
in both these scenarios - my theory would suggest more cache misses in
the "second loop" instead (in the kernel inside sendmsg()).

What happens if you fill in the vnet_len field in the first loop, but
still use a sendmsg() per frame, instead of one batched one?

> From d4b3e12132ceaf5484de215e9c84cbedcbbb8188 Mon Sep 17 00:00:00 2001
> From: Laurent Vivier <lvivier@redhat.com>
> Date: Tue, 19 Mar 2024 18:20:20 +0100
> Subject: [PATCH] tap: compute vnet_len inside tap_send_frames_passt()
> 
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> ---
>  tap.c | 49 +++++++++++++++++++++++++++++++++----------------
>  tcp.c | 39 ++++++++++-----------------------------
>  2 files changed, 43 insertions(+), 45 deletions(-)
> 
> diff --git a/tap.c b/tap.c
> index 13e4da79d690..1096272b411a 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -74,7 +74,7 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
>   */
>  void tap_send_single(const struct ctx *c, const void *data, size_t len)
>  {
> -	uint32_t vnet_len = htonl(len);
> +	uint32_t vnet_len;
>  	struct iovec iov[2];
>  	size_t iovcnt = 0;
>  
> @@ -365,34 +365,51 @@ static size_t tap_send_frames_passt(const struct ctx *c,
>  				    const struct iovec *iov,
>  				    size_t bufs_per_frame, size_t nframes)
>  {
> -	size_t nbufs = bufs_per_frame * nframes;
>  	struct msghdr mh = {
> -		.msg_iov = (void *)iov,
> -		.msg_iovlen = nbufs,
> +		.msg_iovlen = bufs_per_frame,
>  	};
>  	size_t buf_offset;
>  	unsigned int i;
>  	ssize_t sent;
>  
> -	sent = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT);
> -	if (sent < 0)
> -		return 0;
> +	for (i = 0; i < nframes; i++) {
> +		unsigned int j;
> +
> +		if (bufs_per_frame > 1) {
> +			/* if we have more than one iovec, the first one is
> +			 * vnet_len
> +			 */
> +			uint32_t *p = iov[i * bufs_per_frame].iov_base;
> +			uint32_t vnet_len = 0;
>  
> -	/* Check for any partial frames due to short send */
> -	i = iov_skip_bytes(iov, nbufs, sent, &buf_offset);
> +			for (j = 1; j < bufs_per_frame; j++)
> +				vnet_len += iov[i * bufs_per_frame + j].iov_len;
> +			vnet_len = htonl(vnet_len);
> +
> +			*p = vnet_len;
> +		}
>  
> -	if (i < nbufs && (buf_offset || (i % bufs_per_frame))) {
> -		/* Number of unsent or partially sent buffers for the frame */
> -		size_t rembufs = bufs_per_frame - (i % bufs_per_frame);
> +		mh.msg_iov = (void *)&iov[i * bufs_per_frame];
>  
> -		if (write_remainder(c->fd_tap, &iov[i], rembufs, buf_offset) < 0) {
> -			err("tap: partial frame send: %s", strerror(errno));
> +		sent = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT);
> +		if (sent < 0)
>  			return i;
> +
> +		/* Check for any partial frames due to short send */
> +		j = iov_skip_bytes(&iov[i * bufs_per_frame], bufs_per_frame, sent, &buf_offset);
> +
> +		if (buf_offset && j < bufs_per_frame) {
> +			if (write_remainder(c->fd_tap, &iov[i * bufs_per_frame + j],
> +					    bufs_per_frame - j,
> +					    buf_offset) < 0) {
> +				err("tap: partial frame send: %s",
> +				    strerror(errno));
> +				return i;
> +			}
>  		}
> -		i += rembufs;
>  	}
>  
> -	return i / bufs_per_frame;
> +	return i;
>  }
>  
>  /**
> diff --git a/tcp.c b/tcp.c
> index cc705064f059..d147e2c41648 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -443,10 +443,11 @@ struct tcp_flags_t {
>  } __attribute__ ((packed, aligned(__alignof__(unsigned int))));
>  #endif
>  
> +static uint32_t vnet_len;
> +
>  /* Ethernet header for IPv4 frames */
>  static struct ethhdr		tcp4_eth_src;
>  
> -static uint32_t			tcp4_payload_vnet_len[TCP_FRAMES_MEM];
>  /* IPv4 headers */
>  static struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
>  /* TCP headers and data for IPv4 frames */
> @@ -457,7 +458,6 @@ static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"
>  static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
>  static unsigned int tcp4_payload_used;
>  
> -static uint32_t			tcp4_flags_vnet_len[TCP_FRAMES_MEM];
>  /* IPv4 headers for TCP option flags frames */
>  static struct iphdr		tcp4_flags_ip[TCP_FRAMES_MEM];
>  /* TCP headers and option flags for IPv4 frames */
> @@ -468,7 +468,6 @@ static unsigned int tcp4_flags_used;
>  /* Ethernet header for IPv6 frames */
>  static struct ethhdr		tcp6_eth_src;
>  
> -static uint32_t			tcp6_payload_vnet_len[TCP_FRAMES_MEM];
>  /* IPv6 headers */
>  static struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
>  /* TCP headers and data for IPv6 frames */
> @@ -479,7 +478,6 @@ static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"
>  static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
>  static unsigned int tcp6_payload_used;
>  
> -static uint32_t			tcp6_flags_vnet_len[TCP_FRAMES_MEM];
>  /* IPv6 headers for TCP option flags frames */
>  static struct ipv6hdr		tcp6_flags_ip[TCP_FRAMES_MEM];
>  /* TCP headers and option flags for IPv6 frames */
> @@ -944,9 +942,8 @@ static void tcp_sock4_iov_init(const struct ctx *c)
>  
>  		/* iovecs */
>  		iov = tcp4_l2_iov[i];
> -		iov[TCP_IOV_TAP].iov_base = &tcp4_payload_vnet_len[i];
> -		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
> -					   sizeof(tcp4_payload_vnet_len[i]) : 0;
> +		iov[TCP_IOV_TAP].iov_base = &vnet_len;
> +		iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len);
>  		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
>  		iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
>  		iov[TCP_IOV_IP].iov_base = &tcp4_payload_ip[i];
> @@ -954,9 +951,8 @@ static void tcp_sock4_iov_init(const struct ctx *c)
>  		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
>  
>  		iov = tcp4_l2_flags_iov[i];
> -		iov[TCP_IOV_TAP].iov_base = &tcp4_flags_vnet_len[i];
> -		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
> -					   sizeof(tcp4_flags_vnet_len[i]) : 0;
> +		iov[TCP_IOV_TAP].iov_base = &vnet_len;
> +		iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len);
>  		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
>  		iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
>  		iov[TCP_IOV_IP].iov_base = &tcp4_flags_ip[i];
> @@ -989,9 +985,8 @@ static void tcp_sock6_iov_init(const struct ctx *c)
>  
>  		/* iovecs */
>  		iov = tcp6_l2_iov[i];
> -		iov[TCP_IOV_TAP].iov_base = &tcp6_payload_vnet_len[i];
> -		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
> -					   sizeof(tcp6_payload_vnet_len[i]) : 0;
> +		iov[TCP_IOV_TAP].iov_base = &vnet_len;
> +		iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len);
>  		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
>  		iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
>  		iov[TCP_IOV_IP].iov_base = &tcp6_payload_ip[i];
> @@ -999,9 +994,8 @@ static void tcp_sock6_iov_init(const struct ctx *c)
>  		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
>  
>  		iov = tcp6_l2_flags_iov[i];
> -		iov[TCP_IOV_TAP].iov_base = &tcp6_flags_vnet_len[i];
> -		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
> -					   sizeof(tcp6_flags_vnet_len[i]) : 0;
> +		iov[TCP_IOV_TAP].iov_base = &vnet_len;
> +		iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len);
>  		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
>  		iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
>  		iov[TCP_IOV_IP].iov_base = &tcp6_flags_ip[i];
> @@ -1558,7 +1552,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  	struct tcp_info tinfo = { 0 };
>  	socklen_t sl = sizeof(tinfo);
>  	int s = conn->sock;
> -	uint32_t vnet_len;
>  	size_t optlen = 0;
>  	struct tcphdr *th;
>  	struct iovec *iov;
> @@ -1587,10 +1580,8 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  
>  	if (CONN_V4(conn)) {
>  		iov = tcp4_l2_flags_iov[tcp4_flags_used++];
> -		vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr);
>  	} else {
>  		iov = tcp6_l2_flags_iov[tcp6_flags_used++];
> -		vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
>  	}
>  
>  	payload = iov[TCP_IOV_PAYLOAD].iov_base;
> @@ -1649,8 +1640,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  					 conn->seq_to_tap);
>  	iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
>  
> -	*(uint32_t *)iov[TCP_IOV_TAP].iov_base = htonl(vnet_len + ip_len);
> -
>  	if (th->ack) {
>  		if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
>  			conn_flag(c, conn, ~ACK_TO_TAP_DUE);
> @@ -2150,10 +2139,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>  		ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, check,
>  						 seq);
>  		iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
> -		*(uint32_t *)iov[TCP_IOV_TAP].iov_base =
> -						htonl(sizeof(struct ethhdr) +
> -						      sizeof(struct iphdr) +
> -						      ip_len);
>  		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
>  			tcp_payload_flush(c);
>  	} else if (CONN_V6(conn)) {
> @@ -2163,10 +2148,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>  		iov = tcp6_l2_iov[tcp6_payload_used++];
>  		ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, NULL, seq);
>  		iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
> -		*(uint32_t *)iov[TCP_IOV_TAP].iov_base =
> -						htonl(sizeof(struct ethhdr) +
> -						      sizeof(struct ipv6hdr) +
> -						      ip_len);
>  		if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
>  			tcp_payload_flush(c);
>  	}


-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC v4] tcp: Replace TCP buffer structure by an iovec array
  2024-03-21 10:26 [RFC v4] tcp: Replace TCP buffer structure by an iovec array Laurent Vivier
  2024-03-22  2:40 ` David Gibson
  2024-03-26 10:19 ` Laurent Vivier
@ 2024-04-12 17:59 ` Stefano Brivio
  2 siblings, 0 replies; 5+ messages in thread
From: Stefano Brivio @ 2024-04-12 17:59 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: passt-dev, David Gibson

On Thu, 21 Mar 2024 11:26:55 +0100
Laurent Vivier <lvivier@redhat.com> wrote:

> To be able to provide pointers to TCP headers and IP headers without
> worrying about alignment in the structure, split the structure into
> several arrays and point to each part of the frame using an iovec array.
> 
> Using iovec also allows us to simply ignore the first entry when the
> vnet length header is not needed.
> 
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

Sorry for the delay. A few comments:

> ---
> 
> Notes:
>     v4:
>       - fix static_assert()
>     
>     v3:
>       - use (again) a 2d array of iovecs
>       - fix pcap_multiple() call
>       - introduce IP_MAX_MTU to define MSS4 and MSS6
>       - use VAR[0].data rather than ((TYPE *)0)->data in static_assert()
>       - cleanup tcp_iov_parts comments
>       - use sizeof(VAR) rather than sizeof(TYPE)
>       - fix tcp_l2_buf_fill_headers function comment
>       - don't mix network order and host order in vnet_len
>       - with dup_iov, update only TCP_IOV_PAYLOAD length
>       - remove _l2_ from the buffers names
>     
>     v2:
>       - rebased on top of David's series
>         "Some improvements to the tap send path"
>       - no performance improvement anymore
>       - remove the iovec functions of v1 introduced in tap.c to use new
>         functions from David
>       - don't use an array of array of iovec
>       - fix comments
>       - re-introduce MSS4 and MSS6
>       - address comments from David and Stefano
>     
>     I have tested only passt, not pasta
> 
>  tcp.c  | 524 +++++++++++++++++++++++++++------------------------------
>  util.h |   3 +
>  2 files changed, 253 insertions(+), 274 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index a1860d10b15f..cc705064f059 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -318,39 +318,8 @@
>  
>  /* MSS rounding: see SET_MSS() */
>  #define MSS_DEFAULT			536
> -
> -struct tcp4_l2_head {	/* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
> -#ifdef __AVX2__
> -	uint8_t pad[26];
> -#else
> -	uint8_t pad[2];
> -#endif
> -	struct tap_hdr taph;
> -	struct iphdr iph;
> -	struct tcphdr th;
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)));
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> -#endif
> -
> -struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
> -#ifdef __AVX2__
> -	uint8_t pad[14];
> -#else
> -	uint8_t pad[2];
> -#endif
> -	struct tap_hdr taph;
> -	struct ipv6hdr ip6h;
> -	struct tcphdr th;
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)));
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> -#endif
> -
> -#define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
> -#define MSS6	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
> +#define MSS4				ROUND_DOWN(IP_MAX_MTU - sizeof(struct tcphdr) - sizeof(struct iphdr), sizeof(uint32_t))
> +#define MSS6				ROUND_DOWN(IP_MAX_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), sizeof(uint32_t))

Would it be possible to keep those within 80 columns? That is,

#define MSS4				ROUND_DOWN(IP_MAX_MTU -
						   sizeof(struct tcphdr) -
						   sizeof(struct iphdr),
						   sizeof(uint32_t))
#define MSS6				ROUND_DOWN(IP_MAX_MTU -
						   sizeof(struct tcphdr) -
						   sizeof(struct ipv6hdr),
						   sizeof(uint32_t))

doesn't look so bad to me.

>  
>  #define WINDOW_DEFAULT			14600		/* RFC 6928 */
>  #ifdef HAS_SND_WND
> @@ -445,133 +414,107 @@ struct tcp_buf_seq_update {
>  };
>  
>  /* Static buffers */
> -
>  /**
> - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
> - * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @iph:	Pre-filled IP header (except for tot_len and saddr)
> - * @uh:		Headroom for TCP header
> - * @data:	Storage for TCP payload
> + * struct tcp_payload_t - TCP header and data to send data

While "data to send data" strictly speaking refers what this
represents, it's a bit obscure, especially in combination with the
tcp_payload_t name. What about:

 * struct tcp_payload_t - TCP header and data to send segments with payload

?

> + * 		32 bytes aligned to be able to use AVX2 checksum

Not true if !__AVX2__. I would just drop this line and explain below
(see below).

> + * @th:		TCP header
> + * @data:	TCP data
>   */
> -static struct tcp4_l2_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[26];	/* 0, align th to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
> -#endif
> -	struct tap_hdr taph;	/* 26				2 */
> -	struct iphdr iph;	/* 44				20 */
> -	struct tcphdr th;	/* 64				40 */
> -	uint8_t data[MSS4];	/* 84				60 */
> -				/* 65536			65532 */
> +struct tcp_payload_t {
> +	struct tcphdr th;
> +	uint8_t data[65536 - sizeof(struct tcphdr)];

We pick 65536 here because it's IP_MAX_MTU, now that you defined it, we
might as well use it.

>  #ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> +} __attribute__ ((packed, aligned(32)));

We could explain that here, e.g.:

} __attribute__ ((packed, aligned(32)));	/* For AVX2 checksum routines */

>  #else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> +} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
>  #endif
> -tcp4_l2_buf[TCP_FRAMES_MEM];
> -
> -static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
> -
> -static unsigned int tcp4_l2_buf_used;
>  
>  /**
> - * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
> - * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
> - * @th:		Headroom for TCP header
> - * @data:	Storage for TCP payload
> + * struct tcp_flags_t - TCP header and data to send option flags

Actually, there's no such thing as "option flags", so, similarly to my
remark above, what about:

 * struct tcp_flags_t - TCP header and data to send zero-length segments (flags)

?

> + * @th:		TCP header
> + * @opts	TCP option flags

I'd say "TCP options".

>   */
> -struct tcp6_l2_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align ip6h to 4 bytes	0 */
> -#endif
> -	struct tap_hdr taph;	/* 14				2 */
> -	struct ipv6hdr ip6h;	/* 32				20 */
> -	struct tcphdr th;	/* 72				60 */
> -	uint8_t data[MSS6];	/* 92				80 */
> -				/* 65536			65532 */
> +struct tcp_flags_t {
> +	struct tcphdr th;
> +	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
>  #ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> +} __attribute__ ((packed, aligned(32)));
>  #else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> +} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
>  #endif
> -tcp6_l2_buf[TCP_FRAMES_MEM];
>  
> -static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
> +/* Ethernet header for IPv4 frames */
> +static struct ethhdr		tcp4_eth_src;
> +
> +static uint32_t			tcp4_payload_vnet_len[TCP_FRAMES_MEM];
> +/* IPv4 headers */
> +static struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
> +/* TCP headers and data for IPv4 frames */

Well, TCP headers and data make up segments, so perhaps:

/* TCP segments with payload for IPv4 frames */

...which is easier to understand as a distinction from tcp4_flags[]
below:

> +static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
> +
> +static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
> +
> +static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
> +static unsigned int tcp4_payload_used;
> +
> +static uint32_t			tcp4_flags_vnet_len[TCP_FRAMES_MEM];
> +/* IPv4 headers for TCP option flags frames */
> +static struct iphdr		tcp4_flags_ip[TCP_FRAMES_MEM];
> +/* TCP headers and option flags for IPv4 frames */

/* TCP segments without payload for IPv4 frames */

?

> +static struct tcp_flags_t	tcp4_flags[TCP_FRAMES_MEM];
> +
> +static unsigned int tcp4_flags_used;
>  
> -static unsigned int tcp6_l2_buf_used;
> +/* Ethernet header for IPv6 frames */
> +static struct ethhdr		tcp6_eth_src;
> +
> +static uint32_t			tcp6_payload_vnet_len[TCP_FRAMES_MEM];
> +/* IPv6 headers */
> +static struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
> +/* TCP headers and data for IPv6 frames */
> +static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
> +
> +static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
> +
> +static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
> +static unsigned int tcp6_payload_used;
> +
> +static uint32_t			tcp6_flags_vnet_len[TCP_FRAMES_MEM];
> +/* IPv6 headers for TCP option flags frames */
> +static struct ipv6hdr		tcp6_flags_ip[TCP_FRAMES_MEM];
> +/* TCP headers and option flags for IPv6 frames */
> +static struct tcp_flags_t	tcp6_flags[TCP_FRAMES_MEM];

...same here and above (three occurrences).

> +
> +static unsigned int tcp6_flags_used;
>  
>  /* recvmsg()/sendmsg() data for tap */
>  static char 		tcp_buf_discard		[MAX_WINDOW];
>  static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
>  
> -static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
> -static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM];
> -static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM];
> -static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM];
> +/*
> + * enum tcp_iov_parts - I/O vector parts for one TCP frame
> + * @TCP_IOV_TAP		TAP header

It's kind of funny that TCP_IOV_TAP is only used for passt, which,
unlike pasta, does *not* use a tap device.

I haven't really looked into how complicated this is at this point,
but now that this is finally split out of the "Ethernet" part, could
we call this IOV_VLEN or suchlike, and rename all related references?

> + * @TCP_IOV_ETH		ethernet header

s/ethernet/Ethernet/

> + * @TCP_IOV_IP		IP (v4/v6) header
> + * @TCP_IOV_PAYLOAD	IP payload (TCP header + data)
> + * @TCP_NUM_IOVS 	the number of entries in the iovec array
> + */
> +enum tcp_iov_parts {
> +	TCP_IOV_TAP	= 0,
> +	TCP_IOV_ETH	= 1,
> +	TCP_IOV_IP	= 2,
> +	TCP_IOV_PAYLOAD	= 3,
> +	TCP_NUM_IOVS
> +};
> +
> +static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
>  
>  /* sendmsg() to socket */
>  static struct iovec	tcp_iov			[UIO_MAXIOV];
>  
> -/**
> - * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
> - * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @iph:	Pre-filled IP header (except for tot_len and saddr)
> - * @th:		Headroom for TCP header
> - * @opts:	Headroom for TCP options
> - */
> -static struct tcp4_l2_flags_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[26];	/* 0, align th to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
> -#endif
> -	struct tap_hdr taph;	/* 26				2 */
> -	struct iphdr iph;	/* 44				20 */
> -	struct tcphdr th;	/* 64				40 */
> -	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> -#endif
> -tcp4_l2_flags_buf[TCP_FRAMES_MEM];
> -
> -static unsigned int tcp4_l2_flags_buf_used;
> -
> -/**
> - * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
> - * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
> - * @taph:	Tap-level headers (partially pre-filled)
> - * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
> - * @th:		Headroom for TCP header
> - * @opts:	Headroom for TCP options
> - */
> -static struct tcp6_l2_flags_buf_t {
> -#ifdef __AVX2__
> -	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
> -#else
> -	uint8_t pad[2];		/*	align ip6h to 4 bytes		   0 */
> -#endif
> -	struct tap_hdr taph;	/* 14					   2 */
> -	struct ipv6hdr ip6h;	/* 32					  20 */
> -	struct tcphdr th	/* 72 */ __attribute__ ((aligned(4))); /* 60 */
> -	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> -#endif
> -tcp6_l2_flags_buf[TCP_FRAMES_MEM];
> -
> -static unsigned int tcp6_l2_flags_buf_used;
> -
>  #define CONN(idx)		(&(FLOW(idx)->tcp))
>  
>  /* Table for lookup from remote address, local port, remote port */
> @@ -967,25 +910,14 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
>  }
>  
>  /**
> - * tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
> + * tcp_update_l2_buf() - Update ethernet header buffers with addresses

s/ethernet/Ethernet/

>   * @eth_d:	Ethernet destination address, NULL if unchanged
>   * @eth_s:	Ethernet source address, NULL if unchanged
>   */
>  void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
>  {
> -	int i;
> -
> -	for (i = 0; i < TCP_FRAMES_MEM; i++) {
> -		struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
> -		struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
> -		struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
> -		struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
> -
> -		eth_update_mac(&b4->taph.eh, eth_d, eth_s);
> -		eth_update_mac(&b6->taph.eh, eth_d, eth_s);
> -		eth_update_mac(&b4f->taph.eh, eth_d, eth_s);
> -		eth_update_mac(&b6f->taph.eh, eth_d, eth_s);
> -	}
> +	eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
> +	eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
>  }
>  
>  /**
> @@ -995,29 +927,42 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
>  static void tcp_sock4_iov_init(const struct ctx *c)
>  {
>  	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
> -	struct iovec *iov;
>  	int i;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
> -		tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IP),
> -			.iph = iph,
> -			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
> -		};
> -	}
> +	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);

An extra newline here would be nice, as we're switching to Layer-3
headers now.

> +	for (i = 0; i < TCP_FRAMES_MEM; i++) {

The existing version has two advantages over using i < TCP_FRAMES_MEM:
the array sizes don't necessarily need to match (even though they do),
but perhaps more importantly ARRAY_SIZE() clearly shows on what the
loop boundary is based on.

And two loops is probably a bit clearer than having one, because this
gives the impression that the two arrays are somewhat linked (same
amount, so it almost looks like there are header buffers first and
payload buffers next).

> +		struct iovec *iov;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
> -		tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IP),
> -			.iph = L2_BUF_IP4_INIT(IPPROTO_TCP)
> -		};
> -	}
> +		/* headers */
> +		tcp4_payload_ip[i] = iph;
> +		tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
> +		tcp4_payload[i].th.ack = 1;
> +
> +		tcp4_flags_ip[i] = iph;
> +		tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
> +		tcp4_flags[i].th.ack = 1;
>  
> -	for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_frame_base(c, &tcp4_l2_buf[i].taph);
> +		/* iovecs */
> +		iov = tcp4_l2_iov[i];
> +		iov[TCP_IOV_TAP].iov_base = &tcp4_payload_vnet_len[i];
> +		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
> +					   sizeof(tcp4_payload_vnet_len[i]) : 0;
> +		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
> +		iov[TCP_IOV_IP].iov_base = &tcp4_payload_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(tcp4_payload_ip[i]);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
>  
> -	for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_frame_base(c, &tcp4_l2_flags_buf[i].taph);
> +		iov = tcp4_l2_flags_iov[i];
> +		iov[TCP_IOV_TAP].iov_base = &tcp4_flags_vnet_len[i];
> +		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
> +					   sizeof(tcp4_flags_vnet_len[i]) : 0;
> +		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
> +		iov[TCP_IOV_IP].iov_base = &tcp4_flags_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(tcp4_flags_ip[i]);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
> +	}
>  }
>  
>  /**
> @@ -1026,29 +971,43 @@ static void tcp_sock4_iov_init(const struct ctx *c)
>   */
>  static void tcp_sock6_iov_init(const struct ctx *c)
>  {
> -	struct iovec *iov;
> +	struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
>  	int i;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
> -		tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IPV6),
> -			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP),
> -			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
> -		};
> -	}
> +	tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
> +	for (i = 0; i < TCP_FRAMES_MEM; i++) {

Same here.

> +		struct iovec *iov;
>  
> -	for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
> -		tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
> -			.taph = TAP_HDR_INIT(ETH_P_IPV6),
> -			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP)
> -		};
> -	}
> +		/* headers */
> +		tcp6_payload_ip[i] = ip6;
> +		tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
> +		tcp6_payload[i].th.ack = 1;
> +
> +		tcp6_flags_ip[i] = ip6;
> +		tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
> +		tcp6_flags[i].th .ack = 1;
>  
> -	for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_frame_base(c, &tcp6_l2_buf[i].taph);
> +		/* iovecs */
> +		iov = tcp6_l2_iov[i];
> +		iov[TCP_IOV_TAP].iov_base = &tcp6_payload_vnet_len[i];
> +		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
> +					   sizeof(tcp6_payload_vnet_len[i]) : 0;
> +		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
> +		iov[TCP_IOV_IP].iov_base = &tcp6_payload_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(tcp6_payload_ip[i]);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
>  
> -	for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
> -		iov->iov_base = tap_frame_base(c, &tcp6_l2_flags_buf[i].taph);
> +		iov = tcp6_l2_flags_iov[i];
> +		iov[TCP_IOV_TAP].iov_base = &tcp6_flags_vnet_len[i];
> +		iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ?
> +					   sizeof(tcp6_flags_vnet_len[i]) : 0;
> +		iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
> +		iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
> +		iov[TCP_IOV_IP].iov_base = &tcp6_flags_ip[i];
> +		iov[TCP_IOV_IP].iov_len = sizeof(tcp6_flags_ip[i]);
> +		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
> +	}
>  }
>  
>  /**
> @@ -1284,36 +1243,40 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
>  	} while (0)
>  
>  /**
> - * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags)
> + * tcp_flags_flush() - Send out buffers for segments with no data (flags)
>   * @c:		Execution context
>   */
> -static void tcp_l2_flags_buf_flush(const struct ctx *c)
> +static void tcp_flags_flush(const struct ctx *c)
>  {
> -	tap_send_frames(c, tcp6_l2_flags_iov, 1, tcp6_l2_flags_buf_used);
> -	tcp6_l2_flags_buf_used = 0;
> +	tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
> +			tcp6_flags_used);
> +	tcp6_flags_used = 0;
>  
> -	tap_send_frames(c, tcp4_l2_flags_iov, 1, tcp4_l2_flags_buf_used);
> -	tcp4_l2_flags_buf_used = 0;
> +	tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
> +			tcp4_flags_used);
> +	tcp4_flags_used = 0;
>  }
>  
>  /**
> - * tcp_l2_data_buf_flush() - Send out buffers for segments with data
> + * tcp_payload_flush() - Send out buffers for segments with data
>   * @c:		Execution context
>   */
> -static void tcp_l2_data_buf_flush(const struct ctx *c)
> +static void tcp_payload_flush(const struct ctx *c)
>  {
>  	unsigned i;
>  	size_t m;
>  
> -	m = tap_send_frames(c, tcp6_l2_iov, 1, tcp6_l2_buf_used);
> +	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
> +			    tcp6_payload_used);
>  	for (i = 0; i < m; i++)
> -		*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
> -	tcp6_l2_buf_used = 0;
> +		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
> +	tcp6_payload_used = 0;
>  
> -	m = tap_send_frames(c, tcp4_l2_iov, 1, tcp4_l2_buf_used);
> +	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
> +			    tcp4_payload_used);
>  	for (i = 0; i < m; i++)
> -		*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
> -	tcp4_l2_buf_used = 0;
> +		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
> +	tcp4_payload_used = 0;
>  }
>  
>  /**
> @@ -1323,8 +1286,8 @@ static void tcp_l2_data_buf_flush(const struct ctx *c)
>  /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
>  void tcp_defer_handler(struct ctx *c)
>  {
> -	tcp_l2_flags_buf_flush(c);
> -	tcp_l2_data_buf_flush(c);
> +	tcp_flags_flush(c);
> +	tcp_payload_flush(c);
>  }
>  
>  /**
> @@ -1433,35 +1396,31 @@ static size_t tcp_fill_headers6(const struct ctx *c,
>   * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
>   * @c:		Execution context
>   * @conn:	Connection pointer
> - * @p:		Pointer to any type of TCP pre-cooked buffer
> + * @iov:	Pointer to an array of iovec of TCP pre-cooked buffer

s/buffer/buffers/ now.

>   * @plen:	Payload length (including TCP header options)
>   * @check:	Checksum, if already known
>   * @seq:	Sequence number for this segment
>   *
> - * Return: frame length including L2 headers, host order
> + * Return: IP payload length, host order
>   */
>  static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
>  				      const struct tcp_tap_conn *conn,
> -				      void *p, size_t plen,
> +				      struct iovec *iov, size_t plen,
>  				      const uint16_t *check, uint32_t seq)
>  {
>  	const struct in_addr *a4 = inany_v4(&conn->faddr);
>  	size_t ip_len, tlen;
>  
>  	if (a4) {
> -		struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p;
> -
> -		ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen,
> +		ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base,
> +					   iov[TCP_IOV_PAYLOAD].iov_base, plen,
>  					   check, seq);
> -
> -		tlen = tap_frame_len(c, &b->taph, ip_len);
> +		tlen = ip_len - sizeof(struct iphdr);
>  	} else {
> -		struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p;
> -
> -		ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen,
> +		ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base,
> +					   iov[TCP_IOV_PAYLOAD].iov_base, plen,
>  					   seq);
> -
> -		tlen = tap_frame_len(c, &b->taph, ip_len);
> +		tlen = ip_len - sizeof(struct ipv6hdr);
>  	}
>  
>  	return tlen;
> @@ -1595,16 +1554,16 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  {
>  	uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
>  	uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
> -	struct tcp4_l2_flags_buf_t *b4 = NULL;
> -	struct tcp6_l2_flags_buf_t *b6 = NULL;
> +	struct tcp_flags_t *payload;
>  	struct tcp_info tinfo = { 0 };
>  	socklen_t sl = sizeof(tinfo);
>  	int s = conn->sock;
> +	uint32_t vnet_len;
>  	size_t optlen = 0;
> -	struct iovec *iov;
>  	struct tcphdr *th;
> +	struct iovec *iov;
> +	size_t ip_len;
>  	char *data;
> -	void *p;
>  
>  	if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
>  	    !flags && conn->wnd_to_tap)
> @@ -1627,19 +1586,17 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  		return 0;
>  
>  	if (CONN_V4(conn)) {
> -		iov = tcp4_l2_flags_iov    + tcp4_l2_flags_buf_used;
> -		p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
> -		th = &b4->th;
> -
> -		/* gcc 11.2 would complain on data = (char *)(th + 1); */
> -		data = b4->opts;
> +		iov = tcp4_l2_flags_iov[tcp4_flags_used++];
> +		vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr);
>  	} else {
> -		iov = tcp6_l2_flags_iov    + tcp6_l2_flags_buf_used;
> -		p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
> -		th = &b6->th;
> -		data = b6->opts;
> +		iov = tcp6_l2_flags_iov[tcp6_flags_used++];
> +		vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
>  	}
>  
> +	payload = iov[TCP_IOV_PAYLOAD].iov_base;
> +	th = &payload->th;
> +	data = payload->opts;
> +
>  	if (flags & SYN) {
>  		int mss;
>  
> @@ -1688,8 +1645,11 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  	th->syn = !!(flags & SYN);
>  	th->fin = !!(flags & FIN);
>  
> -	iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen,
> -					       NULL, conn->seq_to_tap);
> +	ip_len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL,
> +					 conn->seq_to_tap);
> +	iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
> +
> +	*(uint32_t *)iov[TCP_IOV_TAP].iov_base = htonl(vnet_len + ip_len);
>  
>  	if (th->ack) {
>  		if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
> @@ -1705,24 +1665,27 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  	if (th->fin || th->syn)
>  		conn->seq_to_tap++;
>  
> -	if (CONN_V4(conn)) {
> -		if (flags & DUP_ACK) {
> -			memcpy(b4 + 1, b4, sizeof(*b4));
> -			(iov + 1)->iov_len = iov->iov_len;
> -			tcp4_l2_flags_buf_used++;
> -		}
> +	if (flags & DUP_ACK) {
> +		struct iovec *dup_iov;
> +		int i;
>  
> -		if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
> -			tcp_l2_flags_buf_flush(c);
> -	} else {
> -		if (flags & DUP_ACK) {
> -			memcpy(b6 + 1, b6, sizeof(*b6));
> -			(iov + 1)->iov_len = iov->iov_len;
> -			tcp6_l2_flags_buf_used++;
> -		}
> +		if (CONN_V4(conn))
> +			dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
> +		else
> +			dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
>  
> -		if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
> -			tcp_l2_flags_buf_flush(c);
> +		for (i = 0; i < TCP_NUM_IOVS; i++)
> +			memcpy(dup_iov[i].iov_base, iov[i].iov_base,
> +			       iov[i].iov_len);
> +		dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
> +	}
> +
> +	if (CONN_V4(conn)) {
> +		if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
> +			tcp_flags_flush(c);
> +	} else {
> +		if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
> +			tcp_flags_flush(c);
>  	}
>  
>  	return 0;
> @@ -2169,30 +2132,43 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>  {
>  	uint32_t *seq_update = &conn->seq_to_tap;
>  	struct iovec *iov;
> +	size_t ip_len;
>  
>  	if (CONN_V4(conn)) {
> -		struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
> -		const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
> +		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
> +		const uint16_t *check = NULL;
>  
> -		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
> -		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
> +		if (no_csum) {
> +			struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
> +			check = &iph->check;
> +		}
>  
> -		iov = tcp4_l2_iov + tcp4_l2_buf_used++;
> -		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
> -						       check, seq);
> -		if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1)
> -			tcp_l2_data_buf_flush(c);
> +		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
> +		tcp4_seq_update[tcp4_payload_used].len = plen;
> +
> +		iov = tcp4_l2_iov[tcp4_payload_used++];
> +		ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, check,
> +						 seq);
> +		iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
> +		*(uint32_t *)iov[TCP_IOV_TAP].iov_base =
> +						htonl(sizeof(struct ethhdr) +
> +						      sizeof(struct iphdr) +
> +						      ip_len);

A bit less cramped:

		uint32_t vnet_len;

		[...]

		vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr) +
			   ip_len;
		*(uint32_t *)iov[TCP_IOV_TAP].iov_base = htonl(vnet_len);

> +		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
> +			tcp_payload_flush(c);
>  	} else if (CONN_V6(conn)) {
> -		struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
> -
> -		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
> -		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
> +		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
> +		tcp6_seq_update[tcp6_payload_used].len = plen;
>  
> -		iov = tcp6_l2_iov + tcp6_l2_buf_used++;
> -		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
> -						       NULL, seq);
> -		if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1)
> -			tcp_l2_data_buf_flush(c);
> +		iov = tcp6_l2_iov[tcp6_payload_used++];
> +		ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, NULL, seq);
> +		iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
> +		*(uint32_t *)iov[TCP_IOV_TAP].iov_base =
> +						htonl(sizeof(struct ethhdr) +
> +						      sizeof(struct ipv6hdr) +
> +						      ip_len);
> +		if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
> +			tcp_payload_flush(c);
>  	}
>  }
>  
> @@ -2247,19 +2223,19 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  	iov_sock[0].iov_base = tcp_buf_discard;
>  	iov_sock[0].iov_len = already_sent;
>  
> -	if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) ||
> -	    (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) {
> -		tcp_l2_data_buf_flush(c);
> +	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
> +	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
> +		tcp_payload_flush(c);
>  
>  		/* Silence Coverity CWE-125 false positive */
> -		tcp4_l2_buf_used = tcp6_l2_buf_used = 0;
> +		tcp4_payload_used = tcp6_payload_used = 0;
>  	}
>  
>  	for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
>  		if (v4)
> -			iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data;
> +			iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
>  		else
> -			iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data;
> +			iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
>  		iov->iov_len = mss;
>  	}
>  	if (iov_rem)
> @@ -2304,7 +2280,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  	plen = mss;
>  	seq = conn->seq_to_tap;
>  	for (i = 0; i < send_bufs; i++) {
> -		int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
> +		int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
>  
>  		if (i == send_bufs - 1)
>  			plen = last_len;
> diff --git a/util.h b/util.h
> index 48f35604df92..0069df34a5d2 100644
> --- a/util.h
> +++ b/util.h
> @@ -31,6 +31,9 @@
>  #ifndef ETH_MIN_MTU
>  #define ETH_MIN_MTU			68
>  #endif
> +#ifndef IP_MAX_MTU
> +#define IP_MAX_MTU			USHRT_MAX
> +#endif
>  
>  #ifndef MIN
>  #define MIN(x, y)		(((x) < (y)) ? (x) : (y))

...everything else looks good to me!

-- 
Stefano


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-04-12 18:00 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-21 10:26 [RFC v4] tcp: Replace TCP buffer structure by an iovec array Laurent Vivier
2024-03-22  2:40 ` David Gibson
2024-03-26 10:19 ` Laurent Vivier
2024-03-27  1:35   ` David Gibson
2024-04-12 17:59 ` Stefano Brivio

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).