* [PATCH v4 1/4] tcp: Encode checksum computation flags in a single parameter
2026-04-01 21:55 [PATCH v4 0/4] vhost-user,tcp: Handle multiple iovec entries per virtqueue element Laurent Vivier
@ 2026-04-01 21:55 ` Laurent Vivier
2026-04-01 21:55 ` [PATCH v4 2/4] tcp_vu: Build headers on the stack and write them into the iovec Laurent Vivier
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Laurent Vivier @ 2026-04-01 21:55 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
tcp_fill_headers() takes a pointer to a previously computed IPv4 header
checksum to avoid recalculating it when the payload length doesn't
change, and a separate bool to skip TCP checksum computation.
Replace both parameters with a single uint32_t csum_flags that encodes:
- IP4_CSUM (bit 31): compute IPv4 header checksum from scratch
- TCP_CSUM (bit 30): compute TCP checksum
- IP4_CMASK (low 16 bits): cached IPv4 header checksum value
When IP4_CSUM is not set, the cached checksum is extracted from the low
16 bits. This is cleaner than the pointer-based approach, and also
avoids a potential dangling pointer issue: a subsequent patch makes
tcp_fill_headers() access ip4h via with_header(), which scopes it to a
temporary variable, so a pointer to ip4h->check would become invalid
after the with_header() block.
Suggested-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
tcp.c | 25 +++++++++++++------------
tcp_buf.c | 23 ++++++++++++-----------
tcp_internal.h | 7 +++++--
tcp_vu.c | 28 +++++++++++++++++-----------
4 files changed, 47 insertions(+), 36 deletions(-)
diff --git a/tcp.c b/tcp.c
index 6b0e25f33bf1..680b1afa2521 100644
--- a/tcp.c
+++ b/tcp.c
@@ -946,9 +946,10 @@ static void tcp_fill_header(struct tcphdr *th,
* @th: Pointer to TCP header
* @payload: TCP payload
* @dlen: TCP payload length
- * @ip4_check: IPv4 checksum, if already known
+ * @csum_flags: TCP_CSUM if TCP checksum must be computed,
+ * IP4_CSUM if IPv4 checksum must be computed,
+ * otherwise IPv4 checksum is provided in IP4_CMASK
* @seq: Sequence number for this segment
- * @no_tcp_csum: Do not set TCP checksum
*
* Return: frame length (including L2 headers)
*/
@@ -956,8 +957,7 @@ size_t tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
struct ethhdr *eh,
struct iphdr *ip4h, struct ipv6hdr *ip6h,
struct tcphdr *th, struct iov_tail *payload,
- size_t dlen, const uint16_t *ip4_check, uint32_t seq,
- bool no_tcp_csum)
+ size_t dlen, uint32_t csum_flags, uint32_t seq)
{
const struct flowside *tapside = TAPFLOW(conn);
size_t l4len = dlen + sizeof(*th);
@@ -977,13 +977,14 @@ size_t tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
ip4h->saddr = src4->s_addr;
ip4h->daddr = dst4->s_addr;
- if (ip4_check)
- ip4h->check = *ip4_check;
- else
+ if (csum_flags & IP4_CSUM) {
ip4h->check = csum_ip4_header(l3len, IPPROTO_TCP,
*src4, *dst4);
+ } else {
+ ip4h->check = csum_flags & IP4_CMASK;
+ }
- if (!no_tcp_csum) {
+ if (csum_flags & TCP_CSUM) {
psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP,
*src4, *dst4);
}
@@ -1003,7 +1004,7 @@ size_t tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
ip6_set_flow_lbl(ip6h, conn->sock);
- if (!no_tcp_csum) {
+ if (csum_flags & TCP_CSUM) {
psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
&ip6h->saddr,
&ip6h->daddr);
@@ -1018,10 +1019,10 @@ size_t tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
tcp_fill_header(th, conn, seq);
- if (no_tcp_csum)
- th->check = 0;
- else
+ if (csum_flags & TCP_CSUM)
tcp_update_csum(psum, th, payload, l4len);
+ else
+ th->check = 0;
return MAX(l3len + sizeof(struct ethhdr), ETH_ZLEN);
}
diff --git a/tcp_buf.c b/tcp_buf.c
index 27151854033c..a27d9733616c 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -166,14 +166,15 @@ static void tcp_l2_buf_pad(struct iovec *iov)
* @c: Execution context
* @conn: Connection pointer
* @iov: Pointer to an array of iovec of TCP pre-cooked buffers
- * @check: Checksum, if already known
+ * @csum_flags: TCP_CSUM if TCP checksum must be computed,
+ * IP4_CSUM if IPv4 checksum must be computed,
+ * otherwise IPv4 checksum is provided in IP4_CMASK
* @seq: Sequence number for this segment
- * @no_tcp_csum: Do not set TCP checksum
*/
static void tcp_l2_buf_fill_headers(const struct ctx *c,
struct tcp_tap_conn *conn,
- struct iovec *iov, const uint16_t *check,
- uint32_t seq, bool no_tcp_csum)
+ struct iovec *iov, uint32_t csum_flags,
+ uint32_t seq)
{
struct iov_tail tail = IOV_TAIL(&iov[TCP_IOV_PAYLOAD], 1, 0);
struct tcphdr th_storage, *th = IOV_REMOVE_HEADER(&tail, th_storage);
@@ -191,8 +192,7 @@ static void tcp_l2_buf_fill_headers(const struct ctx *c,
ip6h = iov[TCP_IOV_IP].iov_base;
l2len = tcp_fill_headers(c, conn, eh, ip4h, ip6h, th, &tail,
- iov_tail_size(&tail), check, seq,
- no_tcp_csum);
+ iov_tail_size(&tail), csum_flags, seq);
tap_hdr_update(taph, l2len);
}
@@ -234,7 +234,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (flags & KEEPALIVE)
seq--;
- tcp_l2_buf_fill_headers(c, conn, iov, NULL, seq, false);
+ tcp_l2_buf_fill_headers(c, conn, iov, IP4_CSUM | TCP_CSUM, seq);
tcp_l2_buf_pad(iov);
@@ -271,7 +271,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
ssize_t dlen, int no_csum, uint32_t seq, bool push)
{
struct tcp_payload_t *payload;
- const uint16_t *check = NULL;
+ uint32_t check = IP4_CSUM;
struct iovec *iov;
conn->seq_to_tap = seq + dlen;
@@ -280,9 +280,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
if (CONN_V4(conn)) {
if (no_csum) {
struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1];
- struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
+ const struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
- check = &iph->check;
+ /* overwrite IP4_CSUM flag as we set the checksum */
+ check = iph->check;
}
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
} else if (CONN_V6(conn)) {
@@ -296,7 +297,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
payload->th.ack = 1;
payload->th.psh = push;
iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
- tcp_l2_buf_fill_headers(c, conn, iov, check, seq, false);
+ tcp_l2_buf_fill_headers(c, conn, iov, TCP_CSUM | check, seq);
tcp_l2_buf_pad(iov);
diff --git a/tcp_internal.h b/tcp_internal.h
index a0fa19f4ed11..40472c9973c8 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -183,12 +183,15 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
struct tcp_info_linux;
+#define IP4_CSUM 0x80000000
+#define IP4_CMASK 0x0000FFFF
+#define TCP_CSUM 0x40000000
+
size_t tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
struct ethhdr *eh,
struct iphdr *ip4h, struct ipv6hdr *ip6h,
struct tcphdr *th, struct iov_tail *payload,
- size_t dlen, const uint16_t *ip4_check, uint32_t seq,
- bool no_tcp_csum);
+ size_t dlen, uint32_t csum_flags, uint32_t seq);
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
bool force_seq, struct tcp_info_linux *tinfo);
diff --git a/tcp_vu.c b/tcp_vu.c
index cae6926334b9..23d2b62acacb 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -134,7 +134,7 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
seq--;
tcp_fill_headers(c, conn, eh, ip4h, ip6h, th, &payload,
- optlen, NULL, seq, !*c->pcap);
+ optlen, IP4_CSUM | (*c->pcap ? TCP_CSUM : 0), seq);
vu_pad(flags_elem[0].in_sg, 1, hdrlen + optlen);
vu_flush(vdev, vq, flags_elem, 1, hdrlen + optlen);
@@ -281,13 +281,15 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq,
* @iov: Pointer to the array of IO vectors
* @iov_cnt: Number of entries in @iov
* @dlen: Data length
- * @check: Checksum, if already known
- * @no_tcp_csum: Do not set TCP checksum
+ * @csum_flags: Pointer to checksum flags (input/output)
+ * TCP_CSUM if TCP checksum must be computed,
+ * IP4_CSUM if IPv4 checksum must be computed,
+ * otherwise IPv4 checksum is provided in IP4_CMASK
* @push: Set PSH flag, last segment in a batch
*/
static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
struct iovec *iov, size_t iov_cnt, size_t dlen,
- const uint16_t **check, bool no_tcp_csum, bool push)
+ uint32_t *csum_flags, bool push)
{
const struct flowside *toside = TAPFLOW(conn);
bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
@@ -331,9 +333,11 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
th->psh = push;
tcp_fill_headers(c, conn, eh, ip4h, ip6h, th, &payload, dlen,
- *check, conn->seq_to_tap, no_tcp_csum);
+ *csum_flags, conn->seq_to_tap);
+
+ /* Preserve TCP_CSUM, overwrite IP4_CSUM as we set the checksum */
if (ip4h)
- *check = &ip4h->check;
+ *csum_flags = (*csum_flags & TCP_CSUM) | ip4h->check;
}
/**
@@ -349,12 +353,11 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
struct vu_dev *vdev = c->vdev;
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+ uint32_t already_sent, check;
ssize_t len, previous_dlen;
int i, iov_cnt, head_cnt;
size_t hdrlen, fillsize;
int v6 = CONN_V6(conn);
- uint32_t already_sent;
- const uint16_t *check;
if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
debug("Got packet, but RX virtqueue not usable yet");
@@ -441,7 +444,10 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
*/
hdrlen = tcp_vu_hdrlen(v6);
- for (i = 0, previous_dlen = -1, check = NULL; i < head_cnt; i++) {
+ check = IP4_CSUM;
+ if (*c->pcap)
+ check |= TCP_CSUM;
+ for (i = 0, previous_dlen = -1; i < head_cnt; i++) {
struct iovec *iov = &elem[head[i]].in_sg[0];
int buf_cnt = head[i + 1] - head[i];
size_t frame_size = iov_size(iov, buf_cnt);
@@ -457,10 +463,10 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
/* The IPv4 header checksum varies only with dlen */
if (previous_dlen != dlen)
- check = NULL;
+ check |= IP4_CSUM;
previous_dlen = dlen;
- tcp_vu_prepare(c, conn, iov, buf_cnt, dlen, &check, !*c->pcap, push);
+ tcp_vu_prepare(c, conn, iov, buf_cnt, dlen, &check, push);
vu_pad(elem[head[i]].in_sg, buf_cnt, dlen + hdrlen);
vu_flush(vdev, vq, &elem[head[i]], buf_cnt, dlen + hdrlen);
--
2.53.0
^ permalink raw reply [flat|nested] 5+ messages in thread* [PATCH v4 2/4] tcp_vu: Build headers on the stack and write them into the iovec
2026-04-01 21:55 [PATCH v4 0/4] vhost-user,tcp: Handle multiple iovec entries per virtqueue element Laurent Vivier
2026-04-01 21:55 ` [PATCH v4 1/4] tcp: Encode checksum computation flags in a single parameter Laurent Vivier
@ 2026-04-01 21:55 ` Laurent Vivier
2026-04-01 21:55 ` [PATCH v4 3/4] tcp_vu: Support multibuffer frames in tcp_vu_sock_recv() Laurent Vivier
2026-04-01 21:55 ` [PATCH v4 4/4] tcp_vu: Support multibuffer frames in tcp_vu_send_flag() Laurent Vivier
3 siblings, 0 replies; 5+ messages in thread
From: Laurent Vivier @ 2026-04-01 21:55 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
tcp_vu_prepare() currently assumes the first iovec element provided by
the guest is large enough to hold all L2-L4 headers, and builds them
in place via pointer casts into iov[0].iov_base. This assumption is
enforced by an assert().
Since the headers in the buffer are uninitialized anyway, we can just
as well build the Ethernet, IP, and TCP headers on the stack instead,
and write them into the iovec with IOV_PUSH_HEADER(). This mirrors the
approach already used in udp_vu_prepare(), and prepares for support of
elements with multiple iovecs.
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
tcp_vu.c | 60 ++++++++++++++++++++++++--------------------------------
1 file changed, 26 insertions(+), 34 deletions(-)
diff --git a/tcp_vu.c b/tcp_vu.c
index 23d2b62acacb..484f60774448 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -295,49 +295,41 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
size_t hdrlen = tcp_vu_hdrlen(v6);
struct iov_tail payload = IOV_TAIL(iov, iov_cnt, hdrlen);
- char *base = iov[0].iov_base;
- struct ipv6hdr *ip6h = NULL;
- struct iphdr *ip4h = NULL;
- struct tcphdr *th;
- struct ethhdr *eh;
-
- /* we guess the first iovec provided by the guest can embed
- * all the headers needed by L2 frame, including any padding
- */
- assert(iov[0].iov_len >= hdrlen);
+ struct ipv6hdr ip6h;
+ struct iphdr ip4h;
+ struct tcphdr th;
+ struct ethhdr eh;
- eh = vu_eth(base);
-
- memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
+ memcpy(eh.h_dest, c->guest_mac, sizeof(eh.h_dest));
/* initialize header */
- if (!v6) {
- eh->h_proto = htons(ETH_P_IP);
-
- ip4h = vu_ip(base);
- *ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
- th = vu_payloadv4(base);
- } else {
- eh->h_proto = htons(ETH_P_IPV6);
+ if (!v6)
+ ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
+ else
+ ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
- ip6h = vu_ip(base);
- *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
+ memset(&th, 0, sizeof(th));
+ th.doff = sizeof(th) / 4;
+ th.ack = 1;
+ th.psh = push;
- th = vu_payloadv6(base);
- }
+ tcp_fill_headers(c, conn, &eh, v6 ? NULL : &ip4h, v6 ? &ip6h : NULL, &th,
+ &payload, dlen, *csum_flags, conn->seq_to_tap);
- memset(th, 0, sizeof(*th));
- th->doff = sizeof(*th) / 4;
- th->ack = 1;
- th->psh = push;
+ /* Preserve TCP_CSUM, overwrite IP4_CSUM as we set the checksum */
+ if (!v6)
+ *csum_flags = (*csum_flags & TCP_CSUM) | ip4h.check;
- tcp_fill_headers(c, conn, eh, ip4h, ip6h, th, &payload, dlen,
- *csum_flags, conn->seq_to_tap);
+ /* write headers */
+ payload = IOV_TAIL(iov, iov_cnt, VNET_HLEN);
- /* Preserve TCP_CSUM, overwrite IP4_CSUM as we set the checksum */
- if (ip4h)
- *csum_flags = (*csum_flags & TCP_CSUM) | ip4h->check;
+ IOV_PUSH_HEADER(&payload, eh);
+ if (!v6)
+ IOV_PUSH_HEADER(&payload, ip4h);
+ else
+ IOV_PUSH_HEADER(&payload, ip6h);
+ IOV_PUSH_HEADER(&payload, th);
}
/**
--
2.53.0
^ permalink raw reply [flat|nested] 5+ messages in thread* [PATCH v4 3/4] tcp_vu: Support multibuffer frames in tcp_vu_sock_recv()
2026-04-01 21:55 [PATCH v4 0/4] vhost-user,tcp: Handle multiple iovec entries per virtqueue element Laurent Vivier
2026-04-01 21:55 ` [PATCH v4 1/4] tcp: Encode checksum computation flags in a single parameter Laurent Vivier
2026-04-01 21:55 ` [PATCH v4 2/4] tcp_vu: Build headers on the stack and write them into the iovec Laurent Vivier
@ 2026-04-01 21:55 ` Laurent Vivier
2026-04-01 21:55 ` [PATCH v4 4/4] tcp_vu: Support multibuffer frames in tcp_vu_send_flag() Laurent Vivier
3 siblings, 0 replies; 5+ messages in thread
From: Laurent Vivier @ 2026-04-01 21:55 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
Previously, tcp_vu_sock_recv() assumed a 1:1 mapping between virtqueue
elements and iovecs (one iovec per element), enforced by an ASSERT.
This prevented the use of virtqueue elements with multiple buffers
(e.g. when mergeable rx buffers are not negotiated and headers are
provided in a separate buffer).
Introduce a struct vu_frame to track per-frame metadata: the range of
elements and iovecs that make up each frame, and the frame's total size.
This replaces the head[] array which only tracked element indices.
A separate iov_msg[] array is built for recvmsg() by cloning the data
portions (after stripping headers) using iov_tail helpers.
Then a frame truncation after recvmsg() properly walks the frame and
element arrays to adjust iovec counts and element counts.
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
tcp_vu.c | 174 ++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 113 insertions(+), 61 deletions(-)
diff --git a/tcp_vu.c b/tcp_vu.c
index 484f60774448..4172307618c3 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -35,9 +35,24 @@
#include "vu_common.h"
#include <time.h>
-static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + DISCARD_IOV_NUM];
+static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE];
static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
-static int head[VIRTQUEUE_MAX_SIZE + 1];
+
+/**
+ * struct vu_frame - Descriptor for a TCP frame mapped to virtqueue elements
+ * @idx_element: Index of first element in elem[] for this frame
+ * @num_element: Number of virtqueue elements used by this frame
+ * @idx_iovec: Index of first iovec in iov_vu[] for this frame
+ * @num_iovec: Number of iovecs covering this frame's buffers
+ * @size: Total frame size including all headers
+ */
+static struct vu_frame {
+ int idx_element;
+ int num_element;
+ int idx_iovec;
+ int num_iovec;
+ size_t size;
+} frame[VIRTQUEUE_MAX_SIZE];
/**
* tcp_vu_hdrlen() - Sum size of all headers, from TCP to virtio-net
@@ -173,8 +188,8 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
* @v6: Set for IPv6 connections
* @already_sent: Number of bytes already sent
* @fillsize: Maximum bytes to fill in guest-side receiving window
- * @iov_cnt: number of iov (output)
- * @head_cnt: Pointer to store the count of head iov entries (output)
+ * @elem_used: number of element (output)
+ * @frame_cnt: Pointer to store the number of frames (output)
*
* Return: number of bytes received from the socket, or a negative error code
* on failure.
@@ -182,57 +197,77 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq,
const struct tcp_tap_conn *conn, bool v6,
uint32_t already_sent, size_t fillsize,
- int *iov_cnt, int *head_cnt)
+ int *elem_used, int *frame_cnt)
{
+ static struct iovec iov_msg[VIRTQUEUE_MAX_SIZE + DISCARD_IOV_NUM];
const struct vu_dev *vdev = c->vdev;
struct msghdr mh_sock = { 0 };
uint16_t mss = MSS_GET(conn);
size_t hdrlen, iov_used;
int s = conn->sock;
+ ssize_t ret, dlen;
int elem_cnt;
- ssize_t ret;
- int i;
-
- *iov_cnt = 0;
+ int i, j;
hdrlen = tcp_vu_hdrlen(v6);
+ *elem_used = 0;
+
iov_used = 0;
elem_cnt = 0;
- *head_cnt = 0;
+ *frame_cnt = 0;
while (fillsize > 0 && elem_cnt < ARRAY_SIZE(elem) &&
- iov_used < VIRTQUEUE_MAX_SIZE) {
- size_t frame_size, dlen, in_total;
- struct iovec *iov;
+ iov_used < ARRAY_SIZE(iov_vu) &&
+ *frame_cnt < ARRAY_SIZE(frame)) {
+ size_t frame_size, in_total;
int cnt;
cnt = vu_collect(vdev, vq, &elem[elem_cnt],
ARRAY_SIZE(elem) - elem_cnt,
- &iov_vu[DISCARD_IOV_NUM + iov_used],
- VIRTQUEUE_MAX_SIZE - iov_used, &in_total,
+ &iov_vu[iov_used],
+ ARRAY_SIZE(iov_vu) - iov_used, &in_total,
MIN(mss, fillsize) + hdrlen,
&frame_size);
if (cnt == 0)
break;
- assert((size_t)cnt == in_total); /* one iovec per element */
+
+ frame[*frame_cnt].idx_element = elem_cnt;
+ frame[*frame_cnt].num_element = cnt;
+ frame[*frame_cnt].idx_iovec = iov_used;
+ frame[*frame_cnt].num_iovec = in_total;
+ frame[*frame_cnt].size = frame_size;
+ (*frame_cnt)++;
iov_used += in_total;
- dlen = frame_size - hdrlen;
+ elem_cnt += cnt;
- /* reserve space for headers in iov */
- iov = &elem[elem_cnt].in_sg[0];
- assert(iov->iov_len >= hdrlen);
- iov->iov_base = (char *)iov->iov_base + hdrlen;
- iov->iov_len -= hdrlen;
- head[(*head_cnt)++] = elem_cnt;
+ fillsize -= frame_size - hdrlen;
+ }
- fillsize -= dlen;
- elem_cnt += cnt;
+ /* build an iov array without headers */
+ for (i = 0, j = DISCARD_IOV_NUM; i < *frame_cnt &&
+ j < ARRAY_SIZE(iov_msg); i++) {
+ struct iov_tail data;
+ ssize_t cnt;
+
+ data = IOV_TAIL(&iov_vu[frame[i].idx_iovec],
+ frame[i].num_iovec, 0);
+ iov_drop_header(&data, hdrlen);
+
+ cnt = iov_tail_clone(&iov_msg[j], ARRAY_SIZE(iov_msg) - j,
+ &data);
+ if (cnt == -1)
+ die("Missing entries in iov_msg");
+
+ j += cnt;
}
- if (tcp_prepare_iov(&mh_sock, iov_vu, already_sent, elem_cnt))
+ if (tcp_prepare_iov(&mh_sock, iov_msg, already_sent,
+ j - DISCARD_IOV_NUM)) {
/* Expect caller to do a TCP reset */
+ vu_queue_rewind(vq, elem_cnt);
return -1;
+ }
do
ret = recvmsg(s, &mh_sock, MSG_PEEK);
@@ -246,32 +281,50 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq,
if (!peek_offset_cap)
ret -= already_sent;
- i = iov_skip_bytes(&iov_vu[DISCARD_IOV_NUM], iov_used,
- MAX(hdrlen + ret, VNET_HLEN + ETH_ZLEN),
- NULL);
- if ((size_t)i < iov_used)
- i++;
+ dlen = ret;
- /* adjust head count */
- while (*head_cnt > 0 && head[*head_cnt - 1] >= i)
- (*head_cnt)--;
+ /* truncate frame */
+ *elem_used = 0;
+ for (i = 0; i < *frame_cnt; i++) {
+ struct vu_frame *f = &frame[i];
- /* mark end of array */
- head[*head_cnt] = i;
- *iov_cnt = i;
+ if ((size_t)ret <= f->size - hdrlen) {
+ unsigned cnt;
- /* release unused buffers */
- vu_queue_rewind(vq, elem_cnt - i);
+ cnt = iov_skip_bytes(&iov_vu[f->idx_iovec], f->num_iovec,
+ MAX(hdrlen + ret, VNET_HLEN + ETH_ZLEN),
+ NULL);
+ if (cnt < (unsigned)f->num_iovec)
+ cnt++;
+
+ f->size = ret + hdrlen;
+ f->num_iovec = cnt;
- /* restore space for headers in iov */
- for (i = 0; i < *head_cnt; i++) {
- struct iovec *iov = &elem[head[i]].in_sg[0];
+ for (j = 0; j < f->num_element; j++) {
+ struct vu_virtq_element *e;
- iov->iov_base = (char *)iov->iov_base - hdrlen;
- iov->iov_len += hdrlen;
+ e = &elem[f->idx_element + j];
+ if (cnt <= e->in_num) {
+ e->in_num = cnt;
+ j++;
+ break;
+ }
+ cnt -= e->in_num;
+ }
+ f->num_element = j;
+ *elem_used += j;
+ i++;
+ break;
+ }
+ *elem_used += f->num_element;
+ ret -= f->size - hdrlen;
}
+ *frame_cnt = i;
- return ret;
+ /* release unused buffers */
+ vu_queue_rewind(vq, elem_cnt - *elem_used);
+
+ return dlen;
}
/**
@@ -347,7 +400,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
uint32_t already_sent, check;
ssize_t len, previous_dlen;
- int i, iov_cnt, head_cnt;
+ int i, elem_cnt, frame_cnt;
size_t hdrlen, fillsize;
int v6 = CONN_V6(conn);
@@ -385,7 +438,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
* data from the socket
*/
len = tcp_vu_sock_recv(c, vq, conn, v6, already_sent, fillsize,
- &iov_cnt, &head_cnt);
+ &elem_cnt, &frame_cnt);
if (len < 0) {
if (len != -EAGAIN && len != -EWOULDBLOCK) {
tcp_rst(c, conn);
@@ -399,6 +452,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
}
if (!len) {
+ vu_queue_rewind(vq, elem_cnt);
if (already_sent) {
conn_flag(c, conn, STALLED);
} else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) ==
@@ -439,32 +493,30 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
check = IP4_CSUM;
if (*c->pcap)
check |= TCP_CSUM;
- for (i = 0, previous_dlen = -1; i < head_cnt; i++) {
- struct iovec *iov = &elem[head[i]].in_sg[0];
- int buf_cnt = head[i + 1] - head[i];
- size_t frame_size = iov_size(iov, buf_cnt);
- bool push = i == head_cnt - 1;
+ for (i = 0, previous_dlen = -1; i < frame_cnt; i++) {
+ struct iovec *iov = &iov_vu[frame[i].idx_iovec];
+ int iov_cnt = frame[i].num_iovec;
+ bool push = i == frame_cnt - 1;
ssize_t dlen;
- assert(frame_size >= hdrlen);
+ assert(frame[i].size >= hdrlen);
- dlen = frame_size - hdrlen;
- if (dlen > len)
- dlen = len;
- len -= dlen;
+ dlen = frame[i].size - hdrlen;
/* The IPv4 header checksum varies only with dlen */
if (previous_dlen != dlen)
check |= IP4_CSUM;
previous_dlen = dlen;
- tcp_vu_prepare(c, conn, iov, buf_cnt, dlen, &check, push);
+ tcp_vu_prepare(c, conn, iov, iov_cnt, dlen, &check, push);
- vu_pad(elem[head[i]].in_sg, buf_cnt, dlen + hdrlen);
- vu_flush(vdev, vq, &elem[head[i]], buf_cnt, dlen + hdrlen);
+ vu_pad(&iov[frame[i].idx_iovec], frame[i].num_iovec,
+ dlen + hdrlen);
+ vu_flush(vdev, vq, &elem[frame[i].idx_element],
+ frame[i].num_element, dlen + hdrlen);
if (*c->pcap)
- pcap_iov(iov, buf_cnt, VNET_HLEN,
+ pcap_iov(iov, iov_cnt, VNET_HLEN,
dlen + hdrlen - VNET_HLEN);
conn->seq_to_tap += dlen;
--
2.53.0
^ permalink raw reply [flat|nested] 5+ messages in thread* [PATCH v4 4/4] tcp_vu: Support multibuffer frames in tcp_vu_send_flag()
2026-04-01 21:55 [PATCH v4 0/4] vhost-user,tcp: Handle multiple iovec entries per virtqueue element Laurent Vivier
` (2 preceding siblings ...)
2026-04-01 21:55 ` [PATCH v4 3/4] tcp_vu: Support multibuffer frames in tcp_vu_sock_recv() Laurent Vivier
@ 2026-04-01 21:55 ` Laurent Vivier
3 siblings, 0 replies; 5+ messages in thread
From: Laurent Vivier @ 2026-04-01 21:55 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
Build the Ethernet, IP, and TCP headers on the stack instead of
directly in the buffer via pointer casts, then write them into the
iovec with IOV_PUSH_HEADER(). This mirrors the approach already used
in tcp_vu_prepare() and udp_vu_prepare().
Remove the vu_eth(), vu_ip(), vu_payloadv4() and vu_payloadv6() helpers
from vu_common.h, as they are no longer used anywhere.
Introduce tcp_vu_send_dup() to handle DUP_ACK duplication using
vu_collect() and iov_memcopy() instead of a plain memcpy(), so that
the duplicated frame is also properly scattered across multiple iovecs.
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
iov.c | 1 -
tcp_vu.c | 145 ++++++++++++++++++++++++++++++----------------------
vu_common.h | 20 --------
3 files changed, 85 insertions(+), 81 deletions(-)
diff --git a/iov.c b/iov.c
index d5fb4e81a502..c103ba9c3250 100644
--- a/iov.c
+++ b/iov.c
@@ -208,7 +208,6 @@ void iov_memset(const struct iovec *iov, size_t iov_cnt, size_t offset, int c,
*
* Return: total number of bytes copied
*/
-/* cppcheck-suppress unusedFunction */
size_t iov_memcopy(struct iovec *dst_iov, size_t dst_iov_cnt, size_t dst_offs,
const struct iovec *iov, size_t iov_cnt, size_t offs,
size_t length)
diff --git a/tcp_vu.c b/tcp_vu.c
index 4172307618c3..1927b14e0962 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -74,6 +74,44 @@ static size_t tcp_vu_hdrlen(bool v6)
return hdrlen;
}
+/**
+ * tcp_vu_send_dup() - Duplicate a frame into a new virtqueue element
+ * @c: Execution context
+ * @vq: Receive virtqueue
+ * @dest_elem: Destination virtqueue element to collect
+ * @dest_iov: Destination iovec array for collected buffers
+ * @max_dest_iov: Maximum number of entries in @dest_iov
+ * @src_iov: Source iovec array containing the frame to duplicate
+ * @src_cnt: Number of entries in @src_iov
+ *
+ * Return: number of virtqueue elements collected (0 if none available)
+ */
+static int tcp_vu_send_dup(const struct ctx *c, struct vu_virtq *vq,
+ struct vu_virtq_element *dest_elem,
+ struct iovec *dest_iov, size_t max_dest_iov,
+ const struct iovec *src_iov, size_t src_cnt,
+ size_t vnlen)
+{
+ const struct vu_dev *vdev = c->vdev;
+ size_t dest_cnt;
+ int elem_cnt;
+
+ elem_cnt = vu_collect(vdev, vq, dest_elem, 1, dest_iov, max_dest_iov,
+ &dest_cnt, vnlen, NULL);
+ if (elem_cnt == 0)
+ return 0;
+
+ iov_memcopy(dest_iov, dest_cnt, 0, src_iov, src_cnt, 0,
+ MAX(VNET_HLEN + ETH_ZLEN, vnlen));
+
+ vu_flush(vdev, vq, dest_elem, elem_cnt, vnlen);
+
+ if (*c->pcap)
+ pcap_iov(dest_iov, dest_cnt, VNET_HLEN, vnlen - VNET_HLEN);
+
+ return elem_cnt;
+}
+
/**
* tcp_vu_send_flag() - Send segment with flags to vhost-user (no payload)
* @c: Execution context
@@ -86,15 +124,15 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
{
struct vu_dev *vdev = c->vdev;
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+ size_t optlen, hdrlen, iov_cnt, iov_used;
struct vu_virtq_element flags_elem[2];
- struct ipv6hdr *ip6h = NULL;
- struct iphdr *ip4h = NULL;
- struct iovec flags_iov[2];
- struct tcp_syn_opts *opts;
+ struct iovec flags_iov[64];
+ struct tcp_syn_opts opts;
struct iov_tail payload;
- size_t optlen, hdrlen;
- struct tcphdr *th;
- struct ethhdr *eh;
+ struct ipv6hdr ip6h;
+ struct iphdr ip4h;
+ struct tcphdr th;
+ struct ethhdr eh;
uint32_t seq;
int elem_cnt;
int ret;
@@ -102,79 +140,66 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
hdrlen = tcp_vu_hdrlen(CONN_V6(conn));
elem_cnt = vu_collect(vdev, vq, &flags_elem[0], 1,
- &flags_iov[0], 1, NULL,
- hdrlen + sizeof(*opts), NULL);
- if (elem_cnt != 1)
+ flags_iov, ARRAY_SIZE(flags_iov), &iov_cnt,
+ hdrlen + sizeof(opts), NULL);
+ if (elem_cnt == 0)
return -1;
- assert(flags_elem[0].in_num == 1);
- assert(flags_elem[0].in_sg[0].iov_len >=
- MAX(hdrlen + sizeof(*opts), ETH_ZLEN + VNET_HLEN));
-
- eh = vu_eth(flags_elem[0].in_sg[0].iov_base);
-
- memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
- memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
-
- if (CONN_V4(conn)) {
- eh->h_proto = htons(ETH_P_IP);
-
- ip4h = vu_ip(flags_elem[0].in_sg[0].iov_base);
- *ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
-
- th = vu_payloadv4(flags_elem[0].in_sg[0].iov_base);
- } else {
- eh->h_proto = htons(ETH_P_IPV6);
+ memcpy(eh.h_dest, c->guest_mac, sizeof(eh.h_dest));
- ip6h = vu_ip(flags_elem[0].in_sg[0].iov_base);
- *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
- th = vu_payloadv6(flags_elem[0].in_sg[0].iov_base);
- }
+ if (CONN_V4(conn))
+ ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
+ else
+ ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
- memset(th, 0, sizeof(*th));
- th->doff = sizeof(*th) / 4;
- th->ack = 1;
+ memset(&th, 0, sizeof(th));
seq = conn->seq_to_tap;
- opts = (struct tcp_syn_opts *)(th + 1);
- ret = tcp_prepare_flags(c, conn, flags, th, opts, &optlen);
+ ret = tcp_prepare_flags(c, conn, flags, &th, &opts, &optlen);
if (ret <= 0) {
- vu_queue_rewind(vq, 1);
+ vu_queue_rewind(vq, elem_cnt);
return ret;
}
- payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen);
-
if (flags & KEEPALIVE)
seq--;
- tcp_fill_headers(c, conn, eh, ip4h, ip6h, th, &payload,
+ iov_used = iov_skip_bytes(flags_iov, iov_cnt,
+ MAX(optlen + hdrlen, VNET_HLEN + ETH_ZLEN),
+ NULL);
+ if (iov_used < iov_cnt)
+ iov_used++;
+ iov_cnt = iov_used;
+
+ payload = IOV_TAIL(flags_elem[0].in_sg, iov_cnt, hdrlen);
+ iov_from_buf(payload.iov, payload.cnt, payload.off, &opts, optlen);
+ tcp_fill_headers(c, conn, &eh, CONN_V4(conn) ? &ip4h : NULL,
+ CONN_V6(conn) ? &ip6h : NULL, &th, &payload,
optlen, IP4_CSUM | (*c->pcap ? TCP_CSUM : 0), seq);
- vu_pad(flags_elem[0].in_sg, 1, hdrlen + optlen);
- vu_flush(vdev, vq, flags_elem, 1, hdrlen + optlen);
+ vu_pad(flags_elem[0].in_sg, iov_cnt, hdrlen + optlen);
+ vu_flush(vdev, vq, flags_elem, elem_cnt, hdrlen + optlen);
+
+ /* write headers */
+ payload = IOV_TAIL(flags_elem[0].in_sg, iov_cnt, VNET_HLEN);
+
+ IOV_PUSH_HEADER(&payload, eh);
+ if (CONN_V4(conn))
+ IOV_PUSH_HEADER(&payload, ip4h);
+ else
+ IOV_PUSH_HEADER(&payload, ip6h);
+ IOV_PUSH_HEADER(&payload, th);
if (*c->pcap)
- pcap_iov(&flags_elem[0].in_sg[0], 1, VNET_HLEN,
+ pcap_iov(flags_elem[0].in_sg, iov_cnt, VNET_HLEN,
hdrlen + optlen - VNET_HLEN);
if (flags & DUP_ACK) {
- elem_cnt = vu_collect(vdev, vq, &flags_elem[1], 1,
- &flags_iov[1], 1, NULL,
- hdrlen + optlen, NULL);
- if (elem_cnt == 1 &&
- flags_elem[1].in_sg[0].iov_len >=
- flags_elem[0].in_sg[0].iov_len) {
- memcpy(flags_elem[1].in_sg[0].iov_base,
- flags_elem[0].in_sg[0].iov_base,
- flags_elem[0].in_sg[0].iov_len);
-
- vu_flush(vdev, vq, &flags_elem[1], 1, hdrlen + optlen);
-
- if (*c->pcap)
- pcap_iov(&flags_elem[1].in_sg[0], 1, VNET_HLEN,
- hdrlen + optlen - VNET_HLEN);
- }
+ tcp_vu_send_dup(c, vq, &flags_elem[elem_cnt],
+ &flags_iov[iov_cnt],
+ ARRAY_SIZE(flags_iov) - iov_cnt,
+ flags_elem[0].in_sg, iov_cnt,
+ hdrlen + optlen);
}
vu_queue_notify(vdev, vq);
diff --git a/vu_common.h b/vu_common.h
index 51f70084a7cb..817384175a1d 100644
--- a/vu_common.h
+++ b/vu_common.h
@@ -15,26 +15,6 @@
#include "ip.h"
#include "virtio.h"
-static inline void *vu_eth(void *base)
-{
- return ((char *)base + VNET_HLEN);
-}
-
-static inline void *vu_ip(void *base)
-{
- return (struct ethhdr *)vu_eth(base) + 1;
-}
-
-static inline void *vu_payloadv4(void *base)
-{
- return (struct iphdr *)vu_ip(base) + 1;
-}
-
-static inline void *vu_payloadv6(void *base)
-{
- return (struct ipv6hdr *)vu_ip(base) + 1;
-}
-
int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq,
struct vu_virtq_element *elem, int max_elem,
struct iovec *in_sg, size_t max_in_sg, size_t *in_total,
--
2.53.0
^ permalink raw reply [flat|nested] 5+ messages in thread