From: David Gibson <david@gibson.dropbear.id.au>
To: Laurent Vivier <lvivier@redhat.com>
Cc: passt-dev@passt.top
Subject: Re: [PATCH v6 3/4] tcp_vu: Support multibuffer frames in tcp_vu_sock_recv()
Date: Mon, 11 May 2026 18:24:41 +1000 [thread overview]
Message-ID: <agGSSYTjte8o-DTi@zatzit> (raw)
In-Reply-To: <20260416161618.3826904-4-lvivier@redhat.com>
[-- Attachment #1: Type: text/plain, Size: 10840 bytes --]
On Thu, Apr 16, 2026 at 06:16:17PM +0200, Laurent Vivier wrote:
> Previously, tcp_vu_sock_recv() assumed a 1:1 mapping between virtqueue
> elements and iovecs (one iovec per element), enforced by an ASSERT.
> This prevented the use of virtqueue elements with multiple buffers
> (e.g. when mergeable rx buffers are not negotiated and headers are
> provided in a separate buffer).
>
> Introduce a struct vu_frame to track per-frame metadata: the range of
> elements and iovecs that make up each frame, and the frame's total size.
> This replaces the head[] array which only tracked element indices.
>
> A separate iov_msg[] array is built for recvmsg() by cloning the data
> portions (after stripping headers) using iov_tail helpers.
>
> Then a frame truncation after recvmsg() properly walks the frame and
> element arrays to adjust iovec counts and element counts.
>
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
LGTM apart from a couple of minor points noted below.
> ---
> tcp_vu.c | 174 ++++++++++++++++++++++++++++++++++++-------------------
> 1 file changed, 113 insertions(+), 61 deletions(-)
>
> diff --git a/tcp_vu.c b/tcp_vu.c
> index 2017aec90342..96b16007701d 100644
> --- a/tcp_vu.c
> +++ b/tcp_vu.c
> @@ -35,9 +35,24 @@
> #include "vu_common.h"
> #include <time.h>
>
> -static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + DISCARD_IOV_NUM];
> +static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE];
> static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
> -static int head[VIRTQUEUE_MAX_SIZE + 1];
> +
> +/**
> + * struct vu_frame - Descriptor for a TCP frame mapped to virtqueue elements
> + * @idx_element: Index of first element in elem[] for this frame
> + * @num_element: Number of virtqueue elements used by this frame
> + * @idx_iovec: Index of first iovec in iov_vu[] for this frame
> + * @num_iovec: Number of iovecs covering this frame's buffers
> + * @size: Total frame size including all headers
> + */
> +static struct vu_frame {
> + int idx_element;
> + int num_element;
> + int idx_iovec;
> + int num_iovec;
> + size_t size;
> +} frame[VIRTQUEUE_MAX_SIZE];
>
> /**
> * tcp_vu_hdrlen() - Sum size of all headers, from TCP to virtio-net
> @@ -174,8 +189,8 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
> * @v6: Set for IPv6 connections
> * @already_sent: Number of bytes already sent
> * @fillsize: Maximum bytes to fill in guest-side receiving window
> - * @iov_cnt: number of iov (output)
> - * @head_cnt: Pointer to store the count of head iov entries (output)
> + * @elem_used: number of element (output)
> + * @frame_cnt: Pointer to store the number of frames (output)
> *
> * Return: number of bytes received from the socket, or a negative error code
> * on failure.
> @@ -183,57 +198,77 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
> static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq,
> const struct tcp_tap_conn *conn, bool v6,
> uint32_t already_sent, size_t fillsize,
> - int *iov_cnt, int *head_cnt)
> + int *elem_used, int *frame_cnt)
> {
> + static struct iovec iov_msg[VIRTQUEUE_MAX_SIZE + DISCARD_IOV_NUM];
> const struct vu_dev *vdev = c->vdev;
> struct msghdr mh_sock = { 0 };
> uint16_t mss = MSS_GET(conn);
> size_t hdrlen, iov_used;
> int s = conn->sock;
> + ssize_t ret, dlen;
> int elem_cnt;
> - ssize_t ret;
> - int i;
> -
> - *iov_cnt = 0;
> + int i, j;
>
> hdrlen = tcp_vu_hdrlen(v6);
>
> + *elem_used = 0;
> +
> iov_used = 0;
> elem_cnt = 0;
> - *head_cnt = 0;
> + *frame_cnt = 0;
> while (fillsize > 0 && elem_cnt < ARRAY_SIZE(elem) &&
> - iov_used < VIRTQUEUE_MAX_SIZE) {
> - size_t frame_size, dlen, in_total;
> - struct iovec *iov;
> + iov_used < ARRAY_SIZE(iov_vu) &&
> + *frame_cnt < ARRAY_SIZE(frame)) {
> + size_t frame_size, in_total;
> int cnt;
>
> cnt = vu_collect(vdev, vq, &elem[elem_cnt],
> ARRAY_SIZE(elem) - elem_cnt,
> - &iov_vu[DISCARD_IOV_NUM + iov_used],
> - VIRTQUEUE_MAX_SIZE - iov_used, &in_total,
> + &iov_vu[iov_used],
> + ARRAY_SIZE(iov_vu) - iov_used, &in_total,
> MIN(mss, fillsize) + hdrlen,
> &frame_size);
> if (cnt == 0)
> break;
> - assert((size_t)cnt == in_total); /* one iovec per element */
> +
> + frame[*frame_cnt].idx_element = elem_cnt;
> + frame[*frame_cnt].num_element = cnt;
> + frame[*frame_cnt].idx_iovec = iov_used;
> + frame[*frame_cnt].num_iovec = in_total;
> + frame[*frame_cnt].size = frame_size;
> + (*frame_cnt)++;
>
> iov_used += in_total;
> - dlen = frame_size - hdrlen;
> + elem_cnt += cnt;
>
> - /* reserve space for headers in iov */
> - iov = &elem[elem_cnt].in_sg[0];
> - assert(iov->iov_len >= hdrlen);
> - iov->iov_base = (char *)iov->iov_base + hdrlen;
> - iov->iov_len -= hdrlen;
> - head[(*head_cnt)++] = elem_cnt;
> + fillsize -= frame_size - hdrlen;
> + }
>
> - fillsize -= dlen;
> - elem_cnt += cnt;
> + /* build an iov array without headers */
> + for (i = 0, j = DISCARD_IOV_NUM; i < *frame_cnt &&
> + j < ARRAY_SIZE(iov_msg); i++) {
> + struct iov_tail data;
> + ssize_t cnt;
> +
> + data = IOV_TAIL(&iov_vu[frame[i].idx_iovec],
> + frame[i].num_iovec, 0);
> + iov_drop_header(&data, hdrlen);
> +
> + cnt = iov_tail_clone(&iov_msg[j], ARRAY_SIZE(iov_msg) - j,
> + &data);
> + if (cnt == -1)
> + die("Missing entries in iov_msg");
Is a fatal error really what we want here?
> +
> + j += cnt;
> }
>
> - if (tcp_prepare_iov(&mh_sock, iov_vu, already_sent, elem_cnt))
> + if (tcp_prepare_iov(&mh_sock, iov_msg, already_sent,
> + j - DISCARD_IOV_NUM)) {
> /* Expect caller to do a TCP reset */
> + vu_queue_rewind(vq, elem_cnt);
> return -1;
> + }
>
> do
> ret = recvmsg(s, &mh_sock, MSG_PEEK);
> @@ -247,32 +282,50 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq,
> if (!peek_offset_cap)
> ret -= already_sent;
>
> - i = iov_skip_bytes(&iov_vu[DISCARD_IOV_NUM], iov_used,
> - MAX(hdrlen + ret, VNET_HLEN + ETH_ZLEN),
> - NULL);
> - if ((size_t)i < iov_used)
> - i++;
> + dlen = ret;
>
> - /* adjust head count */
> - while (*head_cnt > 0 && head[*head_cnt - 1] >= i)
> - (*head_cnt)--;
> + /* truncate frame */
> + *elem_used = 0;
Is this redundant with the *elem_used = 0 right at the top of the function?
> + for (i = 0; i < *frame_cnt; i++) {
> + struct vu_frame *f = &frame[i];
>
> - /* mark end of array */
> - head[*head_cnt] = i;
> - *iov_cnt = i;
> + if ((size_t)ret <= f->size - hdrlen) {
> + unsigned cnt;
>
> - /* release unused buffers */
> - vu_queue_rewind(vq, elem_cnt - i);
> + cnt = iov_skip_bytes(&iov_vu[f->idx_iovec], f->num_iovec,
> + MAX(hdrlen + ret, VNET_HLEN + ETH_ZLEN),
> + NULL);
> + if (cnt < (unsigned)f->num_iovec)
> + cnt++;
> +
> + f->size = ret + hdrlen;
> + f->num_iovec = cnt;
>
> - /* restore space for headers in iov */
> - for (i = 0; i < *head_cnt; i++) {
> - struct iovec *iov = &elem[head[i]].in_sg[0];
> + for (j = 0; j < f->num_element; j++) {
> + struct vu_virtq_element *e;
>
> - iov->iov_base = (char *)iov->iov_base - hdrlen;
> - iov->iov_len += hdrlen;
> + e = &elem[f->idx_element + j];
> + if (cnt <= e->in_num) {
> + e->in_num = cnt;
> + j++;
> + break;
> + }
> + cnt -= e->in_num;
> + }
> + f->num_element = j;
> + *elem_used += j;
> + i++;
> + break;
> + }
> + *elem_used += f->num_element;
> + ret -= f->size - hdrlen;
> }
> + *frame_cnt = i;
>
> - return ret;
> + /* release unused buffers */
> + vu_queue_rewind(vq, elem_cnt - *elem_used);
> +
> + return dlen;
> }
>
> /**
> @@ -348,7 +401,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
> struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
> uint32_t already_sent, check;
> ssize_t len, previous_dlen;
> - int i, iov_cnt, head_cnt;
> + int i, elem_cnt, frame_cnt;
> size_t hdrlen, fillsize;
> int v6 = CONN_V6(conn);
>
> @@ -386,7 +439,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
> * data from the socket
> */
> len = tcp_vu_sock_recv(c, vq, conn, v6, already_sent, fillsize,
> - &iov_cnt, &head_cnt);
> + &elem_cnt, &frame_cnt);
> if (len < 0) {
> if (len != -EAGAIN && len != -EWOULDBLOCK) {
> tcp_rst(c, conn);
> @@ -400,6 +453,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
> }
>
> if (!len) {
> + vu_queue_rewind(vq, elem_cnt);
> if (already_sent) {
> conn_flag(c, conn, STALLED);
> } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) ==
> @@ -440,34 +494,32 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
> check = IP4_CSUM;
> if (*c->pcap)
> check |= TCP_CSUM;
> - for (i = 0, previous_dlen = -1; i < head_cnt; i++) {
> - struct iovec *iov = &elem[head[i]].in_sg[0];
> - int buf_cnt = head[i + 1] - head[i];
> - size_t frame_size = iov_size(iov, buf_cnt);
> - bool push = i == head_cnt - 1;
> + for (i = 0, previous_dlen = -1; i < frame_cnt; i++) {
> + struct iovec *iov = &iov_vu[frame[i].idx_iovec];
> + int iov_cnt = frame[i].num_iovec;
> + bool push = i == frame_cnt - 1;
> ssize_t dlen;
>
> - assert(frame_size >= hdrlen);
> + assert(frame[i].size >= hdrlen);
>
> - dlen = frame_size - hdrlen;
> - if (dlen > len)
> - dlen = len;
> - len -= dlen;
> + dlen = frame[i].size - hdrlen;
>
> /* The IPv4 header checksum varies only with dlen */
> if (previous_dlen != dlen)
> check |= IP4_CSUM;
> previous_dlen = dlen;
>
> - tcp_vu_prepare(c, conn, iov, buf_cnt, dlen, &check, push);
> + tcp_vu_prepare(c, conn, iov, iov_cnt, dlen, &check, push);
>
> - vu_pad(elem[head[i]].in_sg, buf_cnt, dlen + hdrlen);
> - vu_flush(vdev, vq, &elem[head[i]], buf_cnt, dlen + hdrlen);
> + vu_pad(&iov[frame[i].idx_iovec], frame[i].num_iovec,
> + dlen + hdrlen);
>
> if (*c->pcap) {
> - pcap_iov(iov, buf_cnt, VNET_HLEN,
> + pcap_iov(iov, iov_cnt, VNET_HLEN,
> dlen + hdrlen - VNET_HLEN);
> }
> + vu_flush(vdev, vq, &elem[frame[i].idx_element],
> + frame[i].num_element, dlen + hdrlen);
>
> conn->seq_to_tap += dlen;
> }
> --
> 2.53.0
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
next prev parent reply other threads:[~2026-05-11 8:25 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-16 16:16 [PATCH v6 0/4] vhost-user,tcp: Handle multiple iovec entries per virtqueue element Laurent Vivier
2026-04-16 16:16 ` [PATCH v6 1/4] tcp: Encode checksum computation flags in a single parameter Laurent Vivier
2026-05-09 23:45 ` Jon Maloy
2026-05-11 7:49 ` David Gibson
2026-04-16 16:16 ` [PATCH v6 2/4] tcp_vu: Build headers on the stack and write them into the iovec Laurent Vivier
2026-05-09 23:57 ` Jon Maloy
2026-05-11 7:54 ` David Gibson
2026-04-16 16:16 ` [PATCH v6 3/4] tcp_vu: Support multibuffer frames in tcp_vu_sock_recv() Laurent Vivier
2026-04-17 14:56 ` Laurent Vivier
2026-05-10 1:33 ` Jon Maloy
2026-05-11 8:24 ` David Gibson [this message]
2026-04-16 16:16 ` [PATCH v6 4/4] tcp_vu: Support multibuffer frames in tcp_vu_send_flag() Laurent Vivier
2026-05-10 2:03 ` Jon Maloy
2026-05-11 10:52 ` David Gibson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=agGSSYTjte8o-DTi@zatzit \
--to=david@gibson.dropbear.id.au \
--cc=lvivier@redhat.com \
--cc=passt-dev@passt.top \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).