On Mon, Mar 23, 2026 at 05:52:56PM +0100, Laurent Vivier wrote: > Previously, tcp_vu_sock_recv() assumed a 1:1 mapping between virtqueue > elements and iovecs (one iovec per element), enforced by an ASSERT. > This prevented the use of virtqueue elements with multiple buffers > (e.g. when mergeable rx buffers are not negotiated and headers are > provided in a separate buffer). > > Introduce a struct vu_frame to track per-frame metadata: the range of > elements and iovecs that make up each frame, and the frame's total size. > This replaces the head[] array which only tracked element indices. > > A separate iov_msg[] array is built for recvmsg() by cloning the data > portions (after stripping headers) using iov_tail helpers. > > Then a frame truncation after recvmsg() properly walks the frame and > element arrays to adjust iovec counts and element counts. > > Signed-off-by: Laurent Vivier > --- > tcp_vu.c | 161 ++++++++++++++++++++++++++++++++++++------------------- > 1 file changed, 107 insertions(+), 54 deletions(-) > > diff --git a/tcp_vu.c b/tcp_vu.c > index a39f6ea018e9..96cd9da1caae 100644 > --- a/tcp_vu.c > +++ b/tcp_vu.c > @@ -35,9 +35,24 @@ > #include "vu_common.h" > #include > > -static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + DISCARD_IOV_NUM]; > +static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE]; > static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; > -static int head[VIRTQUEUE_MAX_SIZE + 1]; > + > +/** > + * struct vu_frame - Descriptor for a TCP frame mapped to virtqueue elements > + * @idx_element: Index of first element in elem[] for this frame > + * @num_element: Number of virtqueue elements used by this frame > + * @idx_iovec: Index of first iovec in iov_vu[] for this frame > + * @num_iovec: Number of iovecs covering this frame's buffers > + * @size: Total frame size including all headers "all headers" is a bit context dependent. I assume it includes the Ethernet header, but I'm not sure about virtio_net_hdr_mrg_rxbuf. If it doesn't this should be called l2len. If it does.. maybe we need to invent a term. Usually I wouldn't consider the "frame" to include like mrg_rxbuf (I'd consider that a "hw" descriptor followed by the frame). > + */ > +static struct vu_frame { > + int idx_element; > + int num_element; > + int idx_iovec; > + int num_iovec; > + size_t size; > +} frame[VIRTQUEUE_MAX_SIZE]; > > /** > * tcp_vu_hdrlen() - Sum size of all headers, from TCP to virtio-net > @@ -173,8 +188,8 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) > * @v6: Set for IPv6 connections > * @already_sent: Number of bytes already sent > * @fillsize: Maximum bytes to fill in guest-side receiving window > - * @iov_cnt: number of iov (output) > - * @head_cnt: Pointer to store the count of head iov entries (output) > + * @elem_used: number of element (output) > + * @frame_cnt: Pointer to store the number of frames (output) > * > * Return: number of bytes received from the socket, or a negative error code > * on failure. > @@ -182,58 +197,77 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) > static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, > const struct tcp_tap_conn *conn, bool v6, > uint32_t already_sent, size_t fillsize, > - int *iov_cnt, int *head_cnt) > + int *elem_used, int *frame_cnt) > { > + static struct iovec iov_msg[VIRTQUEUE_MAX_SIZE + DISCARD_IOV_NUM]; What's the rationale for making this static? > const struct vu_dev *vdev = c->vdev; > struct msghdr mh_sock = { 0 }; > uint16_t mss = MSS_GET(conn); > size_t hdrlen, iov_used; > int s = conn->sock; > + ssize_t ret, dlen; > int elem_cnt; > - ssize_t ret; > - int i; > - > - *iov_cnt = 0; > + int i, j; > > hdrlen = tcp_vu_hdrlen(v6); > > + *elem_used = 0; > + > iov_used = 0; > elem_cnt = 0; > - *head_cnt = 0; > + *frame_cnt = 0; > while (fillsize > 0 && elem_cnt < ARRAY_SIZE(elem) && > - iov_used < VIRTQUEUE_MAX_SIZE) { > - size_t frame_size, dlen, in_total; > - struct iovec *iov; > + iov_used < ARRAY_SIZE(iov_vu) && > + *frame_cnt < ARRAY_SIZE(frame)) { > + size_t frame_size, in_total; > int cnt; > > cnt = vu_collect(vdev, vq, &elem[elem_cnt], > ARRAY_SIZE(elem) - elem_cnt, > - &iov_vu[DISCARD_IOV_NUM + iov_used], > - VIRTQUEUE_MAX_SIZE - iov_used, &in_total, > + &iov_vu[iov_used], > + ARRAY_SIZE(iov_vu) - iov_used, &in_total, > MIN(mss, fillsize) + hdrlen, > &frame_size); > if (cnt == 0) > break; > - assert((size_t)cnt == in_total); /* one iovec per element */ > + > + frame[*frame_cnt].idx_element = elem_cnt; > + frame[*frame_cnt].num_element = cnt; > + frame[*frame_cnt].idx_iovec = iov_used; > + frame[*frame_cnt].num_iovec = in_total; > + frame[*frame_cnt].size = frame_size; > + (*frame_cnt)++; > > iov_used += in_total; > - dlen = frame_size - hdrlen; > + elem_cnt += cnt; > > - /* reserve space for headers in iov */ > - iov = &elem[elem_cnt].in_sg[0]; > - assert(iov->iov_len >= hdrlen); > + fillsize -= frame_size - hdrlen; > + } > > - iov->iov_base = (char *)iov->iov_base + hdrlen; > - iov->iov_len -= hdrlen; > - head[(*head_cnt)++] = elem_cnt; > + /* build an iov array without headers */ > + for (i = 0, j = DISCARD_IOV_NUM; i < *frame_cnt && > + j < ARRAY_SIZE(iov_msg); i++) { > + struct iov_tail data; > + ssize_t cnt; > > - fillsize -= dlen; > - elem_cnt += cnt; > + data = IOV_TAIL(&iov_vu[frame[i].idx_iovec], > + frame[i].num_iovec, 0); > + iov_drop_header(&data, hdrlen); > + > + cnt = iov_tail_clone(&iov_msg[j], ARRAY_SIZE(iov_msg) - j, > + &data); > + if (cnt == -1) > + die("Missing entries in iov_msg"); IIUC that would indicate a bug in the sizing / setup of the arrays, in which case it should be an assert(). > + > + j += cnt; > } > > - if (tcp_prepare_iov(&mh_sock, iov_vu, already_sent, elem_cnt)) > + if (tcp_prepare_iov(&mh_sock, iov_msg, already_sent, > + j - DISCARD_IOV_NUM)) { > /* Expect caller to do a TCP reset */ > + vu_queue_rewind(vq, elem_cnt); > return -1; > + } > > do > ret = recvmsg(s, &mh_sock, MSG_PEEK); > @@ -247,28 +281,47 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, > if (!peek_offset_cap) > ret -= already_sent; > > - i = vu_pad(&iov_vu[DISCARD_IOV_NUM], iov_used, hdrlen, ret); > + dlen = ret; > > - /* adjust head count */ > - while (*head_cnt > 0 && head[*head_cnt - 1] >= i) > - (*head_cnt)--; > + /* truncate frame */ > + *elem_used = 0; > + for (i = 0; i < *frame_cnt; i++) { > + struct vu_frame *f = &frame[i]; > > - /* mark end of array */ > - head[*head_cnt] = i; > - *iov_cnt = i; > + if ((size_t)ret <= f->size - hdrlen) { > + unsigned cnt; > > - /* release unused buffers */ > - vu_queue_rewind(vq, elem_cnt - i); > + cnt = vu_pad(&iov_vu[f->idx_iovec], f->num_iovec, > + 0, hdrlen + ret); > > - /* restore space for headers in iov */ > - for (i = 0; i < *head_cnt; i++) { > - struct iovec *iov = &elem[head[i]].in_sg[0]; > + f->size = ret + hdrlen; > + f->num_iovec = cnt; > > - iov->iov_base = (char *)iov->iov_base - hdrlen; > - iov->iov_len += hdrlen; > + for (j = 0; j < f->num_element; j++) { > + struct vu_virtq_element *e; > + > + e = &elem[f->idx_element + j]; > + if (cnt <= e->in_num) { > + e->in_num = cnt; > + j++; > + break; > + } > + cnt -= e->in_num; > + } > + f->num_element = j; > + *elem_used += j; > + i++; > + break; > + } > + *elem_used += f->num_element; > + ret -= f->size - hdrlen; > } > + *frame_cnt = i; > > - return ret; > + /* release unused buffers */ > + vu_queue_rewind(vq, elem_cnt - *elem_used); > + > + return dlen; > } > > /** > @@ -341,7 +394,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) > struct vu_dev *vdev = c->vdev; > struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; > ssize_t len, previous_dlen; > - int i, iov_cnt, head_cnt; > + int i, elem_cnt, frame_cnt; > size_t hdrlen, fillsize; > int v6 = CONN_V6(conn); > uint32_t already_sent; > @@ -381,7 +434,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) > * data from the socket > */ > len = tcp_vu_sock_recv(c, vq, conn, v6, already_sent, fillsize, > - &iov_cnt, &head_cnt); > + &elem_cnt, &frame_cnt); > if (len < 0) { > if (len != -EAGAIN && len != -EWOULDBLOCK) { > tcp_rst(c, conn); > @@ -395,6 +448,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) > } > > if (!len) { > + vu_queue_rewind(vq, elem_cnt); > if (already_sent) { > conn_flag(c, conn, STALLED); > } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == > @@ -432,33 +486,32 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) > */ > > hdrlen = tcp_vu_hdrlen(v6); > - for (i = 0, previous_dlen = -1, check = -1; i < head_cnt; i++) { > - struct iovec *iov = &elem[head[i]].in_sg[0]; > - int buf_cnt = head[i + 1] - head[i]; > - size_t frame_size = iov_size(iov, buf_cnt); > - bool push = i == head_cnt - 1; > + for (i = 0, previous_dlen = -1, check = -1; i < frame_cnt; i++) { > + struct iovec *iov = &iov_vu[frame[i].idx_iovec]; > + int iov_cnt = frame[i].num_iovec; > + bool push = i == frame_cnt - 1; > ssize_t dlen; > > - assert(frame_size >= hdrlen); > + assert(frame[i].size >= hdrlen); > > - dlen = frame_size - hdrlen; > - vu_set_vnethdr(iov->iov_base, buf_cnt); > + dlen = frame[i].size - hdrlen; > + vu_set_vnethdr(iov->iov_base, frame[i].num_element); > > /* The IPv4 header checksum varies only with dlen */ > if (previous_dlen != dlen) > check = -1; > previous_dlen = dlen; > > - tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap, push); > + tcp_vu_prepare(c, conn, iov, iov_cnt, &check, !*c->pcap, push); > > if (*c->pcap) > - pcap_iov(iov, buf_cnt, VNET_HLEN); > + pcap_iov(iov, iov_cnt, VNET_HLEN); > > conn->seq_to_tap += dlen; > } > > /* send packets */ > - vu_flush(vdev, vq, elem, iov_cnt); > + vu_flush(vdev, vq, elem, elem_cnt); > > conn_flag(c, conn, ACK_FROM_TAP_DUE); > > -- > 2.53.0 > -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson