On Thu, Apr 16, 2026 at 06:16:17PM +0200, Laurent Vivier wrote: > Previously, tcp_vu_sock_recv() assumed a 1:1 mapping between virtqueue > elements and iovecs (one iovec per element), enforced by an ASSERT. > This prevented the use of virtqueue elements with multiple buffers > (e.g. when mergeable rx buffers are not negotiated and headers are > provided in a separate buffer). > > Introduce a struct vu_frame to track per-frame metadata: the range of > elements and iovecs that make up each frame, and the frame's total size. > This replaces the head[] array which only tracked element indices. > > A separate iov_msg[] array is built for recvmsg() by cloning the data > portions (after stripping headers) using iov_tail helpers. > > Then a frame truncation after recvmsg() properly walks the frame and > element arrays to adjust iovec counts and element counts. > > Signed-off-by: Laurent Vivier LGTM apart from a couple of minor points noted below. > --- > tcp_vu.c | 174 ++++++++++++++++++++++++++++++++++++------------------- > 1 file changed, 113 insertions(+), 61 deletions(-) > > diff --git a/tcp_vu.c b/tcp_vu.c > index 2017aec90342..96b16007701d 100644 > --- a/tcp_vu.c > +++ b/tcp_vu.c > @@ -35,9 +35,24 @@ > #include "vu_common.h" > #include > > -static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + DISCARD_IOV_NUM]; > +static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE]; > static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; > -static int head[VIRTQUEUE_MAX_SIZE + 1]; > + > +/** > + * struct vu_frame - Descriptor for a TCP frame mapped to virtqueue elements > + * @idx_element: Index of first element in elem[] for this frame > + * @num_element: Number of virtqueue elements used by this frame > + * @idx_iovec: Index of first iovec in iov_vu[] for this frame > + * @num_iovec: Number of iovecs covering this frame's buffers > + * @size: Total frame size including all headers > + */ > +static struct vu_frame { > + int idx_element; > + int num_element; > + int idx_iovec; > + int num_iovec; > + size_t size; > +} frame[VIRTQUEUE_MAX_SIZE]; > > /** > * tcp_vu_hdrlen() - Sum size of all headers, from TCP to virtio-net > @@ -174,8 +189,8 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) > * @v6: Set for IPv6 connections > * @already_sent: Number of bytes already sent > * @fillsize: Maximum bytes to fill in guest-side receiving window > - * @iov_cnt: number of iov (output) > - * @head_cnt: Pointer to store the count of head iov entries (output) > + * @elem_used: number of element (output) > + * @frame_cnt: Pointer to store the number of frames (output) > * > * Return: number of bytes received from the socket, or a negative error code > * on failure. > @@ -183,57 +198,77 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) > static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, > const struct tcp_tap_conn *conn, bool v6, > uint32_t already_sent, size_t fillsize, > - int *iov_cnt, int *head_cnt) > + int *elem_used, int *frame_cnt) > { > + static struct iovec iov_msg[VIRTQUEUE_MAX_SIZE + DISCARD_IOV_NUM]; > const struct vu_dev *vdev = c->vdev; > struct msghdr mh_sock = { 0 }; > uint16_t mss = MSS_GET(conn); > size_t hdrlen, iov_used; > int s = conn->sock; > + ssize_t ret, dlen; > int elem_cnt; > - ssize_t ret; > - int i; > - > - *iov_cnt = 0; > + int i, j; > > hdrlen = tcp_vu_hdrlen(v6); > > + *elem_used = 0; > + > iov_used = 0; > elem_cnt = 0; > - *head_cnt = 0; > + *frame_cnt = 0; > while (fillsize > 0 && elem_cnt < ARRAY_SIZE(elem) && > - iov_used < VIRTQUEUE_MAX_SIZE) { > - size_t frame_size, dlen, in_total; > - struct iovec *iov; > + iov_used < ARRAY_SIZE(iov_vu) && > + *frame_cnt < ARRAY_SIZE(frame)) { > + size_t frame_size, in_total; > int cnt; > > cnt = vu_collect(vdev, vq, &elem[elem_cnt], > ARRAY_SIZE(elem) - elem_cnt, > - &iov_vu[DISCARD_IOV_NUM + iov_used], > - VIRTQUEUE_MAX_SIZE - iov_used, &in_total, > + &iov_vu[iov_used], > + ARRAY_SIZE(iov_vu) - iov_used, &in_total, > MIN(mss, fillsize) + hdrlen, > &frame_size); > if (cnt == 0) > break; > - assert((size_t)cnt == in_total); /* one iovec per element */ > + > + frame[*frame_cnt].idx_element = elem_cnt; > + frame[*frame_cnt].num_element = cnt; > + frame[*frame_cnt].idx_iovec = iov_used; > + frame[*frame_cnt].num_iovec = in_total; > + frame[*frame_cnt].size = frame_size; > + (*frame_cnt)++; > > iov_used += in_total; > - dlen = frame_size - hdrlen; > + elem_cnt += cnt; > > - /* reserve space for headers in iov */ > - iov = &elem[elem_cnt].in_sg[0]; > - assert(iov->iov_len >= hdrlen); > - iov->iov_base = (char *)iov->iov_base + hdrlen; > - iov->iov_len -= hdrlen; > - head[(*head_cnt)++] = elem_cnt; > + fillsize -= frame_size - hdrlen; > + } > > - fillsize -= dlen; > - elem_cnt += cnt; > + /* build an iov array without headers */ > + for (i = 0, j = DISCARD_IOV_NUM; i < *frame_cnt && > + j < ARRAY_SIZE(iov_msg); i++) { > + struct iov_tail data; > + ssize_t cnt; > + > + data = IOV_TAIL(&iov_vu[frame[i].idx_iovec], > + frame[i].num_iovec, 0); > + iov_drop_header(&data, hdrlen); > + > + cnt = iov_tail_clone(&iov_msg[j], ARRAY_SIZE(iov_msg) - j, > + &data); > + if (cnt == -1) > + die("Missing entries in iov_msg"); Is a fatal error really what we want here? > + > + j += cnt; > } > > - if (tcp_prepare_iov(&mh_sock, iov_vu, already_sent, elem_cnt)) > + if (tcp_prepare_iov(&mh_sock, iov_msg, already_sent, > + j - DISCARD_IOV_NUM)) { > /* Expect caller to do a TCP reset */ > + vu_queue_rewind(vq, elem_cnt); > return -1; > + } > > do > ret = recvmsg(s, &mh_sock, MSG_PEEK); > @@ -247,32 +282,50 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, > if (!peek_offset_cap) > ret -= already_sent; > > - i = iov_skip_bytes(&iov_vu[DISCARD_IOV_NUM], iov_used, > - MAX(hdrlen + ret, VNET_HLEN + ETH_ZLEN), > - NULL); > - if ((size_t)i < iov_used) > - i++; > + dlen = ret; > > - /* adjust head count */ > - while (*head_cnt > 0 && head[*head_cnt - 1] >= i) > - (*head_cnt)--; > + /* truncate frame */ > + *elem_used = 0; Is this redundant with the *elem_used = 0 right at the top of the function? > + for (i = 0; i < *frame_cnt; i++) { > + struct vu_frame *f = &frame[i]; > > - /* mark end of array */ > - head[*head_cnt] = i; > - *iov_cnt = i; > + if ((size_t)ret <= f->size - hdrlen) { > + unsigned cnt; > > - /* release unused buffers */ > - vu_queue_rewind(vq, elem_cnt - i); > + cnt = iov_skip_bytes(&iov_vu[f->idx_iovec], f->num_iovec, > + MAX(hdrlen + ret, VNET_HLEN + ETH_ZLEN), > + NULL); > + if (cnt < (unsigned)f->num_iovec) > + cnt++; > + > + f->size = ret + hdrlen; > + f->num_iovec = cnt; > > - /* restore space for headers in iov */ > - for (i = 0; i < *head_cnt; i++) { > - struct iovec *iov = &elem[head[i]].in_sg[0]; > + for (j = 0; j < f->num_element; j++) { > + struct vu_virtq_element *e; > > - iov->iov_base = (char *)iov->iov_base - hdrlen; > - iov->iov_len += hdrlen; > + e = &elem[f->idx_element + j]; > + if (cnt <= e->in_num) { > + e->in_num = cnt; > + j++; > + break; > + } > + cnt -= e->in_num; > + } > + f->num_element = j; > + *elem_used += j; > + i++; > + break; > + } > + *elem_used += f->num_element; > + ret -= f->size - hdrlen; > } > + *frame_cnt = i; > > - return ret; > + /* release unused buffers */ > + vu_queue_rewind(vq, elem_cnt - *elem_used); > + > + return dlen; > } > > /** > @@ -348,7 +401,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) > struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; > uint32_t already_sent, check; > ssize_t len, previous_dlen; > - int i, iov_cnt, head_cnt; > + int i, elem_cnt, frame_cnt; > size_t hdrlen, fillsize; > int v6 = CONN_V6(conn); > > @@ -386,7 +439,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) > * data from the socket > */ > len = tcp_vu_sock_recv(c, vq, conn, v6, already_sent, fillsize, > - &iov_cnt, &head_cnt); > + &elem_cnt, &frame_cnt); > if (len < 0) { > if (len != -EAGAIN && len != -EWOULDBLOCK) { > tcp_rst(c, conn); > @@ -400,6 +453,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) > } > > if (!len) { > + vu_queue_rewind(vq, elem_cnt); > if (already_sent) { > conn_flag(c, conn, STALLED); > } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == > @@ -440,34 +494,32 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) > check = IP4_CSUM; > if (*c->pcap) > check |= TCP_CSUM; > - for (i = 0, previous_dlen = -1; i < head_cnt; i++) { > - struct iovec *iov = &elem[head[i]].in_sg[0]; > - int buf_cnt = head[i + 1] - head[i]; > - size_t frame_size = iov_size(iov, buf_cnt); > - bool push = i == head_cnt - 1; > + for (i = 0, previous_dlen = -1; i < frame_cnt; i++) { > + struct iovec *iov = &iov_vu[frame[i].idx_iovec]; > + int iov_cnt = frame[i].num_iovec; > + bool push = i == frame_cnt - 1; > ssize_t dlen; > > - assert(frame_size >= hdrlen); > + assert(frame[i].size >= hdrlen); > > - dlen = frame_size - hdrlen; > - if (dlen > len) > - dlen = len; > - len -= dlen; > + dlen = frame[i].size - hdrlen; > > /* The IPv4 header checksum varies only with dlen */ > if (previous_dlen != dlen) > check |= IP4_CSUM; > previous_dlen = dlen; > > - tcp_vu_prepare(c, conn, iov, buf_cnt, dlen, &check, push); > + tcp_vu_prepare(c, conn, iov, iov_cnt, dlen, &check, push); > > - vu_pad(elem[head[i]].in_sg, buf_cnt, dlen + hdrlen); > - vu_flush(vdev, vq, &elem[head[i]], buf_cnt, dlen + hdrlen); > + vu_pad(&iov[frame[i].idx_iovec], frame[i].num_iovec, > + dlen + hdrlen); > > if (*c->pcap) { > - pcap_iov(iov, buf_cnt, VNET_HLEN, > + pcap_iov(iov, iov_cnt, VNET_HLEN, > dlen + hdrlen - VNET_HLEN); > } > + vu_flush(vdev, vq, &elem[frame[i].idx_element], > + frame[i].num_element, dlen + hdrlen); > > conn->seq_to_tap += dlen; > } > -- > 2.53.0 > -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson