On Mon, Mar 23, 2026 at 03:31:48PM +0100, Laurent Vivier wrote: > Replace direct iovec pointer arithmetic in UDP vhost-user handling with > iov_tail operations. > > udp_vu_sock_recv() now takes an iov/cnt pair instead of using the > file-scoped iov_vu array, and returns the data length rather than the > iov count. Internally it uses IOV_TAIL() to create a view past the > L2/L3/L4 headers, and iov_tail_clone() to build the recvmsg() iovec, > removing the manual pointer offset and restore pattern. > > udp_vu_prepare() and udp_vu_csum() take a const struct iov_tail * > instead of referencing iov_vu directly, making data flow explicit. > > udp_vu_csum() uses iov_drop_header() and IOV_REMOVE_HEADER() to locate > the UDP header and payload, replacing manual offset calculations via > vu_payloadv4()/vu_payloadv6(). > > Signed-off-by: Laurent Vivier Reviewed-by: David Gibson Though a couple of nits below. > --- > udp_vu.c | 124 +++++++++++++++++++++++++++++++------------------------ > 1 file changed, 71 insertions(+), 53 deletions(-) > > diff --git a/udp_vu.c b/udp_vu.c > index 6a1e1696b19f..6638cb0ddc5f 100644 > --- a/udp_vu.c > +++ b/udp_vu.c > @@ -59,21 +59,26 @@ static size_t udp_vu_hdrlen(bool v6) > /** > * udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers > * @c: Execution context > + * @iov: IO vector for the frame (in/out) > + * @cnt: Number of IO vector entries (in/out) > * @vq: virtqueue to use to receive data > * @s: Socket to receive from > * @v6: Set for IPv6 connections > - * @dlen: Size of received data (output) > * > - * Return: number of iov entries used to store the datagram, 0 if the datagram > + * Return: size of received data, 0 if the datagram > * was discarded because the virtqueue is not ready, -1 on error > */ > -static int udp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, int s, > - bool v6, ssize_t *dlen) > +static ssize_t udp_vu_sock_recv(const struct ctx *c, struct iovec *iov, > + size_t *cnt, unsigned *elem_used, > + struct vu_virtq *vq, int s, bool v6) > { > const struct vu_dev *vdev = c->vdev; > - int elem_cnt, elem_used, iov_used; > struct msghdr msg = { 0 }; > - size_t iov_cnt, hdrlen; > + struct iov_tail payload; > + size_t hdrlen, iov_used; > + unsigned elem_cnt; > + unsigned i, j; > + ssize_t dlen; > > assert(!c->no_udp); > > @@ -83,6 +88,7 @@ static int udp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, int s, > if (recvmsg(s, &msg, MSG_DONTWAIT) < 0) > debug_perror("Failed to discard datagram"); > > + *cnt = 0; > return 0; > } > > @@ -90,67 +96,70 @@ static int udp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, int s, > hdrlen = udp_vu_hdrlen(v6); > > elem_cnt = vu_collect(vdev, vq, elem, ARRAY_SIZE(elem), > - iov_vu, ARRAY_SIZE(iov_vu), &iov_cnt, > + iov, *cnt, &iov_used, > IP_MAX_MTU + ETH_HLEN + VNET_HLEN, NULL); > if (elem_cnt == 0) > return -1; > > - assert((size_t)elem_cnt == iov_cnt); /* one iovec per element */ > + assert((size_t)elem_cnt == iov_used); /* one iovec per element */ > > - /* reserve space for the headers */ > - assert(iov_vu[0].iov_len >= MAX(hdrlen, ETH_ZLEN + VNET_HLEN)); > - iov_vu[0].iov_base = (char *)iov_vu[0].iov_base + hdrlen; > - iov_vu[0].iov_len -= hdrlen; > + payload = IOV_TAIL(iov, iov_used, hdrlen); > > - /* read data from the socket */ > - msg.msg_iov = iov_vu; > - msg.msg_iovlen = iov_cnt; > + struct iovec msg_iov[payload.cnt]; > + msg.msg_iov = msg_iov; > + msg.msg_iovlen = iov_tail_clone(msg.msg_iov, payload.cnt, &payload); > > - *dlen = recvmsg(s, &msg, 0); > - if (*dlen < 0) { > + /* read data from the socket */ > + dlen = recvmsg(s, &msg, 0); > + if (dlen < 0) { > vu_queue_rewind(vq, elem_cnt); > return -1; > } > > - /* restore the pointer to the headers address */ > - iov_vu[0].iov_base = (char *)iov_vu[0].iov_base - hdrlen; > - iov_vu[0].iov_len += hdrlen; > + *cnt = vu_pad(iov, iov_used, 0, dlen + hdrlen); > > - iov_used = vu_pad(iov_vu, iov_cnt, 0, *dlen + hdrlen); > - elem_used = iov_used; /* one iovec per element */ > + *elem_used = 0; > + for (i = 0, j = 0; j < *cnt && i < elem_cnt; i++) { Nit, this would be slightly easier to read with the i condition before the j condition. > + if (j + elem[i].in_num > *cnt) > + elem[i].in_num = *cnt - j; > + j += elem[i].in_num; > + (*elem_used)++; > + } > > - vu_set_vnethdr(iov_vu[0].iov_base, elem_used); > + vu_set_vnethdr(iov[0].iov_base, *elem_used); > > /* release unused buffers */ > - vu_queue_rewind(vq, elem_cnt - elem_used); > + vu_queue_rewind(vq, elem_cnt - *elem_used); > > - return iov_used; > + return dlen; > } > > /** > * udp_vu_prepare() - Prepare the packet header > * @c: Execution context > + * @data: IO vector tail for the frame > * @toside: Address information for one side of the flow > * @dlen: Packet data length > * > * Return: Layer-4 length > */ > -static size_t udp_vu_prepare(const struct ctx *c, > +static size_t udp_vu_prepare(const struct ctx *c, const struct iov_tail *data, > const struct flowside *toside, ssize_t dlen) > { > + const struct iovec *iov = data->iov; > struct ethhdr *eh; > size_t l4len; > > /* ethernet header */ > - eh = vu_eth(iov_vu[0].iov_base); > + eh = vu_eth(iov[0].iov_base); > > memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest)); > memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); > > /* initialize header */ > if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) { > - struct iphdr *iph = vu_ip(iov_vu[0].iov_base); > - struct udp_payload_t *bp = vu_payloadv4(iov_vu[0].iov_base); > + struct iphdr *iph = vu_ip(iov[0].iov_base); > + struct udp_payload_t *bp = vu_payloadv4(iov[0].iov_base); > > eh->h_proto = htons(ETH_P_IP); > > @@ -158,8 +167,8 @@ static size_t udp_vu_prepare(const struct ctx *c, > > l4len = udp_update_hdr4(iph, bp, toside, dlen, true); > } else { > - struct ipv6hdr *ip6h = vu_ip(iov_vu[0].iov_base); > - struct udp_payload_t *bp = vu_payloadv6(iov_vu[0].iov_base); > + struct ipv6hdr *ip6h = vu_ip(iov[0].iov_base); > + struct udp_payload_t *bp = vu_payloadv6(iov[0].iov_base); > > eh->h_proto = htons(ETH_P_IPV6); > > @@ -174,25 +183,29 @@ static size_t udp_vu_prepare(const struct ctx *c, > /** > * udp_vu_csum() - Calculate and set checksum for a UDP packet > * @toside: Address information for one side of the flow > - * @iov_used: Number of used iov_vu items > + * @data: IO vector tail for the frame (including vnet header) > */ > -static void udp_vu_csum(const struct flowside *toside, int iov_used) > +static void udp_vu_csum(const struct flowside *toside, > + const struct iov_tail *data) > { > const struct in_addr *src4 = inany_v4(&toside->oaddr); > const struct in_addr *dst4 = inany_v4(&toside->eaddr); > - char *base = iov_vu[0].iov_base; > - struct udp_payload_t *bp; > - struct iov_tail data; > + struct iov_tail payload = *data; > + struct udphdr *uh, uh_storage; > + bool ipv4 = src4 && dst4; > + > + IOV_DROP_HEADER(&payload, struct virtio_net_hdr_mrg_rxbuf); > + IOV_DROP_HEADER(&payload, struct ethhdr); > + if (ipv4) > + IOV_DROP_HEADER(&payload, struct iphdr); > + else > + IOV_DROP_HEADER(&payload, struct ipv6hdr); > + uh = IOV_REMOVE_HEADER(&payload, uh_storage); If uh_storage is actually used, this function is not correct (we never write it back to the iov). That doesn't add any limitation that wasn't already there, but it's arguably less obvious. I'm guessing you address this later in the series, but maybe a FIXME comment? > - if (src4 && dst4) { > - bp = vu_payloadv4(base); > - data = IOV_TAIL(iov_vu, iov_used, (char *)&bp->data - base); > - csum_udp4(&bp->uh, *src4, *dst4, &data); > - } else { > - bp = vu_payloadv6(base); > - data = IOV_TAIL(iov_vu, iov_used, (char *)&bp->data - base); > - csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, &data); > - } > + if (ipv4) > + csum_udp4(uh, *src4, *dst4, &payload); > + else > + csum_udp6(uh, &toside->oaddr.a6, &toside->eaddr.a6, &payload); > } > > /** > @@ -208,23 +221,28 @@ void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx) > bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); > struct vu_dev *vdev = c->vdev; > struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; > + struct iov_tail data; > int i; > > for (i = 0; i < n; i++) { > + unsigned elem_used; > + size_t iov_cnt; > ssize_t dlen; > - int iov_used; > > - iov_used = udp_vu_sock_recv(c, vq, s, v6, &dlen); > - if (iov_used < 0) > + iov_cnt = ARRAY_SIZE(iov_vu); > + dlen = udp_vu_sock_recv(c, iov_vu, &iov_cnt, &elem_used, vq, > + s, v6); > + if (dlen < 0) > break; > > - if (iov_used > 0) { > - udp_vu_prepare(c, toside, dlen); > + if (iov_cnt > 0) { > + data = IOV_TAIL(iov_vu, iov_cnt, 0); > + udp_vu_prepare(c, &data, toside, dlen); > if (*c->pcap) { > - udp_vu_csum(toside, iov_used); > - pcap_iov(iov_vu, iov_used, VNET_HLEN); > + udp_vu_csum(toside, &data); > + pcap_iov(data.iov, data.cnt, VNET_HLEN); > } > - vu_flush(vdev, vq, elem, iov_used); > + vu_flush(vdev, vq, elem, elem_used); > } > } > } > -- > 2.53.0 > -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson