On Fri, Jun 21, 2024 at 04:56:40PM +0200, Laurent Vivier wrote:
> add virtio and vhost-user functions to connect with QEMU.
> 
>   $ ./passt --vhost-user
> 
> and
> 
>   # qemu-system-x86_64 ... -m 4G \
>         -object memory-backend-memfd,id=memfd0,share=on,size=4G \
>         -numa node,memdev=memfd0 \
>         -chardev socket,id=chr0,path=/tmp/passt_1.socket \
>         -netdev vhost-user,id=netdev0,chardev=chr0 \
>         -device virtio-net,mac=9a:2b:2c:2d:2e:2f,netdev=netdev0 \
>         ...
> 
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> ---
>  Makefile       |   4 +-
>  checksum.c     |   1 -
>  conf.c         |  18 +-
>  iov.c          |   1 -
>  packet.c       |   6 +
>  packet.h       |   2 +
>  passt.c        |  12 +-
>  passt.h        |   2 +
>  pcap.c         |   1 -
>  tap.c          |  87 ++++++--
>  tap.h          |   3 +-
>  tcp.c          |  17 +-
>  tcp_vu.c       | 547 +++++++++++++++++++++++++++++++++++++++++++++++++
>  tcp_vu.h       |   9 +
>  udp.c          |  54 +++--
>  udp_internal.h |  39 ++++
>  udp_vu.c       | 237 +++++++++++++++++++++
>  udp_vu.h       |   8 +
>  vhost_user.c   |   6 -
>  virtio.c       |   1 -
>  20 files changed, 988 insertions(+), 67 deletions(-)
>  create mode 100644 tcp_vu.c
>  create mode 100644 tcp_vu.h
>  create mode 100644 udp_internal.h
>  create mode 100644 udp_vu.c
>  create mode 100644 udp_vu.h
> 
> diff --git a/Makefile b/Makefile
> index b2da6ad62103..d22388726099 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
>  PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
>  	icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
>  	ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
> -	tcp_buf.c tcp_splice.c udp.c util.c vhost_user.c virtio.c
> +	tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_vu.c util.c vhost_user.c virtio.c
>  QRAP_SRCS = qrap.c
>  SRCS = $(PASST_SRCS) $(QRAP_SRCS)
>  
> @@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
>  	flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
>  	lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
>  	siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
> -	udp.h util.h vhost_user.h virtio.h
> +	tcp_vu.h udp.h udp_internal.h udp_vu.h util.h vhost_user.h virtio.h
>  HEADERS = $(PASST_HEADERS) seccomp.h
>  
>  C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
> diff --git a/checksum.c b/checksum.c
> index 006614fcbb28..aa5b7ae1cb66 100644
> --- a/checksum.c
> +++ b/checksum.c
> @@ -501,7 +501,6 @@ uint16_t csum(const void *buf, size_t len, uint32_t init)
>   *
>   * Return: 16-bit folded, complemented checksum
>   */
> -/* cppcheck-suppress unusedFunction */
>  uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init)
>  {
>  	unsigned int i;
> diff --git a/conf.c b/conf.c
> index 94b3ed6fa659..2c9a6da05666 100644
> --- a/conf.c
> +++ b/conf.c
> @@ -45,6 +45,7 @@
>  #include "lineread.h"
>  #include "isolation.h"
>  #include "log.h"
> +#include "vhost_user.h"
>  
>  /**
>   * next_chunk - Return the next piece of a string delimited by a character
> @@ -751,6 +752,9 @@ static void usage(const char *name, FILE *f, int status)
>  			"  -s, --socket PATH	UNIX domain socket path\n"
>  			"    default: probe free path starting from "
>  			UNIX_SOCK_PATH "\n", 1);
> +		info(   "  --vhost-user		Enable vhost-user mode");
> +		info(   "    UNIX domain socket is provided by -s option");
> +		info(   "  --print-capabilities	print back-end capabilities in JSON format");

Probably worth nothing this is only meaningful for vhost-user mode.

>  	}
>  
>  	fprintf(f,
> @@ -1175,6 +1179,7 @@ void conf(struct ctx *c, int argc, char **argv)
>  		{"help",	no_argument,		NULL,		'h' },
>  		{"socket",	required_argument,	NULL,		's' },
>  		{"fd",		required_argument,	NULL,		'F' },
> +		{"socket-path",	required_argument,	NULL,		's' }, /* vhost-user mandatory */

Maybe put this next to the --socket option to make it clearer it's an
alias for it.

"vhost-user mandatory" isn't that clear to me - initially I though it
meant the user had to supply this option for vhost-user mode, rather
than that the vhost-user interface mandates this option exists.

>  		{"ns-ifname",	required_argument,	NULL,		'I' },
>  		{"pcap",	required_argument,	NULL,		'p' },
>  		{"pid",		required_argument,	NULL,		'P' },
> @@ -1221,6 +1226,8 @@ void conf(struct ctx *c, int argc, char **argv)
>  		{"config-net",	no_argument,		NULL,		17 },
>  		{"no-copy-routes", no_argument,		NULL,		18 },
>  		{"no-copy-addrs", no_argument,		NULL,		19 },
> +		{"vhost-user",	no_argument,		NULL,		20 },
> +		{"print-capabilities", no_argument,	NULL,		21 }, /* vhost-user mandatory */
>  		{ 0 },
>  	};
>  	char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 };
> @@ -1373,7 +1380,6 @@ void conf(struct ctx *c, int argc, char **argv)
>  				       sizeof(c->ip6.ifname_out), "%s", optarg);
>  			if (ret <= 0 || ret >= (int)sizeof(c->ip6.ifname_out))
>  				die("Invalid interface name: %s", optarg);
> -

Unrelated whitespace change.

>  			break;
>  		case 17:
>  			if (c->mode != MODE_PASTA)
> @@ -1395,6 +1401,16 @@ void conf(struct ctx *c, int argc, char **argv)
>  			warn("--no-copy-addrs will be dropped soon");
>  			c->no_copy_addrs = copy_addrs_opt = true;
>  			break;
> +		case 20:
> +			if (c->mode == MODE_PASTA) {
> +				err("--vhost-user is for passt mode only");
> +				usage(argv[0], stdout, EXIT_SUCCESS);
> +			}
> +			c->mode = MODE_VU;
> +			break;
> +		case 21:
> +			vu_print_capabilities();
> +			break;
>  		case 'd':
>  			if (c->debug)
>  				die("Multiple --debug options given");
> diff --git a/iov.c b/iov.c
> index 793788b5d2bc..4215baf7c3b9 100644
> --- a/iov.c
> +++ b/iov.c
> @@ -170,7 +170,6 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt)
>   *
>   * Returns:	The number of iovec needed to store @size bytes.
>   */
> -/* cppcheck-suppress unusedFunction */
>  size_t iov_count(const struct iovec *iov, size_t iov_cnt,
>                   size_t size, size_t *last_iov_length)
>  {
> diff --git a/packet.c b/packet.c
> index af2a539a1794..3c5fc39df6d7 100644
> --- a/packet.c
> +++ b/packet.c
> @@ -25,6 +25,12 @@
>  static int packet_check_range(const struct pool *p, size_t offset, size_t len,
>  			      const char *start, const char *func, int line)
>  {
> +	ASSERT(p->buf);
> +
> +	if (p->buf_size == 0)
> +		return vu_packet_check_range((void *)p->buf, offset, len, start,
> +					     func, line);
> +
>  	if (start < p->buf) {
>  		if (func) {
>  			trace("add packet start %p before buffer start %p, "
> diff --git a/packet.h b/packet.h
> index 8377dcf678bb..0aec6d9410aa 100644
> --- a/packet.h
> +++ b/packet.h
> @@ -22,6 +22,8 @@ struct pool {
>  	struct iovec pkt[1];
>  };
>  
> +int vu_packet_check_range(void *buf, size_t offset, size_t len,
> +			  const char *start, const char *func, int line);
>  void packet_add_do(struct pool *p, size_t len, const char *start,
>  		   const char *func, int line);
>  void *packet_get_do(const struct pool *p, const size_t idx,
> diff --git a/passt.c b/passt.c
> index 9d21c545b9cf..8c0490782a7f 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -274,6 +274,7 @@ int main(int argc, char **argv)
>  	pasta_netns_quit_init(&c);
>  
>  	tap_sock_init(&c);
> +	vu_init(&c);
>  
>  	secret_init(&c);
>  
> @@ -367,11 +368,20 @@ loop:
>  			tcp_timer_handler(&c, ref);
>  			break;
>  		case EPOLL_TYPE_UDP:
> -			udp_buf_sock_handler(&c, ref, eventmask, &now);
> +			if (c.mode == MODE_VU)
> +				udp_vu_sock_handler(&c, ref, eventmask, &now);
> +			else
> +				udp_buf_sock_handler(&c, ref, eventmask, &now);
>  			break;
>  		case EPOLL_TYPE_PING:
>  			icmp_sock_handler(&c, ref);
>  			break;
> +		case EPOLL_TYPE_VHOST_CMD:
> +			tap_handler_vu(&c, eventmask);
> +			break;
> +		case EPOLL_TYPE_VHOST_KICK:
> +			vu_kick_cb(&c, ref);
> +			break;
>  		default:
>  			/* Can't happen */
>  			ASSERT(0);
> diff --git a/passt.h b/passt.h
> index af10d0bfe4ef..f15f28c89d39 100644
> --- a/passt.h
> +++ b/passt.h
> @@ -22,6 +22,7 @@ union epoll_ref;
>  #include "fwd.h"
>  #include "tcp.h"
>  #include "udp.h"
> +#include "udp_vu.h"
>  #include "vhost_user.h"
>  
>  /**
> @@ -122,6 +123,7 @@ struct fqdn {
>  enum passt_modes {
>  	MODE_PASST,
>  	MODE_PASTA,
> +	MODE_VU,
>  };
>  
>  /**
> diff --git a/pcap.c b/pcap.c
> index 507be2ac1edf..d4d0ec62b944 100644
> --- a/pcap.c
> +++ b/pcap.c
> @@ -142,7 +142,6 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
>   *		containing packet data to write, including L2 header
>   * @iovcnt:	Number of buffers (@iov entries)
>   */
> -/* cppcheck-suppress unusedFunction */
>  void pcap_iov(const struct iovec *iov, size_t iovcnt)
>  {
>  	struct timespec now;
> diff --git a/tap.c b/tap.c
> index be272d25b642..e3274d39131a 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -58,6 +58,7 @@
>  #include "packet.h"
>  #include "tap.h"
>  #include "log.h"
> +#include "vhost_user.h"
>  
>  /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
>  static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
> @@ -78,16 +79,22 @@ void tap_send_single(const struct ctx *c, const void *data, size_t l2len)
>  	struct iovec iov[2];
>  	size_t iovcnt = 0;
>  
> -	if (c->mode == MODE_PASST) {
> +	switch (c->mode) {
> +	case MODE_PASST:
>  		iov[iovcnt] = IOV_OF_LVALUE(vnet_len);
>  		iovcnt++;
> -	}
> -
> -	iov[iovcnt].iov_base = (void *)data;
> -	iov[iovcnt].iov_len = l2len;
> -	iovcnt++;
> +		/* fall through */
> +	case MODE_PASTA:
> +		iov[iovcnt].iov_base = (void *)data;
> +		iov[iovcnt].iov_len = l2len;
> +		iovcnt++;
>  
> -	tap_send_frames(c, iov, iovcnt, 1);
> +		tap_send_frames(c, iov, iovcnt, 1);
> +		break;
> +	case MODE_VU:
> +		vu_send(c, data, l2len);
> +		break;
> +	}
>  }
>  
>  /**
> @@ -416,10 +423,19 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
>  	if (!nframes)
>  		return 0;
>  
> -	if (c->mode == MODE_PASTA)
> +	switch (c->mode) {
> +	case MODE_PASTA:
>  		m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes);
> -	else
> +		break;
> +	case MODE_PASST:
>  		m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes);
> +		break;
> +	case MODE_VU:
> +		ASSERT(0);
> +	default:

This should be an ASSERT(0) as well, yes?  We shouldn't be able to get
here without a mode being set.

> +		m = 0;
> +		break;
> +	}
>  
>  	if (m < nframes)
>  		debug("tap: failed to send %zu frames of %zu",
> @@ -1180,11 +1196,17 @@ static void tap_sock_unix_init(struct ctx *c)
>  	ev.data.u64 = ref.u64;
>  	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev);
>  
> -	info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):");
> -	info("    kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s",
> -	     c->sock_path);
> -	info("or qrap, for earlier qemu versions:");
> -	info("    ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
> +	if (c->mode == MODE_VU) {
> +		info("You can start qemu with:");
> +		info("    kvm ... -chardev socket,id=chr0,path=%s -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0\n",
> +		     c->sock_path);
> +	} else {
> +		info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):");
> +		info("    kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s",
> +		     c->sock_path);
> +		info("or qrap, for earlier qemu versions:");
> +		info("    ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
> +	}
>  }
>  
>  /**
> @@ -1194,8 +1216,8 @@ static void tap_sock_unix_init(struct ctx *c)
>   */
>  void tap_listen_handler(struct ctx *c, uint32_t events)
>  {
> -	union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST };
>  	struct epoll_event ev = { 0 };
> +	union epoll_ref ref;
>  	int v = INT_MAX / 2;
>  	struct ucred ucred;
>  	socklen_t len;
> @@ -1235,7 +1257,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
>  		trace("tap: failed to set SO_SNDBUF to %i", v);
>  
>  	ref.fd = c->fd_tap;
> -	ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
> +	if (c->mode == MODE_VU) {
> +		ref.type = EPOLL_TYPE_VHOST_CMD;
> +		ev.events = EPOLLIN | EPOLLRDHUP;
> +	} else {
> +		ref.type = EPOLL_TYPE_TAP_PASST;
> +		ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
> +	}
>  	ev.data.u64 = ref.u64;
>  	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
>  }
> @@ -1324,10 +1352,22 @@ void tap_sock_init(struct ctx *c)
>  
>  	pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz);
>  	pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz);
> +	if (c->mode == MODE_VU) {
> +		pool_tap4_storage.buf = NULL;
> +		pool_tap4_storage.buf_size = 0;
> +		pool_tap6_storage.buf = NULL;
> +		pool_tap6_storage.buf_size = 0;

It seems a bit of a layering violation to initialize the pool with
PACKET_INIT() then mangle its internals in the vhost-user case.  Could
we use a different PACKET_INIT invocation for the VU case instead?

> +	}
>  
>  	for (i = 0; i < TAP_SEQS; i++) {
>  		tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
>  		tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
> +		if (c->mode == MODE_VU) {
> +			tap4_l4[i].p.buf = NULL;
> +			tap4_l4[i].p.buf_size = 0;
> +			tap6_l4[i].p.buf = NULL;
> +			tap6_l4[i].p.buf_size = 0;

Same here, of course.

> +		}
>  	}
>  
>  	if (c->fd_tap != -1) { /* Passed as --fd */
> @@ -1336,12 +1376,21 @@ void tap_sock_init(struct ctx *c)
>  
>  		ASSERT(c->one_off);
>  		ref.fd = c->fd_tap;
> -		if (c->mode == MODE_PASST)
> +		switch (c->mode) {
> +		case MODE_PASST:
>  			ref.type = EPOLL_TYPE_TAP_PASST;
> -		else
> +			ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
> +			break;
> +		case MODE_PASTA:
>  			ref.type = EPOLL_TYPE_TAP_PASTA;
> +			ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
> +			break;
> +		case MODE_VU:
> +			ref.type = EPOLL_TYPE_VHOST_CMD;
> +			ev.events = EPOLLIN | EPOLLRDHUP;
> +			break;

I suspect one of our static checkers will complain at some point if we
don't put a default case with an ASSERT here.

> +		}
>  
> -		ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
>  		ev.data.u64 = ref.u64;
>  		epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
>  		return;
> diff --git a/tap.h b/tap.h
> index 3b2dde41ae8d..d9c6d4f57093 100644
> --- a/tap.h
> +++ b/tap.h
> @@ -40,7 +40,8 @@ static inline struct iovec tap_hdr_iov(const struct ctx *c,
>   */
>  static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
>  {
> -	thdr->vnet_len = htonl(l2len);
> +	if (thdr)
> +		thdr->vnet_len = htonl(l2len);
>  }
>  
>  struct in_addr tap_ip4_daddr(const struct ctx *c);
> diff --git a/tcp.c b/tcp.c
> index 68524235347c..8709dd6d97bb 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -304,6 +304,7 @@
>  #include "flow_table.h"
>  #include "tcp_internal.h"
>  #include "tcp_buf.h"
> +#include "tcp_vu.h"
>  
>  #define TCP_HASH_TABLE_LOAD		70		/* % */
>  #define TCP_HASH_TABLE_SIZE		(FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD)
> @@ -1049,7 +1050,10 @@ static size_t tcp_fill_headers4(const struct ctx *c,
>  
>  	tcp_fill_header(th, conn, seq);
>  
> -	tcp_update_check_tcp4(iph, th);
> +	if (c->mode != MODE_VU)
> +		tcp_update_check_tcp4(iph, th);
> +	else
> +		th->check = 0;
>  
>  	tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
>  
> @@ -1094,7 +1098,10 @@ static size_t tcp_fill_headers6(const struct ctx *c,
>  
>  	tcp_fill_header(th, conn, seq);
>  
> -	tcp_update_check_tcp6(ip6h, th);
> +	if (c->mode != MODE_VU)
> +		tcp_update_check_tcp6(ip6h, th);
> +	else
> +		th->check = 0;
>  
>  	tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
>  
> @@ -1362,6 +1369,9 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
>   */
>  int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
>  {
> +	if (c->mode == MODE_VU)
> +		return tcp_vu_send_flag(c, conn, flags);
> +
>  	return tcp_buf_send_flag(c, conn, flags);
>  }
>  
> @@ -1808,6 +1818,9 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
>   */
>  static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  {
> +	if (c->mode == MODE_VU)
> +		return tcp_vu_data_from_sock(c, conn);
> +
>  	return tcp_buf_data_from_sock(c, conn);
>  }
>  
> diff --git a/tcp_vu.c b/tcp_vu.c
> new file mode 100644
> index 000000000000..f27890f63c0e
> --- /dev/null
> +++ b/tcp_vu.c
> @@ -0,0 +1,547 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later

Needs Copyright notice, author information and general description here.

> +
> +#include <errno.h>
> +#include <stddef.h>
> +#include <stdint.h>
> +
> +#include <netinet/ip.h>
> +
> +#include <sys/socket.h>
> +
> +#include <linux/tcp.h>
> +#include <linux/virtio_net.h>
> +
> +#include "util.h"
> +#include "ip.h"
> +#include "passt.h"
> +#include "siphash.h"
> +#include "inany.h"
> +#include "vhost_user.h"
> +#include "tcp.h"
> +#include "pcap.h"
> +#include "flow.h"
> +#include "tcp_conn.h"
> +#include "flow_table.h"
> +#include "tcp_vu.h"
> +#include "tcp_internal.h"
> +#include "checksum.h"
> +
> +#define CONN_V4(conn)		(!!inany_v4(&(conn)->faddr))
> +#define CONN_V6(conn)		(!CONN_V4(conn))
> +
> +/**
> + * struct tcp_payload_t - TCP header and data to send segments with payload
> + * @th:		TCP header
> + * @data:	TCP data
> + */
> +struct tcp_payload_t {
> +	struct tcphdr th;
> +	uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
> +};

This could be common with tcp_buf.c, couldn't it?

> +
> +/**
> + * struct tcp_flags_t - TCP header and data to send zero-length
> + *                      segments (flags)
> + * @th:		TCP header
> + * @opts	TCP options
> + */
> +struct tcp_flags_t {
> +	struct tcphdr th;
> +	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> +};

Likewise here.

> +
> +/* vhost-user */
> +static const struct virtio_net_hdr vu_header = {
> +	.flags = VIRTIO_NET_HDR_F_DATA_VALID,
> +	.gso_type = VIRTIO_NET_HDR_GSO_NONE,
> +};
> +
> +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
> +{
> +	VuDev *vdev = (VuDev *)&c->vdev;
> +	VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
> +	size_t tlen, vnet_hdrlen, l4len, optlen;
> +	struct virtio_net_hdr_mrg_rxbuf *vh;
> +	struct iovec l2_iov[TCP_NUM_IOVS];
> +	VuVirtqElement elem;
> +	struct iovec in_sg;
> +	struct ethhdr *eh;
> +	int nb_ack;
> +	int ret;
> +
> +	elem.out_num = 0;
> +	elem.out_sg = NULL;
> +	elem.in_num = 1;
> +	elem.in_sg = &in_sg;
> +	ret = vu_queue_pop(vdev, vq, &elem);
> +	if (ret < 0)
> +		return 0;
> +
> +	if (elem.in_num < 1) {
> +		err("virtio-net receive queue contains no in buffers");
> +		vu_queue_rewind(vdev, vq, 1);
> +		return 0;
> +	}
> +
> +	vh = elem.in_sg[0].iov_base;

AFAICT, the code below requires that in_sg[0] be large enough to
contain the frame, plus a virtio_net_hdr_mrg_rxbuf.  Seems like that
we should ASSERT() that somewhere.

If I'm understanding correctly that the virtio_net_hdr_mrg_rxbuf is a
kind of pseudo-header you need for each frame, I'm wondering if it
could be integrated into the tap_hdr mechanisms.

> +
> +	vh->hdr = vu_header;
> +	if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
> +		vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);

sizeof(*vh) maybe?

> +		vh->num_buffers = htole16(1);
> +	} else {
> +		vnet_hdrlen = sizeof(struct virtio_net_hdr);
> +	}
> +
> +	l2_iov[TCP_IOV_TAP].iov_base = NULL;
> +	l2_iov[TCP_IOV_TAP].iov_len = 0;
> +	l2_iov[TCP_IOV_ETH].iov_base = (char *)elem.in_sg[0].iov_base + vnet_hdrlen;
> +	l2_iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
> +
> +	eh = l2_iov[TCP_IOV_ETH].iov_base;

You could initialise eh first, then set l2_iov[TCP_IOV_ETH] using
IOV_OF_LVALUE().

> +
> +	memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
> +	memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
> +
> +	if (CONN_V4(conn)) {
> +		struct tcp_flags_t *payload;
> +		struct iphdr *iph;
> +		uint32_t seq;
> +
> +		l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base +
> +						      l2_iov[TCP_IOV_ETH].iov_len;
> +		l2_iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
> +		l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base +
> +							   l2_iov[TCP_IOV_IP].iov_len;
> +

Similar thing for iph and TCP_IOV_IP.

> +		eh->h_proto = htons(ETH_P_IP);
> +
> +		iph = l2_iov[TCP_IOV_IP].iov_base;
> +		*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);


Hrm... once l2_iov[TCP_IOV_IP] is set up like this, couldn't you share
the actual initialisation of the header fields with the tcp_buf code?

> +		payload = l2_iov[TCP_IOV_PAYLOAD].iov_base;
> +		payload->th = (struct tcphdr){
> +			.doff = offsetof(struct tcp_flags_t, opts) / 4,
> +			.ack = 1
> +		};

Similarly the TCP header.

> +		seq = conn->seq_to_tap;
> +		ret = tcp_prepare_flags(c, conn, flags, &payload->th, payload->opts, &optlen);
> +		if (ret <= 0) {
> +			vu_queue_rewind(vdev, vq, 1);
> +			return ret;
> +		}
> +
> +		l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov, optlen, NULL,
> +						seq);

Hrm, I guess you sort of are, but I wonder if we can make a longer
stretch of code common here.

> +		/* cppcheck-suppress unreadVariable */
> +		l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> +
> +		tlen =  l4len + sizeof(*iph) + sizeof(struct ethhdr);

'l2len' would be the preferred name for this quantity now.

> +
> +		if (*c->pcap) {
> +			uint32_t sum = proto_ipv4_header_psum(l4len,
> +							      IPPROTO_TCP,
> +				/* cppcheck-suppress unknownEvaluationOrder */
> +				(struct in_addr){ .s_addr = iph->saddr },

I think using locals of type struct in_addr would be cleaner here, and
avoid the cppcheck warning more elegantly.

> +				(struct in_addr){ .s_addr = iph->daddr });
> +
> +			payload->th.check = 0;
> +			payload->th.check = csum(&payload->th, optlen + sizeof(struct tcphdr), sum);
> +		}
> +	} else {
> +		struct tcp_flags_t *payload;
> +		struct ipv6hdr *ip6h;
> +		uint32_t seq;
> +
> +		l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base +
> +						      l2_iov[TCP_IOV_ETH].iov_len;
> +		l2_iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
> +		l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base +
> +							   l2_iov[TCP_IOV_IP].iov_len;
> +
> +		eh->h_proto = htons(ETH_P_IPV6);
> +
> +		ip6h = l2_iov[TCP_IOV_IP].iov_base;
> +		*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
> +
> +		payload = l2_iov[TCP_IOV_PAYLOAD].iov_base;
> +		payload->th = (struct tcphdr){
> +			.doff = offsetof(struct tcp_flags_t, opts) / 4,
> +			.ack = 1
> +		};
> +
> +		seq = conn->seq_to_tap;
> +		ret = tcp_prepare_flags(c, conn, flags, &payload->th, payload->opts, &optlen);
> +		if (ret <= 0) {
> +			vu_queue_rewind(vdev, vq, 1);
> +			return ret;
> +		}
> +
> +		l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov, optlen, NULL,
> +						seq);
> +		/* cppcheck-suppress unreadVariable */
> +		l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> +
> +		tlen =  l4len + sizeof(*ip6h) + sizeof(struct ethhdr);
> +
> +		if (*c->pcap) {
> +			uint32_t sum = proto_ipv6_header_psum(l4len,
> +							      IPPROTO_TCP,
> +							      &ip6h->saddr,
> +							      &ip6h->daddr);
> +
> +			payload->th.check = 0;
> +			payload->th.check = csum(&payload->th, optlen + sizeof(struct tcphdr), sum);
> +		}
> +	}
> +
> +	pcap((void *)eh, tlen);
> +
> +	tlen += vnet_hdrlen;
> +	vu_queue_fill(vdev, vq, &elem, tlen, 0);
> +	nb_ack = 1;
> +
> +	if (flags & DUP_ACK) {
> +		VuVirtqElement elem_dup;
> +		struct iovec in_sg_dup;
> +
> +		elem_dup.out_num = 0;
> +		elem_dup.out_sg = NULL;
> +		elem_dup.in_num = 1;
> +		elem_dup.in_sg = &in_sg_dup;
> +		ret = vu_queue_pop(vdev, vq, &elem_dup);
> +		if (ret == 0) {
> +			if (elem_dup.in_num < 1 || elem_dup.in_sg[0].iov_len < tlen) {
> +				vu_queue_rewind(vdev, vq, 1);
> +			} else {
> +				memcpy(elem_dup.in_sg[0].iov_base, vh, tlen);
> +				nb_ack++;
> +			}
> +		}
> +	}
> +
> +	vu_queue_flush(vdev, vq, nb_ack);
> +	vu_queue_notify(vdev, vq);
> +
> +	return 0;
> +}
> +
> +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
> +{
> +	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
> +	static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE];
> +	static VuVirtqElement elem[VIRTQUEUE_MAX_SIZE];
> +	static struct iovec in_sg[VIRTQUEUE_MAX_SIZE];
> +	VuDev *vdev = (VuDev *)&c->vdev;
> +	VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
> +	size_t l2_hdrlen, vnet_hdrlen, fillsize;
> +	int s = conn->sock, v4 = CONN_V4(conn);
> +	struct iovec l2_iov[TCP_NUM_IOVS];
> +	int i, ret, iov_cnt, iov_used;
> +	struct msghdr mh_sock = { 0 };
> +	uint16_t mss = MSS_GET(conn);
> +	static int in_sg_count;
> +	uint32_t already_sent;
> +	const uint16_t *check;
> +	struct iovec *first;
> +	bool has_mrg_rxbuf;
> +	int segment_size;
> +	int num_buffers;
> +	ssize_t len;
> +
> +	if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
> +		err("Got packet, but no available descriptors on RX virtq.");

flow_err() to give more useful information here.

> +		return 0;
> +	}
> +
> +	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
> +
> +	if (SEQ_LT(already_sent, 0)) {
> +		/* RFC 761, section 2.1. */
> +		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
> +			   conn->seq_ack_from_tap, conn->seq_to_tap);
> +		conn->seq_to_tap = conn->seq_ack_from_tap;
> +		already_sent = 0;
> +	}
> +
> +	if (!wnd_scaled || already_sent >= wnd_scaled) {
> +		conn_flag(c, conn, STALLED);
> +		conn_flag(c, conn, ACK_FROM_TAP_DUE);
> +		return 0;
> +	}

Duplicating some of this subtle TCP core logic between the buf and vu
paths worries me quite a bit :/.

> +
> +	/* Set up buffer descriptors we'll fill completely and partially. */
> +
> +	fillsize = wnd_scaled;
> +
> +	iov_vu[0].iov_base = tcp_buf_discard;
> +	iov_vu[0].iov_len = already_sent;
> +	fillsize -= already_sent;
> +
> +	has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF);
> +	if (has_mrg_rxbuf)
> +		vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +	else
> +		vnet_hdrlen = sizeof(struct virtio_net_hdr);
> +	l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct tcphdr);

"l2_hdrlen" is a bad name, since this is including headers both
"above" L2 (tcp, IP) and "below" L2 (vnet_hdrlen).

> +	if (v4)
> +		l2_hdrlen += sizeof(struct iphdr);
> +	else
> +		l2_hdrlen += sizeof(struct ipv6hdr);
> +
> +	iov_cnt = 0;
> +	in_sg_count = 0;
> +	segment_size = 0;
> +	while (fillsize > 0 && iov_cnt < VIRTQUEUE_MAX_SIZE - 1 &&
> +			       in_sg_count < ARRAY_SIZE(in_sg)) {
> +
> +		elem[iov_cnt].out_num = 0;
> +		elem[iov_cnt].out_sg = NULL;
> +		elem[iov_cnt].in_num = ARRAY_SIZE(in_sg) - in_sg_count;
> +		elem[iov_cnt].in_sg = &in_sg[in_sg_count];
> +		ret = vu_queue_pop(vdev, vq, &elem[iov_cnt]);
> +		if (ret < 0)
> +			break;
> +
> +		if (elem[iov_cnt].in_num < 1) {
> +			err("virtio-net receive queue contains no in buffers");
> +			goto err;
> +		}
> +		in_sg_count += elem[iov_cnt].in_num;
> +
> +		ASSERT(elem[iov_cnt].in_num == 1);
> +		ASSERT(elem[iov_cnt].in_sg[0].iov_len >= l2_hdrlen);
> +
> +		if (segment_size == 0) {
> +			iov_vu[iov_cnt + 1].iov_base =
> +					(char *)elem[iov_cnt].in_sg[0].iov_base + l2_hdrlen;
> +			iov_vu[iov_cnt + 1].iov_len =
> +					elem[iov_cnt].in_sg[0].iov_len - l2_hdrlen;

Do we need to verify somehwere that this buffer is large enough for
all the headers?

> +		} else {
> +			iov_vu[iov_cnt + 1].iov_base = elem[iov_cnt].in_sg[0].iov_base;
> +			iov_vu[iov_cnt + 1].iov_len = elem[iov_cnt].in_sg[0].iov_len;
> +		}
> +
> +		if (iov_vu[iov_cnt + 1].iov_len > fillsize)
> +			iov_vu[iov_cnt + 1].iov_len = fillsize;
> +
> +		segment_size += iov_vu[iov_cnt + 1].iov_len;
> +		if (!has_mrg_rxbuf) {
> +			segment_size = 0;
> +		} else if (segment_size >= mss) {
> +			iov_vu[iov_cnt + 1].iov_len -= segment_size - mss;
> +			segment_size = 0;
> +		}

If I'm understanding this correctly, we're adjusting the size of each
TCP packet we generate to match the size of the buffer that the guest
provides.  Is that correct?  In practice, how large are these buffers
going to me - could this dramatically our typical segment size in
practice?

> +		fillsize -= iov_vu[iov_cnt + 1].iov_len;
> +
> +		iov_cnt++;
> +	}
> +	if (iov_cnt == 0)
> +		return 0;
> +
> +	ret = 0;
> +	mh_sock.msg_iov = iov_vu;
> +	mh_sock.msg_iovlen = iov_cnt + 1;
> +
> +	do
> +		len = recvmsg(s, &mh_sock, MSG_PEEK);
> +	while (len < 0 && errno == EINTR);
> +
> +	if (len < 0)
> +		goto err;
> +
> +	if (!len) {
> +		vu_queue_rewind(vdev, vq, iov_cnt);
> +		if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
> +			ret = tcp_vu_send_flag(c, conn, FIN | ACK);
> +			if (ret) {
> +				tcp_rst(c, conn);
> +				return ret;
> +			}
> +
> +			conn_event(c, conn, TAP_FIN_SENT);
> +		}
> +
> +		return 0;
> +	}
> +
> +	len -= already_sent;
> +	if (len <= 0) {
> +		conn_flag(c, conn, STALLED);
> +		vu_queue_rewind(vdev, vq, iov_cnt);
> +		return 0;
> +	}
> +
> +	conn_flag(c, conn, ~STALLED);
> +
> +	/* Likely, some new data was acked too. */
> +	tcp_update_seqack_wnd(c, conn, 0, NULL);
> +
> +	/* initialize headers */
> +	iov_used = 0;
> +	num_buffers = 0;
> +	check = NULL;
> +	segment_size = 0;
> +	for (i = 0; i < iov_cnt && len; i++) {
> +
> +		if (segment_size == 0)
> +			first = &iov_vu[i + 1];
> +
> +		if (iov_vu[i + 1].iov_len > (size_t)len)
> +			iov_vu[i + 1].iov_len = len;
> +
> +		len -= iov_vu[i + 1].iov_len;
> +		iov_used++;
> +
> +		segment_size += iov_vu[i + 1].iov_len;
> +		num_buffers++;
> +
> +		if (segment_size >= mss || len == 0 ||
> +		    i + 1 == iov_cnt || !has_mrg_rxbuf) {
> +			char *base = (char *)first->iov_base - l2_hdrlen;
> +			size_t size = first->iov_len + l2_hdrlen;
> +			struct virtio_net_hdr_mrg_rxbuf *vh;
> +			struct ethhdr *eh;
> +			size_t l4len;
> +
> +			vh = (struct virtio_net_hdr_mrg_rxbuf *)base;
> +
> +			vh->hdr = vu_header;
> +			if (has_mrg_rxbuf)
> +				vh->num_buffers = htole16(num_buffers);
> +
> +			l2_iov[TCP_IOV_TAP].iov_base = NULL;
> +			l2_iov[TCP_IOV_TAP].iov_len = 0;
> +			l2_iov[TCP_IOV_ETH].iov_base = base + vnet_hdrlen;
> +			l2_iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
> +
> +			eh = l2_iov[TCP_IOV_ETH].iov_base;

Again, this could be done a bit more neatly using IOV_OF_LVALUE().

Also... IIUC, the only purpose of l2_iov[] here is to communicate the
various pieces of the packet to the header initiailizing functions.
That seems like a poor choice compared to specifically typed pointers
for each piece (which, yes, I realise would mean a bunch of changes in
the existing code).

> +			memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
> +			memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
> +
> +			/* initialize header */
> +			if (v4) {
> +				struct tcp_payload_t *payload;
> +				struct iphdr *iph;
> +
> +				l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base +
> +								      l2_iov[TCP_IOV_ETH].iov_len;
> +				l2_iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr);
> +				l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base +
> +									   l2_iov[TCP_IOV_IP].iov_len;
> +
> +
> +				eh->h_proto = htons(ETH_P_IP);
> +
> +				iph = l2_iov[TCP_IOV_IP].iov_base;
> +				*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
> +				payload = l2_iov[TCP_IOV_PAYLOAD].iov_base;
> +				payload->th = (struct tcphdr){
> +					.doff = offsetof(struct tcp_payload_t, data) / 4,
> +					.ack = 1
> +				};
> +
> +				l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov,
> +								segment_size,
> +								len ? check : NULL,
> +								conn->seq_to_tap);
> +				l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> +
> +				if (*c->pcap) {
> +					uint32_t sum = proto_ipv4_header_psum(l4len,
> +									      IPPROTO_TCP,
> +				/* cppcheck-suppress unknownEvaluationOrder */
> +				(struct in_addr){ .s_addr = iph->saddr },
> +				(struct in_addr){ .s_addr = iph->daddr });
> +
> +					first->iov_base = &payload->th;
> +					first->iov_len = size - l2_hdrlen + sizeof(struct tcphdr);
> +					payload->th.check = 0;
> +					payload->th.check = csum_iov(first, num_buffers, sum);
> +				}
> +
> +				check = &iph->check;
> +			} else {
> +				struct tcp_payload_t *payload;
> +				struct ipv6hdr *ip6h;
> +
> +				l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base +
> +								      l2_iov[TCP_IOV_ETH].iov_len;
> +				l2_iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr);
> +				l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base +
> +									   l2_iov[TCP_IOV_IP].iov_len;
> +
> +
> +				eh->h_proto = htons(ETH_P_IPV6);
> +
> +				ip6h = l2_iov[TCP_IOV_IP].iov_base;
> +				*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
> +
> +				payload = l2_iov[TCP_IOV_PAYLOAD].iov_base;
> +				payload->th = (struct tcphdr){
> +					.doff = offsetof(struct tcp_payload_t, data) / 4,
> +					.ack = 1
> +				};
> +;
> +				l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov,
> +								segment_size,
> +								NULL, conn->seq_to_tap);
> +				l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> +
> +				if (*c->pcap) {
> +					uint32_t sum = proto_ipv6_header_psum(l4len,
> +									      IPPROTO_TCP,
> +									      &ip6h->saddr,
> +									      &ip6h->daddr);
> +
> +					first->iov_base = &payload->th;
> +					first->iov_len = size - l2_hdrlen + sizeof(struct tcphdr);
> +
> +					payload->th.check = 0;
> +					payload->th.check = csum_iov(first, num_buffers, sum);
> +				}
> +			}
> +
> +			/* set iov for pcap logging */
> +			first->iov_base = eh;
> +			first->iov_len = size - vnet_hdrlen;
> +
> +			pcap_iov(first, num_buffers);
> +
> +			/* set iov_len for vu_queue_fill_by_index(); */
> +
> +			first->iov_base = base;
> +			first->iov_len = size;
> +
> +			conn->seq_to_tap += segment_size;
> +
> +			segment_size = 0;
> +			num_buffers = 0;
> +		}
> +	}
> +
> +	/* release unused buffers */
> +	vu_queue_rewind(vdev, vq, iov_cnt - iov_used);
> +
> +	/* send packets */
> +	for (i = 0; i < iov_used; i++)
> +		vu_queue_fill(vdev, vq, &elem[i], iov_vu[i + 1].iov_len, i);
> +
> +	vu_queue_flush(vdev, vq, iov_used);
> +	vu_queue_notify(vdev, vq);
> +
> +	conn_flag(c, conn, ACK_FROM_TAP_DUE);
> +
> +	return 0;
> +err:
> +	vu_queue_rewind(vdev, vq, iov_cnt);
> +
> +	if (errno != EAGAIN && errno != EWOULDBLOCK) {
> +		ret = -errno;
> +		tcp_rst(c, conn);
> +	}
> +
> +	return ret;
> +}
> diff --git a/tcp_vu.h b/tcp_vu.h
> new file mode 100644
> index 000000000000..b8c57a543ed5
> --- /dev/null
> +++ b/tcp_vu.h
> @@ -0,0 +1,9 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +#ifndef TCP_VU_H
> +#define TCP_VU_H
> +
> +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
> +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
> +
> +#endif  /*TCP_VU_H */
> diff --git a/udp.c b/udp.c
> index dba75d7fecbd..90d58b691c83 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -121,9 +121,7 @@
>  #include "tap.h"
>  #include "pcap.h"
>  #include "log.h"
> -
> -#define UDP_CONN_TIMEOUT	180 /* s, timeout for ephemeral or local bind */
> -#define UDP_MAX_FRAMES		32  /* max # of frames to receive at once */
> +#include "udp_internal.h"
>  
>  /**
>   * struct udp_tap_port - Port tracking based on tap-facing source port
> @@ -171,20 +169,8 @@ static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][DIV_ROUND_UP(NUM_PORTS, 8)
>  
>  /* Static buffers */
>  
> -/**
> - * struct udp_payload_t - UDP header and data for inbound messages
> - * @uh:		UDP header
> - * @data:	UDP data
> - */
> -static struct udp_payload_t {
> -	struct udphdr uh;
> -	char data[USHRT_MAX - sizeof(struct udphdr)];
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)))
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
> -#endif
> -udp_payload[UDP_MAX_FRAMES];
> +/* UDP header and data for inbound messages */
> +static struct udp_payload_t udp_payload[UDP_MAX_FRAMES];
>  
>  /* Ethernet header for IPv4 frames */
>  static struct ethhdr udp4_eth_hdr;
> @@ -239,11 +225,11 @@ static struct mmsghdr	udp6_l2_mh_sock		[UDP_MAX_FRAMES];
>  /* recvmmsg()/sendmmsg() data for "spliced" connections */
>  static struct iovec	udp_iov_splice		[UDP_MAX_FRAMES];
>  
> -static struct sockaddr_in udp4_localname = {
> +struct sockaddr_in udp4_localname = {
>  	.sin_family = AF_INET,
>  	.sin_addr = IN4ADDR_LOOPBACK_INIT,
>  };
> -static struct sockaddr_in6 udp6_localname = {
> +struct sockaddr_in6 udp6_localname = {
>  	.sin6_family = AF_INET6,
>  	.sin6_addr = IN6ADDR_LOOPBACK_INIT,
>  };
> @@ -564,11 +550,11 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
>   *
>   * Return: size of IPv4 payload (UDP header + data)
>   */
> -static size_t udp_update_hdr4(const struct ctx *c,
> -			      struct iphdr *ip4h, const struct sockaddr_in *s_in,
> -			      struct udp_payload_t *bp,
> -			      in_port_t dstport, size_t dlen,
> -			      const struct timespec *now)
> +size_t udp_update_hdr4(const struct ctx *c,
> +		       struct iphdr *ip4h, const struct sockaddr_in *s_in,
> +		       struct udp_payload_t *bp,
> +		       in_port_t dstport, size_t dlen,
> +		       const struct timespec *now)
>  {
>  	const struct in_addr dst = c->ip4.addr_seen;
>  	in_port_t srcport = ntohs(s_in->sin_port);
> @@ -603,7 +589,10 @@ static size_t udp_update_hdr4(const struct ctx *c,
>  	bp->uh.source = s_in->sin_port;
>  	bp->uh.dest = htons(dstport);
>  	bp->uh.len = htons(l4len);
> -	csum_udp4(&bp->uh, src, dst, bp->data, dlen);
> +	if (c->mode != MODE_VU)
> +		csum_udp4(&bp->uh, src, dst, bp->data, dlen);
> +	else
> +		bp->uh.check = 0;
>  
>  	return l4len;
>  }
> @@ -620,11 +609,11 @@ static size_t udp_update_hdr4(const struct ctx *c,
>   *
>   * Return: size of IPv6 payload (UDP header + data)
>   */
> -static size_t udp_update_hdr6(const struct ctx *c,
> -			      struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6,
> -			      struct udp_payload_t *bp,
> -			      in_port_t dstport, size_t dlen,
> -			      const struct timespec *now)
> +size_t udp_update_hdr6(const struct ctx *c,
> +		       struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6,
> +		       struct udp_payload_t *bp,
> +		       in_port_t dstport, size_t dlen,
> +		       const struct timespec *now)
>  {
>  	const struct in6_addr *src = &s_in6->sin6_addr;
>  	const struct in6_addr *dst = &c->ip6.addr_seen;
> @@ -675,7 +664,10 @@ static size_t udp_update_hdr6(const struct ctx *c,
>  	bp->uh.source = s_in6->sin6_port;
>  	bp->uh.dest = htons(dstport);
>  	bp->uh.len = ip6h->payload_len;
> -	csum_udp6(&bp->uh, src, dst, bp->data, dlen);
> +	if (c->mode != MODE_VU)
> +		csum_udp6(&bp->uh, src, dst, bp->data, dlen);
> +	else
> +		bp->uh.check = 0xffff; /* zero checksum is invalid with IPv6 */
>  
>  	return l4len;
>  }
> diff --git a/udp_internal.h b/udp_internal.h
> new file mode 100644
> index 000000000000..898d1e103cb8
> --- /dev/null
> +++ b/udp_internal.h
> @@ -0,0 +1,39 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later
> + * Copyright (c) 2021 Red Hat GmbH
> + * Author: Stefano Brivio <sbrivio@redhat.com>
> + */
> +
> +#ifndef UDP_INTERNAL_H
> +#define UDP_INTERNAL_H
> +
> +#define UDP_CONN_TIMEOUT	180 /* s, timeout for ephemeral or local bind */
> +#define UDP_MAX_FRAMES		32  /* max # of frames to receive at once */
> +
> +extern struct sockaddr_in udp4_localname;
> +extern struct sockaddr_in6 udp6_localname;
> +
> +/**
> + * struct udp_payload_t - UDP header and data for inbound messages
> + * @uh:		UDP header
> + * @data:	UDP data
> + */
> +struct udp_payload_t {
> +	struct udphdr uh;
> +	char data[USHRT_MAX - sizeof(struct udphdr)];
> +#ifdef __AVX2__
> +} __attribute__ ((packed, aligned(32)));
> +#else
> +} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> +#endif
> +
> +size_t udp_update_hdr4(const struct ctx *c,
> +		       struct iphdr *ip4h, const struct sockaddr_in *s_in,
> +		       struct udp_payload_t *bp,
> +		       in_port_t dstport, size_t dlen,
> +		       const struct timespec *now);
> +size_t udp_update_hdr6(const struct ctx *c,
> +		       struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6,
> +		       struct udp_payload_t *bp,
> +		       in_port_t dstport, size_t dlen,
> +		       const struct timespec *now);
> +#endif /* UDP_INTERNAL_H */
> diff --git a/udp_vu.c b/udp_vu.c
> new file mode 100644
> index 000000000000..deb649028153
> --- /dev/null
> +++ b/udp_vu.c
> @@ -0,0 +1,237 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +#include <unistd.h>
> +#include <net/ethernet.h>
> +#include <net/if.h>
> +#include <netinet/in.h>
> +#include <netinet/ip.h>
> +#include <netinet/udp.h>
> +#include <stdint.h>
> +#include <stddef.h>
> +#include <sys/uio.h>
> +#include <linux/virtio_net.h>
> +
> +#include "checksum.h"
> +#include "util.h"
> +#include "ip.h"
> +#include "passt.h"
> +#include "pcap.h"
> +#include "log.h"
> +#include "vhost_user.h"
> +#include "udp_internal.h"
> +#include "udp_vu.h"
> +
> +/* vhost-user */
> +static const struct virtio_net_hdr vu_header = {
> +	.flags = VIRTIO_NET_HDR_F_DATA_VALID,
> +	.gso_type = VIRTIO_NET_HDR_GSO_NONE,
> +};
> +
> +static struct iovec     iov_vu		[VIRTQUEUE_MAX_SIZE];
> +static VuVirtqElement	elem		[VIRTQUEUE_MAX_SIZE];
> +static struct iovec in_sg[VIRTQUEUE_MAX_SIZE];
> +static int in_sg_count;
> +
> +void udp_vu_sock_handler(const struct ctx *c, union epoll_ref ref,
> +			 uint32_t events, const struct timespec *now)
> +{
> +	VuDev *vdev = (VuDev *)&c->vdev;
> +	VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
> +	bool has_mrg_rxbuf, v6 = ref.udp.v6;
> +	in_port_t dstport = ref.udp.port;
> +	size_t l2_hdrlen, vnet_hdrlen;
> +	struct msghdr msg;
> +	int i, virtqueue_max;

As with TCP, it kind of feels like we should be able to share more of
the skeleton of this path.  I'm worried about the amount of logic
duplication we have, in terms of maintainability.

> +
> +	if (c->no_udp || !(events & EPOLLIN))
> +		return;
> +
> +	has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF);
> +	if (has_mrg_rxbuf) {
> +		vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +		virtqueue_max = VIRTQUEUE_MAX_SIZE;
> +	} else {
> +		vnet_hdrlen = sizeof(struct virtio_net_hdr);
> +		virtqueue_max = 1;
> +	}
> +	l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct udphdr);
> +
> +	if (v6) {
> +		l2_hdrlen += sizeof(struct ipv6hdr);
> +
> +		udp6_localname.sin6_port = htons(dstport);
> +		msg.msg_name = &udp6_localname;
> +		msg.msg_namelen = sizeof(udp6_localname);
> +	} else {
> +		l2_hdrlen += sizeof(struct iphdr);
> +
> +		udp4_localname.sin_port = htons(dstport);
> +		msg.msg_name = &udp4_localname;
> +		msg.msg_namelen = sizeof(udp4_localname);
> +	}
> +
> +	msg.msg_control = NULL;
> +	msg.msg_controllen = 0;
> +	msg.msg_flags = 0;
> +
> +	for (i = 0; i < UDP_MAX_FRAMES; i++) {
> +		struct virtio_net_hdr_mrg_rxbuf *vh;
> +		size_t size, fillsize, remaining;
> +		int iov_cnt, iov_used;
> +		struct ethhdr *eh;
> +		ssize_t data_len;
> +		size_t l4len;
> +		char *base;
> +
> +		fillsize = USHRT_MAX;
> +		iov_cnt = 0;
> +		in_sg_count = 0;
> +		while (fillsize && iov_cnt < virtqueue_max &&
> +				in_sg_count < ARRAY_SIZE(in_sg)) {
> +			int ret;
> +
> +			elem[iov_cnt].out_num = 0;
> +			elem[iov_cnt].out_sg = NULL;
> +			elem[iov_cnt].in_num = ARRAY_SIZE(in_sg) - in_sg_count;
> +			elem[iov_cnt].in_sg = &in_sg[in_sg_count];
> +			ret = vu_queue_pop(vdev, vq, &elem[iov_cnt]);
> +			if (ret < 0)
> +				break;
> +			in_sg_count += elem[iov_cnt].in_num;
> +
> +			if (elem[iov_cnt].in_num < 1) {
> +				err("virtio-net receive queue contains no in buffers");
> +				vu_queue_rewind(vdev, vq, iov_cnt);
> +				return;
> +			}
> +			ASSERT(elem[iov_cnt].in_num == 1);
> +			ASSERT(elem[iov_cnt].in_sg[0].iov_len >= l2_hdrlen);
> +
> +			if (iov_cnt == 0) {
> +				base = elem[iov_cnt].in_sg[0].iov_base;
> +				size = elem[iov_cnt].in_sg[0].iov_len;
> +
> +				/* keep space for the headers */
> +				iov_vu[0].iov_base = base + l2_hdrlen;
> +				iov_vu[0].iov_len = size - l2_hdrlen;
> +			} else {
> +				iov_vu[iov_cnt].iov_base = elem[iov_cnt].in_sg[0].iov_base;
> +				iov_vu[iov_cnt].iov_len = elem[iov_cnt].in_sg[0].iov_len;
> +			}
> +
> +			if (iov_vu[iov_cnt].iov_len > fillsize)
> +				iov_vu[iov_cnt].iov_len = fillsize;
> +
> +			fillsize -= iov_vu[iov_cnt].iov_len;
> +
> +			iov_cnt++;
> +		}
> +		if (iov_cnt == 0)
> +			break;
> +
> +		msg.msg_iov = iov_vu;
> +		msg.msg_iovlen = iov_cnt;
> +
> +		data_len = recvmsg(ref.fd, &msg, 0);
> +		if (data_len < 0) {
> +			vu_queue_rewind(vdev, vq, iov_cnt);
> +			return;
> +		}
> +
> +		/* restore original values */
> +		iov_vu[0].iov_base = base;
> +		iov_vu[0].iov_len = size;
> +
> +		/* count the numbers of buffer filled by recvmsg() */
> +		iov_used = iov_count(iov_vu, iov_cnt, l2_hdrlen + data_len,
> +				     &remaining);
> +		ASSERT(iov_used <= iov_cnt);
> +		if (iov_used > 0) {
> +			ASSERT(iov_vu[iov_used - 1].iov_len >= remaining);
> +			iov_vu[iov_used - 1].iov_len = remaining;
> +			/* update size */
> +			if (iov_used - 1 == 0)
> +				size = iov_vu[0].iov_len;
> +		}
> +
> +		/* release unused buffers */
> +		vu_queue_rewind(vdev, vq, iov_cnt - iov_used);
> +
> +		/* vnet_header */
> +		vh = (struct virtio_net_hdr_mrg_rxbuf *)base;
> +		vh->hdr = vu_header;
> +		if (has_mrg_rxbuf)
> +			vh->num_buffers = htole16(iov_used);
> +
> +		/* ethernet header */
> +		eh = (struct ethhdr *)(base + vnet_hdrlen);
> +
> +		memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
> +		memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
> +
> +		/* initialize header */
> +		if (v6) {
> +			struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
> +			struct udp_payload_t *bp = (struct udp_payload_t *)(ip6h + 1);
> +
> +			eh->h_proto = htons(ETH_P_IPV6);
> +
> +			*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_UDP);
> +
> +			l4len = udp_update_hdr6(c, ip6h, &udp6_localname, bp,
> +						dstport, data_len, now);
> +			if (*c->pcap) {
> +				uint32_t sum;
> +
> +				sum = proto_ipv6_header_psum(l4len, IPPROTO_UDP,
> +							     &ip6h->saddr,
> +							     &ip6h->daddr);
> +
> +				iov_vu[0].iov_base = &bp->uh;
> +				iov_vu[0].iov_len = size - l2_hdrlen +
> +						    sizeof(bp->uh);
> +				bp->uh.check = 0; /* by default, set to 0xffff */
> +				bp->uh.check = csum_iov(iov_vu, iov_used, sum);
> +			}
> +		} else {
> +			struct iphdr *iph = (struct iphdr *)(eh + 1);
> +			struct udp_payload_t *bp = (struct udp_payload_t *)(iph + 1);
> +
> +			eh->h_proto = htons(ETH_P_IP);
> +
> +			*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP);
> +
> +			l4len = udp_update_hdr4(c, iph, &udp4_localname, bp,
> +						dstport, data_len, now);
> +			if (*c->pcap) {
> +				uint32_t sum;
> +
> +				sum = proto_ipv4_header_psum(l4len, IPPROTO_UDP,
> +				/* cppcheck-suppress unknownEvaluationOrder */
> +				(struct in_addr){ .s_addr = iph->saddr },
> +				(struct in_addr){ .s_addr = iph->daddr });
> +
> +				iov_vu[0].iov_base = &bp->uh;
> +				iov_vu[0].iov_len = size - l2_hdrlen +
> +						    sizeof(bp->uh);
> +				bp->uh.check = csum_iov(iov_vu, iov_used, sum);
> +			}
> +		}
> +
> +		/* set iov for pcap logging */
> +		iov_vu[0].iov_base = base + vnet_hdrlen;
> +		iov_vu[0].iov_len = size - vnet_hdrlen;
> +		pcap_iov(iov_vu, iov_used);
> +
> +		/* set iov_len for vu_queue_fill_by_index(); */
> +		iov_vu[0].iov_base = base;
> +		iov_vu[0].iov_len = size;
> +
> +		/* send packets */
> +		for (i = 0; i < iov_used; i++)
> +			vu_queue_fill(vdev, vq, &elem[i], iov_vu[i].iov_len, i);
> +
> +		vu_queue_flush(vdev, vq, iov_used);
> +		vu_queue_notify(vdev, vq);
> +	}
> +}
> diff --git a/udp_vu.h b/udp_vu.h
> new file mode 100644
> index 000000000000..e01ce047ee0a
> --- /dev/null
> +++ b/udp_vu.h
> @@ -0,0 +1,8 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +#ifndef UDP_VU_H
> +#define UDP_VU_H
> +
> +void udp_vu_sock_handler(const struct ctx *c, union epoll_ref ref,
> +			 uint32_t events, const struct timespec *now);
> +#endif /* UDP_VU_H */
> diff --git a/vhost_user.c b/vhost_user.c
> index 4ac0a3e53499..a3d156558359 100644
> --- a/vhost_user.c
> +++ b/vhost_user.c
> @@ -28,7 +28,6 @@
>  
>  #define VHOST_USER_VERSION 1
>  
> -/* cppcheck-suppress unusedFunction */
>  void vu_print_capabilities(void)
>  {
>  	printf("{\n");
> @@ -332,7 +331,6 @@ static bool map_ring(VuDev *vdev, VuVirtq *vq)
>  	return !(vq->vring.desc && vq->vring.used && vq->vring.avail);
>  }
>  
> -/* cppcheck-suppress unusedFunction */
>  int vu_packet_check_range(void *buf, size_t offset, size_t len, const char *start,
>  			  const char *func, int line)
>  {
> @@ -545,7 +543,6 @@ static int vu_wait_queue(const VuVirtq *vq)
>  	return 0;
>  }
>  
> -/* cppcheck-suppress unusedFunction */
>  int vu_send(const struct ctx *c, const void *buf, size_t size)
>  {
>  	VuDev *vdev = (VuDev *)&c->vdev;
> @@ -730,7 +727,6 @@ static void vu_handle_tx(VuDev *vdev, int index)
>  	}
>  }
>  
> -/* cppcheck-suppress unusedFunction */
>  void vu_kick_cb(struct ctx *c, union epoll_ref ref)
>  {
>  	VuDev *vdev = &c->vdev;
> @@ -927,7 +923,6 @@ static bool vu_set_vring_enable_exec(VuDev *vdev, struct VhostUserMsg *msg)
>  	return false;
>  }
>  
> -/* cppcheck-suppress unusedFunction */
>  void vu_init(struct ctx *c)
>  {
>  	int i;
> @@ -988,7 +983,6 @@ static void vu_cleanup(VuDev *vdev)
>   * @c:		Execution context
>   * @events:	epoll events
>   */
> -/* cppcheck-suppress unusedFunction */
>  void tap_handler_vu(struct ctx *c, uint32_t events)
>  {
>  	VuDev *dev = &c->vdev;
> diff --git a/virtio.c b/virtio.c
> index 5d58e56204b3..8c651070bba5 100644
> --- a/virtio.c
> +++ b/virtio.c
> @@ -367,7 +367,6 @@ void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len)
>  	vu_queue_detach_element(dev, vq, index, len);
>  }
>  
> -/* cppcheck-suppress unusedFunction */
>  bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num)
>  {
>  	(void)dev;

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson