On Thu, Aug 15, 2024 at 05:50:22PM +0200, Laurent Vivier wrote:
> Add vhost_user.c and vhost_user.h that define the functions needed
> to implement vhost-user backend.
> 
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> ---
>  Makefile     |    4 +-
>  iov.c        |    1 -
>  vhost_user.c | 1271 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  vhost_user.h |  202 ++++++++
>  virtio.c     |    5 -
>  virtio.h     |    2 +-
>  6 files changed, 1476 insertions(+), 9 deletions(-)
>  create mode 100644 vhost_user.c
>  create mode 100644 vhost_user.h
> 
> diff --git a/Makefile b/Makefile
> index f171c7955ac9..4ccefffacfde 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
>  PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
>  	icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
>  	ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
> -	tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c virtio.c
> +	tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c vhost_user.c virtio.c
>  QRAP_SRCS = qrap.c
>  SRCS = $(PASST_SRCS) $(QRAP_SRCS)
>  
> @@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
>  	flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
>  	lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
>  	siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
> -	udp.h udp_flow.h util.h virtio.h
> +	udp.h udp_flow.h util.h vhost_user.h virtio.h
>  HEADERS = $(PASST_HEADERS) seccomp.h
>  
>  C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
> diff --git a/iov.c b/iov.c
> index 3f9e229a305f..3741db21790f 100644
> --- a/iov.c
> +++ b/iov.c
> @@ -68,7 +68,6 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n,
>   *
>   * Returns:    The number of bytes successfully copied.
>   */
> -/* cppcheck-suppress unusedFunction */
>  size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
>  		    size_t offset, const void *buf, size_t bytes)
>  {
> diff --git a/vhost_user.c b/vhost_user.c
> new file mode 100644
> index 000000000000..c4cd25fae84e
> --- /dev/null
> +++ b/vhost_user.c
> @@ -0,0 +1,1271 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * vhost-user API, command management and virtio interface
> + *
> + * Copyright Red Hat
> + * Author: Laurent Vivier <lvivier@redhat.com>
> + */
> +/* some parts from QEMU subprojects/libvhost-user/libvhost-user.c
> + * licensed under the following terms:
> + *
> + * Copyright IBM, Corp. 2007
> + * Copyright (c) 2016 Red Hat, Inc.
> + *
> + * Authors:
> + *  Anthony Liguori <aliguori@us.ibm.com>
> + *  Marc-André Lureau <mlureau@redhat.com>
> + *  Victor Kaplansky <victork@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +
> +#include <errno.h>
> +#include <fcntl.h>
> +#include <stdlib.h>
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <stddef.h>
> +#include <string.h>
> +#include <assert.h>
> +#include <stdbool.h>
> +#include <inttypes.h>
> +#include <time.h>
> +#include <net/ethernet.h>
> +#include <netinet/in.h>
> +#include <sys/epoll.h>
> +#include <sys/eventfd.h>
> +#include <sys/mman.h>
> +#include <linux/vhost_types.h>
> +#include <linux/virtio_net.h>
> +
> +#include "util.h"
> +#include "passt.h"
> +#include "tap.h"
> +#include "vhost_user.h"
> +
> +/* vhost-user version we are compatible with */
> +#define VHOST_USER_VERSION 1
> +
> +/**
> + * vu_print_capabilities() - print vhost-user capabilities
> + * 			     this is part of the vhost-user backend
> + * 			     convention.
> + */
> +/* cppcheck-suppress unusedFunction */
> +void vu_print_capabilities(void)
> +{
> +	info("{");
> +	info("  \"type\": \"net\"");
> +	info("}");
> +	exit(EXIT_SUCCESS);
> +}
> +
> +/**
> + * vu_request_to_string() - convert a vhost-user request number to its name
> + * @req:	request number
> + *
> + * Return: the name of request number
> + */
> +static const char *vu_request_to_string(unsigned int req)
> +{
> +	if (req < VHOST_USER_MAX) {
> +#define REQ(req) [req] = #req
> +		static const char * const vu_request_str[] = {

Adding VHOST_USER_MAX as an explicit array length can act as a useful
sanity check here.

> +			REQ(VHOST_USER_NONE),
> +			REQ(VHOST_USER_GET_FEATURES),
> +			REQ(VHOST_USER_SET_FEATURES),
> +			REQ(VHOST_USER_SET_OWNER),
> +			REQ(VHOST_USER_RESET_OWNER),
> +			REQ(VHOST_USER_SET_MEM_TABLE),
> +			REQ(VHOST_USER_SET_LOG_BASE),
> +			REQ(VHOST_USER_SET_LOG_FD),
> +			REQ(VHOST_USER_SET_VRING_NUM),
> +			REQ(VHOST_USER_SET_VRING_ADDR),
> +			REQ(VHOST_USER_SET_VRING_BASE),
> +			REQ(VHOST_USER_GET_VRING_BASE),
> +			REQ(VHOST_USER_SET_VRING_KICK),
> +			REQ(VHOST_USER_SET_VRING_CALL),
> +			REQ(VHOST_USER_SET_VRING_ERR),
> +			REQ(VHOST_USER_GET_PROTOCOL_FEATURES),
> +			REQ(VHOST_USER_SET_PROTOCOL_FEATURES),
> +			REQ(VHOST_USER_GET_QUEUE_NUM),
> +			REQ(VHOST_USER_SET_VRING_ENABLE),
> +			REQ(VHOST_USER_SEND_RARP),
> +			REQ(VHOST_USER_NET_SET_MTU),
> +			REQ(VHOST_USER_SET_BACKEND_REQ_FD),
> +			REQ(VHOST_USER_IOTLB_MSG),
> +			REQ(VHOST_USER_SET_VRING_ENDIAN),
> +			REQ(VHOST_USER_GET_CONFIG),
> +			REQ(VHOST_USER_SET_CONFIG),
> +			REQ(VHOST_USER_POSTCOPY_ADVISE),
> +			REQ(VHOST_USER_POSTCOPY_LISTEN),
> +			REQ(VHOST_USER_POSTCOPY_END),
> +			REQ(VHOST_USER_GET_INFLIGHT_FD),
> +			REQ(VHOST_USER_SET_INFLIGHT_FD),
> +			REQ(VHOST_USER_GPU_SET_SOCKET),
> +			REQ(VHOST_USER_VRING_KICK),
> +			REQ(VHOST_USER_GET_MAX_MEM_SLOTS),
> +			REQ(VHOST_USER_ADD_MEM_REG),
> +			REQ(VHOST_USER_REM_MEM_REG),
> +			REQ(VHOST_USER_MAX),
> +		};
> +#undef REQ
> +		return vu_request_str[req];
> +	}
> +
> +	return "unknown";
> +}
> +
> +/**
> + * qva_to_va() -  Translate front-end (QEMU) virtual address to our virtual
> + * 		  address
> + * @dev:		Vhost-user device
> + * @qemu_addr:		front-end userspace address
> + *
> + * Return: the memory address in our process virtual address space.
> + */
> +static void *qva_to_va(struct vu_dev *dev, uint64_t qemu_addr)
> +{
> +	unsigned int i;
> +
> +	/* Find matching memory region.  */
> +	for (i = 0; i < dev->nregions; i++) {
> +		const struct vu_dev_region *r = &dev->regions[i];
> +
> +		if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
> +			/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
> +			return (void *)(qemu_addr - r->qva + r->mmap_addr +
> +					r->mmap_offset);
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +/**
> + * vmsg_close_fds() - Close all file descriptors of a given message
> + * @vmsg:	Vhost-user message with the list of the file descriptors
> + */
> +static void vmsg_close_fds(const struct vhost_user_msg *vmsg)
> +{
> +	int i;
> +
> +	for (i = 0; i < vmsg->fd_num; i++)
> +		close(vmsg->fds[i]);
> +}
> +
> +/**
> + * vu_remove_watch() - Remove a file descriptor from an our passt epoll
> + * 		       file descriptor
> + * @vdev:	Vhost-user device
> + * @fd:		file descriptor to remove
> + */
> +static void vu_remove_watch(const struct vu_dev *vdev, int fd)
> +{
> +	(void)vdev;
> +	(void)fd;

Uh... this doesn't seem to do what the function comment says.

> +}
> +
> +/**
> + * vmsg_set_reply_u64() - Set reply payload.u64 and clear request flags
> + * 			  and fd_num
> + * @vmsg:	Vhost-user message
> + * @val:	64bit value to reply
> + */
> +static void vmsg_set_reply_u64(struct vhost_user_msg *vmsg, uint64_t val)
> +{
> +	vmsg->hdr.flags = 0; /* defaults will be set by vu_send_reply() */
> +	vmsg->hdr.size = sizeof(vmsg->payload.u64);
> +	vmsg->payload.u64 = val;
> +	vmsg->fd_num = 0;
> +}
> +
> +/**
> + * vu_message_read_default() - Read incoming vhost-user message from the
> + * 			       front-end
> + * @conn_fd:	Vhost-user command socket
> + * @vmsg:	Vhost-user message
> + *
> + * Return: -1 there is an error,
> + *          0 if recvmsg() has been interrupted,
> + *          1 if a message has been received
> + */
> +static int vu_message_read_default(int conn_fd, struct vhost_user_msg *vmsg)
> +{
> +	char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS *
> +		     sizeof(int))] = { 0 };
> +	struct iovec iov = {
> +		.iov_base = (char *)vmsg,
> +		.iov_len = VHOST_USER_HDR_SIZE,
> +	};
> +	struct msghdr msg = {
> +		.msg_iov = &iov,
> +		.msg_iovlen = 1,
> +		.msg_control = control,
> +		.msg_controllen = sizeof(control),
> +	};
> +	ssize_t ret, sz_payload;
> +	struct cmsghdr *cmsg;
> +	size_t fd_size;
> +
> +	ret = recvmsg(conn_fd, &msg, MSG_DONTWAIT);
> +	if (ret < 0) {
> +		if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
> +			return 0;
> +		return -1;
> +	}
> +
> +	vmsg->fd_num = 0;
> +	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL;
> +	     cmsg = CMSG_NXTHDR(&msg, cmsg)) {
> +		if (cmsg->cmsg_level == SOL_SOCKET &&
> +		    cmsg->cmsg_type == SCM_RIGHTS) {
> +			fd_size = cmsg->cmsg_len - CMSG_LEN(0);
> +			ASSERT(fd_size / sizeof(int) <=
> +			       VHOST_MEMORY_BASELINE_NREGIONS);

IIUC, this could be tripped by a bug in the peer (qemu?) rather than
in our own code.  In which case I think a die() would be more
appropriate than an ASSERT().

> +			vmsg->fd_num = fd_size / sizeof(int);
> +			memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
> +			break;
> +		}
> +	}
> +
> +	sz_payload = vmsg->hdr.size;
> +	if ((size_t)sz_payload > sizeof(vmsg->payload)) {
> +		die("Error: too big message request: %d,"
> +			 " size: vmsg->size: %zd, "
> +			 "while sizeof(vmsg->payload) = %zu",
> +			 vmsg->hdr.request, sz_payload, sizeof(vmsg->payload));
> +	}
> +
> +	if (sz_payload) {
> +		do {
> +			ret = recv(conn_fd, &vmsg->payload, sz_payload, 0);
> +		} while (ret < 0 && (errno == EINTR || errno == EAGAIN));
> +
> +		if (ret < sz_payload)
> +			die_perror("Error while reading");
> +	}
> +
> +	return 1;
> +}
> +
> +/**
> + * vu_message_write() - send a message to the front-end
> + * @conn_fd:	Vhost-user command socket
> + * @vmsg:	Vhost-user message
> + *
> + * #syscalls:vu sendmsg
> + */
> +static void vu_message_write(int conn_fd, struct vhost_user_msg *vmsg)
> +{
> +	char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = { 0 };
> +	struct iovec iov = {
> +		.iov_base = (char *)vmsg,
> +		.iov_len = VHOST_USER_HDR_SIZE,
> +	};
> +	struct msghdr msg = {
> +		.msg_iov = &iov,
> +		.msg_iovlen = 1,
> +		.msg_control = control,
> +	};
> +	const uint8_t *p = (uint8_t *)vmsg;
> +	int rc;
> +
> +	memset(control, 0, sizeof(control));

I think this is redundant with the { 0 } initialiser.

> +	ASSERT(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS);
> +	if (vmsg->fd_num > 0) {
> +		size_t fdsize = vmsg->fd_num * sizeof(int);
> +		struct cmsghdr *cmsg;
> +
> +		msg.msg_controllen = CMSG_SPACE(fdsize);
> +		cmsg = CMSG_FIRSTHDR(&msg);
> +		cmsg->cmsg_len = CMSG_LEN(fdsize);
> +		cmsg->cmsg_level = SOL_SOCKET;
> +		cmsg->cmsg_type = SCM_RIGHTS;
> +		memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize);
> +	} else {
> +		msg.msg_controllen = 0;

I believe since you have a C99 initialiser on 'msg', fields not
explicitly mentioned will be initiaised to 0, making this redundant.

> +	}
> +
> +	do {
> +		rc = sendmsg(conn_fd, &msg, 0);
> +	} while (rc < 0 && (errno == EINTR || errno == EAGAIN));
> +
> +	if (vmsg->hdr.size) {
> +		do {
> +			rc = write(conn_fd, p + VHOST_USER_HDR_SIZE,
> +				   vmsg->hdr.size);

Is there any particular reason to send the payload as a separate
write(), rather than including it as a second entry in the iov to
sendmsg above?  Or indeed as part of the first entry, since AFAICT the
payload is contiguous with the header.

> +		} while (rc < 0 && (errno == EINTR || errno == EAGAIN));
> +	}

Checking for short writes seems like a good idea.  Even if it
shouldn't ever happen, a die() would be much easier to debug than some
cryptic failure because of truncated data.

> +	if (rc <= 0)
> +		die_perror("Error while writing");
> +}
> +
> +/**
> + * vu_send_reply() - Update message flags and send it to front-end
> + * @conn_fd:	Vhost-user command socket
> + * @vmsg:	Vhost-user message
> + */
> +static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg)
> +{
> +	msg->hdr.flags &= ~VHOST_USER_VERSION_MASK;
> +	msg->hdr.flags |= VHOST_USER_VERSION;
> +	msg->hdr.flags |= VHOST_USER_REPLY_MASK;
> +
> +	vu_message_write(conn_fd, msg);
> +}
> +
> +/**
> + * vu_get_features_exec() - Provide back-end features bitmask to front-end
> + * @vmsg:	Vhost-user message
> + *
> + * Return: true as a reply is requested
> + */
> +static bool vu_get_features_exec(struct vhost_user_msg *msg)
> +{
> +	uint64_t features =
> +		1ULL << VIRTIO_F_VERSION_1 |
> +		1ULL << VIRTIO_NET_F_MRG_RXBUF |
> +		1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
> +
> +	vmsg_set_reply_u64(msg, features);
> +
> +	debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64);
> +
> +	return true;
> +}
> +
> +/**
> + * vu_set_enable_all_rings() - Enable/disable all the virtqueues
> + * @vdev:	Vhost-user device
> + * @enable:	New virtqueues state
> + */
> +static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable)
> +{
> +	uint16_t i;
> +
> +	for (i = 0; i < VHOST_USER_MAX_QUEUES; i++)
> +		vdev->vq[i].enable = enable;
> +}
> +
> +/**
> + * vu_set_features_exec() - Enable features of the back-end
> + * @vdev:	Vhost-user device
> + * @vmsg:	Vhost-user message
> + *
> + * Return: false as no reply is requested
> + */
> +static bool vu_set_features_exec(struct vu_dev *vdev,
> +				 struct vhost_user_msg *msg)
> +{
> +	debug("u64: 0x%016"PRIx64, msg->payload.u64);

A number of these debug() messages look like they'd be pretty cryptic,
with no indication of which part of passt they're coming from.  This
one is especially bad.

> +
> +	vdev->features = msg->payload.u64;
> +	/* We only support devices conforming to VIRTIO 1.0 or
> +	 * later
> +	 */
> +	if (!vu_has_feature(vdev, VIRTIO_F_VERSION_1))
> +		die("virtio legacy devices aren't supported by passt");
> +
> +	if (!vu_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES))
> +		vu_set_enable_all_rings(vdev, true);
> +
> +	/* virtio-net features */
> +
> +	if (vu_has_feature(vdev, VIRTIO_F_VERSION_1) ||

You checked this is set above, making this test redundant, no?

> +	    vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
> +		vdev->hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +	} else {
> +		vdev->hdrlen = sizeof(struct virtio_net_hdr);
> +	}
> +
> +	return false;
> +}
> +
> +/**
> + * vu_set_owner_exec() - Session start flag, do nothing in our case
> + *
> + * Return: false as no reply is requested
> + */
> +static bool vu_set_owner_exec(void)
> +{
> +	return false;
> +}
> +
> +/**
> + * map_ring() - Convert ring front-end (QEMU) addresses to our process
> + * 		virtual address space.
> + * @vdev:	Vhost-user device
> + * @vq:		Virtqueue
> + *
> + * Return: true if ring cannot be mapped to our address space
> + */
> +static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq)
> +{
> +	vq->vring.desc = qva_to_va(vdev, vq->vra.desc_user_addr);
> +	vq->vring.used = qva_to_va(vdev, vq->vra.used_user_addr);
> +	vq->vring.avail = qva_to_va(vdev, vq->vra.avail_user_addr);
> +
> +	debug("Setting virtq addresses:");
> +	debug("    vring_desc  at %p", (void *)vq->vring.desc);
> +	debug("    vring_used  at %p", (void *)vq->vring.used);
> +	debug("    vring_avail at %p", (void *)vq->vring.avail);
> +
> +	return !(vq->vring.desc && vq->vring.used && vq->vring.avail);
> +}
> +
> +/**
> + * vu_packet_check_range() - Check if a given memory zone is contained in
> + * 			     a mapped guest memory region
> + * @buf:	Array of the available memory regions
> + * @offset:	Offset of data range in packet descriptor
> + * @size:	Length of desired data range
> + * @start:	Start of the packet descriptor
> + *
> + * Return: 0 if the zone in a mapped memory region, -1 otherwise
> + */
> +/* cppcheck-suppress unusedFunction */
> +int vu_packet_check_range(void *buf, size_t offset, size_t len,
> +			  const char *start)
> +{
> +	struct vu_dev_region *dev_region;
> +
> +	for (dev_region = buf; dev_region->mmap_addr; dev_region++) {
> +		/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
> +		char *m = (char *)dev_region->mmap_addr;
> +
> +		if (m <= start &&
> +		    start + offset + len < m + dev_region->mmap_offset +
> +					       dev_region->size)
> +			return 0;
> +	}
> +
> +	return -1;
> +}
> +
> +/**
> + * vu_set_mem_table_exec() - Sets the memory map regions to be able to
> + * 			     translate the vring addresses.
> + * @vdev:	Vhost-user device
> + * @vmsg:	Vhost-user message
> + *
> + * Return: false as no reply is requested
> + *
> + * #syscalls:vu mmap munmap
> + */
> +static bool vu_set_mem_table_exec(struct vu_dev *vdev,
> +				  struct vhost_user_msg *msg)
> +{
> +	struct vhost_user_memory m = msg->payload.memory, *memory = &m;

Is there a reason to take a copy of the message, rather than just
referencing into msg as passed?

> +	unsigned int i;
> +
> +	for (i = 0; i < vdev->nregions; i++) {
> +		struct vu_dev_region *r = &vdev->regions[i];
> +		/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
> +		void *mm = (void *)r->mmap_addr;
> +
> +		if (mm)
> +			munmap(mm, r->size + r->mmap_offset);

Do we actually ever need to change the mapping of the regions?  If not
we can avoid this unmapping loop.

> +	}
> +	vdev->nregions = memory->nregions;
> +
> +	debug("Nregions: %u", memory->nregions);
> +	for (i = 0; i < vdev->nregions; i++) {
> +		struct vhost_user_memory_region *msg_region = &memory->regions[i];
> +		struct vu_dev_region *dev_region = &vdev->regions[i];
> +		void *mmap_addr;
> +
> +		debug("Region %d", i);
> +		debug("    guest_phys_addr: 0x%016"PRIx64,
> +		      msg_region->guest_phys_addr);
> +		debug("    memory_size:     0x%016"PRIx64,
> +		      msg_region->memory_size);
> +		debug("    userspace_addr   0x%016"PRIx64,
> +		      msg_region->userspace_addr);
> +		debug("    mmap_offset      0x%016"PRIx64,
> +		      msg_region->mmap_offset);
> +
> +		dev_region->gpa = msg_region->guest_phys_addr;
> +		dev_region->size = msg_region->memory_size;
> +		dev_region->qva = msg_region->userspace_addr;
> +		dev_region->mmap_offset = msg_region->mmap_offset;
> +
> +		/* We don't use offset argument of mmap() since the
> +		 * mapped address has to be page aligned, and we use huge
> +		 * pages.

We do what now?

> +		 */
> +		mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
> +				 PROT_READ | PROT_WRITE, MAP_SHARED |
> +				 MAP_NORESERVE, msg->fds[i], 0);
> +
> +		if (mmap_addr == MAP_FAILED)
> +			die_perror("region mmap error");
> +
> +		dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
> +		debug("    mmap_addr:       0x%016"PRIx64,
> +		      dev_region->mmap_addr);
> +
> +		close(msg->fds[i]);
> +	}
> +
> +	for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
> +		if (vdev->vq[i].vring.desc) {
> +			if (map_ring(vdev, &vdev->vq[i]))
> +				die("remapping queue %d during setmemtable", i);
> +		}
> +	}
> +
> +	return false;
> +}
> +
> +/**
> + * vu_set_vring_num_exec() - Set the size of the queue (vring size)
> + * @vdev:	Vhost-user device
> + * @vmsg:	Vhost-user message
> + *
> + * Return: false as no reply is requested
> + */
> +static bool vu_set_vring_num_exec(struct vu_dev *vdev,
> +				  struct vhost_user_msg *msg)
> +{
> +	unsigned int idx = msg->payload.state.index;
> +	unsigned int num = msg->payload.state.num;
> +
> +	debug("State.index: %u", idx);
> +	debug("State.num:   %u", num);
> +	vdev->vq[idx].vring.num = num;
> +
> +	return false;
> +}
> +
> +/**
> + * vu_set_vring_addr_exec() - Set the addresses of the vring
> + * @vdev:	Vhost-user device
> + * @vmsg:	Vhost-user message
> + *
> + * Return: false as no reply is requested
> + */
> +static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
> +				   struct vhost_user_msg *msg)
> +{
> +	struct vhost_vring_addr addr = msg->payload.addr, *vra = &addr;

Again, any reason to copy the message?

> +	struct vu_virtq *vq = &vdev->vq[vra->index];
> +
> +	debug("vhost_vring_addr:");
> +	debug("    index:  %d", vra->index);
> +	debug("    flags:  %d", vra->flags);
> +	debug("    desc_user_addr:   0x%016" PRIx64, (uint64_t)vra->desc_user_addr);
> +	debug("    used_user_addr:   0x%016" PRIx64, (uint64_t)vra->used_user_addr);
> +	debug("    avail_user_addr:  0x%016" PRIx64, (uint64_t)vra->avail_user_addr);
> +	debug("    log_guest_addr:   0x%016" PRIx64, (uint64_t)vra->log_guest_addr);
> +
> +	vq->vra = *vra;

.. and then copy it again?

> +	vq->vring.flags = vra->flags;
> +	vq->vring.log_guest_addr = vra->log_guest_addr;
> +
> +	if (map_ring(vdev, vq))
> +		die("Invalid vring_addr message");
> +
> +	vq->used_idx = le16toh(vq->vring.used->idx);
> +
> +	if (vq->last_avail_idx != vq->used_idx) {
> +		debug("Last avail index != used index: %u != %u",
> +		      vq->last_avail_idx, vq->used_idx);
> +	}
> +
> +	return false;
> +}
> +/**
> + * vu_set_vring_base_exec() - Sets the next index to use for descriptors
> + * 			      in this vring
> + * @vdev:	Vhost-user device
> + * @vmsg:	Vhost-user message
> + *
> + * Return: false as no reply is requested
> + */
> +static bool vu_set_vring_base_exec(struct vu_dev *vdev,
> +				   struct vhost_user_msg *msg)
> +{
> +	unsigned int idx = msg->payload.state.index;
> +	unsigned int num = msg->payload.state.num;
> +
> +	debug("State.index: %u", idx);
> +	debug("State.num:   %u", num);
> +	vdev->vq[idx].shadow_avail_idx = vdev->vq[idx].last_avail_idx = num;
> +
> +	return false;
> +}
> +
> +/**
> + * vu_get_vring_base_exec() - Stops the vring and returns the current
> + * 			      descriptor index or indices
> + * @vdev:	Vhost-user device
> + * @vmsg:	Vhost-user message
> + *
> + * Return: false as a reply is requested
> + */
> +static bool vu_get_vring_base_exec(struct vu_dev *vdev,
> +				   struct vhost_user_msg *msg)
> +{
> +	unsigned int idx = msg->payload.state.index;
> +
> +	debug("State.index: %u", idx);
> +	msg->payload.state.num = vdev->vq[idx].last_avail_idx;
> +	msg->hdr.size = sizeof(msg->payload.state);
> +
> +	vdev->vq[idx].started = false;
> +
> +	if (vdev->vq[idx].call_fd != -1) {
> +		close(vdev->vq[idx].call_fd);
> +		vdev->vq[idx].call_fd = -1;
> +	}
> +	if (vdev->vq[idx].kick_fd != -1) {
> +		vu_remove_watch(vdev, vdev->vq[idx].kick_fd);
> +		close(vdev->vq[idx].kick_fd);
> +		vdev->vq[idx].kick_fd = -1;
> +	}
> +
> +	return true;
> +}
> +
> +/**
> + * vu_set_watch() - Add a file descriptor to the passt epoll file descriptor
> + * @vdev:	vhost-user device
> + * @fd:		file descriptor to add
> + */
> +static void vu_set_watch(const struct vu_dev *vdev, int fd)
> +{
> +	(void)vdev;
> +	(void)fd;

As with remove, this doesn't appear to do what the function comment
says.  Are these placeholders?  A TODO comment would make that
clearer, if so.

> +}
> +
> +/**
> + * vu_wait_queue() - wait new free entries in the virtqueue

s/wait/wait for/?

> + * @vq:		virtqueue to wait on
> + */
> +static int vu_wait_queue(const struct vu_virtq *vq)
> +{
> +	eventfd_t kick_data;
> +	ssize_t rc;
> +	int status;
> +
> +	/* wait the kernel to put new entries in the queue */
> +	status = fcntl(vq->kick_fd, F_GETFL);
> +	if (status == -1)
> +		return -1;
> +
> +	status = fcntl(vq->kick_fd, F_SETFL, status & ~O_NONBLOCK);
> +	if (status == -1)
> +		return -1;
> +	rc = eventfd_read(vq->kick_fd, &kick_data);
> +	status = fcntl(vq->kick_fd, F_SETFL, status);
> +	if (status == -1)
> +		return -1;
> +
> +	if (rc == -1)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +/**
> + * vu_send() - Send a buffer to the front-end using the RX virtqueue
> + * @vdev:	vhost-user device
> + * @buf:	address of the buffer
> + * @size:	size of the buffer
> + *
> + * Return: number of bytes sent, -1 if there is an error
> + */
> +/* cppcheck-suppress unusedFunction */
> +int vu_send(struct vu_dev *vdev, const void *buf, size_t size)
> +{
> +	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
> +	struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
> +	struct iovec in_sg[VIRTQUEUE_MAX_SIZE];
> +	size_t lens[VIRTQUEUE_MAX_SIZE];
> +	__virtio16 *num_buffers_ptr = NULL;
> +	size_t hdrlen = vdev->hdrlen;
> +	int in_sg_count = 0;
> +	size_t offset = 0;
> +	int i = 0, j;
> +
> +	debug("vu_send size %zu hdrlen %zu", size, hdrlen);
> +
> +	if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
> +		err("Got packet, but no available descriptors on RX virtq.");
> +		return 0;
> +	}
> +
> +	while (offset < size) {
> +		size_t len;
> +		int total;
> +		int ret;
> +
> +		total = 0;
> +
> +		if (i == ARRAY_SIZE(elem) ||
> +		    in_sg_count == ARRAY_SIZE(in_sg)) {
> +			err("virtio-net unexpected long buffer chain");
> +			goto err;
> +		}
> +
> +		elem[i].out_num = 0;
> +		elem[i].out_sg = NULL;
> +		elem[i].in_num = ARRAY_SIZE(in_sg) - in_sg_count;
> +		elem[i].in_sg = &in_sg[in_sg_count];
> +
> +		ret = vu_queue_pop(vdev, vq, &elem[i]);
> +		if (ret < 0) {
> +			if (vu_wait_queue(vq) != -1)
> +				continue;
> +			if (i) {
> +				err("virtio-net unexpected empty queue: "
> +				    "i %d mergeable %d offset %zd, size %zd, "
> +				    "features 0x%" PRIx64,
> +				    i, vu_has_feature(vdev,
> +						      VIRTIO_NET_F_MRG_RXBUF),
> +				    offset, size, vdev->features);
> +			}
> +			offset = -1;
> +			goto err;
> +		}
> +		in_sg_count += elem[i].in_num;

Initially I thought this would consume the entire in_sg array on the
first loop iteration, but I guess vu_queue_pop() reduces in_num from
the value we initialise above.

> +		if (elem[i].in_num < 1) {

I realise it doesn't really matter in this context, but it makes more
sense to me for this check to go _before_ we use in_num to update
in_sg_cuont.


> +			err("virtio-net receive queue contains no in buffers");
> +			vu_queue_detach_element(vq);
> +			offset = -1;
> +			goto err;
> +		}
> +
> +		if (i == 0) {
> +			struct virtio_net_hdr hdr = {
> +				.flags = VIRTIO_NET_HDR_F_DATA_VALID,
> +				.gso_type = VIRTIO_NET_HDR_GSO_NONE,
> +			};
> +
> +			ASSERT(offset == 0);
> +			ASSERT(elem[i].in_sg[0].iov_len >= hdrlen);

Is this necessarily our bug, or could it be cause by the peer giving
unreasonably small buffers?  If the latter, then a die() would make
more sense.

> +
> +			len = iov_from_buf(elem[i].in_sg, elem[i].in_num, 0,
> +					   &hdr, sizeof(hdr));
> +
> +			num_buffers_ptr = (__virtio16 *)((char *)elem[i].in_sg[0].iov_base +
> +							 len);
> +
> +			total += hdrlen;
> +		}
> +
> +		len = iov_from_buf(elem[i].in_sg, elem[i].in_num, total,
> +				   (char *)buf + offset, size - offset);
> +
> +		total += len;
> +		offset += len;
> +
> +		/* If buffers can't be merged, at this point we
> +		 * must have consumed the complete packet.
> +		 * Otherwise, drop it.
> +		 */
> +		if (!vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) &&
> +		    offset < size) {
> +			vu_queue_unpop(vq);
> +			goto err;
> +		}
> +
> +		lens[i] = total;
> +		i++;
> +	}
> +
> +	if (num_buffers_ptr && vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
> +		*num_buffers_ptr = htole16(i);
> +
> +	for (j = 0; j < i; j++) {
> +		debug("filling total %zd idx %d", lens[j], j);
> +		vu_queue_fill(vq, &elem[j], lens[j], j);
> +	}
> +
> +	vu_queue_flush(vq, i);
> +	vu_queue_notify(vdev, vq);
> +
> +	debug("vhost-user sent %zu", offset);
> +
> +	return offset;
> +err:
> +	for (j = 0; j < i; j++)
> +		vu_queue_detach_element(vq);
> +
> +	return offset;
> +}
> +
> +/**
> + * vu_handle_tx() - Receive data from the TX virtqueue
> + * @vdev:	vhost-user device
> + * @index:	index of the virtqueue
> + */
> +static void vu_handle_tx(struct vu_dev *vdev, int index,
> +			 const struct timespec *now)
> +{
> +	struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
> +	struct iovec out_sg[VIRTQUEUE_MAX_SIZE];
> +	struct vu_virtq *vq = &vdev->vq[index];
> +	int hdrlen = vdev->hdrlen;
> +	int out_sg_count;
> +	int count;
> +
> +	if (!VHOST_USER_IS_QUEUE_TX(index)) {
> +		debug("index %d is not a TX queue", index);
> +		return;
> +	}
> +
> +	tap_flush_pools();
> +
> +	count = 0;
> +	out_sg_count = 0;
> +	while (1) {
> +		int ret;
> +
> +
> +		elem[count].out_num = 1;
> +		elem[count].out_sg = &out_sg[out_sg_count];
> +		elem[count].in_num = 0;
> +		elem[count].in_sg = NULL;
> +		ret = vu_queue_pop(vdev, vq, &elem[count]);
> +		if (ret < 0)
> +			break;
> +		out_sg_count += elem[count].out_num;
> +
> +		if (elem[count].out_num < 1) {
> +			debug("virtio-net header not in first element");
> +			break;
> +		}
> +		ASSERT(elem[count].out_num == 1);
> +
> +		tap_add_packet(vdev->context,
> +			       elem[count].out_sg[0].iov_len - hdrlen,
> +			       (char *)elem[count].out_sg[0].iov_base + hdrlen);
> +		count++;
> +	}
> +	tap_handler(vdev->context, now);
> +
> +	if (count) {
> +		int i;
> +
> +		for (i = 0; i < count; i++)
> +			vu_queue_fill(vq, &elem[i], 0, i);
> +		vu_queue_flush(vq, count);
> +		vu_queue_notify(vdev, vq);
> +	}
> +}
> +
> +/**
> + * vu_kick_cb() - Called on a kick event to start to receive data
> + * @vdev:	vhost-user device
> + * @ref:	epoll reference information

Missing @now argument

> + */
> +/* cppcheck-suppress unusedFunction */
> +void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
> +		const struct timespec *now)
> +{
> +	eventfd_t kick_data;
> +	ssize_t rc;
> +	int idx;
> +
> +	for (idx = 0; idx < VHOST_USER_MAX_QUEUES; idx++)
> +		if (vdev->vq[idx].kick_fd == ref.fd)

Eventually I think it would be preferable to put the vq index directly
into the epoll ref, rather than having to scan through the queues for
the right one.  I'm ok with that being a follow up change, though.

> +			break;
> +
> +	if (idx == VHOST_USER_MAX_QUEUES)
> +		return;
> +
> +	rc = eventfd_read(ref.fd, &kick_data);
> +	if (rc == -1)
> +		die_perror("kick eventfd_read()");
> +
> +	debug("Got kick_data: %016"PRIx64" idx:%d",
> +	      kick_data, idx);
> +	if (VHOST_USER_IS_QUEUE_TX(idx))
> +		vu_handle_tx(vdev, idx, now);
> +}
> +
> +/**
> + * vu_check_queue_msg_file() - Check if a message is valid,
> + * 			       close fds if NOFD bit is set
> + * @vmsg:	Vhost-user message
> + */
> +static void vu_check_queue_msg_file(struct vhost_user_msg *msg)
> +{
> +	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
> +	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> +
> +	if (idx >= VHOST_USER_MAX_QUEUES)
> +		die("Invalid queue index: %u", idx);
> +
> +	if (nofd) {
> +		vmsg_close_fds(msg);
> +		return;
> +	}
> +
> +	if (msg->fd_num != 1)
> +		die("Invalid fds in request: %d", msg->hdr.request);
> +}
> +
> +/**
> + * vu_set_vring_kick_exec() - Set the event file descriptor for adding buffers
> + * 			      to the vring
> + * @vdev:	Vhost-user device
> + * @vmsg:	Vhost-user message
> + *
> + * Return: false as no reply is requested
> + */
> +static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
> +				   struct vhost_user_msg *msg)
> +{
> +	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
> +	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> +
> +	debug("u64: 0x%016"PRIx64, msg->payload.u64);
> +
> +	vu_check_queue_msg_file(msg);
> +
> +	if (vdev->vq[idx].kick_fd != -1) {
> +		vu_remove_watch(vdev, vdev->vq[idx].kick_fd);
> +		close(vdev->vq[idx].kick_fd);
> +	}
> +
> +	vdev->vq[idx].kick_fd = nofd ? -1 : msg->fds[0];
> +	debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx);
> +
> +	vdev->vq[idx].started = true;
> +
> +	if (vdev->vq[idx].kick_fd != -1 && VHOST_USER_IS_QUEUE_TX(idx)) {
> +		vu_set_watch(vdev, vdev->vq[idx].kick_fd);
> +		debug("Waiting for kicks on fd: %d for vq: %d",
> +		      vdev->vq[idx].kick_fd, idx);
> +	}
> +
> +	return false;
> +}
> +
> +/**
> + * vu_set_vring_call_exec() - Set the event file descriptor to signal when
> + * 			      buffers are used
> + * @vdev:	Vhost-user device
> + * @vmsg:	Vhost-user message
> + *
> + * Return: false as no reply is requested
> + */
> +static bool vu_set_vring_call_exec(struct vu_dev *vdev,
> +				   struct vhost_user_msg *msg)
> +{
> +	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
> +	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> +
> +	debug("u64: 0x%016"PRIx64, msg->payload.u64);
> +
> +	vu_check_queue_msg_file(msg);
> +
> +	if (vdev->vq[idx].call_fd != -1)
> +		close(vdev->vq[idx].call_fd);
> +
> +	vdev->vq[idx].call_fd = nofd ? -1 : msg->fds[0];
> +
> +	/* in case of I/O hang after reconnecting */
> +	if (vdev->vq[idx].call_fd != -1)
> +		eventfd_write(msg->fds[0], 1);
> +
> +	debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx);
> +
> +	return false;
> +}
> +
> +/**
> + * vu_set_vring_err_exec() - Set the event file descriptor to signal when
> + * 			     error occurs
> + * @vdev:	Vhost-user device
> + * @vmsg:	Vhost-user message
> + *
> + * Return: false as no reply is requested
> + */
> +static bool vu_set_vring_err_exec(struct vu_dev *vdev,
> +				  struct vhost_user_msg *msg)
> +{
> +	bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
> +	int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> +
> +	debug("u64: 0x%016"PRIx64, msg->payload.u64);
> +
> +	vu_check_queue_msg_file(msg);
> +
> +	if (vdev->vq[idx].err_fd != -1) {
> +		close(vdev->vq[idx].err_fd);
> +		vdev->vq[idx].err_fd = -1;
> +	}
> +
> +	/* cppcheck-suppress redundantAssignment */
> +	vdev->vq[idx].err_fd = nofd ? -1 : msg->fds[0];
> +
> +	return false;
> +}
> +
> +/**
> + * vu_get_protocol_features_exec() - Provide the protocol (vhost-user) features
> + * 				     to the front-end
> + * @vdev:	Vhost-user device
> + * @vmsg:	Vhost-user message
> + *
> + * Return: false as a reply is requested
> + */
> +static bool vu_get_protocol_features_exec(struct vhost_user_msg *msg)
> +{
> +	uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK;
> +
> +	vmsg_set_reply_u64(msg, features);
> +
> +	return true;
> +}
> +
> +/**
> + * vu_set_protocol_features_exec() - Enable protocol (vhost-user) features
> + * @vdev:	Vhost-user device
> + * @vmsg:	Vhost-user message
> + *
> + * Return: false as no reply is requested
> + */
> +static bool vu_set_protocol_features_exec(struct vu_dev *vdev,
> +					  struct vhost_user_msg *msg)
> +{
> +	uint64_t features = msg->payload.u64;
> +
> +	debug("u64: 0x%016"PRIx64, features);
> +
> +	vdev->protocol_features = msg->payload.u64;
> +
> +	if (vu_has_protocol_feature(vdev,
> +				    VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) &&
> +	    (!vu_has_protocol_feature(vdev, VHOST_USER_PROTOCOL_F_BACKEND_REQ) ||
> +	     !vu_has_protocol_feature(vdev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) {
> +	/*
> +	 * The use case for using messages for kick/call is simulation, to make
> +	 * the kick and call synchronous. To actually get that behaviour, both
> +	 * of the other features are required.
> +	 * Theoretically, one could use only kick messages, or do them without
> +	 * having F_REPLY_ACK, but too many (possibly pending) messages on the
> +	 * socket will eventually cause the master to hang, to avoid this in
> +	 * scenarios where not desired enforce that the settings are in a way
> +	 * that actually enables the simulation case.
> +	 */
> +		die("F_IN_BAND_NOTIFICATIONS requires F_BACKEND_REQ && F_REPLY_ACK");
> +	}
> +
> +	return false;
> +}
> +
> +/**
> + * vu_get_queue_num_exec() - Tell how many queues we support
> + * @vmsg:	Vhost-user message
> + *
> + * Return: true as a reply is requested
> + */
> +static bool vu_get_queue_num_exec(struct vhost_user_msg *msg)
> +{
> +	vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES);
> +	return true;
> +}
> +
> +/**
> + * vu_set_vring_enable_exec() - Enable or disable corresponding vring
> + * @vdev:	Vhost-user device
> + * @vmsg:	Vhost-user message
> + *
> + * Return: false as no reply is requested
> + */
> +static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
> +				     struct vhost_user_msg *msg)
> +{
> +	unsigned int enable = msg->payload.state.num;
> +	unsigned int idx = msg->payload.state.index;
> +
> +	debug("State.index:  %u", idx);
> +	debug("State.enable: %u", enable);
> +
> +	if (idx >= VHOST_USER_MAX_QUEUES)
> +		die("Invalid vring_enable index: %u", idx);
> +
> +	vdev->vq[idx].enable = enable;
> +	return false;
> +}
> +
> +/**
> + * vu_init() - Initialize vhost-user device structure
> + * @c:		execution context
> + * @vdev:	vhost-user device
> + */
> +/* cppcheck-suppress unusedFunction */
> +void vu_init(struct ctx *c, struct vu_dev *vdev)
> +{
> +	int i;
> +
> +	vdev->context = c;
> +	vdev->hdrlen = 0;
> +	for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
> +		vdev->vq[i] = (struct vu_virtq){
> +			.call_fd = -1,
> +			.kick_fd = -1,
> +			.err_fd = -1,
> +			.notification = true,
> +		};
> +	}
> +}
> +
> +/**
> + * vu_cleanup() - Reset vhost-user device
> + * @vdev:	vhost-user device
> + */
> +void vu_cleanup(struct vu_dev *vdev)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
> +		struct vu_virtq *vq = &vdev->vq[i];
> +
> +		vq->started = false;
> +		vq->notification = true;
> +
> +		if (vq->call_fd != -1) {
> +			close(vq->call_fd);
> +			vq->call_fd = -1;
> +		}
> +		if (vq->err_fd != -1) {
> +			close(vq->err_fd);
> +			vq->err_fd = -1;
> +		}
> +		if (vq->kick_fd != -1) {
> +			vu_remove_watch(vdev, vq->kick_fd);
> +			close(vq->kick_fd);
> +			vq->kick_fd = -1;
> +		}
> +
> +		vq->vring.desc = 0;
> +		vq->vring.used = 0;
> +		vq->vring.avail = 0;
> +	}
> +	vdev->hdrlen = 0;
> +
> +	for (i = 0; i < vdev->nregions; i++) {
> +		const struct vu_dev_region *r = &vdev->regions[i];
> +		/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
> +		void *m = (void *)r->mmap_addr;
> +
> +		if (m)
> +			munmap(m, r->size + r->mmap_offset);
> +	}
> +	vdev->nregions = 0;
> +}
> +
> +/**
> + * vu_sock_reset() - Reset connection socket
> + * @vdev:	vhost-user device
> + */
> +static void vu_sock_reset(struct vu_dev *vdev)
> +{
> +	(void)vdev;

Placeholder?

> +}
> +
> +/**
> + * tap_handler_vu() - Packet handler for vhost-user
> + * @vdev:	vhost-user device
> + * @fd:		vhost-user message socket
> + * @events:	epoll events
> + */
> +/* cppcheck-suppress unusedFunction */
> +void tap_handler_vu(struct vu_dev *vdev, int fd, uint32_t events)

I think this name is misleading.  While we are re-using fd_tap for the
vhost-user control socket, this is quite unlike most of the other
tap_handler functions: those are generally related to getting a new
packet from the "tap" interface - it's the main entry point into the
data path from the guest.  This is, instead, a control path function,
more akin to tap_listen_handler() (also not a great name).  Maybe
"vu_socket_handler()" or "vu_control_handler()"?  tap_handler_vu() I'd
expect to be the function that handles notifications on the queue
receiving packets from the guest.

> +{
> +	struct vhost_user_msg msg = { 0 };
> +	bool need_reply, reply_requested;
> +	int ret;
> +
> +	if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
> +		vu_sock_reset(vdev);
> +		return;
> +	}
> +
> +	ret = vu_message_read_default(fd, &msg);
> +	if (ret < 0)
> +		die_perror("Error while recvmsg");
> +	if (ret == 0) {
> +		vu_sock_reset(vdev);
> +		return;
> +	}
> +	debug("================ Vhost user message ================");
> +	debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request),
> +		msg.hdr.request);
> +	debug("Flags:   0x%x", msg.hdr.flags);
> +	debug("Size:    %u", msg.hdr.size);
> +
> +	need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
> +	switch (msg.hdr.request) {
> +	case VHOST_USER_GET_FEATURES:
> +		reply_requested = vu_get_features_exec(&msg);
> +		break;
> +	case VHOST_USER_SET_FEATURES:
> +		reply_requested = vu_set_features_exec(vdev, &msg);
> +		break;
> +	case VHOST_USER_GET_PROTOCOL_FEATURES:
> +		reply_requested = vu_get_protocol_features_exec(&msg);
> +		break;
> +	case VHOST_USER_SET_PROTOCOL_FEATURES:
> +		reply_requested = vu_set_protocol_features_exec(vdev, &msg);
> +		break;
> +	case VHOST_USER_GET_QUEUE_NUM:
> +		reply_requested = vu_get_queue_num_exec(&msg);
> +		break;
> +	case VHOST_USER_SET_OWNER:
> +		reply_requested = vu_set_owner_exec();
> +		break;
> +	case VHOST_USER_SET_MEM_TABLE:
> +		reply_requested = vu_set_mem_table_exec(vdev, &msg);
> +		break;
> +	case VHOST_USER_SET_VRING_NUM:
> +		reply_requested = vu_set_vring_num_exec(vdev, &msg);
> +		break;
> +	case VHOST_USER_SET_VRING_ADDR:
> +		reply_requested = vu_set_vring_addr_exec(vdev, &msg);
> +		break;
> +	case VHOST_USER_SET_VRING_BASE:
> +		reply_requested = vu_set_vring_base_exec(vdev, &msg);
> +		break;
> +	case VHOST_USER_GET_VRING_BASE:
> +		reply_requested = vu_get_vring_base_exec(vdev, &msg);
> +		break;
> +	case VHOST_USER_SET_VRING_KICK:
> +		reply_requested = vu_set_vring_kick_exec(vdev, &msg);
> +		break;
> +	case VHOST_USER_SET_VRING_CALL:
> +		reply_requested = vu_set_vring_call_exec(vdev, &msg);
> +		break;
> +	case VHOST_USER_SET_VRING_ERR:
> +		reply_requested = vu_set_vring_err_exec(vdev, &msg);
> +		break;
> +	case VHOST_USER_SET_VRING_ENABLE:
> +		reply_requested = vu_set_vring_enable_exec(vdev, &msg);
> +		break;
> +	case VHOST_USER_NONE:
> +		vu_cleanup(vdev);
> +		return;
> +	default:
> +		die("Unhandled request: %d", msg.hdr.request);
> +	}
> +
> +	if (!reply_requested && need_reply) {
> +		msg.payload.u64 = 0;
> +		msg.hdr.flags = 0;
> +		msg.hdr.size = sizeof(msg.payload.u64);
> +		msg.fd_num = 0;
> +		reply_requested = true;
> +	}
> +
> +	if (reply_requested)
> +		vu_send_reply(fd, &msg);
> +}
> diff --git a/vhost_user.h b/vhost_user.h
> new file mode 100644
> index 000000000000..135856dc2873
> --- /dev/null
> +++ b/vhost_user.h
> @@ -0,0 +1,202 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later
> + * Copyright Red Hat
> + * Author: Laurent Vivier <lvivier@redhat.com>
> + *
> + * vhost-user API, command management and virtio interface
> + */
> +
> +/* some parts from subprojects/libvhost-user/libvhost-user.h */
> +
> +#ifndef VHOST_USER_H
> +#define VHOST_USER_H
> +
> +#include "virtio.h"
> +#include "iov.h"
> +
> +#define VHOST_USER_F_PROTOCOL_FEATURES 30
> +
> +#define VHOST_MEMORY_BASELINE_NREGIONS 8
> +
> +/**
> + * enum vhost_user_protocol_feature - List of available vhost-user features
> + */
> +enum vhost_user_protocol_feature {
> +	VHOST_USER_PROTOCOL_F_MQ = 0,
> +	VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
> +	VHOST_USER_PROTOCOL_F_RARP = 2,
> +	VHOST_USER_PROTOCOL_F_REPLY_ACK = 3,
> +	VHOST_USER_PROTOCOL_F_NET_MTU = 4,
> +	VHOST_USER_PROTOCOL_F_BACKEND_REQ = 5,
> +	VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6,
> +	VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
> +	VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
> +	VHOST_USER_PROTOCOL_F_CONFIG = 9,
> +	VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
> +	VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
> +	VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
> +	VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14,
> +	VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15,
> +
> +	VHOST_USER_PROTOCOL_F_MAX
> +};
> +
> +/**
> + * enum vhost_user_request - list of available vhost-user request
> + */
> +enum vhost_user_request {
> +	VHOST_USER_NONE = 0,
> +	VHOST_USER_GET_FEATURES = 1,
> +	VHOST_USER_SET_FEATURES = 2,
> +	VHOST_USER_SET_OWNER = 3,
> +	VHOST_USER_RESET_OWNER = 4,
> +	VHOST_USER_SET_MEM_TABLE = 5,
> +	VHOST_USER_SET_LOG_BASE = 6,
> +	VHOST_USER_SET_LOG_FD = 7,
> +	VHOST_USER_SET_VRING_NUM = 8,
> +	VHOST_USER_SET_VRING_ADDR = 9,
> +	VHOST_USER_SET_VRING_BASE = 10,
> +	VHOST_USER_GET_VRING_BASE = 11,
> +	VHOST_USER_SET_VRING_KICK = 12,
> +	VHOST_USER_SET_VRING_CALL = 13,
> +	VHOST_USER_SET_VRING_ERR = 14,
> +	VHOST_USER_GET_PROTOCOL_FEATURES = 15,
> +	VHOST_USER_SET_PROTOCOL_FEATURES = 16,
> +	VHOST_USER_GET_QUEUE_NUM = 17,
> +	VHOST_USER_SET_VRING_ENABLE = 18,
> +	VHOST_USER_SEND_RARP = 19,
> +	VHOST_USER_NET_SET_MTU = 20,
> +	VHOST_USER_SET_BACKEND_REQ_FD = 21,
> +	VHOST_USER_IOTLB_MSG = 22,
> +	VHOST_USER_SET_VRING_ENDIAN = 23,
> +	VHOST_USER_GET_CONFIG = 24,
> +	VHOST_USER_SET_CONFIG = 25,
> +	VHOST_USER_CREATE_CRYPTO_SESSION = 26,
> +	VHOST_USER_CLOSE_CRYPTO_SESSION = 27,
> +	VHOST_USER_POSTCOPY_ADVISE  = 28,
> +	VHOST_USER_POSTCOPY_LISTEN  = 29,
> +	VHOST_USER_POSTCOPY_END     = 30,
> +	VHOST_USER_GET_INFLIGHT_FD = 31,
> +	VHOST_USER_SET_INFLIGHT_FD = 32,
> +	VHOST_USER_GPU_SET_SOCKET = 33,
> +	VHOST_USER_VRING_KICK = 35,
> +	VHOST_USER_GET_MAX_MEM_SLOTS = 36,
> +	VHOST_USER_ADD_MEM_REG = 37,
> +	VHOST_USER_REM_MEM_REG = 38,
> +	VHOST_USER_MAX
> +};
> +
> +/**
> + * struct vhost_user_header - Vhost-user message header
> + * @request:	Request type of the message
> + * @flags:	Request flags
> + * @size:	The following payload size
> + */
> +struct vhost_user_header {
> +	enum vhost_user_request request;
> +
> +#define VHOST_USER_VERSION_MASK     0x3
> +#define VHOST_USER_REPLY_MASK       (0x1 << 2)
> +#define VHOST_USER_NEED_REPLY_MASK  (0x1 << 3)
> +	uint32_t flags;
> +	uint32_t size; /* the following payload size */
> +} __attribute__ ((__packed__));
> +
> +/**
> + * struct vhost_user_memory_region - Front-end shared memory region information
> + * @guest_phys_addr:	Guest physical address of the region
> + * @memory_size:	Memory size
> + * @userspace_addr:	front-end (QEMU) userspace address
> + * @mmap_offset:	region offset in the shared memory area
> + */
> +struct vhost_user_memory_region {
> +	uint64_t guest_phys_addr;
> +	uint64_t memory_size;
> +	uint64_t userspace_addr;
> +	uint64_t mmap_offset;
> +};
> +
> +/**
> + * struct vhost_user_memory - List of all the shared memory regions
> + * @nregions:	Number of memory regions
> + * @padding:	Padding
> + * @regions:	Memory regions list
> + */
> +struct vhost_user_memory {
> +	uint32_t nregions;
> +	uint32_t padding;
> +	struct vhost_user_memory_region regions[VHOST_MEMORY_BASELINE_NREGIONS];
> +};
> +
> +/**
> + * union vhost_user_payload - Vhost-user message payload
> + * @u64:		64bit payload
> + * @state:		Vring state payload
> + * @addr:		Vring addresses payload
> + * vhost_user_memory:	Memory regions information payload
> + */
> +union vhost_user_payload {
> +#define VHOST_USER_VRING_IDX_MASK   0xff
> +#define VHOST_USER_VRING_NOFD_MASK  (0x1 << 8)
> +	uint64_t u64;
> +	struct vhost_vring_state state;
> +	struct vhost_vring_addr addr;
> +	struct vhost_user_memory memory;
> +};
> +
> +/**
> + * struct vhost_user_msg - Vhost-use message
> + * @hdr:		Message header
> + * @payload:		Message payload
> + * @fds:		File descriptors associated with the message
> + * 			in the ancillary data.
> + * 			(shared memory or event file descriptors)
> + * @fd_num:		Number of file descriptors
> + */
> +struct vhost_user_msg {
> +	struct vhost_user_header hdr;
> +	union vhost_user_payload payload;
> +
> +	int fds[VHOST_MEMORY_BASELINE_NREGIONS];
> +	int fd_num;
> +} __attribute__ ((__packed__));
> +#define VHOST_USER_HDR_SIZE sizeof(struct vhost_user_header)
> +
> +/* index of the RX virtqueue */
> +#define VHOST_USER_RX_QUEUE 0
> +/* index of the TX virtqueue */
> +#define VHOST_USER_TX_QUEUE 1
> +
> +/* in case of multiqueue, we RX and TX queues are interleaved */
> +#define VHOST_USER_IS_QUEUE_TX(n)	(n % 2)
> +#define VHOST_USER_IS_QUEUE_RX(n)	(!(n % 2))
> +
> +/**
> + * vu_queue_enabled - Return state of a virtqueue
> + * @vq:		Virtqueue to check
> + *
> + * Return: true if the virqueue is enabled, false otherwise
> + */
> +static inline bool vu_queue_enabled(const struct vu_virtq *vq)
> +{
> +	return vq->enable;
> +}
> +
> +/**
> + * vu_queue_started - Return state of a virtqueue
> + * @vq:		Virtqueue to check
> + *
> + * Return: true if the virqueue is started, false otherwise
> + */
> +static inline bool vu_queue_started(const struct vu_virtq *vq)
> +{
> +	return vq->started;
> +}
> +
> +int vu_send(struct vu_dev *vdev, const void *buf, size_t size);
> +void vu_print_capabilities(void);
> +void vu_init(struct ctx *c, struct vu_dev *vdev);
> +void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
> +		const struct timespec *now);
> +void vu_cleanup(struct vu_dev *vdev);
> +void tap_handler_vu(struct vu_dev *vdev, int fd, uint32_t events);
> +#endif /* VHOST_USER_H */
> diff --git a/virtio.c b/virtio.c
> index 8354f6052aee..d02e6e04701d 100644
> --- a/virtio.c
> +++ b/virtio.c
> @@ -323,7 +323,6 @@ static bool vring_can_notify(const struct vu_dev *dev, struct vu_virtq *vq)
>   * @dev:	Vhost-user device
>   * @vq:		Virtqueue
>   */
> -/* cppcheck-suppress unusedFunction */
>  void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq)
>  {
>  	if (!vq->vring.avail)
> @@ -500,7 +499,6 @@ static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned i
>   *
>   * Return: -1 if there is an error, 0 otherwise
>   */
> -/* cppcheck-suppress unusedFunction */
>  int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem)
>  {
>  	unsigned int head;
> @@ -550,7 +548,6 @@ void vu_queue_detach_element(struct vu_virtq *vq)
>   * vu_queue_unpop() - Push back the previously popped element from the virqueue
>   * @vq:		Virtqueue
>   */
> -/* cppcheck-suppress unusedFunction */
>  void vu_queue_unpop(struct vu_virtq *vq)
>  {
>  	vq->last_avail_idx--;
> @@ -618,7 +615,6 @@ void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index,
>   * @len:	Size of the element
>   * @idx:	Used ring entry index
>   */
> -/* cppcheck-suppress unusedFunction */
>  void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem,
>  		   unsigned int len, unsigned int idx)
>  {
> @@ -642,7 +638,6 @@ static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val)
>   * @vq:		Virtqueue
>   * @count:	Number of entry to flush
>   */
> -/* cppcheck-suppress unusedFunction */
>  void vu_queue_flush(struct vu_virtq *vq, unsigned int count)
>  {
>  	uint16_t old, new;
> diff --git a/virtio.h b/virtio.h
> index af9cadc990b9..242e788e07e9 100644
> --- a/virtio.h
> +++ b/virtio.h
> @@ -106,6 +106,7 @@ struct vu_dev_region {
>   * @hdrlen:		Virtio -net header length
>   */
>  struct vu_dev {
> +	struct ctx *context;
>  	uint32_t nregions;
>  	struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS];
>  	struct vu_virtq vq[VHOST_USER_MAX_QUEUES];
> @@ -162,7 +163,6 @@ static inline bool vu_has_feature(const struct vu_dev *vdev,
>   *
>   * Return:	True if the feature is available
>   */
> -/* cppcheck-suppress unusedFunction */
>  static inline bool vu_has_protocol_feature(const struct vu_dev *vdev,
>  					   unsigned int fbit)
>  {

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson