public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH 0/3] Add vhost-net kernel support
@ 2025-04-01 11:38 Eugenio Pérez
  2025-04-01 11:38 ` [PATCH 1/3] tap: specify the packet pool Eugenio Pérez
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Eugenio Pérez @ 2025-04-01 11:38 UTC (permalink / raw)
  To: passt-dev; +Cc: jmaloy, sbrivio, lvivier, dgibson

vhost-net is a kernel device that allows to read packets from a tap
device using virtio queues instead of regular read(2) and write(2).
This enables a more eficient packet processing, as the memory can
be written directly by the kernel to the userspace and back, instead
of wasting bandwith on copies, and it enables to batch many packets
in a single notification (through eventfds) both tx and rx.

---
RFC: At this moment only to receive packets from the vhost kernel to
the pasta binary is supported. It does not even support to refill more
rx descriptors so the kernel can keep receiving packets! Just for early
review to make sure we're on the same page.

Eugenio Pérez (3):
  tap: specify the packet pool
  tap: implement vhost_call_cb
  tap: add die() on vhost error

 epoll_type.h |   4 +
 passt.c      |   8 ++
 passt.h      |  13 +-
 tap.c        | 366 +++++++++++++++++++++++++++++++++++++++++++++++++--
 tap.h        |   6 +-
 vu_common.c  |   7 +-
 6 files changed, 387 insertions(+), 17 deletions(-)

-- 
2.49.0


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/3] tap: specify the packet pool
  2025-04-01 11:38 [PATCH 0/3] Add vhost-net kernel support Eugenio Pérez
@ 2025-04-01 11:38 ` Eugenio Pérez
  2025-04-03  1:07   ` David Gibson
  2025-04-01 11:38 ` [PATCH 2/3] tap: implement vhost_call_cb Eugenio Pérez
  2025-04-01 11:38 ` [PATCH 3/3] tap: add die() on vhost error Eugenio Pérez
  2 siblings, 1 reply; 13+ messages in thread
From: Eugenio Pérez @ 2025-04-01 11:38 UTC (permalink / raw)
  To: passt-dev; +Cc: jmaloy, sbrivio, lvivier, dgibson

In vhost-kernel we need to allocate a storage for rx so kernel can
write the buffers async. We need to tell tap_add_packet where to find
them.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
 tap.c       | 34 ++++++++++++++++++++++++----------
 tap.h       |  4 ++--
 vu_common.c |  7 ++++---
 3 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/tap.c b/tap.c
index 182a115..ce859ba 100644
--- a/tap.c
+++ b/tap.c
@@ -1031,10 +1031,16 @@ void tap_flush_pools(void)
  * @c:		Execution context
  * @now:	Current timestamp
  */
-void tap_handler(struct ctx *c, const struct timespec *now)
+void tap_handler(struct ctx *c, const struct timespec *now, struct pool *pool4,
+		struct pool *pool6)
 {
-	tap4_handler(c, pool_tap4, now);
-	tap6_handler(c, pool_tap6, now);
+	if (!pool4)
+		pool4 = pool_tap4;
+	if (!pool6)
+		pool6 = pool_tap6;
+
+	tap4_handler(c, pool4, now);
+	tap6_handler(c, pool6, now);
 }
 
 /**
@@ -1042,11 +1048,19 @@ void tap_handler(struct ctx *c, const struct timespec *now)
  * @c:		Execution context
  * @l2len:	Total L2 packet length
  * @p:		Packet buffer
+ * @pool4	Pool for tap ipv4 packets. If NULL, is pool_tap4
+ * @pool6	Pool for tap ipv6 packets. If NULL, is pool_tap6
  */
-void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
+void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
+					struct pool *pool4, struct pool *pool6)
 {
 	const struct ethhdr *eh;
 
+	if (!pool4)
+		pool4 = pool_tap4;
+	if (!pool6)
+		pool6 = pool_tap6;
+
 	pcap(p, l2len);
 
 	eh = (struct ethhdr *)p;
@@ -1059,10 +1073,10 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
 	switch (ntohs(eh->h_proto)) {
 	case ETH_P_ARP:
 	case ETH_P_IP:
-		packet_add(pool_tap4, l2len, p);
+		packet_add(pool4, l2len, p);
 		break;
 	case ETH_P_IPV6:
-		packet_add(pool_tap6, l2len, p);
+		packet_add(pool6, l2len, p);
 		break;
 	default:
 		break;
@@ -1142,7 +1156,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 		p += sizeof(uint32_t);
 		n -= sizeof(uint32_t);
 
-		tap_add_packet(c, l2len, p);
+		tap_add_packet(c, l2len, p, pool_tap4, pool_tap6);
 
 		p += l2len;
 		n -= l2len;
@@ -1151,7 +1165,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 	partial_len = n;
 	partial_frame = p;
 
-	tap_handler(c, now);
+	tap_handler(c, now, NULL, NULL);
 }
 
 /**
@@ -1207,10 +1221,10 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 		    len > (ssize_t)L2_MAX_LEN_PASTA)
 			continue;
 
-		tap_add_packet(c, len, pkt_buf + n);
+		tap_add_packet(c, len, pkt_buf + n, pool_tap4, pool_tap6);
 	}
 
-	tap_handler(c, now);
+	tap_handler(c, now, NULL, NULL);
 }
 
 /**
diff --git a/tap.h b/tap.h
index dd39fd8..0b5ad17 100644
--- a/tap.h
+++ b/tap.h
@@ -118,7 +118,7 @@ void tap_sock_reset(struct ctx *c);
 void tap_sock_update_pool(void *base, size_t size);
 void tap_backend_init(struct ctx *c);
 void tap_flush_pools(void);
-void tap_handler(struct ctx *c, const struct timespec *now);
-void tap_add_packet(struct ctx *c, ssize_t l2len, char *p);
+void tap_handler(struct ctx *c, const struct timespec *now, struct pool *pool4, struct pool *pool6);
+void tap_add_packet(struct ctx *c, ssize_t l2len, char *p, struct pool *pool4, struct pool *pool6);
 
 #endif /* TAP_H */
diff --git a/vu_common.c b/vu_common.c
index 686a09b..4fe982f 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -191,7 +191,7 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
 			tap_add_packet(vdev->context,
 				       elem[count].out_sg[0].iov_len - hdrlen,
 				       (char *)elem[count].out_sg[0].iov_base +
-				        hdrlen);
+				        hdrlen, NULL, NULL);
 		} else {
 			/* vnet header can be in a separate iovec */
 			if (elem[count].out_num != 2) {
@@ -203,13 +203,14 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
 			} else {
 				tap_add_packet(vdev->context,
 					       elem[count].out_sg[1].iov_len,
-					       (char *)elem[count].out_sg[1].iov_base);
+					       (char *)elem[count].out_sg[1].iov_base,
+						   NULL, NULL);
 			}
 		}
 
 		count++;
 	}
-	tap_handler(vdev->context, now);
+	tap_handler(vdev->context, now, NULL, NULL);
 
 	if (count) {
 		int i;
-- 
@@ -191,7 +191,7 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
 			tap_add_packet(vdev->context,
 				       elem[count].out_sg[0].iov_len - hdrlen,
 				       (char *)elem[count].out_sg[0].iov_base +
-				        hdrlen);
+				        hdrlen, NULL, NULL);
 		} else {
 			/* vnet header can be in a separate iovec */
 			if (elem[count].out_num != 2) {
@@ -203,13 +203,14 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
 			} else {
 				tap_add_packet(vdev->context,
 					       elem[count].out_sg[1].iov_len,
-					       (char *)elem[count].out_sg[1].iov_base);
+					       (char *)elem[count].out_sg[1].iov_base,
+						   NULL, NULL);
 			}
 		}
 
 		count++;
 	}
-	tap_handler(vdev->context, now);
+	tap_handler(vdev->context, now, NULL, NULL);
 
 	if (count) {
 		int i;
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/3] tap: implement vhost_call_cb
  2025-04-01 11:38 [PATCH 0/3] Add vhost-net kernel support Eugenio Pérez
  2025-04-01 11:38 ` [PATCH 1/3] tap: specify the packet pool Eugenio Pérez
@ 2025-04-01 11:38 ` Eugenio Pérez
  2025-04-02  7:16   ` Stefano Brivio
  2025-04-03  1:45   ` David Gibson
  2025-04-01 11:38 ` [PATCH 3/3] tap: add die() on vhost error Eugenio Pérez
  2 siblings, 2 replies; 13+ messages in thread
From: Eugenio Pérez @ 2025-04-01 11:38 UTC (permalink / raw)
  To: passt-dev; +Cc: jmaloy, sbrivio, lvivier, dgibson

This is the main callback when the tap device has processed any buffer.
Another possibility is to reuse the tap callback for this, so less code
changes are needed.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
 epoll_type.h |   2 +
 passt.c      |   4 +
 passt.h      |  12 +-
 tap.c        | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 tap.h        |   2 +
 5 files changed, 334 insertions(+), 2 deletions(-)

diff --git a/epoll_type.h b/epoll_type.h
index 7f2a121..6284c79 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -44,6 +44,8 @@ enum epoll_type {
 	EPOLL_TYPE_REPAIR_LISTEN,
 	/* TCP_REPAIR helper socket */
 	EPOLL_TYPE_REPAIR,
+	/* vhost-kernel call socket */
+	EPOLL_TYPE_VHOST_CALL,
 
 	EPOLL_NUM_TYPES,
 };
diff --git a/passt.c b/passt.c
index cd06772..19c5d5f 100644
--- a/passt.c
+++ b/passt.c
@@ -79,6 +79,7 @@ char *epoll_type_str[] = {
 	[EPOLL_TYPE_VHOST_KICK]		= "vhost-user kick socket",
 	[EPOLL_TYPE_REPAIR_LISTEN]	= "TCP_REPAIR helper listening socket",
 	[EPOLL_TYPE_REPAIR]		= "TCP_REPAIR helper socket",
+	[EPOLL_TYPE_VHOST_CALL]		= "vhost-kernel call socket",
 };
 static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
 	      "epoll_type_str[] doesn't match enum epoll_type");
@@ -357,6 +358,9 @@ loop:
 		case EPOLL_TYPE_REPAIR:
 			repair_handler(&c, eventmask);
 			break;
+		case EPOLL_TYPE_VHOST_CALL:
+			vhost_call_cb(&c, ref, &now);
+			break;
 		default:
 			/* Can't happen */
 			ASSERT(0);
diff --git a/passt.h b/passt.h
index 8f45091..eb5aa03 100644
--- a/passt.h
+++ b/passt.h
@@ -45,7 +45,7 @@ union epoll_ref;
  * @icmp:	ICMP-specific reference part
  * @data:	Data handled by protocol handlers
  * @nsdir_fd:	netns dirfd for fallback timer checking if namespace is gone
- * @queue:	vhost-user queue index for this fd
+ * @queue:	vhost queue index for this fd
  * @u64:	Opaque reference for epoll_ctl() and epoll_wait()
  */
 union epoll_ref {
@@ -271,11 +271,14 @@ struct ctx {
 	int fd_tap;
 	int fd_repair_listen;
 	int fd_repair;
+	/* TODO document all added fields */
+	int fd_vhost;
 	unsigned char our_tap_mac[ETH_ALEN];
 	unsigned char guest_mac[ETH_ALEN];
 	uint16_t mtu;
 
 	uint64_t hash_secret[2];
+	uint64_t virtio_features;
 
 	int ifi4;
 	struct ip4_ctx ip4;
@@ -299,6 +302,13 @@ struct ctx {
 	int no_icmp;
 	struct icmp_ctx icmp;
 
+	struct {
+		uint16_t last_used_idx;
+
+		int kick_fd;
+		int call_fd;
+	} vq[2];
+
 	int no_dns;
 	int no_dns_search;
 	int no_dhcp_dns;
diff --git a/tap.c b/tap.c
index ce859ba..fbe83aa 100644
--- a/tap.c
+++ b/tap.c
@@ -31,6 +31,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
+#include <sys/eventfd.h>
 #include <sys/uio.h>
 #include <stdbool.h>
 #include <stdlib.h>
@@ -82,6 +83,46 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
 #define TAP_SEQS		128 /* Different L4 tuples in one batch */
 #define FRAGMENT_MSG_RATE	10  /* # seconds between fragment warnings */
 
+#define VHOST_VIRTIO 0xAF
+#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
+#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64)
+#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
+#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory)
+#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state)
+#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr)
+#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
+#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
+#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
+#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64)
+#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)
+
+char virtio_rx_pkt_buf[PKT_BUF_BYTES] __attribute__((aligned(PAGE_SIZE)));
+static PACKET_POOL_NOINIT(pool_virtiorx4, TAP_MSGS, virtio_rx_pkt_buf);
+static PACKET_POOL_NOINIT(pool_virtiorx6, TAP_MSGS, virtio_rx_pkt_buf);
+
+/* TODO is it better to have 1024 or 65520 bytes per packet? */
+#define VHOST_NDESCS 1024
+static struct vring_desc vring_desc[2][VHOST_NDESCS] __attribute__((aligned(PAGE_SIZE)));
+static union {
+	struct vring_avail avail;
+	char buf[offsetof(struct vring_avail, ring[VHOST_NDESCS])];
+} vring_avail_0 __attribute__((aligned(PAGE_SIZE))), vring_avail_1 __attribute__((aligned(PAGE_SIZE)));
+static union {
+	struct vring_used used;
+	char buf[offsetof(struct vring_used, ring[VHOST_NDESCS])];
+} vring_used_0 __attribute__((aligned(PAGE_SIZE))), vring_used_1 __attribute__((aligned(PAGE_SIZE)));
+
+/* all descs ring + 2rings * 2vqs + tx pkt buf + rx pkt buf */
+#define N_VHOST_REGIONS 7
+union {
+	struct vhost_memory mem;
+	char buf[offsetof(struct vhost_memory, regions[N_VHOST_REGIONS])];
+} vhost_memory = {
+	.mem = {
+		.nregions = N_VHOST_REGIONS,
+	},
+};
+
 /**
  * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
  * @c:		Execution context
@@ -1360,6 +1401,89 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
 	tap_start_connection(c);
 }
 
+static void *virtqueue_get_buf(struct ctx *c, unsigned qid, unsigned *len)
+{
+	struct vring_used *used = !qid ? &vring_used_0.used : &vring_used_1.used;
+	uint32_t i;
+	uint16_t used_idx, last_used;
+
+	/* TODO think if this has races with previous eventfd_read */
+	/* TODO we could improve performance with a shadow_used_idx */
+	used_idx = le16toh(used->idx);
+
+	smp_rmb();
+
+	if (used_idx == c->vq[qid].last_used_idx) {
+		*len = 0;
+		return NULL;
+	}
+
+	last_used = c->vq[qid].last_used_idx & VHOST_NDESCS;
+	i = le32toh(used->ring[last_used].id);
+	*len = le32toh(used->ring[last_used].len);
+
+	if (i > VHOST_NDESCS) {
+		/* TODO imporove this, it will cause infinite loop */
+		warn("vhost: id %u at used position %u out of range (max=%u)", i, last_used, VHOST_NDESCS);
+		return NULL;
+	}
+
+	if (*len > PKT_BUF_BYTES/VHOST_NDESCS) {
+		warn("vhost: id %d len %u > %zu", i, *len, PKT_BUF_BYTES/VHOST_NDESCS);
+		return NULL;
+	}
+
+	/* TODO check if the id is valid and it has not been double used */
+	c->vq[qid].last_used_idx++;
+	return virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS);
+}
+
+/* container is tx but we receive it from vhost POV */
+void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now)
+{
+	eventfd_read(ref.fd, (eventfd_t[]){ 0 });
+
+	tap_flush_pools();
+
+	while (true) {
+		struct virtio_net_hdr_mrg_rxbuf *hdr;
+		unsigned len;
+
+		hdr = virtqueue_get_buf(c, ref.queue, &len);
+		if (!hdr)
+			break;
+
+		if (len < sizeof(*hdr)) {
+			warn("vhost: invalid len %u", len);
+			continue;
+		}
+
+		/* TODO this will break from this moment */
+		if (hdr->num_buffers != 1) {
+			warn("vhost: Too many buffers %u, %zu bytes should be enough for everybody!", hdr->num_buffers, PKT_BUF_BYTES/VHOST_NDESCS);
+			continue;
+		}
+
+		/* TODO fix the v6 pool to support ipv6 */
+		tap_add_packet(c, len - sizeof(*hdr), (void *)(hdr+1), pool_virtiorx4, pool_virtiorx6);
+	}
+
+	tap_handler(c, now, pool_virtiorx4, pool_virtiorx6);
+}
+
+/* TODO: Actually refill */
+static void rx_pkt_refill(int kick_fd)
+{
+	for (unsigned i = 0; i < VHOST_NDESCS; ++i) {
+		vring_desc[0][i].addr = (uintptr_t)virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS);
+		vring_desc[0][i].len = PKT_BUF_BYTES/VHOST_NDESCS;
+		vring_desc[0][i].flags = VRING_DESC_F_WRITE;
+	}
+
+	vring_avail_0.avail.idx = VHOST_NDESCS;
+	eventfd_write(kick_fd, 1);
+}
+
 /**
  * tap_ns_tun() - Get tuntap fd in namespace
  * @c:		Execution context
@@ -1370,10 +1494,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
  */
 static int tap_ns_tun(void *arg)
 {
+	/* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and VHOST_NET_F_VIRTIO_NET_HDR actually */
+	static const uint64_t features =
+		(1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR);
 	struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI };
 	int flags = O_RDWR | O_NONBLOCK | O_CLOEXEC;
 	struct ctx *c = (struct ctx *)arg;
-	int fd, rc;
+	int fd, vhost_fd, rc;
 
 	c->fd_tap = -1;
 	memcpy(ifr.ifr_name, c->pasta_ifn, IFNAMSIZ);
@@ -1383,6 +1510,175 @@ static int tap_ns_tun(void *arg)
 	if (fd < 0)
 		die_perror("Failed to open() /dev/net/tun");
 
+	vhost_fd = open("/dev/vhost-net", flags);
+	if (vhost_fd < 0)
+		die_perror("Failed to open() /dev/vhost-net");
+
+	rc = ioctl(vhost_fd, VHOST_SET_OWNER, NULL);
+	if (rc < 0)
+		die_perror("VHOST_SET_OWNER ioctl on /dev/vhost-net failed");
+
+	rc = ioctl(vhost_fd, VHOST_GET_FEATURES, &c->virtio_features);
+	if (rc < 0)
+		die_perror("VHOST_GET_FEATURES ioctl on /dev/vhost-net failed");
+
+	/* TODO inform more explicitely */
+	fprintf(stderr, "vhost features: %lx\n", c->virtio_features);
+	fprintf(stderr, "req features: %lx\n", features);
+	c->virtio_features &= features;
+	if (c->virtio_features != features)
+		die("vhost does not support required features");
+
+	for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
+		struct vhost_vring_file file = {
+			.index = i,
+		};
+		union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_CALL,
+					.queue = i };
+		struct epoll_event ev;
+
+		file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+		ref.fd = file.fd;
+		if (file.fd < 0)
+			die_perror("Failed to create call eventfd");
+
+		rc = ioctl(vhost_fd, VHOST_SET_VRING_CALL, &file);
+		if (rc < 0)
+			die_perror(
+				"VHOST_SET_VRING_CALL ioctl on /dev/vhost-net failed");
+
+		ev = (struct epoll_event){ .data.u64 = ref.u64, .events = EPOLLIN };
+		rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
+		if (rc < 0)
+			die_perror("Failed to add call eventfd to epoll");
+		c->vq[i].call_fd = file.fd;
+	}
+
+	/* 1:1 translation */
+	vhost_memory.mem.regions[0] = (struct vhost_memory_region){
+		.guest_phys_addr = (uintptr_t)&vring_desc,
+		/* memory size should include the last byte, but we probably never send
+		 * ptrs there so...
+		 */
+		.memory_size = sizeof(vring_desc),
+		.userspace_addr = (uintptr_t)&vring_desc,
+	};
+	vhost_memory.mem.regions[1] = (struct vhost_memory_region){
+		.guest_phys_addr = (uintptr_t)&vring_avail_0,
+		/* memory size should include the last byte, but we probably never send
+			* ptrs there so...
+			*/
+		.memory_size = sizeof(vring_avail_0),
+		.userspace_addr = (uintptr_t)&vring_avail_0,
+	};
+	vhost_memory.mem.regions[2] = (struct vhost_memory_region){
+		.guest_phys_addr = (uintptr_t)&vring_avail_1,
+		/* memory size should include the last byte, but we probably never send
+			* ptrs there so...
+			*/
+		.memory_size = sizeof(vring_avail_1),
+		.userspace_addr = (uintptr_t)&vring_avail_1,
+	};
+	vhost_memory.mem.regions[3] = (struct vhost_memory_region){
+		.guest_phys_addr = (uintptr_t)&vring_used_0,
+		/* memory size should include the last byte, but we probably never send
+			* ptrs there so...
+			*/
+		.memory_size = sizeof(vring_avail_0),
+		.userspace_addr = (uintptr_t)&vring_used_0,
+	};
+	vhost_memory.mem.regions[4] = (struct vhost_memory_region){
+		.guest_phys_addr = (uintptr_t)&vring_used_1,
+		/* memory size should include the last byte, but we probably never send
+			* ptrs there so...
+			*/
+		.memory_size = sizeof(vring_avail_1),
+		.userspace_addr = (uintptr_t)&vring_used_1,
+	};
+	vhost_memory.mem.regions[5] = (struct vhost_memory_region){
+		.guest_phys_addr = (uintptr_t)virtio_rx_pkt_buf,
+		/* memory size should include the last byte, but we probably never send
+			* ptrs there so...
+			*/
+		.memory_size = sizeof(virtio_rx_pkt_buf),
+		.userspace_addr = (uintptr_t)virtio_rx_pkt_buf,
+	};
+	vhost_memory.mem.regions[6] = (struct vhost_memory_region){
+		.guest_phys_addr = (uintptr_t)pkt_buf,
+		/* memory size should include the last byte, but we probably never send
+			* ptrs there so...
+			*/
+		.memory_size = sizeof(pkt_buf),
+		.userspace_addr = (uintptr_t)pkt_buf,
+	};
+	static_assert(6 < N_VHOST_REGIONS);
+
+	rc = ioctl(vhost_fd, VHOST_SET_MEM_TABLE, &vhost_memory.mem);
+	if (rc < 0)
+		die_perror(
+			"VHOST_SET_MEM_TABLE ioctl on /dev/vhost-net failed");
+
+	/* TODO: probably it increases RX perf */
+#if 0
+	struct ifreq ifr;
+	memset(&ifr, 0, sizeof(ifr));
+
+	if (ioctl(fd, TUNGETIFF, &ifr) != 0)
+		die_perror("Unable to query TUNGETIFF on FD %d", fd);
+	}
+
+	if (ifr.ifr_flags & IFF_VNET_HDR)
+		net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF);
+#endif
+	rc = ioctl(vhost_fd, VHOST_SET_FEATURES, &c->virtio_features);
+	if (rc < 0)
+		die_perror("VHOST_SET_FEATURES ioctl on /dev/vhost-net failed");
+
+	/* Duplicating foreach queue to follow the exact order from QEMU */
+	for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
+		struct vhost_vring_addr addr = {
+			.index = i,
+			.desc_user_addr = (unsigned long)vring_desc[i],
+			.avail_user_addr = i == 0 ? (unsigned long)&vring_avail_0 :
+										(unsigned long)&vring_avail_1,
+			.used_user_addr = i == 0 ? (unsigned long)&vring_used_0 :
+										(unsigned long)&vring_used_1,
+			/* GPA addr */
+			.log_guest_addr = i == 0 ? (unsigned long)&vring_used_0 :
+									   (unsigned long)&vring_used_1,
+		};
+		struct vhost_vring_state state = {
+			.index = i,
+			.num = VHOST_NDESCS,
+		};
+		struct vhost_vring_file file = {
+			.index = i,
+		};
+
+		rc = ioctl(vhost_fd, VHOST_SET_VRING_NUM, &state);
+		if (rc < 0)
+			die_perror(
+				"VHOST_SET_VRING_NUM ioctl on /dev/vhost-net failed");
+
+		fprintf(stderr, "qid: %d\n", i);
+		fprintf(stderr, "vhost desc addr: 0x%llx\n", addr.desc_user_addr);
+		fprintf(stderr, "vhost avail addr: 0x%llx\n", addr.avail_user_addr);
+		fprintf(stderr, "vhost used addr: 0x%llx\n", addr.used_user_addr);
+		rc = ioctl(vhost_fd, VHOST_SET_VRING_ADDR, &addr);
+		if (rc < 0)
+			die_perror(
+				"VHOST_SET_VRING_ADDR ioctl on /dev/vhost-net failed");
+
+		file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+		if (file.fd < 0)
+			die_perror("Failed to create kick eventfd");
+		rc = ioctl(vhost_fd, VHOST_SET_VRING_KICK, &file);
+		if (rc < 0)
+			die_perror(
+				"VHOST_SET_VRING_KICK ioctl on /dev/vhost-net failed");
+		c->vq[i].kick_fd = file.fd;
+	}
+
 	rc = ioctl(fd, (int)TUNSETIFF, &ifr);
 	if (rc < 0)
 		die_perror("TUNSETIFF ioctl on /dev/net/tun failed");
@@ -1390,7 +1686,25 @@ static int tap_ns_tun(void *arg)
 	if (!(c->pasta_ifi = if_nametoindex(c->pasta_ifn)))
 		die("Tap device opened but no network interface found");
 
+	rx_pkt_refill(c->vq[0].kick_fd);
+
+	/* Duplicating foreach queue to follow the exact order from QEMU */
+	for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
+		struct vhost_vring_file file = {
+			.index = i,
+			.fd = fd,
+		};
+
+		fprintf(stderr, "qid: %d\n", file.index);
+		fprintf(stderr, "tap fd: %d\n", file.fd);
+		rc = ioctl(vhost_fd, VHOST_NET_SET_BACKEND, &file);
+		if (rc < 0)
+			die_perror(
+				"VHOST_NET_SET_BACKEND ioctl on /dev/vhost-net failed");
+	}
+
 	c->fd_tap = fd;
+	c->fd_vhost = vhost_fd;
 
 	return 0;
 }
diff --git a/tap.h b/tap.h
index 0b5ad17..91d3f62 100644
--- a/tap.h
+++ b/tap.h
@@ -69,6 +69,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
 		thdr->vnet_len = htonl(l2len);
 }
 
+void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now);
+
 unsigned long tap_l2_max_len(const struct ctx *c);
 void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
 void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
-- 
@@ -69,6 +69,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
 		thdr->vnet_len = htonl(l2len);
 }
 
+void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now);
+
 unsigned long tap_l2_max_len(const struct ctx *c);
 void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
 void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 3/3] tap: add die() on vhost error
  2025-04-01 11:38 [PATCH 0/3] Add vhost-net kernel support Eugenio Pérez
  2025-04-01 11:38 ` [PATCH 1/3] tap: specify the packet pool Eugenio Pérez
  2025-04-01 11:38 ` [PATCH 2/3] tap: implement vhost_call_cb Eugenio Pérez
@ 2025-04-01 11:38 ` Eugenio Pérez
  2025-04-03  1:50   ` David Gibson
  2 siblings, 1 reply; 13+ messages in thread
From: Eugenio Pérez @ 2025-04-01 11:38 UTC (permalink / raw)
  To: passt-dev; +Cc: jmaloy, sbrivio, lvivier, dgibson

In case the kernel needs to signal an error.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
 epoll_type.h |  2 ++
 passt.c      |  4 ++++
 passt.h      |  1 +
 tap.c        | 16 ++++++++++++++++
 4 files changed, 23 insertions(+)

diff --git a/epoll_type.h b/epoll_type.h
index 6284c79..6b320db 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -46,6 +46,8 @@ enum epoll_type {
 	EPOLL_TYPE_REPAIR,
 	/* vhost-kernel call socket */
 	EPOLL_TYPE_VHOST_CALL,
+	/* vhost-kernel error socket */
+	EPOLL_TYPE_VHOST_ERROR,

 	EPOLL_NUM_TYPES,
 };
diff --git a/passt.c b/passt.c
index 19c5d5f..2779e0b 100644
--- a/passt.c
+++ b/passt.c
@@ -80,6 +80,7 @@ char *epoll_type_str[] = {
 	[EPOLL_TYPE_REPAIR_LISTEN]	= "TCP_REPAIR helper listening socket",
 	[EPOLL_TYPE_REPAIR]		= "TCP_REPAIR helper socket",
 	[EPOLL_TYPE_VHOST_CALL]		= "vhost-kernel call socket",
+	[EPOLL_TYPE_VHOST_ERROR]	= "vhost-kernel error socket",
 };
 static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
 	      "epoll_type_str[] doesn't match enum epoll_type");
@@ -361,6 +362,9 @@ loop:
 		case EPOLL_TYPE_VHOST_CALL:
 			vhost_call_cb(&c, ref, &now);
 			break;
+		case EPOLL_TYPE_VHOST_ERROR:
+			die("Error on vhost-kernel socket");
+			break;
 		default:
 			/* Can't happen */
 			ASSERT(0);
diff --git a/passt.h b/passt.h
index eb5aa03..9e42f3b 100644
--- a/passt.h
+++ b/passt.h
@@ -307,6 +307,7 @@ struct ctx {

 		int kick_fd;
 		int call_fd;
+		int err_fd;
 	} vq[2];

 	int no_dns;
diff --git a/tap.c b/tap.c
index fbe83aa..b02d3da 100644
--- a/tap.c
+++ b/tap.c
@@ -1552,6 +1552,22 @@ static int tap_ns_tun(void *arg)
 		if (rc < 0)
 			die_perror("Failed to add call eventfd to epoll");
 		c->vq[i].call_fd = file.fd;
+
+		file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+		if (file.fd < 0)
+			die_perror("Failed to create error eventfd");
+
+		rc = ioctl(vhost_fd, VHOST_SET_VRING_ERR, &file);
+		if (rc < 0)
+			die_perror(
+				"VHOST_SET_VRING_ERR ioctl on /dev/vhost-net failed");
+
+		ref.type = EPOLL_TYPE_VHOST_ERROR, ref.fd = file.fd;
+		ev.data.u64 = ref.u64;
+		rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
+		if (rc < 0)
+			die_perror("Failed to add error eventfd to epoll");
+		c->vq[i].err_fd = file.fd;
 	}

 	/* 1:1 translation */
--
2.49.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/3] tap: implement vhost_call_cb
  2025-04-01 11:38 ` [PATCH 2/3] tap: implement vhost_call_cb Eugenio Pérez
@ 2025-04-02  7:16   ` Stefano Brivio
  2025-04-02 12:03     ` Eugenio Perez Martin
  2025-04-03  1:48     ` David Gibson
  2025-04-03  1:45   ` David Gibson
  1 sibling, 2 replies; 13+ messages in thread
From: Stefano Brivio @ 2025-04-02  7:16 UTC (permalink / raw)
  To: Eugenio Pérez; +Cc: passt-dev, jmaloy, lvivier, dgibson

A couple of general notes:

- there's no need to Cc: people already on this list unless you need
  their attention specifically (it can get quite noisy...)

- things kind of make sense to me, many are hard to evaluate at this
  early stage, so I noted below just some specific comments/questions
  here, but in the sense of "being on the same page" my current answer
  is... yes, I guess so!

- I'm not reviewing 1/3 and 3/3 right away as I guess you'll revisit
  them anyway, I'm just not sure we need a separate pool... but I'm not
  sure if that's temporary either.

On Tue,  1 Apr 2025 13:38:08 +0200
Eugenio Pérez <eperezma@redhat.com> wrote:

> This is the main callback when the tap device has processed any buffer.
> Another possibility is to reuse the tap callback for this, so less code
> changes are needed.
> 
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> ---
>  epoll_type.h |   2 +
>  passt.c      |   4 +
>  passt.h      |  12 +-
>  tap.c        | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  tap.h        |   2 +
>  5 files changed, 334 insertions(+), 2 deletions(-)
> 
> diff --git a/epoll_type.h b/epoll_type.h
> index 7f2a121..6284c79 100644
> --- a/epoll_type.h
> +++ b/epoll_type.h
> @@ -44,6 +44,8 @@ enum epoll_type {
>  	EPOLL_TYPE_REPAIR_LISTEN,
>  	/* TCP_REPAIR helper socket */
>  	EPOLL_TYPE_REPAIR,
> +	/* vhost-kernel call socket */
> +	EPOLL_TYPE_VHOST_CALL,
>  
>  	EPOLL_NUM_TYPES,
>  };
> diff --git a/passt.c b/passt.c
> index cd06772..19c5d5f 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -79,6 +79,7 @@ char *epoll_type_str[] = {
>  	[EPOLL_TYPE_VHOST_KICK]		= "vhost-user kick socket",
>  	[EPOLL_TYPE_REPAIR_LISTEN]	= "TCP_REPAIR helper listening socket",
>  	[EPOLL_TYPE_REPAIR]		= "TCP_REPAIR helper socket",
> +	[EPOLL_TYPE_VHOST_CALL]		= "vhost-kernel call socket",
>  };
>  static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
>  	      "epoll_type_str[] doesn't match enum epoll_type");
> @@ -357,6 +358,9 @@ loop:
>  		case EPOLL_TYPE_REPAIR:
>  			repair_handler(&c, eventmask);
>  			break;
> +		case EPOLL_TYPE_VHOST_CALL:
> +			vhost_call_cb(&c, ref, &now);
> +			break;
>  		default:
>  			/* Can't happen */
>  			ASSERT(0);
> diff --git a/passt.h b/passt.h
> index 8f45091..eb5aa03 100644
> --- a/passt.h
> +++ b/passt.h
> @@ -45,7 +45,7 @@ union epoll_ref;
>   * @icmp:	ICMP-specific reference part
>   * @data:	Data handled by protocol handlers
>   * @nsdir_fd:	netns dirfd for fallback timer checking if namespace is gone
> - * @queue:	vhost-user queue index for this fd
> + * @queue:	vhost queue index for this fd
>   * @u64:	Opaque reference for epoll_ctl() and epoll_wait()
>   */
>  union epoll_ref {
> @@ -271,11 +271,14 @@ struct ctx {
>  	int fd_tap;
>  	int fd_repair_listen;
>  	int fd_repair;
> +	/* TODO document all added fields */
> +	int fd_vhost;
>  	unsigned char our_tap_mac[ETH_ALEN];
>  	unsigned char guest_mac[ETH_ALEN];
>  	uint16_t mtu;
>  
>  	uint64_t hash_secret[2];
> +	uint64_t virtio_features;
>  
>  	int ifi4;
>  	struct ip4_ctx ip4;
> @@ -299,6 +302,13 @@ struct ctx {
>  	int no_icmp;
>  	struct icmp_ctx icmp;
>  
> +	struct {
> +		uint16_t last_used_idx;
> +
> +		int kick_fd;
> +		int call_fd;
> +	} vq[2];
> +
>  	int no_dns;
>  	int no_dns_search;
>  	int no_dhcp_dns;
> diff --git a/tap.c b/tap.c
> index ce859ba..fbe83aa 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -31,6 +31,7 @@
>  #include <sys/types.h>
>  #include <sys/stat.h>
>  #include <fcntl.h>
> +#include <sys/eventfd.h>

Why do we need eventfds here? Is there anything peculiar in the
protocol, or it's all stuff that can be done with "regular" file
descriptors plus epoll?

>  #include <sys/uio.h>
>  #include <stdbool.h>
>  #include <stdlib.h>
> @@ -82,6 +83,46 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
>  #define TAP_SEQS		128 /* Different L4 tuples in one batch */
>  #define FRAGMENT_MSG_RATE	10  /* # seconds between fragment warnings */
>  
> +#define VHOST_VIRTIO 0xAF
> +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
> +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64)
> +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
> +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory)
> +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state)
> +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr)
> +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
> +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
> +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
> +#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64)
> +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)

Coding style (no strict requirement): align those nicely/table-like if
possible.

> +
> +char virtio_rx_pkt_buf[PKT_BUF_BYTES] __attribute__((aligned(PAGE_SIZE)));
> +static PACKET_POOL_NOINIT(pool_virtiorx4, TAP_MSGS, virtio_rx_pkt_buf);
> +static PACKET_POOL_NOINIT(pool_virtiorx6, TAP_MSGS, virtio_rx_pkt_buf);
> +
> +/* TODO is it better to have 1024 or 65520 bytes per packet? */

In general 65535 bytes (including the Ethernet header) appears to be a
good idea, but actual profiling would be nice in the long term.

> +#define VHOST_NDESCS 1024
> +static struct vring_desc vring_desc[2][VHOST_NDESCS] __attribute__((aligned(PAGE_SIZE)));

Coding style, here and a bit all over the place: wrap to 80 columns
("net" kernel-like style).

> +static union {
> +	struct vring_avail avail;
> +	char buf[offsetof(struct vring_avail, ring[VHOST_NDESCS])];
> +} vring_avail_0 __attribute__((aligned(PAGE_SIZE))), vring_avail_1 __attribute__((aligned(PAGE_SIZE)));
> +static union {
> +	struct vring_used used;
> +	char buf[offsetof(struct vring_used, ring[VHOST_NDESCS])];
> +} vring_used_0 __attribute__((aligned(PAGE_SIZE))), vring_used_1 __attribute__((aligned(PAGE_SIZE)));
> +
> +/* all descs ring + 2rings * 2vqs + tx pkt buf + rx pkt buf */

What's the "all descs ring"? A short "theory of operation" section
might help eventually.

> +#define N_VHOST_REGIONS 7
> +union {
> +	struct vhost_memory mem;
> +	char buf[offsetof(struct vhost_memory, regions[N_VHOST_REGIONS])];
> +} vhost_memory = {
> +	.mem = {
> +		.nregions = N_VHOST_REGIONS,
> +	},
> +};
> +
>  /**
>   * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
>   * @c:		Execution context
> @@ -1360,6 +1401,89 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
>  	tap_start_connection(c);
>  }
>  
> +static void *virtqueue_get_buf(struct ctx *c, unsigned qid, unsigned *len)
> +{
> +	struct vring_used *used = !qid ? &vring_used_0.used : &vring_used_1.used;
> +	uint32_t i;
> +	uint16_t used_idx, last_used;
> +
> +	/* TODO think if this has races with previous eventfd_read */
> +	/* TODO we could improve performance with a shadow_used_idx */
> +	used_idx = le16toh(used->idx);
> +
> +	smp_rmb();
> +
> +	if (used_idx == c->vq[qid].last_used_idx) {
> +		*len = 0;
> +		return NULL;
> +	}
> +
> +	last_used = c->vq[qid].last_used_idx & VHOST_NDESCS;
> +	i = le32toh(used->ring[last_used].id);
> +	*len = le32toh(used->ring[last_used].len);
> +
> +	if (i > VHOST_NDESCS) {
> +		/* TODO imporove this, it will cause infinite loop */
> +		warn("vhost: id %u at used position %u out of range (max=%u)", i, last_used, VHOST_NDESCS);
> +		return NULL;
> +	}
> +
> +	if (*len > PKT_BUF_BYTES/VHOST_NDESCS) {
> +		warn("vhost: id %d len %u > %zu", i, *len, PKT_BUF_BYTES/VHOST_NDESCS);
> +		return NULL;
> +	}
> +
> +	/* TODO check if the id is valid and it has not been double used */
> +	c->vq[qid].last_used_idx++;
> +	return virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS);
> +}
> +
> +/* container is tx but we receive it from vhost POV */
> +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now)
> +{
> +	eventfd_read(ref.fd, (eventfd_t[]){ 0 });
> +
> +	tap_flush_pools();
> +
> +	while (true) {
> +		struct virtio_net_hdr_mrg_rxbuf *hdr;
> +		unsigned len;
> +
> +		hdr = virtqueue_get_buf(c, ref.queue, &len);
> +		if (!hdr)
> +			break;
> +
> +		if (len < sizeof(*hdr)) {
> +			warn("vhost: invalid len %u", len);
> +			continue;
> +		}
> +
> +		/* TODO this will break from this moment */
> +		if (hdr->num_buffers != 1) {
> +			warn("vhost: Too many buffers %u, %zu bytes should be enough for everybody!", hdr->num_buffers, PKT_BUF_BYTES/VHOST_NDESCS);
> +			continue;
> +		}
> +
> +		/* TODO fix the v6 pool to support ipv6 */
> +		tap_add_packet(c, len - sizeof(*hdr), (void *)(hdr+1), pool_virtiorx4, pool_virtiorx6);
> +	}
> +
> +	tap_handler(c, now, pool_virtiorx4, pool_virtiorx6);
> +}
> +
> +/* TODO: Actually refill */
> +static void rx_pkt_refill(int kick_fd)
> +{
> +	for (unsigned i = 0; i < VHOST_NDESCS; ++i) {
> +		vring_desc[0][i].addr = (uintptr_t)virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS);
> +		vring_desc[0][i].len = PKT_BUF_BYTES/VHOST_NDESCS;
> +		vring_desc[0][i].flags = VRING_DESC_F_WRITE;
> +	}
> +
> +	vring_avail_0.avail.idx = VHOST_NDESCS;
> +	eventfd_write(kick_fd, 1);
> +}
> +
>  /**
>   * tap_ns_tun() - Get tuntap fd in namespace
>   * @c:		Execution context
> @@ -1370,10 +1494,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
>   */
>  static int tap_ns_tun(void *arg)
>  {
> +	/* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and VHOST_NET_F_VIRTIO_NET_HDR actually */
> +	static const uint64_t features =
> +		(1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR);
>  	struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI };

I kind of wonder, by the way, if IFF_TUN simplifies things here. It's
something we should already add, as an option, see also:
https://bugs.passt.top/show_bug.cgi?id=49, but if it makes your life
easier for any reason you might consider adding it right away.

>  	int flags = O_RDWR | O_NONBLOCK | O_CLOEXEC;
>  	struct ctx *c = (struct ctx *)arg;
> -	int fd, rc;
> +	int fd, vhost_fd, rc;
>  
>  	c->fd_tap = -1;
>  	memcpy(ifr.ifr_name, c->pasta_ifn, IFNAMSIZ);
> @@ -1383,6 +1510,175 @@ static int tap_ns_tun(void *arg)
>  	if (fd < 0)
>  		die_perror("Failed to open() /dev/net/tun");
>  
> +	vhost_fd = open("/dev/vhost-net", flags);
> +	if (vhost_fd < 0)
> +		die_perror("Failed to open() /dev/vhost-net");

Note pretty much to future self: this will need adjustments to AppArmor
and SELinux policies.

> +
> +	rc = ioctl(vhost_fd, VHOST_SET_OWNER, NULL);
> +	if (rc < 0)
> +		die_perror("VHOST_SET_OWNER ioctl on /dev/vhost-net failed");
> +
> +	rc = ioctl(vhost_fd, VHOST_GET_FEATURES, &c->virtio_features);
> +	if (rc < 0)
> +		die_perror("VHOST_GET_FEATURES ioctl on /dev/vhost-net failed");
> +
> +	/* TODO inform more explicitely */
> +	fprintf(stderr, "vhost features: %lx\n", c->virtio_features);
> +	fprintf(stderr, "req features: %lx\n", features);
> +	c->virtio_features &= features;
> +	if (c->virtio_features != features)
> +		die("vhost does not support required features");
> +
> +	for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {

No declarations directly in loops (it hides them somehow).

> +		struct vhost_vring_file file = {
> +			.index = i,
> +		};
> +		union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_CALL,
> +					.queue = i };
> +		struct epoll_event ev;
> +
> +		file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> +		ref.fd = file.fd;
> +		if (file.fd < 0)
> +			die_perror("Failed to create call eventfd");
> +
> +		rc = ioctl(vhost_fd, VHOST_SET_VRING_CALL, &file);
> +		if (rc < 0)
> +			die_perror(
> +				"VHOST_SET_VRING_CALL ioctl on /dev/vhost-net failed");

Same as net kernel style: if it's more than a line, even as a single
statement, use curly brackets (rationale: somebody later adds another
statement without noticing and... oops).

> +
> +		ev = (struct epoll_event){ .data.u64 = ref.u64, .events = EPOLLIN };
> +		rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
> +		if (rc < 0)
> +			die_perror("Failed to add call eventfd to epoll");
> +		c->vq[i].call_fd = file.fd;
> +	}
> +
> +	/* 1:1 translation */
> +	vhost_memory.mem.regions[0] = (struct vhost_memory_region){

Space between cast and initialiser, ") {", for consistency. I'll wait
before we have some kind of theory of operation / general description
before actually looking into those, I'm not sure about the exact role of
those seven regions.

> +		.guest_phys_addr = (uintptr_t)&vring_desc,
> +		/* memory size should include the last byte, but we probably never send
> +		 * ptrs there so...
> +		 */
> +		.memory_size = sizeof(vring_desc),
> +		.userspace_addr = (uintptr_t)&vring_desc,
> +	};
> +	vhost_memory.mem.regions[1] = (struct vhost_memory_region){
> +		.guest_phys_addr = (uintptr_t)&vring_avail_0,
> +		/* memory size should include the last byte, but we probably never send
> +			* ptrs there so...
> +			*/
> +		.memory_size = sizeof(vring_avail_0),
> +		.userspace_addr = (uintptr_t)&vring_avail_0,
> +	};
> +	vhost_memory.mem.regions[2] = (struct vhost_memory_region){
> +		.guest_phys_addr = (uintptr_t)&vring_avail_1,
> +		/* memory size should include the last byte, but we probably never send
> +			* ptrs there so...
> +			*/
> +		.memory_size = sizeof(vring_avail_1),
> +		.userspace_addr = (uintptr_t)&vring_avail_1,
> +	};
> +	vhost_memory.mem.regions[3] = (struct vhost_memory_region){
> +		.guest_phys_addr = (uintptr_t)&vring_used_0,
> +		/* memory size should include the last byte, but we probably never send
> +			* ptrs there so...
> +			*/
> +		.memory_size = sizeof(vring_avail_0),
> +		.userspace_addr = (uintptr_t)&vring_used_0,
> +	};
> +	vhost_memory.mem.regions[4] = (struct vhost_memory_region){
> +		.guest_phys_addr = (uintptr_t)&vring_used_1,
> +		/* memory size should include the last byte, but we probably never send
> +			* ptrs there so...
> +			*/
> +		.memory_size = sizeof(vring_avail_1),
> +		.userspace_addr = (uintptr_t)&vring_used_1,
> +	};
> +	vhost_memory.mem.regions[5] = (struct vhost_memory_region){
> +		.guest_phys_addr = (uintptr_t)virtio_rx_pkt_buf,
> +		/* memory size should include the last byte, but we probably never send
> +			* ptrs there so...
> +			*/
> +		.memory_size = sizeof(virtio_rx_pkt_buf),
> +		.userspace_addr = (uintptr_t)virtio_rx_pkt_buf,
> +	};
> +	vhost_memory.mem.regions[6] = (struct vhost_memory_region){
> +		.guest_phys_addr = (uintptr_t)pkt_buf,
> +		/* memory size should include the last byte, but we probably never send
> +			* ptrs there so...
> +			*/
> +		.memory_size = sizeof(pkt_buf),
> +		.userspace_addr = (uintptr_t)pkt_buf,
> +	};
> +	static_assert(6 < N_VHOST_REGIONS);
> +
> +	rc = ioctl(vhost_fd, VHOST_SET_MEM_TABLE, &vhost_memory.mem);
> +	if (rc < 0)
> +		die_perror(
> +			"VHOST_SET_MEM_TABLE ioctl on /dev/vhost-net failed");
> +
> +	/* TODO: probably it increases RX perf */
> +#if 0
> +	struct ifreq ifr;
> +	memset(&ifr, 0, sizeof(ifr));
> +
> +	if (ioctl(fd, TUNGETIFF, &ifr) != 0)
> +		die_perror("Unable to query TUNGETIFF on FD %d", fd);
> +	}
> +
> +	if (ifr.ifr_flags & IFF_VNET_HDR)
> +		net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF);
> +#endif
> +	rc = ioctl(vhost_fd, VHOST_SET_FEATURES, &c->virtio_features);
> +	if (rc < 0)
> +		die_perror("VHOST_SET_FEATURES ioctl on /dev/vhost-net failed");
> +
> +	/* Duplicating foreach queue to follow the exact order from QEMU */
> +	for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
> +		struct vhost_vring_addr addr = {
> +			.index = i,
> +			.desc_user_addr = (unsigned long)vring_desc[i],
> +			.avail_user_addr = i == 0 ? (unsigned long)&vring_avail_0 :
> +										(unsigned long)&vring_avail_1,
> +			.used_user_addr = i == 0 ? (unsigned long)&vring_used_0 :
> +										(unsigned long)&vring_used_1,
> +			/* GPA addr */
> +			.log_guest_addr = i == 0 ? (unsigned long)&vring_used_0 :
> +									   (unsigned long)&vring_used_1,
> +		};
> +		struct vhost_vring_state state = {
> +			.index = i,
> +			.num = VHOST_NDESCS,
> +		};
> +		struct vhost_vring_file file = {
> +			.index = i,
> +		};
> +
> +		rc = ioctl(vhost_fd, VHOST_SET_VRING_NUM, &state);
> +		if (rc < 0)
> +			die_perror(
> +				"VHOST_SET_VRING_NUM ioctl on /dev/vhost-net failed");
> +
> +		fprintf(stderr, "qid: %d\n", i);
> +		fprintf(stderr, "vhost desc addr: 0x%llx\n", addr.desc_user_addr);
> +		fprintf(stderr, "vhost avail addr: 0x%llx\n", addr.avail_user_addr);
> +		fprintf(stderr, "vhost used addr: 0x%llx\n", addr.used_user_addr);
> +		rc = ioctl(vhost_fd, VHOST_SET_VRING_ADDR, &addr);
> +		if (rc < 0)
> +			die_perror(
> +				"VHOST_SET_VRING_ADDR ioctl on /dev/vhost-net failed");
> +
> +		file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> +		if (file.fd < 0)
> +			die_perror("Failed to create kick eventfd");
> +		rc = ioctl(vhost_fd, VHOST_SET_VRING_KICK, &file);
> +		if (rc < 0)
> +			die_perror(
> +				"VHOST_SET_VRING_KICK ioctl on /dev/vhost-net failed");
> +		c->vq[i].kick_fd = file.fd;
> +	}
> +
>  	rc = ioctl(fd, (int)TUNSETIFF, &ifr);
>  	if (rc < 0)
>  		die_perror("TUNSETIFF ioctl on /dev/net/tun failed");
> @@ -1390,7 +1686,25 @@ static int tap_ns_tun(void *arg)
>  	if (!(c->pasta_ifi = if_nametoindex(c->pasta_ifn)))
>  		die("Tap device opened but no network interface found");
>  
> +	rx_pkt_refill(c->vq[0].kick_fd);
> +
> +	/* Duplicating foreach queue to follow the exact order from QEMU */
> +	for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
> +		struct vhost_vring_file file = {
> +			.index = i,
> +			.fd = fd,
> +		};
> +
> +		fprintf(stderr, "qid: %d\n", file.index);
> +		fprintf(stderr, "tap fd: %d\n", file.fd);
> +		rc = ioctl(vhost_fd, VHOST_NET_SET_BACKEND, &file);
> +		if (rc < 0)
> +			die_perror(
> +				"VHOST_NET_SET_BACKEND ioctl on /dev/vhost-net failed");
> +	}
> +
>  	c->fd_tap = fd;
> +	c->fd_vhost = vhost_fd;
>  
>  	return 0;
>  }
> diff --git a/tap.h b/tap.h
> index 0b5ad17..91d3f62 100644
> --- a/tap.h
> +++ b/tap.h
> @@ -69,6 +69,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
>  		thdr->vnet_len = htonl(l2len);
>  }
>  
> +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now);
> +
>  unsigned long tap_l2_max_len(const struct ctx *c);
>  void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
>  void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,

-- 
Stefano


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/3] tap: implement vhost_call_cb
  2025-04-02  7:16   ` Stefano Brivio
@ 2025-04-02 12:03     ` Eugenio Perez Martin
  2025-04-03  1:48     ` David Gibson
  1 sibling, 0 replies; 13+ messages in thread
From: Eugenio Perez Martin @ 2025-04-02 12:03 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev, jmaloy, lvivier, dgibson

On Wed, Apr 2, 2025 at 9:16 AM Stefano Brivio <sbrivio@redhat.com> wrote:
>
> A couple of general notes:
>
> - there's no need to Cc: people already on this list unless you need
>   their attention specifically (it can get quite noisy...)
>

Got it, I'll do for the next time.

> - things kind of make sense to me, many are hard to evaluate at this
>   early stage, so I noted below just some specific comments/questions
>   here, but in the sense of "being on the same page" my current answer
>   is... yes, I guess so!
>
> - I'm not reviewing 1/3 and 3/3 right away as I guess you'll revisit
>   them anyway, I'm just not sure we need a separate pool... but I'm not
>   sure if that's temporary either.
>
> On Tue,  1 Apr 2025 13:38:08 +0200
> Eugenio Pérez <eperezma@redhat.com> wrote:
>
> > This is the main callback when the tap device has processed any buffer.
> > Another possibility is to reuse the tap callback for this, so less code
> > changes are needed.
> >
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > ---
> >  epoll_type.h |   2 +
> >  passt.c      |   4 +
> >  passt.h      |  12 +-
> >  tap.c        | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  tap.h        |   2 +
> >  5 files changed, 334 insertions(+), 2 deletions(-)
> >
> > diff --git a/epoll_type.h b/epoll_type.h
> > index 7f2a121..6284c79 100644
> > --- a/epoll_type.h
> > +++ b/epoll_type.h
> > @@ -44,6 +44,8 @@ enum epoll_type {
> >       EPOLL_TYPE_REPAIR_LISTEN,
> >       /* TCP_REPAIR helper socket */
> >       EPOLL_TYPE_REPAIR,
> > +     /* vhost-kernel call socket */
> > +     EPOLL_TYPE_VHOST_CALL,
> >
> >       EPOLL_NUM_TYPES,
> >  };
> > diff --git a/passt.c b/passt.c
> > index cd06772..19c5d5f 100644
> > --- a/passt.c
> > +++ b/passt.c
> > @@ -79,6 +79,7 @@ char *epoll_type_str[] = {
> >       [EPOLL_TYPE_VHOST_KICK]         = "vhost-user kick socket",
> >       [EPOLL_TYPE_REPAIR_LISTEN]      = "TCP_REPAIR helper listening socket",
> >       [EPOLL_TYPE_REPAIR]             = "TCP_REPAIR helper socket",
> > +     [EPOLL_TYPE_VHOST_CALL]         = "vhost-kernel call socket",
> >  };
> >  static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
> >             "epoll_type_str[] doesn't match enum epoll_type");
> > @@ -357,6 +358,9 @@ loop:
> >               case EPOLL_TYPE_REPAIR:
> >                       repair_handler(&c, eventmask);
> >                       break;
> > +             case EPOLL_TYPE_VHOST_CALL:
> > +                     vhost_call_cb(&c, ref, &now);
> > +                     break;
> >               default:
> >                       /* Can't happen */
> >                       ASSERT(0);
> > diff --git a/passt.h b/passt.h
> > index 8f45091..eb5aa03 100644
> > --- a/passt.h
> > +++ b/passt.h
> > @@ -45,7 +45,7 @@ union epoll_ref;
> >   * @icmp:    ICMP-specific reference part
> >   * @data:    Data handled by protocol handlers
> >   * @nsdir_fd:        netns dirfd for fallback timer checking if namespace is gone
> > - * @queue:   vhost-user queue index for this fd
> > + * @queue:   vhost queue index for this fd
> >   * @u64:     Opaque reference for epoll_ctl() and epoll_wait()
> >   */
> >  union epoll_ref {
> > @@ -271,11 +271,14 @@ struct ctx {
> >       int fd_tap;
> >       int fd_repair_listen;
> >       int fd_repair;
> > +     /* TODO document all added fields */
> > +     int fd_vhost;
> >       unsigned char our_tap_mac[ETH_ALEN];
> >       unsigned char guest_mac[ETH_ALEN];
> >       uint16_t mtu;
> >
> >       uint64_t hash_secret[2];
> > +     uint64_t virtio_features;
> >
> >       int ifi4;
> >       struct ip4_ctx ip4;
> > @@ -299,6 +302,13 @@ struct ctx {
> >       int no_icmp;
> >       struct icmp_ctx icmp;
> >
> > +     struct {
> > +             uint16_t last_used_idx;
> > +
> > +             int kick_fd;
> > +             int call_fd;
> > +     } vq[2];
> > +
> >       int no_dns;
> >       int no_dns_search;
> >       int no_dhcp_dns;
> > diff --git a/tap.c b/tap.c
> > index ce859ba..fbe83aa 100644
> > --- a/tap.c
> > +++ b/tap.c
> > @@ -31,6 +31,7 @@
> >  #include <sys/types.h>
> >  #include <sys/stat.h>
> >  #include <fcntl.h>
> > +#include <sys/eventfd.h>
>
> Why do we need eventfds here? Is there anything peculiar in the
> protocol, or it's all stuff that can be done with "regular" file
> descriptors plus epoll?
>

Yes, vhost-net trusts eventfd for signalling.

> >  #include <sys/uio.h>
> >  #include <stdbool.h>
> >  #include <stdlib.h>
> > @@ -82,6 +83,46 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
> >  #define TAP_SEQS             128 /* Different L4 tuples in one batch */
> >  #define FRAGMENT_MSG_RATE    10  /* # seconds between fragment warnings */
> >
> > +#define VHOST_VIRTIO 0xAF
> > +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
> > +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64)
> > +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
> > +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory)
> > +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state)
> > +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr)
> > +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
> > +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
> > +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
> > +#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64)
> > +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)
>
> Coding style (no strict requirement): align those nicely/table-like if
> possible.
>

I'll fix it (and the rest of code style comments) in the next RFC!

> > +
> > +char virtio_rx_pkt_buf[PKT_BUF_BYTES] __attribute__((aligned(PAGE_SIZE)));
> > +static PACKET_POOL_NOINIT(pool_virtiorx4, TAP_MSGS, virtio_rx_pkt_buf);
> > +static PACKET_POOL_NOINIT(pool_virtiorx6, TAP_MSGS, virtio_rx_pkt_buf);
> > +
> > +/* TODO is it better to have 1024 or 65520 bytes per packet? */
>
> In general 65535 bytes (including the Ethernet header) appears to be a
> good idea, but actual profiling would be nice in the long term.
>

That can be done for sure.

> > +#define VHOST_NDESCS 1024
> > +static struct vring_desc vring_desc[2][VHOST_NDESCS] __attribute__((aligned(PAGE_SIZE)));
>
> Coding style, here and a bit all over the place: wrap to 80 columns
> ("net" kernel-like style).
>
> > +static union {
> > +     struct vring_avail avail;
> > +     char buf[offsetof(struct vring_avail, ring[VHOST_NDESCS])];
> > +} vring_avail_0 __attribute__((aligned(PAGE_SIZE))), vring_avail_1 __attribute__((aligned(PAGE_SIZE)));
> > +static union {
> > +     struct vring_used used;
> > +     char buf[offsetof(struct vring_used, ring[VHOST_NDESCS])];
> > +} vring_used_0 __attribute__((aligned(PAGE_SIZE))), vring_used_1 __attribute__((aligned(PAGE_SIZE)));
> > +
> > +/* all descs ring + 2rings * 2vqs + tx pkt buf + rx pkt buf */
>
> What's the "all descs ring"? A short "theory of operation" section
> might help eventually.
>

The vhost kernel module expects these descriptors ring for the buffer
addresses & length. I can add more docs for sure.

> > +#define N_VHOST_REGIONS 7
> > +union {
> > +     struct vhost_memory mem;
> > +     char buf[offsetof(struct vhost_memory, regions[N_VHOST_REGIONS])];
> > +} vhost_memory = {
> > +     .mem = {
> > +             .nregions = N_VHOST_REGIONS,
> > +     },
> > +};
> > +
> >  /**
> >   * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
> >   * @c:               Execution context
> > @@ -1360,6 +1401,89 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
> >       tap_start_connection(c);
> >  }
> >
> > +static void *virtqueue_get_buf(struct ctx *c, unsigned qid, unsigned *len)
> > +{
> > +     struct vring_used *used = !qid ? &vring_used_0.used : &vring_used_1.used;
> > +     uint32_t i;
> > +     uint16_t used_idx, last_used;
> > +
> > +     /* TODO think if this has races with previous eventfd_read */
> > +     /* TODO we could improve performance with a shadow_used_idx */
> > +     used_idx = le16toh(used->idx);
> > +
> > +     smp_rmb();
> > +
> > +     if (used_idx == c->vq[qid].last_used_idx) {
> > +             *len = 0;
> > +             return NULL;
> > +     }
> > +
> > +     last_used = c->vq[qid].last_used_idx & VHOST_NDESCS;
> > +     i = le32toh(used->ring[last_used].id);
> > +     *len = le32toh(used->ring[last_used].len);
> > +
> > +     if (i > VHOST_NDESCS) {
> > +             /* TODO imporove this, it will cause infinite loop */
> > +             warn("vhost: id %u at used position %u out of range (max=%u)", i, last_used, VHOST_NDESCS);
> > +             return NULL;
> > +     }
> > +
> > +     if (*len > PKT_BUF_BYTES/VHOST_NDESCS) {
> > +             warn("vhost: id %d len %u > %zu", i, *len, PKT_BUF_BYTES/VHOST_NDESCS);
> > +             return NULL;
> > +     }
> > +
> > +     /* TODO check if the id is valid and it has not been double used */
> > +     c->vq[qid].last_used_idx++;
> > +     return virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS);
> > +}
> > +
> > +/* container is tx but we receive it from vhost POV */
> > +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now)
> > +{
> > +     eventfd_read(ref.fd, (eventfd_t[]){ 0 });
> > +
> > +     tap_flush_pools();
> > +
> > +     while (true) {
> > +             struct virtio_net_hdr_mrg_rxbuf *hdr;
> > +             unsigned len;
> > +
> > +             hdr = virtqueue_get_buf(c, ref.queue, &len);
> > +             if (!hdr)
> > +                     break;
> > +
> > +             if (len < sizeof(*hdr)) {
> > +                     warn("vhost: invalid len %u", len);
> > +                     continue;
> > +             }
> > +
> > +             /* TODO this will break from this moment */
> > +             if (hdr->num_buffers != 1) {
> > +                     warn("vhost: Too many buffers %u, %zu bytes should be enough for everybody!", hdr->num_buffers, PKT_BUF_BYTES/VHOST_NDESCS);
> > +                     continue;
> > +             }
> > +
> > +             /* TODO fix the v6 pool to support ipv6 */
> > +             tap_add_packet(c, len - sizeof(*hdr), (void *)(hdr+1), pool_virtiorx4, pool_virtiorx6);
> > +     }
> > +
> > +     tap_handler(c, now, pool_virtiorx4, pool_virtiorx6);
> > +}
> > +
> > +/* TODO: Actually refill */
> > +static void rx_pkt_refill(int kick_fd)
> > +{
> > +     for (unsigned i = 0; i < VHOST_NDESCS; ++i) {
> > +             vring_desc[0][i].addr = (uintptr_t)virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS);
> > +             vring_desc[0][i].len = PKT_BUF_BYTES/VHOST_NDESCS;
> > +             vring_desc[0][i].flags = VRING_DESC_F_WRITE;
> > +     }
> > +
> > +     vring_avail_0.avail.idx = VHOST_NDESCS;
> > +     eventfd_write(kick_fd, 1);
> > +}
> > +
> >  /**
> >   * tap_ns_tun() - Get tuntap fd in namespace
> >   * @c:               Execution context
> > @@ -1370,10 +1494,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
> >   */
> >  static int tap_ns_tun(void *arg)
> >  {
> > +     /* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and VHOST_NET_F_VIRTIO_NET_HDR actually */
> > +     static const uint64_t features =
> > +             (1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR);
> >       struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI };
>
> I kind of wonder, by the way, if IFF_TUN simplifies things here. It's
> something we should already add, as an option, see also:
> https://bugs.passt.top/show_bug.cgi?id=49, but if it makes your life
> easier for any reason you might consider adding it right away.
>

I'm not sure, but maybe we can reuse something, yes!

> >       int flags = O_RDWR | O_NONBLOCK | O_CLOEXEC;
> >       struct ctx *c = (struct ctx *)arg;
> > -     int fd, rc;
> > +     int fd, vhost_fd, rc;
> >
> >       c->fd_tap = -1;
> >       memcpy(ifr.ifr_name, c->pasta_ifn, IFNAMSIZ);
> > @@ -1383,6 +1510,175 @@ static int tap_ns_tun(void *arg)
> >       if (fd < 0)
> >               die_perror("Failed to open() /dev/net/tun");
> >
> > +     vhost_fd = open("/dev/vhost-net", flags);
> > +     if (vhost_fd < 0)
> > +             die_perror("Failed to open() /dev/vhost-net");
>
> Note pretty much to future self: this will need adjustments to AppArmor
> and SELinux policies.
>
> > +
> > +     rc = ioctl(vhost_fd, VHOST_SET_OWNER, NULL);
> > +     if (rc < 0)
> > +             die_perror("VHOST_SET_OWNER ioctl on /dev/vhost-net failed");
> > +
> > +     rc = ioctl(vhost_fd, VHOST_GET_FEATURES, &c->virtio_features);
> > +     if (rc < 0)
> > +             die_perror("VHOST_GET_FEATURES ioctl on /dev/vhost-net failed");
> > +
> > +     /* TODO inform more explicitely */
> > +     fprintf(stderr, "vhost features: %lx\n", c->virtio_features);
> > +     fprintf(stderr, "req features: %lx\n", features);
> > +     c->virtio_features &= features;
> > +     if (c->virtio_features != features)
> > +             die("vhost does not support required features");
> > +
> > +     for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
>
> No declarations directly in loops (it hides them somehow).
>
> > +             struct vhost_vring_file file = {
> > +                     .index = i,
> > +             };
> > +             union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_CALL,
> > +                                     .queue = i };
> > +             struct epoll_event ev;
> > +
> > +             file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > +             ref.fd = file.fd;
> > +             if (file.fd < 0)
> > +                     die_perror("Failed to create call eventfd");
> > +
> > +             rc = ioctl(vhost_fd, VHOST_SET_VRING_CALL, &file);
> > +             if (rc < 0)
> > +                     die_perror(
> > +                             "VHOST_SET_VRING_CALL ioctl on /dev/vhost-net failed");
>
> Same as net kernel style: if it's more than a line, even as a single
> statement, use curly brackets (rationale: somebody later adds another
> statement without noticing and... oops).
>
> > +
> > +             ev = (struct epoll_event){ .data.u64 = ref.u64, .events = EPOLLIN };
> > +             rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
> > +             if (rc < 0)
> > +                     die_perror("Failed to add call eventfd to epoll");
> > +             c->vq[i].call_fd = file.fd;
> > +     }
> > +
> > +     /* 1:1 translation */
> > +     vhost_memory.mem.regions[0] = (struct vhost_memory_region){
>
> Space between cast and initialiser, ") {", for consistency. I'll wait
> before we have some kind of theory of operation / general description
> before actually looking into those, I'm not sure about the exact role of
> those seven regions.
>

In the case of QEMU, vhost reads the descriptor ring with addresses in
the guest address space, as it is a direct communication between them
without QEMU intervening. These regions tells vhost how to translate
these addresses to addresses in the QEMU address space so vhost can
use copy_to_user and copy_from_user.

> > +             .guest_phys_addr = (uintptr_t)&vring_desc,
> > +             /* memory size should include the last byte, but we probably never send
> > +              * ptrs there so...
> > +              */
> > +             .memory_size = sizeof(vring_desc),
> > +             .userspace_addr = (uintptr_t)&vring_desc,
> > +     };
> > +     vhost_memory.mem.regions[1] = (struct vhost_memory_region){
> > +             .guest_phys_addr = (uintptr_t)&vring_avail_0,
> > +             /* memory size should include the last byte, but we probably never send
> > +                     * ptrs there so...
> > +                     */
> > +             .memory_size = sizeof(vring_avail_0),
> > +             .userspace_addr = (uintptr_t)&vring_avail_0,
> > +     };
> > +     vhost_memory.mem.regions[2] = (struct vhost_memory_region){
> > +             .guest_phys_addr = (uintptr_t)&vring_avail_1,
> > +             /* memory size should include the last byte, but we probably never send
> > +                     * ptrs there so...
> > +                     */
> > +             .memory_size = sizeof(vring_avail_1),
> > +             .userspace_addr = (uintptr_t)&vring_avail_1,
> > +     };
> > +     vhost_memory.mem.regions[3] = (struct vhost_memory_region){
> > +             .guest_phys_addr = (uintptr_t)&vring_used_0,
> > +             /* memory size should include the last byte, but we probably never send
> > +                     * ptrs there so...
> > +                     */
> > +             .memory_size = sizeof(vring_avail_0),
> > +             .userspace_addr = (uintptr_t)&vring_used_0,
> > +     };
> > +     vhost_memory.mem.regions[4] = (struct vhost_memory_region){
> > +             .guest_phys_addr = (uintptr_t)&vring_used_1,
> > +             /* memory size should include the last byte, but we probably never send
> > +                     * ptrs there so...
> > +                     */
> > +             .memory_size = sizeof(vring_avail_1),
> > +             .userspace_addr = (uintptr_t)&vring_used_1,
> > +     };
> > +     vhost_memory.mem.regions[5] = (struct vhost_memory_region){
> > +             .guest_phys_addr = (uintptr_t)virtio_rx_pkt_buf,
> > +             /* memory size should include the last byte, but we probably never send
> > +                     * ptrs there so...
> > +                     */
> > +             .memory_size = sizeof(virtio_rx_pkt_buf),
> > +             .userspace_addr = (uintptr_t)virtio_rx_pkt_buf,
> > +     };
> > +     vhost_memory.mem.regions[6] = (struct vhost_memory_region){
> > +             .guest_phys_addr = (uintptr_t)pkt_buf,
> > +             /* memory size should include the last byte, but we probably never send
> > +                     * ptrs there so...
> > +                     */
> > +             .memory_size = sizeof(pkt_buf),
> > +             .userspace_addr = (uintptr_t)pkt_buf,
> > +     };
> > +     static_assert(6 < N_VHOST_REGIONS);
> > +
> > +     rc = ioctl(vhost_fd, VHOST_SET_MEM_TABLE, &vhost_memory.mem);
> > +     if (rc < 0)
> > +             die_perror(
> > +                     "VHOST_SET_MEM_TABLE ioctl on /dev/vhost-net failed");
> > +
> > +     /* TODO: probably it increases RX perf */
> > +#if 0
> > +     struct ifreq ifr;
> > +     memset(&ifr, 0, sizeof(ifr));
> > +
> > +     if (ioctl(fd, TUNGETIFF, &ifr) != 0)
> > +             die_perror("Unable to query TUNGETIFF on FD %d", fd);
> > +     }
> > +
> > +     if (ifr.ifr_flags & IFF_VNET_HDR)
> > +             net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF);
> > +#endif
> > +     rc = ioctl(vhost_fd, VHOST_SET_FEATURES, &c->virtio_features);
> > +     if (rc < 0)
> > +             die_perror("VHOST_SET_FEATURES ioctl on /dev/vhost-net failed");
> > +
> > +     /* Duplicating foreach queue to follow the exact order from QEMU */
> > +     for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
> > +             struct vhost_vring_addr addr = {
> > +                     .index = i,
> > +                     .desc_user_addr = (unsigned long)vring_desc[i],
> > +                     .avail_user_addr = i == 0 ? (unsigned long)&vring_avail_0 :
> > +                                                                             (unsigned long)&vring_avail_1,
> > +                     .used_user_addr = i == 0 ? (unsigned long)&vring_used_0 :
> > +                                                                             (unsigned long)&vring_used_1,
> > +                     /* GPA addr */
> > +                     .log_guest_addr = i == 0 ? (unsigned long)&vring_used_0 :
> > +                                                                        (unsigned long)&vring_used_1,
> > +             };
> > +             struct vhost_vring_state state = {
> > +                     .index = i,
> > +                     .num = VHOST_NDESCS,
> > +             };
> > +             struct vhost_vring_file file = {
> > +                     .index = i,
> > +             };
> > +
> > +             rc = ioctl(vhost_fd, VHOST_SET_VRING_NUM, &state);
> > +             if (rc < 0)
> > +                     die_perror(
> > +                             "VHOST_SET_VRING_NUM ioctl on /dev/vhost-net failed");
> > +
> > +             fprintf(stderr, "qid: %d\n", i);
> > +             fprintf(stderr, "vhost desc addr: 0x%llx\n", addr.desc_user_addr);
> > +             fprintf(stderr, "vhost avail addr: 0x%llx\n", addr.avail_user_addr);
> > +             fprintf(stderr, "vhost used addr: 0x%llx\n", addr.used_user_addr);
> > +             rc = ioctl(vhost_fd, VHOST_SET_VRING_ADDR, &addr);
> > +             if (rc < 0)
> > +                     die_perror(
> > +                             "VHOST_SET_VRING_ADDR ioctl on /dev/vhost-net failed");
> > +
> > +             file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > +             if (file.fd < 0)
> > +                     die_perror("Failed to create kick eventfd");
> > +             rc = ioctl(vhost_fd, VHOST_SET_VRING_KICK, &file);
> > +             if (rc < 0)
> > +                     die_perror(
> > +                             "VHOST_SET_VRING_KICK ioctl on /dev/vhost-net failed");
> > +             c->vq[i].kick_fd = file.fd;
> > +     }
> > +
> >       rc = ioctl(fd, (int)TUNSETIFF, &ifr);
> >       if (rc < 0)
> >               die_perror("TUNSETIFF ioctl on /dev/net/tun failed");
> > @@ -1390,7 +1686,25 @@ static int tap_ns_tun(void *arg)
> >       if (!(c->pasta_ifi = if_nametoindex(c->pasta_ifn)))
> >               die("Tap device opened but no network interface found");
> >
> > +     rx_pkt_refill(c->vq[0].kick_fd);
> > +
> > +     /* Duplicating foreach queue to follow the exact order from QEMU */
> > +     for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
> > +             struct vhost_vring_file file = {
> > +                     .index = i,
> > +                     .fd = fd,
> > +             };
> > +
> > +             fprintf(stderr, "qid: %d\n", file.index);
> > +             fprintf(stderr, "tap fd: %d\n", file.fd);
> > +             rc = ioctl(vhost_fd, VHOST_NET_SET_BACKEND, &file);
> > +             if (rc < 0)
> > +                     die_perror(
> > +                             "VHOST_NET_SET_BACKEND ioctl on /dev/vhost-net failed");
> > +     }
> > +
> >       c->fd_tap = fd;
> > +     c->fd_vhost = vhost_fd;
> >
> >       return 0;
> >  }
> > diff --git a/tap.h b/tap.h
> > index 0b5ad17..91d3f62 100644
> > --- a/tap.h
> > +++ b/tap.h
> > @@ -69,6 +69,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
> >               thdr->vnet_len = htonl(l2len);
> >  }
> >
> > +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now);
> > +
> >  unsigned long tap_l2_max_len(const struct ctx *c);
> >  void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
> >  void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
>


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/3] tap: specify the packet pool
  2025-04-01 11:38 ` [PATCH 1/3] tap: specify the packet pool Eugenio Pérez
@ 2025-04-03  1:07   ` David Gibson
  2025-04-03  6:40     ` Eugenio Perez Martin
  0 siblings, 1 reply; 13+ messages in thread
From: David Gibson @ 2025-04-03  1:07 UTC (permalink / raw)
  To: Eugenio Pérez; +Cc: passt-dev, jmaloy, sbrivio, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 5014 bytes --]

On Tue, Apr 01, 2025 at 01:38:07PM +0200, Eugenio Pérez wrote:
> In vhost-kernel we need to allocate a storage for rx so kernel can
> write the buffers async. We need to tell tap_add_packet where to find
> them.
> 
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>

As discussed on our call, I don't think we need this.  We should be
able to re-use pool_tap[46] for vhost-net.

> ---
>  tap.c       | 34 ++++++++++++++++++++++++----------
>  tap.h       |  4 ++--
>  vu_common.c |  7 ++++---
>  3 files changed, 30 insertions(+), 15 deletions(-)
> 
> diff --git a/tap.c b/tap.c
> index 182a115..ce859ba 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -1031,10 +1031,16 @@ void tap_flush_pools(void)
>   * @c:		Execution context
>   * @now:	Current timestamp
>   */
> -void tap_handler(struct ctx *c, const struct timespec *now)
> +void tap_handler(struct ctx *c, const struct timespec *now, struct pool *pool4,
> +		struct pool *pool6)
>  {
> -	tap4_handler(c, pool_tap4, now);
> -	tap6_handler(c, pool_tap6, now);
> +	if (!pool4)
> +		pool4 = pool_tap4;
> +	if (!pool6)
> +		pool6 = pool_tap6;
> +
> +	tap4_handler(c, pool4, now);
> +	tap6_handler(c, pool6, now);
>  }
>  
>  /**
> @@ -1042,11 +1048,19 @@ void tap_handler(struct ctx *c, const struct timespec *now)
>   * @c:		Execution context
>   * @l2len:	Total L2 packet length
>   * @p:		Packet buffer
> + * @pool4	Pool for tap ipv4 packets. If NULL, is pool_tap4
> + * @pool6	Pool for tap ipv6 packets. If NULL, is pool_tap6
>   */
> -void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
> +void tap_add_packet(struct ctx *c, ssize_t l2len, char *p,
> +					struct pool *pool4, struct pool *pool6)
>  {
>  	const struct ethhdr *eh;
>  
> +	if (!pool4)
> +		pool4 = pool_tap4;
> +	if (!pool6)
> +		pool6 = pool_tap6;
> +
>  	pcap(p, l2len);
>  
>  	eh = (struct ethhdr *)p;
> @@ -1059,10 +1073,10 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
>  	switch (ntohs(eh->h_proto)) {
>  	case ETH_P_ARP:
>  	case ETH_P_IP:
> -		packet_add(pool_tap4, l2len, p);
> +		packet_add(pool4, l2len, p);
>  		break;
>  	case ETH_P_IPV6:
> -		packet_add(pool_tap6, l2len, p);
> +		packet_add(pool6, l2len, p);
>  		break;
>  	default:
>  		break;
> @@ -1142,7 +1156,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
>  		p += sizeof(uint32_t);
>  		n -= sizeof(uint32_t);
>  
> -		tap_add_packet(c, l2len, p);
> +		tap_add_packet(c, l2len, p, pool_tap4, pool_tap6);
>  
>  		p += l2len;
>  		n -= l2len;
> @@ -1151,7 +1165,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
>  	partial_len = n;
>  	partial_frame = p;
>  
> -	tap_handler(c, now);
> +	tap_handler(c, now, NULL, NULL);
>  }
>  
>  /**
> @@ -1207,10 +1221,10 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
>  		    len > (ssize_t)L2_MAX_LEN_PASTA)
>  			continue;
>  
> -		tap_add_packet(c, len, pkt_buf + n);
> +		tap_add_packet(c, len, pkt_buf + n, pool_tap4, pool_tap6);
>  	}
>  
> -	tap_handler(c, now);
> +	tap_handler(c, now, NULL, NULL);
>  }
>  
>  /**
> diff --git a/tap.h b/tap.h
> index dd39fd8..0b5ad17 100644
> --- a/tap.h
> +++ b/tap.h
> @@ -118,7 +118,7 @@ void tap_sock_reset(struct ctx *c);
>  void tap_sock_update_pool(void *base, size_t size);
>  void tap_backend_init(struct ctx *c);
>  void tap_flush_pools(void);
> -void tap_handler(struct ctx *c, const struct timespec *now);
> -void tap_add_packet(struct ctx *c, ssize_t l2len, char *p);
> +void tap_handler(struct ctx *c, const struct timespec *now, struct pool *pool4, struct pool *pool6);
> +void tap_add_packet(struct ctx *c, ssize_t l2len, char *p, struct pool *pool4, struct pool *pool6);
>  
>  #endif /* TAP_H */
> diff --git a/vu_common.c b/vu_common.c
> index 686a09b..4fe982f 100644
> --- a/vu_common.c
> +++ b/vu_common.c
> @@ -191,7 +191,7 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
>  			tap_add_packet(vdev->context,
>  				       elem[count].out_sg[0].iov_len - hdrlen,
>  				       (char *)elem[count].out_sg[0].iov_base +
> -				        hdrlen);
> +				        hdrlen, NULL, NULL);
>  		} else {
>  			/* vnet header can be in a separate iovec */
>  			if (elem[count].out_num != 2) {
> @@ -203,13 +203,14 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
>  			} else {
>  				tap_add_packet(vdev->context,
>  					       elem[count].out_sg[1].iov_len,
> -					       (char *)elem[count].out_sg[1].iov_base);
> +					       (char *)elem[count].out_sg[1].iov_base,
> +						   NULL, NULL);
>  			}
>  		}
>  
>  		count++;
>  	}
> -	tap_handler(vdev->context, now);
> +	tap_handler(vdev->context, now, NULL, NULL);
>  
>  	if (count) {
>  		int i;

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/3] tap: implement vhost_call_cb
  2025-04-01 11:38 ` [PATCH 2/3] tap: implement vhost_call_cb Eugenio Pérez
  2025-04-02  7:16   ` Stefano Brivio
@ 2025-04-03  1:45   ` David Gibson
  1 sibling, 0 replies; 13+ messages in thread
From: David Gibson @ 2025-04-03  1:45 UTC (permalink / raw)
  To: Eugenio Pérez; +Cc: passt-dev, jmaloy, sbrivio, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 18680 bytes --]

On Tue, Apr 01, 2025 at 01:38:08PM +0200, Eugenio Pérez wrote:
> This is the main callback when the tap device has processed any buffer.
> Another possibility is to reuse the tap callback for this, so less code
> changes are needed.

I think using a new epoll_type and a new callback is the right
approach here.  We mostly try to avoid having multi-layered
multiplexing of things.

> 
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> ---
>  epoll_type.h |   2 +
>  passt.c      |   4 +
>  passt.h      |  12 +-
>  tap.c        | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  tap.h        |   2 +
>  5 files changed, 334 insertions(+), 2 deletions(-)
> 
> diff --git a/epoll_type.h b/epoll_type.h
> index 7f2a121..6284c79 100644
> --- a/epoll_type.h
> +++ b/epoll_type.h
> @@ -44,6 +44,8 @@ enum epoll_type {
>  	EPOLL_TYPE_REPAIR_LISTEN,
>  	/* TCP_REPAIR helper socket */
>  	EPOLL_TYPE_REPAIR,
> +	/* vhost-kernel call socket */
> +	EPOLL_TYPE_VHOST_CALL,
>  
>  	EPOLL_NUM_TYPES,
>  };
> diff --git a/passt.c b/passt.c
> index cd06772..19c5d5f 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -79,6 +79,7 @@ char *epoll_type_str[] = {
>  	[EPOLL_TYPE_VHOST_KICK]		= "vhost-user kick socket",
>  	[EPOLL_TYPE_REPAIR_LISTEN]	= "TCP_REPAIR helper listening socket",
>  	[EPOLL_TYPE_REPAIR]		= "TCP_REPAIR helper socket",
> +	[EPOLL_TYPE_VHOST_CALL]		= "vhost-kernel call socket",
>  };
>  static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
>  	      "epoll_type_str[] doesn't match enum epoll_type");
> @@ -357,6 +358,9 @@ loop:
>  		case EPOLL_TYPE_REPAIR:
>  			repair_handler(&c, eventmask);
>  			break;
> +		case EPOLL_TYPE_VHOST_CALL:
> +			vhost_call_cb(&c, ref, &now);
> +			break;
>  		default:
>  			/* Can't happen */
>  			ASSERT(0);
> diff --git a/passt.h b/passt.h
> index 8f45091..eb5aa03 100644
> --- a/passt.h
> +++ b/passt.h
> @@ -45,7 +45,7 @@ union epoll_ref;
>   * @icmp:	ICMP-specific reference part
>   * @data:	Data handled by protocol handlers
>   * @nsdir_fd:	netns dirfd for fallback timer checking if namespace is gone
> - * @queue:	vhost-user queue index for this fd
> + * @queue:	vhost queue index for this fd
>   * @u64:	Opaque reference for epoll_ctl() and epoll_wait()
>   */
>  union epoll_ref {
> @@ -271,11 +271,14 @@ struct ctx {
>  	int fd_tap;
>  	int fd_repair_listen;
>  	int fd_repair;
> +	/* TODO document all added fields */
> +	int fd_vhost;
>  	unsigned char our_tap_mac[ETH_ALEN];
>  	unsigned char guest_mac[ETH_ALEN];
>  	uint16_t mtu;
>  
>  	uint64_t hash_secret[2];
> +	uint64_t virtio_features;
>  
>  	int ifi4;
>  	struct ip4_ctx ip4;
> @@ -299,6 +302,13 @@ struct ctx {
>  	int no_icmp;
>  	struct icmp_ctx icmp;
>  
> +	struct {
> +		uint16_t last_used_idx;
> +
> +		int kick_fd;
> +		int call_fd;
> +	} vq[2];

It might be possible to share some of these with vhost-user as well,
but that's a detail.

>  	int no_dns;
>  	int no_dns_search;
>  	int no_dhcp_dns;
> diff --git a/tap.c b/tap.c
> index ce859ba..fbe83aa 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -31,6 +31,7 @@
>  #include <sys/types.h>
>  #include <sys/stat.h>
>  #include <fcntl.h>
> +#include <sys/eventfd.h>
>  #include <sys/uio.h>
>  #include <stdbool.h>
>  #include <stdlib.h>
> @@ -82,6 +83,46 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
>  #define TAP_SEQS		128 /* Different L4 tuples in one batch */
>  #define FRAGMENT_MSG_RATE	10  /* # seconds between fragment warnings */
>  
> +#define VHOST_VIRTIO 0xAF
> +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
> +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64)
> +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
> +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory)
> +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state)
> +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr)
> +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
> +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
> +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
> +#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64)
> +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)

I think there's enough complexity in the vhost stuff that it's
probably worth putting it into new files rather than adding to tap.c

> +char virtio_rx_pkt_buf[PKT_BUF_BYTES] __attribute__((aligned(PAGE_SIZE)));

Again as discussed on the call, I think we can just re-use the
existing pkt_buf for this.

> +static PACKET_POOL_NOINIT(pool_virtiorx4, TAP_MSGS, virtio_rx_pkt_buf);
> +static PACKET_POOL_NOINIT(pool_virtiorx6, TAP_MSGS, virtio_rx_pkt_buf);
> +
> +/* TODO is it better to have 1024 or 65520 bytes per packet? */

I'm a little confused by this comment.  We certainly want to allow 64k
packets/frames, but the define below seems like it's a number of
descriptors not a size of frame.

> +#define VHOST_NDESCS 1024
> +static struct vring_desc vring_desc[2][VHOST_NDESCS] __attribute__((aligned(PAGE_SIZE)));
> +static union {
> +	struct vring_avail avail;
> +	char buf[offsetof(struct vring_avail, ring[VHOST_NDESCS])];
> +} vring_avail_0 __attribute__((aligned(PAGE_SIZE))), vring_avail_1 __attribute__((aligned(PAGE_SIZE)));
> +static union {
> +	struct vring_used used;
> +	char buf[offsetof(struct vring_used, ring[VHOST_NDESCS])];
> +} vring_used_0 __attribute__((aligned(PAGE_SIZE))), vring_used_1 __attribute__((aligned(PAGE_SIZE)));

Some comments describing what these unions do would be helpful.

> +/* all descs ring + 2rings * 2vqs + tx pkt buf + rx pkt buf */
> +#define N_VHOST_REGIONS 7
> +union {
> +	struct vhost_memory mem;
> +	char buf[offsetof(struct vhost_memory, regions[N_VHOST_REGIONS])];
> +} vhost_memory = {
> +	.mem = {
> +		.nregions = N_VHOST_REGIONS,
> +	},
> +};
> +
>  /**
>   * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
>   * @c:		Execution context
> @@ -1360,6 +1401,89 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
>  	tap_start_connection(c);
>  }
>  
> +static void *virtqueue_get_buf(struct ctx *c, unsigned qid, unsigned *len)
> +{
> +	struct vring_used *used = !qid ? &vring_used_0.used : &vring_used_1.used;
> +	uint32_t i;
> +	uint16_t used_idx, last_used;
> +
> +	/* TODO think if this has races with previous eventfd_read */
> +	/* TODO we could improve performance with a shadow_used_idx */
> +	used_idx = le16toh(used->idx);
> +
> +	smp_rmb();
> +	if (used_idx == c->vq[qid].last_used_idx) {
> +		*len = 0;
> +		return NULL;
> +	}
> +
> +	last_used = c->vq[qid].last_used_idx & VHOST_NDESCS;
> +	i = le32toh(used->ring[last_used].id);
> +	*len = le32toh(used->ring[last_used].len);
> +
> +	if (i > VHOST_NDESCS) {
> +		/* TODO imporove this, it will cause infinite loop */
> +		warn("vhost: id %u at used position %u out of range (max=%u)", i, last_used, VHOST_NDESCS);

Does this indicate the kernel has done something wrong / unexpected?
If so, it's probably ok to just die().

> +		return NULL;
> +	}
> +
> +	if (*len > PKT_BUF_BYTES/VHOST_NDESCS) {
> +		warn("vhost: id %d len %u > %zu", i, *len, PKT_BUF_BYTES/VHOST_NDESCS);
> +		return NULL;

As discussed on the call, we always want there to be room for maximum
size of frame.  You can actually define that maximum frame size to
suit yourself, say L2_MAX_LEN_VHOST similar to the existing
L2_MAX_LEN_PASTA, L2_MAX_LEN_VU etc, but it should be in the vicinity
of 64k.  The you should set the number of descriptors based on how
many frames you can it in the buffer, rather than changing the maximum
frame size based on the number of descriptors.

> +	}
> +
> +	/* TODO check if the id is valid and it has not been double used */
> +	c->vq[qid].last_used_idx++;
> +	return virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS);
> +}
> +
> +/* container is tx but we receive it from vhost POV */

You're probably already aware, but we're pretty strict about having
kerneldoc style function header comments.

> +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now)

For consistency with the other backends, I'd suggest calling this
"tap_vhost_input()".

> +{
> +	eventfd_read(ref.fd, (eventfd_t[]){ 0 });
> +
> +	tap_flush_pools();
> +
> +	while (true) {
> +		struct virtio_net_hdr_mrg_rxbuf *hdr;
> +		unsigned len;
> +
> +		hdr = virtqueue_get_buf(c, ref.queue, &len);
> +		if (!hdr)
> +			break;
> +
> +		if (len < sizeof(*hdr)) {
> +			warn("vhost: invalid len %u", len);
> +			continue;
> +		}
> +
> +		/* TODO this will break from this moment */
> +		if (hdr->num_buffers != 1) {
> +			warn("vhost: Too many buffers %u, %zu bytes should be enough for everybody!", hdr->num_buffers, PKT_BUF_BYTES/VHOST_NDESCS);
> +			continue;
> +		}
> +
> +		/* TODO fix the v6 pool to support ipv6 */
> +		tap_add_packet(c, len - sizeof(*hdr), (void *)(hdr+1), pool_virtiorx4, pool_virtiorx6);
> +	}
> +
> +	tap_handler(c, now, pool_virtiorx4, pool_virtiorx6);
> +}
> +
> +/* TODO: Actually refill */
> +static void rx_pkt_refill(int kick_fd)
> +{
> +	for (unsigned i = 0; i < VHOST_NDESCS; ++i) {
> +		vring_desc[0][i].addr = (uintptr_t)virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS);
> +		vring_desc[0][i].len = PKT_BUF_BYTES/VHOST_NDESCS;
> +		vring_desc[0][i].flags = VRING_DESC_F_WRITE;
> +	}
> +
> +	vring_avail_0.avail.idx = VHOST_NDESCS;
> +	eventfd_write(kick_fd, 1);
> +}
> +
>  /**
>   * tap_ns_tun() - Get tuntap fd in namespace
>   * @c:		Execution context
> @@ -1370,10 +1494,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
>   */
>  static int tap_ns_tun(void *arg)
>  {
> +	/* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and VHOST_NET_F_VIRTIO_NET_HDR actually */
> +	static const uint64_t features =
> +		(1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR);
>  	struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI };
>  	int flags = O_RDWR | O_NONBLOCK | O_CLOEXEC;
>  	struct ctx *c = (struct ctx *)arg;
> -	int fd, rc;
> +	int fd, vhost_fd, rc;
>  
>  	c->fd_tap = -1;
>  	memcpy(ifr.ifr_name, c->pasta_ifn, IFNAMSIZ);
> @@ -1383,6 +1510,175 @@ static int tap_ns_tun(void *arg)
>  	if (fd < 0)
>  		die_perror("Failed to open() /dev/net/tun");
>

Probably worth moving most of the vhost user specific stuff into a
separate function.

> +	vhost_fd = open("/dev/vhost-net", flags);
> +	if (vhost_fd < 0)
> +		die_perror("Failed to open() /dev/vhost-net");
> +
> +	rc = ioctl(vhost_fd, VHOST_SET_OWNER, NULL);
> +	if (rc < 0)
> +		die_perror("VHOST_SET_OWNER ioctl on /dev/vhost-net failed");
> +
> +	rc = ioctl(vhost_fd, VHOST_GET_FEATURES, &c->virtio_features);
> +	if (rc < 0)
> +		die_perror("VHOST_GET_FEATURES ioctl on /dev/vhost-net failed");
> +
> +	/* TODO inform more explicitely */
> +	fprintf(stderr, "vhost features: %lx\n", c->virtio_features);
> +	fprintf(stderr, "req features: %lx\n", features);
> +	c->virtio_features &= features;
> +	if (c->virtio_features != features)
> +		die("vhost does not support required features");
> +
> +	for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
> +		struct vhost_vring_file file = {
> +			.index = i,
> +		};
> +		union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_CALL,
> +					.queue = i };
> +		struct epoll_event ev;
> +
> +		file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> +		ref.fd = file.fd;
> +		if (file.fd < 0)
> +			die_perror("Failed to create call eventfd");
> +
> +		rc = ioctl(vhost_fd, VHOST_SET_VRING_CALL, &file);
> +		if (rc < 0)
> +			die_perror(
> +				"VHOST_SET_VRING_CALL ioctl on /dev/vhost-net failed");
> +
> +		ev = (struct epoll_event){ .data.u64 = ref.u64, .events = EPOLLIN };
> +		rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
> +		if (rc < 0)
> +			die_perror("Failed to add call eventfd to epoll");
> +		c->vq[i].call_fd = file.fd;
> +	}
> +
> +	/* 1:1 translation */
> +	vhost_memory.mem.regions[0] = (struct vhost_memory_region){
> +		.guest_phys_addr = (uintptr_t)&vring_desc,
> +		/* memory size should include the last byte, but we probably never send
> +		 * ptrs there so...

Not sure what this comment is getting at.

> +		 */
> +		.memory_size = sizeof(vring_desc),
> +		.userspace_addr = (uintptr_t)&vring_desc,
> +	};
> +	vhost_memory.mem.regions[1] = (struct vhost_memory_region){
> +		.guest_phys_addr = (uintptr_t)&vring_avail_0,
> +		/* memory size should include the last byte, but we probably never send
> +			* ptrs there so...
> +			*/
> +		.memory_size = sizeof(vring_avail_0),
> +		.userspace_addr = (uintptr_t)&vring_avail_0,
> +	};
> +	vhost_memory.mem.regions[2] = (struct vhost_memory_region){
> +		.guest_phys_addr = (uintptr_t)&vring_avail_1,
> +		/* memory size should include the last byte, but we probably never send
> +			* ptrs there so...
> +			*/
> +		.memory_size = sizeof(vring_avail_1),
> +		.userspace_addr = (uintptr_t)&vring_avail_1,
> +	};
> +	vhost_memory.mem.regions[3] = (struct vhost_memory_region){
> +		.guest_phys_addr = (uintptr_t)&vring_used_0,
> +		/* memory size should include the last byte, but we probably never send
> +			* ptrs there so...
> +			*/
> +		.memory_size = sizeof(vring_avail_0),
> +		.userspace_addr = (uintptr_t)&vring_used_0,
> +	};
> +	vhost_memory.mem.regions[4] = (struct vhost_memory_region){
> +		.guest_phys_addr = (uintptr_t)&vring_used_1,
> +		/* memory size should include the last byte, but we probably never send
> +			* ptrs there so...
> +			*/
> +		.memory_size = sizeof(vring_avail_1),
> +		.userspace_addr = (uintptr_t)&vring_used_1,
> +	};
> +	vhost_memory.mem.regions[5] = (struct vhost_memory_region){
> +		.guest_phys_addr = (uintptr_t)virtio_rx_pkt_buf,
> +		/* memory size should include the last byte, but we probably never send
> +			* ptrs there so...
> +			*/
> +		.memory_size = sizeof(virtio_rx_pkt_buf),
> +		.userspace_addr = (uintptr_t)virtio_rx_pkt_buf,
> +	};
> +	vhost_memory.mem.regions[6] = (struct vhost_memory_region){
> +		.guest_phys_addr = (uintptr_t)pkt_buf,
> +		/* memory size should include the last byte, but we probably never send
> +			* ptrs there so...
> +			*/
> +		.memory_size = sizeof(pkt_buf),
> +		.userspace_addr = (uintptr_t)pkt_buf,
> +	};
> +	static_assert(6 < N_VHOST_REGIONS);
> +
> +	rc = ioctl(vhost_fd, VHOST_SET_MEM_TABLE, &vhost_memory.mem);
> +	if (rc < 0)
> +		die_perror(
> +			"VHOST_SET_MEM_TABLE ioctl on /dev/vhost-net failed");
> +
> +	/* TODO: probably it increases RX perf */
> +#if 0
> +	struct ifreq ifr;
> +	memset(&ifr, 0, sizeof(ifr));
> +
> +	if (ioctl(fd, TUNGETIFF, &ifr) != 0)
> +		die_perror("Unable to query TUNGETIFF on FD %d", fd);
> +	}
> +
> +	if (ifr.ifr_flags & IFF_VNET_HDR)
> +		net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF);
> +#endif
> +	rc = ioctl(vhost_fd, VHOST_SET_FEATURES, &c->virtio_features);
> +	if (rc < 0)
> +		die_perror("VHOST_SET_FEATURES ioctl on /dev/vhost-net failed");
> +
> +	/* Duplicating foreach queue to follow the exact order from QEMU */
> +	for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
> +		struct vhost_vring_addr addr = {
> +			.index = i,
> +			.desc_user_addr = (unsigned long)vring_desc[i],
> +			.avail_user_addr = i == 0 ? (unsigned long)&vring_avail_0 :
> +										(unsigned long)&vring_avail_1,
> +			.used_user_addr = i == 0 ? (unsigned long)&vring_used_0 :
> +										(unsigned long)&vring_used_1,
> +			/* GPA addr */
> +			.log_guest_addr = i == 0 ? (unsigned long)&vring_used_0 :
> +									   (unsigned long)&vring_used_1,
> +		};
> +		struct vhost_vring_state state = {
> +			.index = i,
> +			.num = VHOST_NDESCS,
> +		};
> +		struct vhost_vring_file file = {
> +			.index = i,
> +		};
> +
> +		rc = ioctl(vhost_fd, VHOST_SET_VRING_NUM, &state);
> +		if (rc < 0)
> +			die_perror(
> +				"VHOST_SET_VRING_NUM ioctl on /dev/vhost-net failed");
> +
> +		fprintf(stderr, "qid: %d\n", i);
> +		fprintf(stderr, "vhost desc addr: 0x%llx\n", addr.desc_user_addr);
> +		fprintf(stderr, "vhost avail addr: 0x%llx\n", addr.avail_user_addr);
> +		fprintf(stderr, "vhost used addr: 0x%llx\n", addr.used_user_addr);
> +		rc = ioctl(vhost_fd, VHOST_SET_VRING_ADDR, &addr);
> +		if (rc < 0)
> +			die_perror(
> +				"VHOST_SET_VRING_ADDR ioctl on /dev/vhost-net failed");
> +
> +		file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> +		if (file.fd < 0)
> +			die_perror("Failed to create kick eventfd");
> +		rc = ioctl(vhost_fd, VHOST_SET_VRING_KICK, &file);
> +		if (rc < 0)
> +			die_perror(
> +				"VHOST_SET_VRING_KICK ioctl on /dev/vhost-net failed");
> +		c->vq[i].kick_fd = file.fd;
> +	}
> +
>  	rc = ioctl(fd, (int)TUNSETIFF, &ifr);
>  	if (rc < 0)
>  		die_perror("TUNSETIFF ioctl on /dev/net/tun failed");
> @@ -1390,7 +1686,25 @@ static int tap_ns_tun(void *arg)
>  	if (!(c->pasta_ifi = if_nametoindex(c->pasta_ifn)))
>  		die("Tap device opened but no network interface found");
>  
> +	rx_pkt_refill(c->vq[0].kick_fd);
> +
> +	/* Duplicating foreach queue to follow the exact order from QEMU */
> +	for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
> +		struct vhost_vring_file file = {
> +			.index = i,
> +			.fd = fd,
> +		};
> +
> +		fprintf(stderr, "qid: %d\n", file.index);
> +		fprintf(stderr, "tap fd: %d\n", file.fd);
> +		rc = ioctl(vhost_fd, VHOST_NET_SET_BACKEND, &file);
> +		if (rc < 0)
> +			die_perror(
> +				"VHOST_NET_SET_BACKEND ioctl on /dev/vhost-net failed");
> +	}
> +
>  	c->fd_tap = fd;
> +	c->fd_vhost = vhost_fd;
>  
>  	return 0;
>  }
> diff --git a/tap.h b/tap.h
> index 0b5ad17..91d3f62 100644
> --- a/tap.h
> +++ b/tap.h
> @@ -69,6 +69,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
>  		thdr->vnet_len = htonl(l2len);
>  }
>  
> +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now);
> +
>  unsigned long tap_l2_max_len(const struct ctx *c);
>  void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
>  void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/3] tap: implement vhost_call_cb
  2025-04-02  7:16   ` Stefano Brivio
  2025-04-02 12:03     ` Eugenio Perez Martin
@ 2025-04-03  1:48     ` David Gibson
  2025-04-03  4:28       ` Stefano Brivio
  1 sibling, 1 reply; 13+ messages in thread
From: David Gibson @ 2025-04-03  1:48 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Eugenio Pérez, passt-dev, jmaloy, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 1230 bytes --]

On Wed, Apr 02, 2025 at 09:16:22AM +0200, Stefano Brivio wrote:
[snip]
> >  static int tap_ns_tun(void *arg)
> >  {
> > +	/* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and VHOST_NET_F_VIRTIO_NET_HDR actually */
> > +	static const uint64_t features =
> > +		(1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR);
> >  	struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI };
> 
> I kind of wonder, by the way, if IFF_TUN simplifies things here. It's
> something we should already add, as an option, see also:
> https://bugs.passt.top/show_bug.cgi?id=49, but if it makes your life
> easier for any reason you might consider adding it right away.

I agree that supporting IFF_TUN would be a good idea in general.  I
think trying to do it in the middle of this will be unnecessary
difficulty though.  There are a bunch of places throughout the code
that assume we have Ethernet headers, and changing that will require a
pretty wide ranging rework.

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 3/3] tap: add die() on vhost error
  2025-04-01 11:38 ` [PATCH 3/3] tap: add die() on vhost error Eugenio Pérez
@ 2025-04-03  1:50   ` David Gibson
  2025-04-03  6:36     ` Eugenio Perez Martin
  0 siblings, 1 reply; 13+ messages in thread
From: David Gibson @ 2025-04-03  1:50 UTC (permalink / raw)
  To: Eugenio Pérez; +Cc: passt-dev, jmaloy, sbrivio, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 2914 bytes --]

On Tue, Apr 01, 2025 at 01:38:09PM +0200, Eugenio Pérez wrote:
> In case the kernel needs to signal an error.
> 
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> ---
>  epoll_type.h |  2 ++
>  passt.c      |  4 ++++
>  passt.h      |  1 +
>  tap.c        | 16 ++++++++++++++++
>  4 files changed, 23 insertions(+)
> 
> diff --git a/epoll_type.h b/epoll_type.h
> index 6284c79..6b320db 100644
> --- a/epoll_type.h
> +++ b/epoll_type.h
> @@ -46,6 +46,8 @@ enum epoll_type {
>  	EPOLL_TYPE_REPAIR,
>  	/* vhost-kernel call socket */
>  	EPOLL_TYPE_VHOST_CALL,
> +	/* vhost-kernel error socket */
> +	EPOLL_TYPE_VHOST_ERROR,
> 
>  	EPOLL_NUM_TYPES,
>  };
> diff --git a/passt.c b/passt.c
> index 19c5d5f..2779e0b 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -80,6 +80,7 @@ char *epoll_type_str[] = {
>  	[EPOLL_TYPE_REPAIR_LISTEN]	= "TCP_REPAIR helper listening socket",
>  	[EPOLL_TYPE_REPAIR]		= "TCP_REPAIR helper socket",
>  	[EPOLL_TYPE_VHOST_CALL]		= "vhost-kernel call socket",
> +	[EPOLL_TYPE_VHOST_ERROR]	= "vhost-kernel error socket",
>  };
>  static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
>  	      "epoll_type_str[] doesn't match enum epoll_type");
> @@ -361,6 +362,9 @@ loop:
>  		case EPOLL_TYPE_VHOST_CALL:
>  			vhost_call_cb(&c, ref, &now);
>  			break;
> +		case EPOLL_TYPE_VHOST_ERROR:
> +			die("Error on vhost-kernel socket");
> +			break;
>  		default:
>  			/* Can't happen */
>  			ASSERT(0);
> diff --git a/passt.h b/passt.h
> index eb5aa03..9e42f3b 100644
> --- a/passt.h
> +++ b/passt.h
> @@ -307,6 +307,7 @@ struct ctx {
> 
>  		int kick_fd;
>  		int call_fd;
> +		int err_fd;
>  	} vq[2];
> 
>  	int no_dns;
> diff --git a/tap.c b/tap.c
> index fbe83aa..b02d3da 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -1552,6 +1552,22 @@ static int tap_ns_tun(void *arg)
>  		if (rc < 0)
>  			die_perror("Failed to add call eventfd to epoll");
>  		c->vq[i].call_fd = file.fd;
> +
> +		file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> +		if (file.fd < 0)
> +			die_perror("Failed to create error eventfd");
> +
> +		rc = ioctl(vhost_fd, VHOST_SET_VRING_ERR, &file);
> +		if (rc < 0)
> +			die_perror(
> +				"VHOST_SET_VRING_ERR ioctl on /dev/vhost-net failed");
> +
> +		ref.type = EPOLL_TYPE_VHOST_ERROR, ref.fd = file.fd;
> +		ev.data.u64 = ref.u64;
> +		rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
> +		if (rc < 0)
> +			die_perror("Failed to add error eventfd to epoll");
> +		c->vq[i].err_fd = file.fd;

If it's possible to re-initialize vhost, eventually it would be nice
to do so eventually, but for now die()ing is perfectly reasonable.

>  	}
> 
>  	/* 1:1 translation */

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/3] tap: implement vhost_call_cb
  2025-04-03  1:48     ` David Gibson
@ 2025-04-03  4:28       ` Stefano Brivio
  0 siblings, 0 replies; 13+ messages in thread
From: Stefano Brivio @ 2025-04-03  4:28 UTC (permalink / raw)
  To: David Gibson, Eugenio Pérez; +Cc: passt-dev, jmaloy, lvivier, dgibson

On Thu, 3 Apr 2025 12:48:42 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Wed, Apr 02, 2025 at 09:16:22AM +0200, Stefano Brivio wrote:
> [snip]
> > >  static int tap_ns_tun(void *arg)
> > >  {
> > > +	/* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and VHOST_NET_F_VIRTIO_NET_HDR actually */
> > > +	static const uint64_t features =
> > > +		(1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR);
> > >  	struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI };  
> > 
> > I kind of wonder, by the way, if IFF_TUN simplifies things here. It's
> > something we should already add, as an option, see also:
> > https://bugs.passt.top/show_bug.cgi?id=49, but if it makes your life
> > easier for any reason you might consider adding it right away.  
> 
> I agree that supporting IFF_TUN would be a good idea in general.  I
> think trying to do it in the middle of this will be unnecessary
> difficulty though.  There are a bunch of places throughout the code
> that assume we have Ethernet headers, and changing that will require a
> pretty wide ranging rework.

Maybe, yes, hence "if it makes your life easier for any reason". If it
does not, it's of course out of scope here...

-- 
Stefano


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 3/3] tap: add die() on vhost error
  2025-04-03  1:50   ` David Gibson
@ 2025-04-03  6:36     ` Eugenio Perez Martin
  0 siblings, 0 replies; 13+ messages in thread
From: Eugenio Perez Martin @ 2025-04-03  6:36 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Thu, Apr 3, 2025 at 3:51 AM David Gibson <david@gibson.dropbear.id.au> wrote:
>
> On Tue, Apr 01, 2025 at 01:38:09PM +0200, Eugenio Pérez wrote:
> > In case the kernel needs to signal an error.
> >
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > ---
> >  epoll_type.h |  2 ++
> >  passt.c      |  4 ++++
> >  passt.h      |  1 +
> >  tap.c        | 16 ++++++++++++++++
> >  4 files changed, 23 insertions(+)
> >
> > diff --git a/epoll_type.h b/epoll_type.h
> > index 6284c79..6b320db 100644
> > --- a/epoll_type.h
> > +++ b/epoll_type.h
> > @@ -46,6 +46,8 @@ enum epoll_type {
> >       EPOLL_TYPE_REPAIR,
> >       /* vhost-kernel call socket */
> >       EPOLL_TYPE_VHOST_CALL,
> > +     /* vhost-kernel error socket */
> > +     EPOLL_TYPE_VHOST_ERROR,
> >
> >       EPOLL_NUM_TYPES,
> >  };
> > diff --git a/passt.c b/passt.c
> > index 19c5d5f..2779e0b 100644
> > --- a/passt.c
> > +++ b/passt.c
> > @@ -80,6 +80,7 @@ char *epoll_type_str[] = {
> >       [EPOLL_TYPE_REPAIR_LISTEN]      = "TCP_REPAIR helper listening socket",
> >       [EPOLL_TYPE_REPAIR]             = "TCP_REPAIR helper socket",
> >       [EPOLL_TYPE_VHOST_CALL]         = "vhost-kernel call socket",
> > +     [EPOLL_TYPE_VHOST_ERROR]        = "vhost-kernel error socket",
> >  };
> >  static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
> >             "epoll_type_str[] doesn't match enum epoll_type");
> > @@ -361,6 +362,9 @@ loop:
> >               case EPOLL_TYPE_VHOST_CALL:
> >                       vhost_call_cb(&c, ref, &now);
> >                       break;
> > +             case EPOLL_TYPE_VHOST_ERROR:
> > +                     die("Error on vhost-kernel socket");
> > +                     break;
> >               default:
> >                       /* Can't happen */
> >                       ASSERT(0);
> > diff --git a/passt.h b/passt.h
> > index eb5aa03..9e42f3b 100644
> > --- a/passt.h
> > +++ b/passt.h
> > @@ -307,6 +307,7 @@ struct ctx {
> >
> >               int kick_fd;
> >               int call_fd;
> > +             int err_fd;
> >       } vq[2];
> >
> >       int no_dns;
> > diff --git a/tap.c b/tap.c
> > index fbe83aa..b02d3da 100644
> > --- a/tap.c
> > +++ b/tap.c
> > @@ -1552,6 +1552,22 @@ static int tap_ns_tun(void *arg)
> >               if (rc < 0)
> >                       die_perror("Failed to add call eventfd to epoll");
> >               c->vq[i].call_fd = file.fd;
> > +
> > +             file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > +             if (file.fd < 0)
> > +                     die_perror("Failed to create error eventfd");
> > +
> > +             rc = ioctl(vhost_fd, VHOST_SET_VRING_ERR, &file);
> > +             if (rc < 0)
> > +                     die_perror(
> > +                             "VHOST_SET_VRING_ERR ioctl on /dev/vhost-net failed");
> > +
> > +             ref.type = EPOLL_TYPE_VHOST_ERROR, ref.fd = file.fd;
> > +             ev.data.u64 = ref.u64;
> > +             rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
> > +             if (rc < 0)
> > +                     die_perror("Failed to add error eventfd to epoll");
> > +             c->vq[i].err_fd = file.fd;
>
> If it's possible to re-initialize vhost, eventually it would be nice
> to do so eventually, but for now die()ing is perfectly reasonable.
>

That's a good point, adding TODO!


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/3] tap: specify the packet pool
  2025-04-03  1:07   ` David Gibson
@ 2025-04-03  6:40     ` Eugenio Perez Martin
  0 siblings, 0 replies; 13+ messages in thread
From: Eugenio Perez Martin @ 2025-04-03  6:40 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev, jmaloy, sbrivio, lvivier, dgibson

On Thu, Apr 3, 2025 at 3:51 AM David Gibson <david@gibson.dropbear.id.au> wrote:
>
> On Tue, Apr 01, 2025 at 01:38:07PM +0200, Eugenio Pérez wrote:
> > In vhost-kernel we need to allocate a storage for rx so kernel can
> > write the buffers async. We need to tell tap_add_packet where to find
> > them.
> >
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
>
> As discussed on our call, I don't think we need this.  We should be
> able to re-use pool_tap[46] for vhost-net.
>

Right, I'm able to transmit from the guest reusing pool_tap[46] now, thanks!


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2025-04-03  6:41 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-04-01 11:38 [PATCH 0/3] Add vhost-net kernel support Eugenio Pérez
2025-04-01 11:38 ` [PATCH 1/3] tap: specify the packet pool Eugenio Pérez
2025-04-03  1:07   ` David Gibson
2025-04-03  6:40     ` Eugenio Perez Martin
2025-04-01 11:38 ` [PATCH 2/3] tap: implement vhost_call_cb Eugenio Pérez
2025-04-02  7:16   ` Stefano Brivio
2025-04-02 12:03     ` Eugenio Perez Martin
2025-04-03  1:48     ` David Gibson
2025-04-03  4:28       ` Stefano Brivio
2025-04-03  1:45   ` David Gibson
2025-04-01 11:38 ` [PATCH 3/3] tap: add die() on vhost error Eugenio Pérez
2025-04-03  1:50   ` David Gibson
2025-04-03  6:36     ` Eugenio Perez Martin

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).