From: "Eugenio Pérez" <eperezma@redhat.com>
To: passt-dev@passt.top
Cc: jmaloy@redhat.com, sbrivio@redhat.com, lvivier@redhat.com,
dgibson@redhat.com
Subject: [PATCH 2/3] tap: implement vhost_call_cb
Date: Tue, 1 Apr 2025 13:38:08 +0200 [thread overview]
Message-ID: <20250401113809.1765282-3-eperezma@redhat.com> (raw)
In-Reply-To: <20250401113809.1765282-1-eperezma@redhat.com>
This is the main callback when the tap device has processed any buffer.
Another possibility is to reuse the tap callback for this, so less code
changes are needed.
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
epoll_type.h | 2 +
passt.c | 4 +
passt.h | 12 +-
tap.c | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++-
tap.h | 2 +
5 files changed, 334 insertions(+), 2 deletions(-)
diff --git a/epoll_type.h b/epoll_type.h
index 7f2a121..6284c79 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -44,6 +44,8 @@ enum epoll_type {
EPOLL_TYPE_REPAIR_LISTEN,
/* TCP_REPAIR helper socket */
EPOLL_TYPE_REPAIR,
+ /* vhost-kernel call socket */
+ EPOLL_TYPE_VHOST_CALL,
EPOLL_NUM_TYPES,
};
diff --git a/passt.c b/passt.c
index cd06772..19c5d5f 100644
--- a/passt.c
+++ b/passt.c
@@ -79,6 +79,7 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
[EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
[EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
+ [EPOLL_TYPE_VHOST_CALL] = "vhost-kernel call socket",
};
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
"epoll_type_str[] doesn't match enum epoll_type");
@@ -357,6 +358,9 @@ loop:
case EPOLL_TYPE_REPAIR:
repair_handler(&c, eventmask);
break;
+ case EPOLL_TYPE_VHOST_CALL:
+ vhost_call_cb(&c, ref, &now);
+ break;
default:
/* Can't happen */
ASSERT(0);
diff --git a/passt.h b/passt.h
index 8f45091..eb5aa03 100644
--- a/passt.h
+++ b/passt.h
@@ -45,7 +45,7 @@ union epoll_ref;
* @icmp: ICMP-specific reference part
* @data: Data handled by protocol handlers
* @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone
- * @queue: vhost-user queue index for this fd
+ * @queue: vhost queue index for this fd
* @u64: Opaque reference for epoll_ctl() and epoll_wait()
*/
union epoll_ref {
@@ -271,11 +271,14 @@ struct ctx {
int fd_tap;
int fd_repair_listen;
int fd_repair;
+ /* TODO document all added fields */
+ int fd_vhost;
unsigned char our_tap_mac[ETH_ALEN];
unsigned char guest_mac[ETH_ALEN];
uint16_t mtu;
uint64_t hash_secret[2];
+ uint64_t virtio_features;
int ifi4;
struct ip4_ctx ip4;
@@ -299,6 +302,13 @@ struct ctx {
int no_icmp;
struct icmp_ctx icmp;
+ struct {
+ uint16_t last_used_idx;
+
+ int kick_fd;
+ int call_fd;
+ } vq[2];
+
int no_dns;
int no_dns_search;
int no_dhcp_dns;
diff --git a/tap.c b/tap.c
index ce859ba..fbe83aa 100644
--- a/tap.c
+++ b/tap.c
@@ -31,6 +31,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
+#include <sys/eventfd.h>
#include <sys/uio.h>
#include <stdbool.h>
#include <stdlib.h>
@@ -82,6 +83,46 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
#define TAP_SEQS 128 /* Different L4 tuples in one batch */
#define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */
+#define VHOST_VIRTIO 0xAF
+#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
+#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64)
+#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
+#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory)
+#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state)
+#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr)
+#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
+#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
+#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
+#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64)
+#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)
+
+char virtio_rx_pkt_buf[PKT_BUF_BYTES] __attribute__((aligned(PAGE_SIZE)));
+static PACKET_POOL_NOINIT(pool_virtiorx4, TAP_MSGS, virtio_rx_pkt_buf);
+static PACKET_POOL_NOINIT(pool_virtiorx6, TAP_MSGS, virtio_rx_pkt_buf);
+
+/* TODO is it better to have 1024 or 65520 bytes per packet? */
+#define VHOST_NDESCS 1024
+static struct vring_desc vring_desc[2][VHOST_NDESCS] __attribute__((aligned(PAGE_SIZE)));
+static union {
+ struct vring_avail avail;
+ char buf[offsetof(struct vring_avail, ring[VHOST_NDESCS])];
+} vring_avail_0 __attribute__((aligned(PAGE_SIZE))), vring_avail_1 __attribute__((aligned(PAGE_SIZE)));
+static union {
+ struct vring_used used;
+ char buf[offsetof(struct vring_used, ring[VHOST_NDESCS])];
+} vring_used_0 __attribute__((aligned(PAGE_SIZE))), vring_used_1 __attribute__((aligned(PAGE_SIZE)));
+
+/* all descs ring + 2rings * 2vqs + tx pkt buf + rx pkt buf */
+#define N_VHOST_REGIONS 7
+union {
+ struct vhost_memory mem;
+ char buf[offsetof(struct vhost_memory, regions[N_VHOST_REGIONS])];
+} vhost_memory = {
+ .mem = {
+ .nregions = N_VHOST_REGIONS,
+ },
+};
+
/**
* tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
* @c: Execution context
@@ -1360,6 +1401,89 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
tap_start_connection(c);
}
+static void *virtqueue_get_buf(struct ctx *c, unsigned qid, unsigned *len)
+{
+ struct vring_used *used = !qid ? &vring_used_0.used : &vring_used_1.used;
+ uint32_t i;
+ uint16_t used_idx, last_used;
+
+ /* TODO think if this has races with previous eventfd_read */
+ /* TODO we could improve performance with a shadow_used_idx */
+ used_idx = le16toh(used->idx);
+
+ smp_rmb();
+
+ if (used_idx == c->vq[qid].last_used_idx) {
+ *len = 0;
+ return NULL;
+ }
+
+ last_used = c->vq[qid].last_used_idx & VHOST_NDESCS;
+ i = le32toh(used->ring[last_used].id);
+ *len = le32toh(used->ring[last_used].len);
+
+ if (i > VHOST_NDESCS) {
+ /* TODO imporove this, it will cause infinite loop */
+ warn("vhost: id %u at used position %u out of range (max=%u)", i, last_used, VHOST_NDESCS);
+ return NULL;
+ }
+
+ if (*len > PKT_BUF_BYTES/VHOST_NDESCS) {
+ warn("vhost: id %d len %u > %zu", i, *len, PKT_BUF_BYTES/VHOST_NDESCS);
+ return NULL;
+ }
+
+ /* TODO check if the id is valid and it has not been double used */
+ c->vq[qid].last_used_idx++;
+ return virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS);
+}
+
+/* container is tx but we receive it from vhost POV */
+void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now)
+{
+ eventfd_read(ref.fd, (eventfd_t[]){ 0 });
+
+ tap_flush_pools();
+
+ while (true) {
+ struct virtio_net_hdr_mrg_rxbuf *hdr;
+ unsigned len;
+
+ hdr = virtqueue_get_buf(c, ref.queue, &len);
+ if (!hdr)
+ break;
+
+ if (len < sizeof(*hdr)) {
+ warn("vhost: invalid len %u", len);
+ continue;
+ }
+
+ /* TODO this will break from this moment */
+ if (hdr->num_buffers != 1) {
+ warn("vhost: Too many buffers %u, %zu bytes should be enough for everybody!", hdr->num_buffers, PKT_BUF_BYTES/VHOST_NDESCS);
+ continue;
+ }
+
+ /* TODO fix the v6 pool to support ipv6 */
+ tap_add_packet(c, len - sizeof(*hdr), (void *)(hdr+1), pool_virtiorx4, pool_virtiorx6);
+ }
+
+ tap_handler(c, now, pool_virtiorx4, pool_virtiorx6);
+}
+
+/* TODO: Actually refill */
+static void rx_pkt_refill(int kick_fd)
+{
+ for (unsigned i = 0; i < VHOST_NDESCS; ++i) {
+ vring_desc[0][i].addr = (uintptr_t)virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS);
+ vring_desc[0][i].len = PKT_BUF_BYTES/VHOST_NDESCS;
+ vring_desc[0][i].flags = VRING_DESC_F_WRITE;
+ }
+
+ vring_avail_0.avail.idx = VHOST_NDESCS;
+ eventfd_write(kick_fd, 1);
+}
+
/**
* tap_ns_tun() - Get tuntap fd in namespace
* @c: Execution context
@@ -1370,10 +1494,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
*/
static int tap_ns_tun(void *arg)
{
+ /* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and VHOST_NET_F_VIRTIO_NET_HDR actually */
+ static const uint64_t features =
+ (1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR);
struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI };
int flags = O_RDWR | O_NONBLOCK | O_CLOEXEC;
struct ctx *c = (struct ctx *)arg;
- int fd, rc;
+ int fd, vhost_fd, rc;
c->fd_tap = -1;
memcpy(ifr.ifr_name, c->pasta_ifn, IFNAMSIZ);
@@ -1383,6 +1510,175 @@ static int tap_ns_tun(void *arg)
if (fd < 0)
die_perror("Failed to open() /dev/net/tun");
+ vhost_fd = open("/dev/vhost-net", flags);
+ if (vhost_fd < 0)
+ die_perror("Failed to open() /dev/vhost-net");
+
+ rc = ioctl(vhost_fd, VHOST_SET_OWNER, NULL);
+ if (rc < 0)
+ die_perror("VHOST_SET_OWNER ioctl on /dev/vhost-net failed");
+
+ rc = ioctl(vhost_fd, VHOST_GET_FEATURES, &c->virtio_features);
+ if (rc < 0)
+ die_perror("VHOST_GET_FEATURES ioctl on /dev/vhost-net failed");
+
+ /* TODO inform more explicitely */
+ fprintf(stderr, "vhost features: %lx\n", c->virtio_features);
+ fprintf(stderr, "req features: %lx\n", features);
+ c->virtio_features &= features;
+ if (c->virtio_features != features)
+ die("vhost does not support required features");
+
+ for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
+ struct vhost_vring_file file = {
+ .index = i,
+ };
+ union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_CALL,
+ .queue = i };
+ struct epoll_event ev;
+
+ file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+ ref.fd = file.fd;
+ if (file.fd < 0)
+ die_perror("Failed to create call eventfd");
+
+ rc = ioctl(vhost_fd, VHOST_SET_VRING_CALL, &file);
+ if (rc < 0)
+ die_perror(
+ "VHOST_SET_VRING_CALL ioctl on /dev/vhost-net failed");
+
+ ev = (struct epoll_event){ .data.u64 = ref.u64, .events = EPOLLIN };
+ rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
+ if (rc < 0)
+ die_perror("Failed to add call eventfd to epoll");
+ c->vq[i].call_fd = file.fd;
+ }
+
+ /* 1:1 translation */
+ vhost_memory.mem.regions[0] = (struct vhost_memory_region){
+ .guest_phys_addr = (uintptr_t)&vring_desc,
+ /* memory size should include the last byte, but we probably never send
+ * ptrs there so...
+ */
+ .memory_size = sizeof(vring_desc),
+ .userspace_addr = (uintptr_t)&vring_desc,
+ };
+ vhost_memory.mem.regions[1] = (struct vhost_memory_region){
+ .guest_phys_addr = (uintptr_t)&vring_avail_0,
+ /* memory size should include the last byte, but we probably never send
+ * ptrs there so...
+ */
+ .memory_size = sizeof(vring_avail_0),
+ .userspace_addr = (uintptr_t)&vring_avail_0,
+ };
+ vhost_memory.mem.regions[2] = (struct vhost_memory_region){
+ .guest_phys_addr = (uintptr_t)&vring_avail_1,
+ /* memory size should include the last byte, but we probably never send
+ * ptrs there so...
+ */
+ .memory_size = sizeof(vring_avail_1),
+ .userspace_addr = (uintptr_t)&vring_avail_1,
+ };
+ vhost_memory.mem.regions[3] = (struct vhost_memory_region){
+ .guest_phys_addr = (uintptr_t)&vring_used_0,
+ /* memory size should include the last byte, but we probably never send
+ * ptrs there so...
+ */
+ .memory_size = sizeof(vring_avail_0),
+ .userspace_addr = (uintptr_t)&vring_used_0,
+ };
+ vhost_memory.mem.regions[4] = (struct vhost_memory_region){
+ .guest_phys_addr = (uintptr_t)&vring_used_1,
+ /* memory size should include the last byte, but we probably never send
+ * ptrs there so...
+ */
+ .memory_size = sizeof(vring_avail_1),
+ .userspace_addr = (uintptr_t)&vring_used_1,
+ };
+ vhost_memory.mem.regions[5] = (struct vhost_memory_region){
+ .guest_phys_addr = (uintptr_t)virtio_rx_pkt_buf,
+ /* memory size should include the last byte, but we probably never send
+ * ptrs there so...
+ */
+ .memory_size = sizeof(virtio_rx_pkt_buf),
+ .userspace_addr = (uintptr_t)virtio_rx_pkt_buf,
+ };
+ vhost_memory.mem.regions[6] = (struct vhost_memory_region){
+ .guest_phys_addr = (uintptr_t)pkt_buf,
+ /* memory size should include the last byte, but we probably never send
+ * ptrs there so...
+ */
+ .memory_size = sizeof(pkt_buf),
+ .userspace_addr = (uintptr_t)pkt_buf,
+ };
+ static_assert(6 < N_VHOST_REGIONS);
+
+ rc = ioctl(vhost_fd, VHOST_SET_MEM_TABLE, &vhost_memory.mem);
+ if (rc < 0)
+ die_perror(
+ "VHOST_SET_MEM_TABLE ioctl on /dev/vhost-net failed");
+
+ /* TODO: probably it increases RX perf */
+#if 0
+ struct ifreq ifr;
+ memset(&ifr, 0, sizeof(ifr));
+
+ if (ioctl(fd, TUNGETIFF, &ifr) != 0)
+ die_perror("Unable to query TUNGETIFF on FD %d", fd);
+ }
+
+ if (ifr.ifr_flags & IFF_VNET_HDR)
+ net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF);
+#endif
+ rc = ioctl(vhost_fd, VHOST_SET_FEATURES, &c->virtio_features);
+ if (rc < 0)
+ die_perror("VHOST_SET_FEATURES ioctl on /dev/vhost-net failed");
+
+ /* Duplicating foreach queue to follow the exact order from QEMU */
+ for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
+ struct vhost_vring_addr addr = {
+ .index = i,
+ .desc_user_addr = (unsigned long)vring_desc[i],
+ .avail_user_addr = i == 0 ? (unsigned long)&vring_avail_0 :
+ (unsigned long)&vring_avail_1,
+ .used_user_addr = i == 0 ? (unsigned long)&vring_used_0 :
+ (unsigned long)&vring_used_1,
+ /* GPA addr */
+ .log_guest_addr = i == 0 ? (unsigned long)&vring_used_0 :
+ (unsigned long)&vring_used_1,
+ };
+ struct vhost_vring_state state = {
+ .index = i,
+ .num = VHOST_NDESCS,
+ };
+ struct vhost_vring_file file = {
+ .index = i,
+ };
+
+ rc = ioctl(vhost_fd, VHOST_SET_VRING_NUM, &state);
+ if (rc < 0)
+ die_perror(
+ "VHOST_SET_VRING_NUM ioctl on /dev/vhost-net failed");
+
+ fprintf(stderr, "qid: %d\n", i);
+ fprintf(stderr, "vhost desc addr: 0x%llx\n", addr.desc_user_addr);
+ fprintf(stderr, "vhost avail addr: 0x%llx\n", addr.avail_user_addr);
+ fprintf(stderr, "vhost used addr: 0x%llx\n", addr.used_user_addr);
+ rc = ioctl(vhost_fd, VHOST_SET_VRING_ADDR, &addr);
+ if (rc < 0)
+ die_perror(
+ "VHOST_SET_VRING_ADDR ioctl on /dev/vhost-net failed");
+
+ file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+ if (file.fd < 0)
+ die_perror("Failed to create kick eventfd");
+ rc = ioctl(vhost_fd, VHOST_SET_VRING_KICK, &file);
+ if (rc < 0)
+ die_perror(
+ "VHOST_SET_VRING_KICK ioctl on /dev/vhost-net failed");
+ c->vq[i].kick_fd = file.fd;
+ }
+
rc = ioctl(fd, (int)TUNSETIFF, &ifr);
if (rc < 0)
die_perror("TUNSETIFF ioctl on /dev/net/tun failed");
@@ -1390,7 +1686,25 @@ static int tap_ns_tun(void *arg)
if (!(c->pasta_ifi = if_nametoindex(c->pasta_ifn)))
die("Tap device opened but no network interface found");
+ rx_pkt_refill(c->vq[0].kick_fd);
+
+ /* Duplicating foreach queue to follow the exact order from QEMU */
+ for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
+ struct vhost_vring_file file = {
+ .index = i,
+ .fd = fd,
+ };
+
+ fprintf(stderr, "qid: %d\n", file.index);
+ fprintf(stderr, "tap fd: %d\n", file.fd);
+ rc = ioctl(vhost_fd, VHOST_NET_SET_BACKEND, &file);
+ if (rc < 0)
+ die_perror(
+ "VHOST_NET_SET_BACKEND ioctl on /dev/vhost-net failed");
+ }
+
c->fd_tap = fd;
+ c->fd_vhost = vhost_fd;
return 0;
}
diff --git a/tap.h b/tap.h
index 0b5ad17..91d3f62 100644
--- a/tap.h
+++ b/tap.h
@@ -69,6 +69,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
thdr->vnet_len = htonl(l2len);
}
+void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now);
+
unsigned long tap_l2_max_len(const struct ctx *c);
void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
--
@@ -69,6 +69,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
thdr->vnet_len = htonl(l2len);
}
+void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now);
+
unsigned long tap_l2_max_len(const struct ctx *c);
void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
--
2.49.0
next prev parent reply other threads:[~2025-04-01 11:38 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-04-01 11:38 [PATCH 0/3] Add vhost-net kernel support Eugenio Pérez
2025-04-01 11:38 ` [PATCH 1/3] tap: specify the packet pool Eugenio Pérez
2025-04-03 1:07 ` David Gibson
2025-04-03 6:40 ` Eugenio Perez Martin
2025-04-01 11:38 ` Eugenio Pérez [this message]
2025-04-02 7:16 ` [PATCH 2/3] tap: implement vhost_call_cb Stefano Brivio
2025-04-02 12:03 ` Eugenio Perez Martin
2025-04-03 1:48 ` David Gibson
2025-04-03 4:28 ` Stefano Brivio
2025-04-03 1:45 ` David Gibson
2025-04-01 11:38 ` [PATCH 3/3] tap: add die() on vhost error Eugenio Pérez
2025-04-03 1:50 ` David Gibson
2025-04-03 6:36 ` Eugenio Perez Martin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250401113809.1765282-3-eperezma@redhat.com \
--to=eperezma@redhat.com \
--cc=dgibson@redhat.com \
--cc=jmaloy@redhat.com \
--cc=lvivier@redhat.com \
--cc=passt-dev@passt.top \
--cc=sbrivio@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).