public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: "Eugenio Pérez" <eperezma@redhat.com>
To: passt-dev@passt.top
Cc: jasowang@redhat.com
Subject: [RFC v2 07/11] tap: support tx through vhost
Date: Wed,  9 Jul 2025 19:47:44 +0200	[thread overview]
Message-ID: <20250709174748.3514693-8-eperezma@redhat.com> (raw)
In-Reply-To: <20250709174748.3514693-1-eperezma@redhat.com>

No users enable vhost right now, just defining the functions.

The use of virtqueue is similar than in rx case.  fills the descriptor
table with packet data it wants to send to the namespace.  Each
descriptor points to a buffer in memory, with an address and a length.
The number of descriptors is again defined by VHOST_NDESCS.

Afterwards it writes the descriptor index into the avail->ring[] array,
then increments avail->idx to make it visible to the kernel, then kicks
the virtqueue 1 event fd.

When the kernel does not need the buffer anymore it writes its id into
the used_ring->ring[], and increments used_ring->idx.  Normally, the
kernel also notifies pasta through call eventfd of the virtqueue 1.
But we don't monitor the eventfd.  Instead, we check if we can reuse the
buffers or not just when we produce, making the code simpler and more
performant.

Like on the rx path, we assume descriptors are used in the same order
they were made available. This is also consistent with behavior seen in
QEMU's virtio-net implementation.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
 arp.c     |  2 +-
 tap.c     | 84 +++++++++++++++++++++++++++++++++++++++++++++++--------
 tap.h     |  4 +--
 tcp.c     |  2 +-
 tcp_buf.c |  2 +-
 udp.c     |  2 +-
 6 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/arp.c b/arp.c
index fc482bb..ea786a0 100644
--- a/arp.c
+++ b/arp.c
@@ -80,7 +80,7 @@ int arp(const struct ctx *c, const struct pool *p)
 	memcpy(eh->h_dest,	eh->h_source,	sizeof(eh->h_dest));
 	memcpy(eh->h_source,	c->our_tap_mac,	sizeof(eh->h_source));
 
-	tap_send_single(c, eh, l2len);
+	tap_send_single(c, eh, l2len, false);
 
 	return 1;
 }
diff --git a/tap.c b/tap.c
index 5667fbe..7ccac86 100644
--- a/tap.c
+++ b/tap.c
@@ -121,11 +121,19 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6, pkt_buf);
 static_assert(!(VHOST_NDESCS & (VHOST_NDESCS - 1)),
 			 "Number of vhost descs must be a power of two by standard");
 static struct {
+	/* Descriptor index we're using. This is not the same as avail idx in
+	 * split: this takes into account the chained descs */
+	uint16_t vring_idx;
+
 	/* Number of free descriptors */
 	uint16_t num_free;
 
 	/* Last used idx processed */
 	uint16_t last_used_idx;
+
+	/* Descriptors in use */
+	/* desc info: number of descriptors in the chain */
+	uint16_t ndescs[VHOST_NDESCS];
 } vqs[2];
 
 static struct vring_desc vring_desc[2][VHOST_NDESCS] __attribute__((aligned(PAGE_SIZE)));
@@ -176,7 +184,7 @@ unsigned long tap_l2_max_len(const struct ctx *c)
  * @data:	Packet buffer
  * @l2len:	Total L2 packet length
  */
-void tap_send_single(const struct ctx *c, const void *data, size_t l2len)
+void tap_send_single(const struct ctx *c, const void *data, size_t l2len, bool vhost)
 {
 	uint32_t vnet_len = htonl(l2len);
 	struct iovec iov[2];
@@ -192,7 +200,7 @@ void tap_send_single(const struct ctx *c, const void *data, size_t l2len)
 		iov[iovcnt].iov_len = l2len;
 		iovcnt++;
 
-		tap_send_frames(c, iov, iovcnt, 1);
+		tap_send_frames(c, iov, iovcnt, 1, vhost);
 		break;
 	case MODE_VU:
 		vu_send_single(c, data, l2len);
@@ -314,7 +322,7 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
 	char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen);
 
 	memcpy(data, in, dlen);
-	tap_send_single(c, buf, dlen + (data - buf));
+	tap_send_single(c, buf, dlen + (data - buf), false);
 }
 
 /**
@@ -336,7 +344,7 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
 	memcpy(icmp4h, in, l4len);
 	csum_icmp4(icmp4h, icmp4h + 1, l4len - sizeof(*icmp4h));
 
-	tap_send_single(c, buf, l4len + ((char *)icmp4h - buf));
+	tap_send_single(c, buf, l4len + ((char *)icmp4h - buf), false);
 }
 
 /**
@@ -421,7 +429,7 @@ void tap_udp6_send(const struct ctx *c,
 	char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen);
 
 	memcpy(data, in, dlen);
-	tap_send_single(c, buf, dlen + (data - buf));
+	tap_send_single(c, buf, dlen + (data - buf), false);
 }
 
 /**
@@ -444,7 +452,7 @@ void tap_icmp6_send(const struct ctx *c,
 	memcpy(icmp6h, in, l4len);
 	csum_icmp6(icmp6h, src, dst, icmp6h + 1, l4len - sizeof(*icmp6h));
 
-	tap_send_single(c, buf, l4len + ((char *)icmp6h - buf));
+	tap_send_single(c, buf, l4len + ((char *)icmp6h - buf), false);
 }
 
 static void vhost_kick(struct vring_used *used, int kick_fd) {
@@ -459,8 +467,9 @@ static void vhost_kick(struct vring_used *used, int kick_fd) {
 		eventfd_write(kick_fd, 1);
 }
 
+
 /**
- * tap_send_frames_pasta() - Send multiple frames to the pasta tap
+ * tap_send_frames_vhost() - Send multiple frames to the pasta tap
  * @c:			Execution context
  * @iov:		Array of buffers
  * @bufs_per_frame:	Number of buffers (iovec entries) per frame
@@ -470,16 +479,68 @@ static void vhost_kick(struct vring_used *used, int kick_fd) {
  * @bufs_per_frame contiguous buffers representing a single frame.
  *
  * Return: number of frames successfully sent
+ */
+static size_t tap_send_frames_vhost(const struct ctx *c,
+				    const struct iovec *iov,
+				    size_t bufs_per_frame, size_t nframes)
+{
+	size_t i;
+
+	for (i = 0; i < nframes; i++) {
+		size_t j;
+
+		if (vqs[1].num_free < bufs_per_frame)
+			return i;
+
+		vring_avail_1.avail.ring[(vring_avail_1.avail.idx + i) % VHOST_NDESCS] = htole16(vqs[1].vring_idx) % VHOST_NDESCS;
+		vqs[1].ndescs[(vring_avail_1.avail.idx + i) % VHOST_NDESCS] = bufs_per_frame;
+		vqs[1].num_free -= bufs_per_frame;
+
+		for (j = 0; j < bufs_per_frame; ++j) {
+			struct vring_desc *desc = &vring_desc[1][vqs[1].vring_idx % VHOST_NDESCS];
+			const struct iovec *iov_i = &iov[i * bufs_per_frame + j];
+
+			desc->addr = (uint64_t)iov_i->iov_base;
+			desc->len = iov_i->iov_len;
+			desc->flags = (j == bufs_per_frame - 1) ? 0 : htole16(VRING_DESC_F_NEXT);
+			vqs[1].vring_idx++;
+		}
+	}
+
+	smp_wmb();
+	vring_avail_1.avail.idx = htole16(le16toh(vring_avail_1.avail.idx) + nframes);
+
+	vhost_kick(&vring_used_1.used, c->vq[1].kick_fd);
+
+	return nframes;
+}
+
+
+/**
+ * tap_send_frames_pasta() - Send multiple frames to the pasta tap
+ * @c:			Execution context
+ * @iov:		Array of buffers
+ * @bufs_per_frame:	Number of buffers (iovec entries) per frame
+ * @nframes:		Number of frames to send
+ * @vhost:             Use vhost-kernel or not
+ *
+ * @iov must have total length @bufs_per_frame * @nframes, with each set of
+ * @bufs_per_frame contiguous buffers representing a single frame.
+ *
+ * Return: number of frames successfully sent (or queued)
  *
  * #syscalls:pasta write
  */
 static size_t tap_send_frames_pasta(const struct ctx *c,
 				    const struct iovec *iov,
-				    size_t bufs_per_frame, size_t nframes)
+				    size_t bufs_per_frame, size_t nframes, bool vhost)
 {
 	size_t nbufs = bufs_per_frame * nframes;
 	size_t i;
 
+	if (vhost)
+		return tap_send_frames_vhost(c, iov, bufs_per_frame, nframes);
+
 	for (i = 0; i < nbufs; i += bufs_per_frame) {
 		ssize_t rc = writev(c->fd_tap, iov + i, bufs_per_frame);
 		size_t framelen = iov_size(iov + i, bufs_per_frame);
@@ -563,14 +624,15 @@ static size_t tap_send_frames_passt(const struct ctx *c,
  * @iov:		Array of buffers, each containing one frame (with L2 headers)
  * @bufs_per_frame:	Number of buffers (iovec entries) per frame
  * @nframes:		Number of frames to send
+ * @vhost:		Use vhost-kernel or not
  *
  * @iov must have total length @bufs_per_frame * @nframes, with each set of
  * @bufs_per_frame contiguous buffers representing a single frame.
  *
- * Return: number of frames actually sent
+ * Return: number of frames actually sent (or queued)
  */
 size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
-		       size_t bufs_per_frame, size_t nframes)
+		       size_t bufs_per_frame, size_t nframes, bool vhost)
 {
 	size_t m;
 
@@ -579,7 +641,7 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
 
 	switch (c->mode) {
 	case MODE_PASTA:
-		m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes);
+		m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes, vhost);
 		break;
 	case MODE_PASST:
 		m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes);
diff --git a/tap.h b/tap.h
index ff8cee5..e924dfb 100644
--- a/tap.h
+++ b/tap.h
@@ -111,9 +111,9 @@ void tap_udp6_send(const struct ctx *c,
 void tap_icmp6_send(const struct ctx *c,
 		    const struct in6_addr *src, const struct in6_addr *dst,
 		    const void *in, size_t l4len);
-void tap_send_single(const struct ctx *c, const void *data, size_t l2len);
+void tap_send_single(const struct ctx *c, const void *data, size_t l2len, bool vhost);
 size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
-		       size_t bufs_per_frame, size_t nframes);
+		       size_t bufs_per_frame, size_t nframes, bool vhost);
 void eth_update_mac(struct ethhdr *eh,
 		    const unsigned char *eth_d, const unsigned char *eth_s);
 void tap_listen_handler(struct ctx *c, uint32_t events);
diff --git a/tcp.c b/tcp.c
index f43c1e2..05f5b4c 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1935,7 +1935,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
 
 	tcp_update_csum(psum, rsth, &payload);
 	rst_l2len = ((char *)rsth - buf) + sizeof(*rsth);
-	tap_send_single(c, buf, rst_l2len);
+	tap_send_single(c, buf, rst_l2len, false);
 }
 
 /**
diff --git a/tcp_buf.c b/tcp_buf.c
index 6d79d67..242086d 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -141,7 +141,7 @@ void tcp_payload_flush(const struct ctx *c)
 	size_t m;
 
 	m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS,
-			    tcp_payload_used);
+			    tcp_payload_used, false);
 	if (m != tcp_payload_used) {
 		tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m],
 			       tcp_payload_used - m);
diff --git a/udp.c b/udp.c
index 65a52e0..d017d99 100644
--- a/udp.c
+++ b/udp.c
@@ -809,7 +809,7 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
 	for (i = 0; i < n; i++)
 		udp_tap_prepare(udp_mh_recv, i, toside, false);
 
-	tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
+	tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n, false);
 }
 
 /**
-- 
@@ -809,7 +809,7 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
 	for (i = 0; i < n; i++)
 		udp_tap_prepare(udp_mh_recv, i, toside, false);
 
-	tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
+	tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n, false);
 }
 
 /**
-- 
2.50.0


  parent reply	other threads:[~2025-07-09 17:48 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-09 17:47 [RFC v2 00/11] Add vhost-net kernel support Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 01/11] tap: implement vhost_call_cb Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 02/11] tap: add die() on vhost error Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 03/11] tap: replace tx tap hdr with virtio_nethdr_mrg_rxbuf Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 04/11] tcp: export memory regions to vhost Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 05/11] virtio: Fill .next in tx queue Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 06/11] tap: move static iov_sock to tcp_buf_data_from_sock Eugenio Pérez
2025-07-09 17:47 ` Eugenio Pérez [this message]
2025-07-09 17:47 ` [RFC v2 08/11] tap: add tap_free_old_xmit Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 09/11] tcp: start conversion to circular buffer Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 10/11] tap: add poll(2) to used_idx Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 11/11] tcp_buf: adding TCP tx circular buffer Eugenio Pérez
2025-07-10  9:46 ` [RFC v2 00/11] Add vhost-net kernel support Eugenio Perez Martin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250709174748.3514693-8-eperezma@redhat.com \
    --to=eperezma@redhat.com \
    --cc=jasowang@redhat.com \
    --cc=passt-dev@passt.top \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).