public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: David Gibson <david@gibson.dropbear.id.au>
To: passt-dev@passt.top, Stefano Brivio <sbrivio@redhat.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH v2 06/10] tap: Re-introduce EPOLLET for tap connections
Date: Fri, 13 Sep 2024 14:32:10 +1000	[thread overview]
Message-ID: <20240913043214.1753014-7-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20240913043214.1753014-1-david@gibson.dropbear.id.au>

Since 4684f603446b ("tap: Don't use EPOLLET on Qemu sockets") we've only
used level-triggered events for the tap device.  Prior to that we used it
inconsistently which was confusing (though not incorrect AFAICT).

We want to add support for EPOLLOUT events on the tap connection, and
without EPOLLET that would require toggling EPOLLOUT on and off, which is
awkward.  So, re-introduce EPOLLET, but now use it uniformly for all tap
modes.  The main change this requires is making sure on EPOLLIN we loop
until all there's no more data to process.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tap.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/tap.c b/tap.c
index 41af6a6d..c1db2960 100644
--- a/tap.c
+++ b/tap.c
@@ -985,8 +985,10 @@ static void tap_sock_reset(struct ctx *c)
  * tap_passt_input() - Handler for new data on the socket to qemu
  * @c:		Execution context
  * @now:	Current timestamp
+ *
+ * Return: true if there may be additional data to read, false otherwise
  */
-static void tap_passt_input(struct ctx *c, const struct timespec *now)
+static bool tap_passt_input(struct ctx *c, const struct timespec *now)
 {
 	static const char *partial_frame;
 	static ssize_t partial_len = 0;
@@ -1013,7 +1015,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 			err_perror("Receive error on guest connection, reset");
 			tap_sock_reset(c);
 		}
-		return;
+		return false;
 	}
 
 	p = pkt_buf;
@@ -1025,7 +1027,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 		if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) {
 			err("Bad frame size from guest, resetting connection");
 			tap_sock_reset(c);
-			return;
+			return false;
 		}
 
 		if (l2len + sizeof(uint32_t) > (size_t)n)
@@ -1045,6 +1047,8 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 	partial_frame = p;
 
 	tap_handler(c, now);
+
+	return true;
 }
 
 /**
@@ -1061,16 +1065,20 @@ void tap_handler_passt(struct ctx *c, uint32_t events,
 		return;
 	}
 
-	if (events & EPOLLIN)
-		tap_passt_input(c, now);
+	if (events & EPOLLIN) {
+		while (tap_passt_input(c, now))
+			;
+	}
 }
 
 /**
  * tap_pasta_input() - Handler for new data on the socket to hypervisor
  * @c:		Execution context
  * @now:	Current timestamp
+ *
+ * Return: true if there may be additional data to read, false otherwise
  */
-static void tap_pasta_input(struct ctx *c, const struct timespec *now)
+static bool tap_pasta_input(struct ctx *c, const struct timespec *now)
 {
 	ssize_t n, len;
 
@@ -1102,6 +1110,8 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 	}
 
 	tap_handler(c, now);
+
+	return len > 0;
 }
 
 /**
@@ -1116,8 +1126,10 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
 	if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR))
 		die("Disconnect event on /dev/net/tun device, exiting");
 
-	if (events & EPOLLIN)
-		tap_pasta_input(c, now);
+	if (events & EPOLLIN) {
+		while (tap_pasta_input(c, now))
+			;
+	}
 }
 
 /**
@@ -1251,7 +1263,7 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
 		trace("tap: failed to set SO_SNDBUF to %i", v);
 
 	ref.fd = c->fd_tap;
-	ev.events = EPOLLIN | EPOLLRDHUP;
+	ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
 	ev.data.u64 = ref.u64;
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 }
@@ -1307,7 +1319,7 @@ static void tap_sock_tun_init(struct ctx *c)
 	pasta_ns_conf(c);
 
 	ref.fd = c->fd_tap;
-	ev.events = EPOLLIN | EPOLLRDHUP;
+	ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
 	ev.data.u64 = ref.u64;
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 }
@@ -1340,7 +1352,7 @@ void tap_sock_init(struct ctx *c)
 		else
 			ref.type = EPOLL_TYPE_TAP_PASTA;
 
-		ev.events = EPOLLIN | EPOLLRDHUP;
+		ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
 		ev.data.u64 = ref.u64;
 		epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 		return;
-- 
@@ -985,8 +985,10 @@ static void tap_sock_reset(struct ctx *c)
  * tap_passt_input() - Handler for new data on the socket to qemu
  * @c:		Execution context
  * @now:	Current timestamp
+ *
+ * Return: true if there may be additional data to read, false otherwise
  */
-static void tap_passt_input(struct ctx *c, const struct timespec *now)
+static bool tap_passt_input(struct ctx *c, const struct timespec *now)
 {
 	static const char *partial_frame;
 	static ssize_t partial_len = 0;
@@ -1013,7 +1015,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 			err_perror("Receive error on guest connection, reset");
 			tap_sock_reset(c);
 		}
-		return;
+		return false;
 	}
 
 	p = pkt_buf;
@@ -1025,7 +1027,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 		if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) {
 			err("Bad frame size from guest, resetting connection");
 			tap_sock_reset(c);
-			return;
+			return false;
 		}
 
 		if (l2len + sizeof(uint32_t) > (size_t)n)
@@ -1045,6 +1047,8 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
 	partial_frame = p;
 
 	tap_handler(c, now);
+
+	return true;
 }
 
 /**
@@ -1061,16 +1065,20 @@ void tap_handler_passt(struct ctx *c, uint32_t events,
 		return;
 	}
 
-	if (events & EPOLLIN)
-		tap_passt_input(c, now);
+	if (events & EPOLLIN) {
+		while (tap_passt_input(c, now))
+			;
+	}
 }
 
 /**
  * tap_pasta_input() - Handler for new data on the socket to hypervisor
  * @c:		Execution context
  * @now:	Current timestamp
+ *
+ * Return: true if there may be additional data to read, false otherwise
  */
-static void tap_pasta_input(struct ctx *c, const struct timespec *now)
+static bool tap_pasta_input(struct ctx *c, const struct timespec *now)
 {
 	ssize_t n, len;
 
@@ -1102,6 +1110,8 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
 	}
 
 	tap_handler(c, now);
+
+	return len > 0;
 }
 
 /**
@@ -1116,8 +1126,10 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
 	if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR))
 		die("Disconnect event on /dev/net/tun device, exiting");
 
-	if (events & EPOLLIN)
-		tap_pasta_input(c, now);
+	if (events & EPOLLIN) {
+		while (tap_pasta_input(c, now))
+			;
+	}
 }
 
 /**
@@ -1251,7 +1263,7 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
 		trace("tap: failed to set SO_SNDBUF to %i", v);
 
 	ref.fd = c->fd_tap;
-	ev.events = EPOLLIN | EPOLLRDHUP;
+	ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
 	ev.data.u64 = ref.u64;
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 }
@@ -1307,7 +1319,7 @@ static void tap_sock_tun_init(struct ctx *c)
 	pasta_ns_conf(c);
 
 	ref.fd = c->fd_tap;
-	ev.events = EPOLLIN | EPOLLRDHUP;
+	ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
 	ev.data.u64 = ref.u64;
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 }
@@ -1340,7 +1352,7 @@ void tap_sock_init(struct ctx *c)
 		else
 			ref.type = EPOLL_TYPE_TAP_PASTA;
 
-		ev.events = EPOLLIN | EPOLLRDHUP;
+		ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
 		ev.data.u64 = ref.u64;
 		epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 		return;
-- 
2.46.0


  parent reply	other threads:[~2024-09-13  4:32 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-13  4:32 [PATCH v2 00/10] RFC: Clean up TCP epoll mask handling David Gibson
2024-09-13  4:32 ` [PATCH v2 01/10] tcp: Make some extra functions private David Gibson
2024-09-13  4:32 ` [PATCH v2 02/10] tcp: Clean up tcpi_snd_wnd probing David Gibson
2024-09-17 21:54   ` Stefano Brivio
2024-09-18  1:27     ` David Gibson
2024-09-13  4:32 ` [PATCH v2 03/10] tcp: Simplify ifdef logic in tcp_update_seqack_wnd() David Gibson
2024-09-17 21:54   ` Stefano Brivio
2024-09-18  1:31     ` David Gibson
2024-09-13  4:32 ` [PATCH v2 04/10] tcp: Make tcp_update_seqack_wnd()s force_seq parameter explicitly boolean David Gibson
2024-09-13  4:32 ` [PATCH v2 05/10] tcp: On socket EPOLLOUT, send new ACK to tap immediately David Gibson
2024-09-13  4:32 ` David Gibson [this message]
2024-09-13  4:32 ` [PATCH v2 07/10] tap: Keep track of whether there might be space in the tap buffers David Gibson
2024-09-13  4:32 ` [PATCH v2 08/10] tcp: Keep track of connections blocked due to a full tap interface David Gibson
2024-09-13  4:32 ` [PATCH v2 09/10] tcp: Move deferred handling functions later in tcp.c David Gibson
2024-09-13  4:32 ` [PATCH v2 10/10] tcp: Simplify epoll event mask management David Gibson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240913043214.1753014-7-david@gibson.dropbear.id.au \
    --to=david@gibson.dropbear.id.au \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).