public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: David Gibson <david@gibson.dropbear.id.au>
To: passt-dev@passt.top, Stefano Brivio <sbrivio@redhat.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH v2 11/13] epoll: Split handling of listening TCP sockets into their own handler
Date: Thu, 10 Aug 2023 12:33:13 +1000	[thread overview]
Message-ID: <20230810023315.684784-12-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20230810023315.684784-1-david@gibson.dropbear.id.au>

tcp_sock_handler() handles both listening TCP sockets, and connected TCP
sockets, but what it needs to do in those cases has essentially nothing in
common.  Therefore, give listening sockets their own epoll_type value and
dispatch directly to their own handler from the top level.  Furthermore,
the two handlers need essentially entirely different information from the
reference: we re-(ab)used the index field in the tcp_epoll_ref to indicate
the port for the listening socket, but that's not the same meaning.  So,
switch listening sockets to their own reference type which we can lay out
as we please.  That lets us remove the listen and outbound fields from the
normal (connected) tcp_epoll_ref, reducing it to just the connection table
index.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt.c      |  8 ++++++--
 passt.h      |  8 ++++++--
 tcp.c        | 51 ++++++++++++++++++++++-----------------------------
 tcp.h        | 25 +++++++++++++++++--------
 tcp_splice.c |  4 ++--
 tcp_splice.h |  2 +-
 util.c       |  2 +-
 7 files changed, 55 insertions(+), 45 deletions(-)

diff --git a/passt.c b/passt.c
index 57ef767..2a207b4 100644
--- a/passt.c
+++ b/passt.c
@@ -56,7 +56,8 @@
 char pkt_buf[PKT_BUF_BYTES]	__attribute__ ((aligned(PAGE_SIZE)));
 
 char *epoll_type_str[EPOLL_TYPE_MAX + 1] = {
-	[EPOLL_TYPE_TCP]	= "TCP socket",
+	[EPOLL_TYPE_TCP]	= "connected TCP socket",
+	[EPOLL_TYPE_TCP_LISTEN]	= "listening TCP socket",
 	[EPOLL_TYPE_TCP_TIMER]	= "TCP timer",
 	[EPOLL_TYPE_UDP]	= "UDP socket",
 	[EPOLL_TYPE_ICMP]	= "ICMP socket",
@@ -323,7 +324,10 @@ loop:
 			break;
 		case EPOLL_TYPE_TCP:
 			if (!c.no_tcp)
-				tcp_sock_handler(&c, ref, eventmask, &now);
+				tcp_sock_handler(&c, ref, eventmask);
+			break;
+		case EPOLL_TYPE_TCP_LISTEN:
+			tcp_listen_handler(&c, ref, &now);
 			break;
 		case EPOLL_TYPE_TCP_TIMER:
 			tcp_timer_handler(&c, ref);
diff --git a/passt.h b/passt.h
index fc1efdb..a625d48 100644
--- a/passt.h
+++ b/passt.h
@@ -47,8 +47,10 @@ union epoll_ref;
 enum epoll_type {
 	/* Special value to indicate an invalid type */
 	EPOLL_TYPE_NONE = 0,
-	/* TCP sockets */
+	/* Connected TCP sockets */
 	EPOLL_TYPE_TCP,
+	/* Listening TCP sockets */
+	EPOLL_TYPE_TCP_LISTEN,
 	/* timerfds used for TCP timers */
 	EPOLL_TYPE_TCP_TIMER,
 	/* UDP sockets */
@@ -69,7 +71,8 @@ enum epoll_type {
  * union epoll_ref - Breakdown of reference for epoll fd bookkeeping
  * @type:	Type of fd (tells us what to do with events)
  * @fd:		File descriptor number (implies < 2^24 total descriptors)
- * @tcp:	TCP-specific reference part
+ * @tcp:	TCP-specific reference part (connected sockets)
+ * @tcp_listen:	TCP-specific reference part (listening sockets)
  * @udp:	UDP-specific reference part
  * @icmp:	ICMP-specific reference part
  * @data:	Data handled by protocol handlers
@@ -83,6 +86,7 @@ union epoll_ref {
 		int32_t		fd:FD_REF_BITS;
 		union {
 			union tcp_epoll_ref tcp;
+			union tcp_listen_epoll_ref tcp_listen;
 			union udp_epoll_ref udp;
 			union icmp_epoll_ref icmp;
 			uint32_t data;
diff --git a/tcp.c b/tcp.c
index 98761a2..0322842 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2735,7 +2735,8 @@ static void tcp_snat_inbound(const struct ctx *c, union inany_addr *addr)
  * @sa:		Peer socket address (from accept())
  * @now:	Current timestamp
  */
-static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref,
+static void tcp_tap_conn_from_sock(struct ctx *c,
+				   union tcp_listen_epoll_ref ref,
 				   struct tcp_tap_conn *conn, int s,
 				   struct sockaddr *sa,
 				   const struct timespec *now)
@@ -2747,7 +2748,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref,
 	conn_event(c, conn, SOCK_ACCEPTED);
 
 	inany_from_sockaddr(&conn->addr, &conn->sock_port, sa);
-	conn->tap_port = ref.tcp.index;
+	conn->tap_port = ref.port;
 
 	tcp_snat_inbound(c, &conn->addr);
 
@@ -2765,22 +2766,20 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref,
 }
 
 /**
- * tcp_conn_from_sock() - Handle new connection request from listening socket
+ * tcp_listen_handler() - Handle new connection request from listening socket
  * @c:		Execution context
  * @ref:	epoll reference of listening socket
  * @now:	Current timestamp
  */
-static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
-			       const struct timespec *now)
+void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
+			const struct timespec *now)
 {
 	struct sockaddr_storage sa;
 	union tcp_conn *conn;
 	socklen_t sl;
 	int s;
 
-	ASSERT(ref.tcp.listen);
-
-	if (c->tcp.conn_count >= TCP_MAX_CONNS)
+	if (c->no_tcp || c->tcp.conn_count >= TCP_MAX_CONNS)
 		return;
 
 	sl = sizeof(sa);
@@ -2796,11 +2795,11 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 	conn = tc + c->tcp.conn_count++;
 
 	if (c->mode == MODE_PASTA &&
-	    tcp_splice_conn_from_sock(c, ref, &conn->splice,
+	    tcp_splice_conn_from_sock(c, ref.tcp_listen, &conn->splice,
 				      s, (struct sockaddr *)&sa))
 		return;
 
-	tcp_tap_conn_from_sock(c, ref, &conn->tap, s,
+	tcp_tap_conn_from_sock(c, ref.tcp_listen, &conn->tap, s,
 			       (struct sockaddr *)&sa, now);
 }
 
@@ -2926,19 +2925,10 @@ static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn,
  * @c:		Execution context
  * @ref:	epoll reference
  * @events:	epoll events bitmap
- * @now:	Current timestamp
  */
-void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
-		      const struct timespec *now)
+void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
 {
-	union tcp_conn *conn;
-
-	if (ref.tcp.listen) {
-		tcp_conn_from_sock(c, ref, now);
-		return;
-	}
-
-	conn = tc + ref.tcp.index;
+	union tcp_conn *conn = tc + ref.tcp.index;
 
 	if (conn->c.spliced)
 		tcp_splice_sock_handler(c, &conn->splice, ref.fd, events);
@@ -2959,8 +2949,9 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
 static int tcp_sock_init_af(const struct ctx *c, int af, in_port_t port,
 			    const struct in_addr *addr, const char *ifname)
 {
-	in_port_t idx = port + c->tcp.fwd_in.delta[port];
-	union tcp_epoll_ref tref = { .listen = 1, .index = idx };
+	union tcp_listen_epoll_ref tref = {
+		.port = port + c->tcp.fwd_in.delta[port],
+	};
 	int s;
 
 	s = sock_l4(c, af, IPPROTO_TCP, addr, ifname, port, tref.u32);
@@ -3019,9 +3010,10 @@ int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
  */
 static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
 {
-	in_port_t idx = port + c->tcp.fwd_out.delta[port];
-	union tcp_epoll_ref tref = { .listen = 1, .outbound = 1,
-				     .index = idx };
+	union tcp_listen_epoll_ref tref = {
+		.port = port + c->tcp.fwd_out.delta[port],
+		.ns = true,
+	};
 	struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
 	int s;
 
@@ -3044,9 +3036,10 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
  */
 static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
 {
-	in_port_t idx = port + c->tcp.fwd_out.delta[port];
-	union tcp_epoll_ref tref = { .listen = 1, .outbound = 1,
-				     .index = idx };
+	union tcp_listen_epoll_ref tref = {
+		.port = port + c->tcp.fwd_out.delta[port],
+		.ns = true,
+	};
 	int s;
 
 	ASSERT(c->mode == MODE_PASTA);
diff --git a/tcp.h b/tcp.h
index 8eb7782..be296ec 100644
--- a/tcp.h
+++ b/tcp.h
@@ -14,8 +14,9 @@
 struct ctx;
 
 void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
-void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
-		      const struct timespec *now);
+void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
+			const struct timespec *now);
+void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
 int tcp_tap_handler(struct ctx *c, int af, const void *addr,
 		    const struct pool *p, const struct timespec *now);
 int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
@@ -30,16 +31,24 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
 
 /**
  * union tcp_epoll_ref - epoll reference portion for TCP connections
- * @listen:		Set if this file descriptor is a listening socket
- * @outbound:		Listening socket maps to outbound, spliced connection
- * @index:		Index of connection in table, or port for bound sockets
+ * @index:		Index of connection in table
  * @u32:		Opaque u32 value of reference
  */
 union tcp_epoll_ref {
+	uint32_t index:20;
+	uint32_t u32;
+};
+
+/**
+ * union tcp_listen_epoll_ref - epoll reference portion for TCP listening
+ * @port:	Port number we're forwarding *to* (listening port plus delta)
+ * @ns:		True if listening within the pasta namespace
+ * @u32:	Opaque u32 value of reference
+ */
+union tcp_listen_epoll_ref {
 	struct {
-		uint32_t	listen:1,
-				outbound:1,
-				index:20;
+		in_port_t	port;
+		bool		ns;
 	};
 	uint32_t u32;
 };
diff --git a/tcp_splice.c b/tcp_splice.c
index 24995e2..64c1263 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -486,7 +486,7 @@ static void tcp_splice_dir(struct tcp_splice_conn *conn, int ref_sock,
  * Return: true if able to create a spliced connection, false otherwise
  * #syscalls:pasta setsockopt
  */
-bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref,
+bool tcp_splice_conn_from_sock(struct ctx *c, union tcp_listen_epoll_ref ref,
 			       struct tcp_splice_conn *conn, int s,
 			       const struct sockaddr *sa)
 {
@@ -516,7 +516,7 @@ bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref,
 	c->tcp.splice_conn_count++;
 	conn->a = s;
 
-	if (tcp_splice_new(c, conn, ref.tcp.index, ref.tcp.outbound))
+	if (tcp_splice_new(c, conn, ref.port, ref.ns))
 		conn_flag(c, conn, CLOSING);
 
 	return true;
diff --git a/tcp_splice.h b/tcp_splice.h
index 99dbac8..e7a583a 100644
--- a/tcp_splice.h
+++ b/tcp_splice.h
@@ -10,7 +10,7 @@ struct tcp_splice_conn;
 
 void tcp_splice_sock_handler(struct ctx *c, struct tcp_splice_conn *conn,
 			     int s, uint32_t events);
-bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref,
+bool tcp_splice_conn_from_sock(struct ctx *c, union tcp_listen_epoll_ref ref,
 			       struct tcp_splice_conn *conn, int s,
 			       const struct sockaddr *sa);
 void tcp_splice_init(struct ctx *c);
diff --git a/util.c b/util.c
index 2cac7ba..d965f48 100644
--- a/util.c
+++ b/util.c
@@ -120,7 +120,7 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
 
 	switch (proto) {
 	case IPPROTO_TCP:
-		ref.type = EPOLL_TYPE_TCP;
+		ref.type = EPOLL_TYPE_TCP_LISTEN;
 		break;
 	case IPPROTO_UDP:
 		ref.type = EPOLL_TYPE_UDP;
-- 
@@ -120,7 +120,7 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
 
 	switch (proto) {
 	case IPPROTO_TCP:
-		ref.type = EPOLL_TYPE_TCP;
+		ref.type = EPOLL_TYPE_TCP_LISTEN;
 		break;
 	case IPPROTO_UDP:
 		ref.type = EPOLL_TYPE_UDP;
-- 
2.41.0


  parent reply	other threads:[~2023-08-10  2:33 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-08-10  2:33 [PATCH v2 00/13] Clean up to tap errors and epoll dispatch David Gibson
2023-08-10  2:33 ` [PATCH v2 01/13] tap: Clean up tap reset path David Gibson
2023-08-10  2:33 ` [PATCH v2 02/13] tap: Clean up behaviour for errors on listening Unix socket David Gibson
2023-08-10  2:33 ` [PATCH v2 03/13] tap: Fold reset handling into tap_handler_pasta() David Gibson
2023-08-10  2:33 ` [PATCH v2 04/13] tap: Fold reset handling into tap_handler_passt() David Gibson
2023-08-10 19:49   ` Stefano Brivio
2023-08-11  3:07     ` David Gibson
2023-08-10  2:33 ` [PATCH v2 05/13] epoll: Generalize epoll_ref to cover things other than sockets David Gibson
2023-08-10  2:33 ` [PATCH v2 06/13] epoll: Always use epoll_ref for the epoll data variable David Gibson
2023-08-10  2:33 ` [PATCH v2 07/13] epoll: Fold sock_handler into general switch on epoll event fd David Gibson
2023-08-10 19:49   ` Stefano Brivio
2023-08-11  3:11     ` David Gibson
2023-08-10  2:33 ` [PATCH v2 08/13] epoll: Split handling of ICMP and ICMPv6 sockets David Gibson
2023-08-10  2:33 ` [PATCH v2 09/13] epoll: Tiny cleanup to udp_sock_handler() David Gibson
2023-08-10  2:33 ` [PATCH v2 10/13] epoll: Split handling of TCP timerfds into its own handler function David Gibson
2023-08-10  2:33 ` David Gibson [this message]
2023-08-10  2:33 ` [PATCH v2 12/13] epoll: Split listening Unix domain socket into its own type David Gibson
2023-08-10  2:33 ` [PATCH v2 13/13] epoll: Use different epoll types for passt and pasta tap fds David Gibson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230810023315.684784-12-david@gibson.dropbear.id.au \
    --to=david@gibson.dropbear.id.au \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).