public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH 0/9] Clean up to epoll dispatch
@ 2023-08-07 13:46 David Gibson
  2023-08-07 13:46 ` [PATCH 1/9] epoll: Generalize epoll_ref to cover things other than sockets David Gibson
                   ` (8 more replies)
  0 siblings, 9 replies; 17+ messages in thread
From: David Gibson @ 2023-08-07 13:46 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

Getting from an epoll event to the relevant handler function is
currently several levels of functions and tests.  This series
simplifies this to be pretty close to a single switch on a value in
the epoll ref dispatching directly to the appropriate handler.

Based on the tap reset series.

David Gibson (9):
  epoll: Generalize epoll_ref to cover things other than sockets
  epoll: Always use epoll_ref for the epoll data variable
  epoll: Fold sock_handler into general switch on epoll event fd
  epoll: Split handling of ICMP and ICMPv6 sockets
  epoll: Tiny cleanup to udp_sock_handler()
  epoll: Split handling of TCP timerfds into its own handler function
  epoll: Split handling of listening TCP sockets into their own handler
  epoll: Split listening Unix domain socket into its own type
  epoll: Use different epoll types for passt and pasta tap fds

 icmp.c       | 118 +++++++++++++++++++++++++++++----------------------
 icmp.h       |   9 ++--
 passt.c      |  90 +++++++++++++++++++++++----------------
 passt.h      |  52 ++++++++++++++++++-----
 pasta.c      |   8 +++-
 tap.c        |  63 +++++++++++++--------------
 tap.h        |   7 ++-
 tcp.c        |  64 ++++++++++------------------
 tcp.h        |  11 +++--
 tcp_conn.h   |   4 +-
 tcp_splice.c |   4 +-
 udp.c        |  16 +++----
 util.c       |  27 +++++++++---
 13 files changed, 263 insertions(+), 210 deletions(-)

-- 
2.41.0


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/9] epoll: Generalize epoll_ref to cover things other than sockets
  2023-08-07 13:46 [PATCH 0/9] Clean up to epoll dispatch David Gibson
@ 2023-08-07 13:46 ` David Gibson
  2023-08-09 19:59   ` Stefano Brivio
  2023-08-07 13:46 ` [PATCH 2/9] epoll: Always use epoll_ref for the epoll data variable David Gibson
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 17+ messages in thread
From: David Gibson @ 2023-08-07 13:46 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

The epoll_ref type includes fields for the IP protocol of a socket, and the
socket fd.  However, we already have a few things in the epoll which aren't
protocol sockets, and we may have more in future.  Rename these fields to
an abstract "fd type" and file descriptor for more generality.

Similarly, rather than using existing IP protocol numbers for the type,
introduce our own number space.  For now these just correspond to the
supported protocols, but we'll expand on that in future.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 icmp.c       |  6 +++---
 passt.c      | 25 ++++++++++++-------------
 passt.h      | 40 +++++++++++++++++++++++++++++-----------
 tcp.c        | 22 +++++++++++-----------
 tcp_conn.h   |  4 ++--
 tcp_splice.c |  4 ++--
 udp.c        | 14 +++++++-------
 util.c       | 27 ++++++++++++++++++++-------
 8 files changed, 86 insertions(+), 56 deletions(-)

diff --git a/icmp.c b/icmp.c
index 676fa64..a4b6a47 100644
--- a/icmp.c
+++ b/icmp.c
@@ -79,7 +79,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref,
 	(void)events;
 	(void)now;
 
-	n = recvfrom(ref.s, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl);
+	n = recvfrom(ref.fd, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl);
 	if (n < 0)
 		return;
 
@@ -182,7 +182,7 @@ int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
 				    bind_if, id, iref.u32);
 			if (s < 0)
 				goto fail_sock;
-			if (s > SOCKET_MAX) {
+			if (s > FD_REF_MAX) {
 				close(s);
 				return 1;
 			}
@@ -236,7 +236,7 @@ int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
 				    bind_if, id, iref.u32);
 			if (s < 0)
 				goto fail_sock;
-			if (s > SOCKET_MAX) {
+			if (s > FD_REF_MAX) {
 				close(s);
 				return 1;
 			}
diff --git a/passt.c b/passt.c
index 9123868..b42f42d 100644
--- a/passt.c
+++ b/passt.c
@@ -55,12 +55,11 @@
 
 char pkt_buf[PKT_BUF_BYTES]	__attribute__ ((aligned(PAGE_SIZE)));
 
-char *ip_proto_str[IPPROTO_SCTP + 1] = {
-	[IPPROTO_ICMP]		= "ICMP",
-	[IPPROTO_TCP]		= "TCP",
-	[IPPROTO_UDP]		= "UDP",
-	[IPPROTO_ICMPV6]	= "ICMPV6",
-	[IPPROTO_SCTP]		= "SCTP",
+char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
+	[EPOLL_TYPE_TCP]	= "TCP socket",
+	[EPOLL_TYPE_UDP]	= "UDP socket",
+	[EPOLL_TYPE_ICMP]	= "ICMP socket",
+	[EPOLL_TYPE_ICMPV6]	= "ICMPv6 socket",
 };
 
 /**
@@ -73,16 +72,16 @@ char *ip_proto_str[IPPROTO_SCTP + 1] = {
 static void sock_handler(struct ctx *c, union epoll_ref ref,
 			 uint32_t events, const struct timespec *now)
 {
-	trace("%s: %s packet from socket %i (events: 0x%08x)",
+	trace("%s: packet from %s %i (events: 0x%08x)",
 	      c->mode == MODE_PASST ? "passt" : "pasta",
-	      IP_PROTO_STR(ref.proto), ref.s, events);
+	      EPOLL_TYPE_STR(ref.type), ref.fd, events);
 
-	if (!c->no_tcp && ref.proto == IPPROTO_TCP)
-		tcp_sock_handler( c, ref, events, now);
-	else if (!c->no_udp && ref.proto == IPPROTO_UDP)
-		udp_sock_handler( c, ref, events, now);
+	if (!c->no_tcp && ref.type == EPOLL_TYPE_TCP)
+		tcp_sock_handler(c, ref, events, now);
+	else if (!c->no_udp && ref.type == EPOLL_TYPE_UDP)
+		udp_sock_handler(c, ref, events, now);
 	else if (!c->no_icmp &&
-		 (ref.proto == IPPROTO_ICMP || ref.proto == IPPROTO_ICMPV6))
+		 (ref.type == EPOLL_TYPE_ICMP || ref.type == EPOLL_TYPE_ICMPV6))
 		icmp_sock_handler(c, ref, events, now);
 }
 
diff --git a/passt.h b/passt.h
index edc4841..2110781 100644
--- a/passt.h
+++ b/passt.h
@@ -42,9 +42,27 @@ union epoll_ref;
 #include "udp.h"
 
 /**
- * union epoll_ref - Breakdown of reference for epoll socket bookkeeping
- * @proto:	IP protocol number
- * @s:		Socket number (implies 2^24-1 limit on number of descriptors)
+ * enum epoll_type - Different types of fds we poll over
+ */
+enum epoll_type {
+	/* Special value to indicate an invalid type */
+	EPOLL_TYPE_NONE = 0,
+	/* Sockets and timerfds for TCP handling */
+	EPOLL_TYPE_TCP,
+	/* UDP sockets */
+	EPOLL_TYPE_UDP,
+	/* IPv4 ICMP sockets */
+	EPOLL_TYPE_ICMP,
+	/* ICMPv6 sockets */
+	EPOLL_TYPE_ICMPV6,
+
+	EPOLL_TYPE_MAX = EPOLL_TYPE_ICMPV6,
+};
+
+/**
+ * union epoll_ref - Breakdown of reference for epoll fd bookkeeping
+ * @type:	Type of fd (tells us what to do with events)
+ * @fd:		File descriptor number (implies < 2^24 total descriptors)
  * @tcp:	TCP-specific reference part
  * @udp:	UDP-specific reference part
  * @icmp:	ICMP-specific reference part
@@ -53,10 +71,10 @@ union epoll_ref;
  */
 union epoll_ref {
 	struct {
-		int32_t		proto:8,
-#define SOCKET_REF_BITS		24
-#define SOCKET_MAX		MAX_FROM_BITS(SOCKET_REF_BITS)
-				s:SOCKET_REF_BITS;
+		enum epoll_type type:8;
+#define FD_REF_BITS		24
+#define FD_REF_MAX		MAX_FROM_BITS(FD_REF_BITS)
+		int32_t		fd:FD_REF_BITS;
 		union {
 			union tcp_epoll_ref tcp;
 			union udp_epoll_ref udp;
@@ -78,10 +96,10 @@ static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
 #define PKT_BUF_BYTES		MAX(TAP_BUF_BYTES, 0)
 extern char pkt_buf		[PKT_BUF_BYTES];
 
-extern char *ip_proto_str[];
-#define IP_PROTO_STR(n)							\
-	(((uint8_t)(n) <= IPPROTO_SCTP && ip_proto_str[(n)]) ?		\
-			  ip_proto_str[(n)] : "?")
+extern char *epoll_type_str[];
+#define EPOLL_TYPE_STR(n)						\
+	(((uint8_t)(n) <= EPOLL_TYPE_MAX && epoll_type_str[(n)]) ?	\
+	                                    epoll_type_str[(n)] : "?")
 
 #include <resolv.h>	/* For MAXNS below */
 
diff --git a/tcp.c b/tcp.c
index d9ecee5..18b781a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -643,7 +643,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
 static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 {
 	int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
-	union epoll_ref ref = { .proto = IPPROTO_TCP, .s = conn->sock,
+	union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
 				.tcp.index = CONN_IDX(conn) };
 	struct epoll_event ev = { .data.u64 = ref.u64 };
 
@@ -663,8 +663,8 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 	conn->c.in_epoll = true;
 
 	if (conn->timer != -1) {
-		union epoll_ref ref_t = { .proto = IPPROTO_TCP,
-					  .s = conn->sock,
+		union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP,
+					  .fd = conn->sock,
 					  .tcp.timer = 1,
 					  .tcp.index = CONN_IDX(conn) };
 		struct epoll_event ev_t = { .data.u64 = ref_t.u64,
@@ -692,8 +692,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		return;
 
 	if (conn->timer == -1) {
-		union epoll_ref ref = { .proto = IPPROTO_TCP,
-					.s = conn->sock,
+		union epoll_ref ref = { .type = EPOLL_TYPE_TCP,
+					.fd = conn->sock,
 					.tcp.timer = 1,
 					.tcp.index = CONN_IDX(conn) };
 		struct epoll_event ev = { .data.u64 = ref.u64,
@@ -701,7 +701,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		int fd;
 
 		fd = timerfd_create(CLOCK_MONOTONIC, 0);
-		if (fd == -1 || fd > SOCKET_MAX) {
+		if (fd == -1 || fd > FD_REF_MAX) {
 			debug("TCP: failed to get timer: %s", strerror(errno));
 			if (fd > -1)
 				close(fd);
@@ -1908,7 +1908,7 @@ int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
 
 	s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
 
-	if (s > SOCKET_MAX) {
+	if (s > FD_REF_MAX) {
 		close(s);
 		return -EIO;
 	}
@@ -2791,7 +2791,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 	 * https://github.com/llvm/llvm-project/issues/58992
 	 */
 	memset(&sa, 0, sizeof(struct sockaddr_in6));
-	s = accept4(ref.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK);
+	s = accept4(ref.fd, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK);
 	if (s < 0)
 		return;
 
@@ -2948,7 +2948,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
 	conn = tc + ref.tcp.index;
 
 	if (conn->c.spliced)
-		tcp_splice_sock_handler(c, &conn->splice, ref.s, events);
+		tcp_splice_sock_handler(c, &conn->splice, ref.fd, events);
 	else
 		tcp_tap_sock_handler(c, &conn->tap, events);
 }
@@ -2999,7 +2999,7 @@ static int tcp_sock_init_af(const struct ctx *c, int af, in_port_t port,
 int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
 		  const char *ifname, in_port_t port)
 {
-	int r4 = SOCKET_MAX + 1, r6 = SOCKET_MAX + 1;
+	int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
 
 	if (af == AF_UNSPEC && c->ifi4 && c->ifi6)
 		/* Attempt to get a dual stack socket */
@@ -3013,7 +3013,7 @@ int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
 	if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6)
 		r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname);
 
-	if (IN_INTERVAL(0, SOCKET_MAX, r4) || IN_INTERVAL(0, SOCKET_MAX, r6))
+	if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
 		return 0;
 
 	return r4 < 0 ? r4 : r6;
diff --git a/tcp_conn.h b/tcp_conn.h
index 9e2b1bf..0b36940 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -62,7 +62,7 @@ struct tcp_tap_conn {
 	unsigned int	ws_to_tap	:TCP_WS_BITS;
 
 
-	int		sock		:SOCKET_REF_BITS;
+	int		sock		:FD_REF_BITS;
 
 	uint8_t		events;
 #define CLOSED			0
@@ -80,7 +80,7 @@ struct tcp_tap_conn {
 	(SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
 
 
-	int		timer		:SOCKET_REF_BITS;
+	int		timer		:FD_REF_BITS;
 
 	uint8_t		flags;
 #define STALLED			BIT(0)
diff --git a/tcp_splice.c b/tcp_splice.c
index 03e14c1..24995e2 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -173,9 +173,9 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
 				struct tcp_splice_conn *conn)
 {
 	int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
-	union epoll_ref ref_a = { .proto = IPPROTO_TCP, .s = conn->a,
+	union epoll_ref ref_a = { .type = EPOLL_TYPE_TCP, .fd = conn->a,
 				  .tcp.index = CONN_IDX(conn) };
-	union epoll_ref ref_b = { .proto = IPPROTO_TCP, .s = conn->b,
+	union epoll_ref ref_b = { .type = EPOLL_TYPE_TCP, .fd = conn->b,
 				  .tcp.index = CONN_IDX(conn) };
 	struct epoll_event ev_a = { .data.u64 = ref_a.u64 };
 	struct epoll_event ev_b = { .data.u64 = ref_b.u64 };
diff --git a/udp.c b/udp.c
index 5a852fb..62f8360 100644
--- a/udp.c
+++ b/udp.c
@@ -388,9 +388,9 @@ static void udp_sock6_iov_init(const struct ctx *c)
 int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns)
 {
 	struct epoll_event ev = { .events = EPOLLIN | EPOLLRDHUP | EPOLLHUP };
-	union epoll_ref ref = { .proto = IPPROTO_UDP,
+	union epoll_ref ref = { .type = EPOLL_TYPE_UDP,
 				.udp = { .splice = true, .ns = ns,
-					     .v6 = v6, .port = src }
+					 .v6 = v6, .port = src }
 			      };
 	struct udp_splice_port *sp;
 	int act, s;
@@ -406,7 +406,7 @@ int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns)
 	s = socket(v6 ? AF_INET6 : AF_INET, SOCK_DGRAM | SOCK_NONBLOCK,
 		   IPPROTO_UDP);
 
-	if (s > SOCKET_MAX) {
+	if (s > FD_REF_MAX) {
 		close(s);
 		return -EIO;
 	}
@@ -414,7 +414,7 @@ int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns)
 	if (s < 0)
 		return s;
 
-	ref.s = s;
+	ref.fd = s;
 
 	if (v6) {
 		struct sockaddr_in6 addr6 = {
@@ -767,7 +767,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
 		udp4_localname.sin_port = htons(dstport);
 	}
 
-	n = recvmmsg(ref.s, mmh_recv, n, 0, NULL);
+	n = recvmmsg(ref.fd, mmh_recv, n, 0, NULL);
 	if (n <= 0)
 		return;
 
@@ -980,7 +980,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 		  const void *addr, const char *ifname, in_port_t port)
 {
 	union udp_epoll_ref uref = { .u32 = 0 };
-	int s, r4 = SOCKET_MAX + 1, r6 = SOCKET_MAX + 1;
+	int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
 
 	if (ns) {
 		uref.port = (in_port_t)(port + c->udp.fwd_out.f.delta[port]);
@@ -1030,7 +1030,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 		}
 	}
 
-	if (IN_INTERVAL(0, SOCKET_MAX, r4) || IN_INTERVAL(0, SOCKET_MAX, r6))
+	if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
 		return 0;
 
 	return r4 < 0 ? r4 : r6;
diff --git a/util.c b/util.c
index 019c56c..2cac7ba 100644
--- a/util.c
+++ b/util.c
@@ -102,7 +102,7 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
 	    const void *bind_addr, const char *ifname, uint16_t port,
 	    uint32_t data)
 {
-	union epoll_ref ref = { .proto = proto, .data = data };
+	union epoll_ref ref = { .data = data };
 	struct sockaddr_in addr4 = {
 		.sin_family = AF_INET,
 		.sin_port = htons(port),
@@ -118,9 +118,22 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
 	int fd, sl, y = 1, ret;
 	struct epoll_event ev;
 
-	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
-	    proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6)
+	switch (proto) {
+	case IPPROTO_TCP:
+		ref.type = EPOLL_TYPE_TCP;
+		break;
+	case IPPROTO_UDP:
+		ref.type = EPOLL_TYPE_UDP;
+		break;
+	case IPPROTO_ICMP:
+		ref.type = EPOLL_TYPE_ICMP;
+		break;
+	case IPPROTO_ICMPV6:
+		ref.type = EPOLL_TYPE_ICMPV6;
+		break;
+	default:
 		return -EPFNOSUPPORT;	/* Not implemented. */
+	}
 
 	if (af == AF_UNSPEC) {
 		if (!DUAL_STACK_SOCKETS || bind_addr)
@@ -140,12 +153,12 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
 		return ret;
 	}
 
-	if (fd > SOCKET_MAX) {
+	if (fd > FD_REF_MAX) {
 		close(fd);
 		return -EBADF;
 	}
 
-	ref.s = fd;
+	ref.fd = fd;
 
 	if (af == AF_INET) {
 		if (bind_addr)
@@ -188,8 +201,8 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
 		if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
 			       ifname, strlen(ifname))) {
 			ret = -errno;
-			warn("Can't bind socket for %s port %u to %s, closing",
-			     ip_proto_str[proto], port, ifname);
+			warn("Can't bind %s socket for port %u to %s, closing",
+			     EPOLL_TYPE_STR(proto), port, ifname);
 			close(fd);
 			return ret;
 		}
-- 
@@ -102,7 +102,7 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
 	    const void *bind_addr, const char *ifname, uint16_t port,
 	    uint32_t data)
 {
-	union epoll_ref ref = { .proto = proto, .data = data };
+	union epoll_ref ref = { .data = data };
 	struct sockaddr_in addr4 = {
 		.sin_family = AF_INET,
 		.sin_port = htons(port),
@@ -118,9 +118,22 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
 	int fd, sl, y = 1, ret;
 	struct epoll_event ev;
 
-	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
-	    proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6)
+	switch (proto) {
+	case IPPROTO_TCP:
+		ref.type = EPOLL_TYPE_TCP;
+		break;
+	case IPPROTO_UDP:
+		ref.type = EPOLL_TYPE_UDP;
+		break;
+	case IPPROTO_ICMP:
+		ref.type = EPOLL_TYPE_ICMP;
+		break;
+	case IPPROTO_ICMPV6:
+		ref.type = EPOLL_TYPE_ICMPV6;
+		break;
+	default:
 		return -EPFNOSUPPORT;	/* Not implemented. */
+	}
 
 	if (af == AF_UNSPEC) {
 		if (!DUAL_STACK_SOCKETS || bind_addr)
@@ -140,12 +153,12 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
 		return ret;
 	}
 
-	if (fd > SOCKET_MAX) {
+	if (fd > FD_REF_MAX) {
 		close(fd);
 		return -EBADF;
 	}
 
-	ref.s = fd;
+	ref.fd = fd;
 
 	if (af == AF_INET) {
 		if (bind_addr)
@@ -188,8 +201,8 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
 		if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
 			       ifname, strlen(ifname))) {
 			ret = -errno;
-			warn("Can't bind socket for %s port %u to %s, closing",
-			     ip_proto_str[proto], port, ifname);
+			warn("Can't bind %s socket for port %u to %s, closing",
+			     EPOLL_TYPE_STR(proto), port, ifname);
 			close(fd);
 			return ret;
 		}
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/9] epoll: Always use epoll_ref for the epoll data variable
  2023-08-07 13:46 [PATCH 0/9] Clean up to epoll dispatch David Gibson
  2023-08-07 13:46 ` [PATCH 1/9] epoll: Generalize epoll_ref to cover things other than sockets David Gibson
@ 2023-08-07 13:46 ` David Gibson
  2023-08-07 13:46 ` [PATCH 3/9] epoll: Fold sock_handler into general switch on epoll event fd David Gibson
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: David Gibson @ 2023-08-07 13:46 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

epoll_ref contains a variety of information useful when handling epoll
events on our sockets, and we place it in the epoll_event data field
returned by epoll.  However, for a few other things we use the 'fd' field
in the standard union of types for that data field.

This actually introduces a bug which is vanishingly unlikely to hit in
practice, but very nasty if it ever did: theoretically if we had a very
large file descriptor number for fd_tap or fd_tap_listen it could overflow
into bits that overlap with the 'proto' field in epoll_ref.  With some
very bad luck this could mean that we mistakenly think an event on a
regular socket is an event on fd_tap or fd_tap_listen.

More practically, using different (but overlapping) fields of the
epoll_data means we can't unify dispatch for the various different objects
in the epoll.  Therefore use the same epoll_ref as the data for the tap
fds and the netns quit fd, adding new fd type values to describe them.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt.c | 11 ++++++-----
 passt.h |  6 +++++-
 pasta.c |  8 ++++++--
 tap.c   | 16 ++++++++++++----
 4 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/passt.c b/passt.c
index b42f42d..195fccf 100644
--- a/passt.c
+++ b/passt.c
@@ -60,6 +60,8 @@ char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
 	[EPOLL_TYPE_UDP]	= "UDP socket",
 	[EPOLL_TYPE_ICMP]	= "ICMP socket",
 	[EPOLL_TYPE_ICMPV6]	= "ICMPv6 socket",
+	[EPOLL_TYPE_NSQUIT]	= "namespace inotify",
+	[EPOLL_TYPE_TAP]	= "tap device",
 };
 
 /**
@@ -328,12 +330,11 @@ loop:
 
 	for (i = 0; i < nfds; i++) {
 		union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
-		int fd = events[i].data.fd;
 
-		if (fd == c.fd_tap || fd == c.fd_tap_listen)
-			tap_handler(&c, fd, events[i].events, &now);
-		else if (fd == quit_fd)
-			pasta_netns_quit_handler(&c, fd);
+		if (ref.type == EPOLL_TYPE_TAP)
+			tap_handler(&c, ref.fd, events[i].events, &now);
+		else if (ref.type == EPOLL_TYPE_NSQUIT)
+			pasta_netns_quit_handler(&c, quit_fd);
 		else
 			sock_handler(&c, ref, events[i].events, &now);
 	}
diff --git a/passt.h b/passt.h
index 2110781..8878a11 100644
--- a/passt.h
+++ b/passt.h
@@ -55,8 +55,12 @@ enum epoll_type {
 	EPOLL_TYPE_ICMP,
 	/* ICMPv6 sockets */
 	EPOLL_TYPE_ICMPV6,
+	/* inotify fd watching for end of netns (pasta) */
+	EPOLL_TYPE_NSQUIT,
+	/* tap char device, or qemu socket fd */
+	EPOLL_TYPE_TAP,
 
-	EPOLL_TYPE_MAX = EPOLL_TYPE_ICMPV6,
+	EPOLL_TYPE_MAX = EPOLL_TYPE_TAP,
 };
 
 /**
diff --git a/pasta.c b/pasta.c
index 1e84680..dbe8e8c 100644
--- a/pasta.c
+++ b/pasta.c
@@ -365,7 +365,10 @@ void pasta_ns_conf(struct ctx *c)
 int pasta_netns_quit_init(struct ctx *c)
 {
 	int flags = O_NONBLOCK | O_CLOEXEC;
-	struct epoll_event ev = { .events = EPOLLIN };
+	union epoll_ref ref = { .type = EPOLL_TYPE_NSQUIT };
+	struct epoll_event ev = {
+		.events = EPOLLIN
+	};
 	int inotify_fd;
 
 	if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base)
@@ -381,7 +384,8 @@ int pasta_netns_quit_init(struct ctx *c)
 		return -1;
 	}
 
-	ev.data.fd = inotify_fd;
+	ref.fd = inotify_fd;
+	ev.data.u64 = ref.u64;
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, inotify_fd, &ev);
 
 	return inotify_fd;
diff --git a/tap.c b/tap.c
index c33c6c7..ad0decf 100644
--- a/tap.c
+++ b/tap.c
@@ -1076,6 +1076,7 @@ restart:
 static void tap_sock_unix_init(struct ctx *c)
 {
 	int fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
 	struct epoll_event ev = { 0 };
 	struct sockaddr_un addr = {
 		.sun_family = AF_UNIX,
@@ -1128,8 +1129,9 @@ static void tap_sock_unix_init(struct ctx *c)
 
 	listen(fd, 0);
 
-	ev.data.fd = c->fd_tap_listen = fd;
+	ref.fd = c->fd_tap_listen = fd;
 	ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
+	ev.data.u64 = ref.u64;
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev);
 
 	info("You can now start qemu (>= 7.2, with commit 13c6be96618c):");
@@ -1145,6 +1147,7 @@ static void tap_sock_unix_init(struct ctx *c)
  */
 static void tap_sock_unix_new(struct ctx *c)
 {
+	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
 	struct epoll_event ev = { 0 };
 	int v = INT_MAX / 2;
 	struct ucred ucred;
@@ -1181,8 +1184,9 @@ static void tap_sock_unix_new(struct ctx *c)
 	    setsockopt(c->fd_tap, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)))
 		trace("tap: failed to set SO_SNDBUF to %i", v);
 
-	ev.data.fd = c->fd_tap;
+	ref.fd = c->fd_tap;
 	ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
+	ev.data.u64 = ref.u64;
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 }
 
@@ -1227,6 +1231,7 @@ static int tap_ns_tun(void *arg)
  */
 static void tap_sock_tun_init(struct ctx *c)
 {
+	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
 	struct epoll_event ev = { 0 };
 
 	NS_CALL(tap_ns_tun, c);
@@ -1235,8 +1240,9 @@ static void tap_sock_tun_init(struct ctx *c)
 
 	pasta_ns_conf(c);
 
-	ev.data.fd = c->fd_tap;
+	ref.fd = c->fd_tap;
 	ev.events = EPOLLIN | EPOLLRDHUP;
+	ev.data.u64 = ref.u64;
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 }
 
@@ -1258,11 +1264,13 @@ void tap_sock_init(struct ctx *c)
 	}
 
 	if (c->fd_tap != -1) { /* Passed as --fd */
+		union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
 		struct epoll_event ev = { 0 };
 		ASSERT(c->one_off);
 
-		ev.data.fd = c->fd_tap;
+		ref.fd = c->fd_tap;
 		ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
+		ev.data.u64 = ref.u64;
 		epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 		return;
 	}
-- 
@@ -1076,6 +1076,7 @@ restart:
 static void tap_sock_unix_init(struct ctx *c)
 {
 	int fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
 	struct epoll_event ev = { 0 };
 	struct sockaddr_un addr = {
 		.sun_family = AF_UNIX,
@@ -1128,8 +1129,9 @@ static void tap_sock_unix_init(struct ctx *c)
 
 	listen(fd, 0);
 
-	ev.data.fd = c->fd_tap_listen = fd;
+	ref.fd = c->fd_tap_listen = fd;
 	ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
+	ev.data.u64 = ref.u64;
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev);
 
 	info("You can now start qemu (>= 7.2, with commit 13c6be96618c):");
@@ -1145,6 +1147,7 @@ static void tap_sock_unix_init(struct ctx *c)
  */
 static void tap_sock_unix_new(struct ctx *c)
 {
+	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
 	struct epoll_event ev = { 0 };
 	int v = INT_MAX / 2;
 	struct ucred ucred;
@@ -1181,8 +1184,9 @@ static void tap_sock_unix_new(struct ctx *c)
 	    setsockopt(c->fd_tap, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)))
 		trace("tap: failed to set SO_SNDBUF to %i", v);
 
-	ev.data.fd = c->fd_tap;
+	ref.fd = c->fd_tap;
 	ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
+	ev.data.u64 = ref.u64;
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 }
 
@@ -1227,6 +1231,7 @@ static int tap_ns_tun(void *arg)
  */
 static void tap_sock_tun_init(struct ctx *c)
 {
+	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
 	struct epoll_event ev = { 0 };
 
 	NS_CALL(tap_ns_tun, c);
@@ -1235,8 +1240,9 @@ static void tap_sock_tun_init(struct ctx *c)
 
 	pasta_ns_conf(c);
 
-	ev.data.fd = c->fd_tap;
+	ref.fd = c->fd_tap;
 	ev.events = EPOLLIN | EPOLLRDHUP;
+	ev.data.u64 = ref.u64;
 	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 }
 
@@ -1258,11 +1264,13 @@ void tap_sock_init(struct ctx *c)
 	}
 
 	if (c->fd_tap != -1) { /* Passed as --fd */
+		union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
 		struct epoll_event ev = { 0 };
 		ASSERT(c->one_off);
 
-		ev.data.fd = c->fd_tap;
+		ref.fd = c->fd_tap;
 		ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
+		ev.data.u64 = ref.u64;
 		epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
 		return;
 	}
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/9] epoll: Fold sock_handler into general switch on epoll event fd
  2023-08-07 13:46 [PATCH 0/9] Clean up to epoll dispatch David Gibson
  2023-08-07 13:46 ` [PATCH 1/9] epoll: Generalize epoll_ref to cover things other than sockets David Gibson
  2023-08-07 13:46 ` [PATCH 2/9] epoll: Always use epoll_ref for the epoll data variable David Gibson
@ 2023-08-07 13:46 ` David Gibson
  2023-08-07 13:46 ` [PATCH 4/9] epoll: Split handling of ICMP and ICMPv6 sockets David Gibson
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: David Gibson @ 2023-08-07 13:46 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

When we process events from epoll_wait(), we check for a number of special
cases before calling sock_handler() which then dispatches based on the
protocol type of the socket in the event.

Fold these cases together into a single switch on the fd type recorded in
the epoll data field.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt.c | 54 +++++++++++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/passt.c b/passt.c
index 195fccf..32ea2ca 100644
--- a/passt.c
+++ b/passt.c
@@ -64,29 +64,6 @@ char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
 	[EPOLL_TYPE_TAP]	= "tap device",
 };
 
-/**
- * sock_handler() - Event handler for L4 sockets
- * @c:		Execution context
- * @ref:	epoll reference
- * @events:	epoll events
- * @now:	Current timestamp
- */
-static void sock_handler(struct ctx *c, union epoll_ref ref,
-			 uint32_t events, const struct timespec *now)
-{
-	trace("%s: packet from %s %i (events: 0x%08x)",
-	      c->mode == MODE_PASST ? "passt" : "pasta",
-	      EPOLL_TYPE_STR(ref.type), ref.fd, events);
-
-	if (!c->no_tcp && ref.type == EPOLL_TYPE_TCP)
-		tcp_sock_handler(c, ref, events, now);
-	else if (!c->no_udp && ref.type == EPOLL_TYPE_UDP)
-		udp_sock_handler(c, ref, events, now);
-	else if (!c->no_icmp &&
-		 (ref.type == EPOLL_TYPE_ICMP || ref.type == EPOLL_TYPE_ICMPV6))
-		icmp_sock_handler(c, ref, events, now);
-}
-
 /**
  * post_handler() - Run periodic and deferred tasks for L4 protocol handlers
  * @c:		Execution context
@@ -330,13 +307,36 @@ loop:
 
 	for (i = 0; i < nfds; i++) {
 		union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
+		uint32_t eventmask = events[i].events;
 
-		if (ref.type == EPOLL_TYPE_TAP)
+		trace("%s: epoll event on %s %i (events: 0x%08x)",
+		      c.mode == MODE_PASST ? "passt" : "pasta",
+		      EPOLL_TYPE_STR(ref.type), ref.fd, events);
+
+		switch (ref.type) {
+		case EPOLL_TYPE_TAP:
 			tap_handler(&c, ref.fd, events[i].events, &now);
-		else if (ref.type == EPOLL_TYPE_NSQUIT)
+			break;
+		case EPOLL_TYPE_NSQUIT:
 			pasta_netns_quit_handler(&c, quit_fd);
-		else
-			sock_handler(&c, ref, events[i].events, &now);
+			break;
+		case EPOLL_TYPE_TCP:
+			if (!c.no_tcp)
+				tcp_sock_handler(&c, ref, eventmask, &now);
+			break;
+		case EPOLL_TYPE_UDP:
+			if (!c.no_udp)
+				udp_sock_handler(&c, ref, eventmask, &now);
+			break;
+		case EPOLL_TYPE_ICMP:
+		case EPOLL_TYPE_ICMPV6:
+			if (!c.no_icmp)
+				icmp_sock_handler(&c, ref, eventmask, &now);
+			break;
+		default:
+			/* Can't happen */
+			ASSERT(0);
+		}
 	}
 
 	post_handler(&c, &now);
-- 
@@ -64,29 +64,6 @@ char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
 	[EPOLL_TYPE_TAP]	= "tap device",
 };
 
-/**
- * sock_handler() - Event handler for L4 sockets
- * @c:		Execution context
- * @ref:	epoll reference
- * @events:	epoll events
- * @now:	Current timestamp
- */
-static void sock_handler(struct ctx *c, union epoll_ref ref,
-			 uint32_t events, const struct timespec *now)
-{
-	trace("%s: packet from %s %i (events: 0x%08x)",
-	      c->mode == MODE_PASST ? "passt" : "pasta",
-	      EPOLL_TYPE_STR(ref.type), ref.fd, events);
-
-	if (!c->no_tcp && ref.type == EPOLL_TYPE_TCP)
-		tcp_sock_handler(c, ref, events, now);
-	else if (!c->no_udp && ref.type == EPOLL_TYPE_UDP)
-		udp_sock_handler(c, ref, events, now);
-	else if (!c->no_icmp &&
-		 (ref.type == EPOLL_TYPE_ICMP || ref.type == EPOLL_TYPE_ICMPV6))
-		icmp_sock_handler(c, ref, events, now);
-}
-
 /**
  * post_handler() - Run periodic and deferred tasks for L4 protocol handlers
  * @c:		Execution context
@@ -330,13 +307,36 @@ loop:
 
 	for (i = 0; i < nfds; i++) {
 		union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
+		uint32_t eventmask = events[i].events;
 
-		if (ref.type == EPOLL_TYPE_TAP)
+		trace("%s: epoll event on %s %i (events: 0x%08x)",
+		      c.mode == MODE_PASST ? "passt" : "pasta",
+		      EPOLL_TYPE_STR(ref.type), ref.fd, events);
+
+		switch (ref.type) {
+		case EPOLL_TYPE_TAP:
 			tap_handler(&c, ref.fd, events[i].events, &now);
-		else if (ref.type == EPOLL_TYPE_NSQUIT)
+			break;
+		case EPOLL_TYPE_NSQUIT:
 			pasta_netns_quit_handler(&c, quit_fd);
-		else
-			sock_handler(&c, ref, events[i].events, &now);
+			break;
+		case EPOLL_TYPE_TCP:
+			if (!c.no_tcp)
+				tcp_sock_handler(&c, ref, eventmask, &now);
+			break;
+		case EPOLL_TYPE_UDP:
+			if (!c.no_udp)
+				udp_sock_handler(&c, ref, eventmask, &now);
+			break;
+		case EPOLL_TYPE_ICMP:
+		case EPOLL_TYPE_ICMPV6:
+			if (!c.no_icmp)
+				icmp_sock_handler(&c, ref, eventmask, &now);
+			break;
+		default:
+			/* Can't happen */
+			ASSERT(0);
+		}
 	}
 
 	post_handler(&c, &now);
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 4/9] epoll: Split handling of ICMP and ICMPv6 sockets
  2023-08-07 13:46 [PATCH 0/9] Clean up to epoll dispatch David Gibson
                   ` (2 preceding siblings ...)
  2023-08-07 13:46 ` [PATCH 3/9] epoll: Fold sock_handler into general switch on epoll event fd David Gibson
@ 2023-08-07 13:46 ` David Gibson
  2023-08-07 13:46 ` [PATCH 5/9] epoll: Tiny cleanup to udp_sock_handler() David Gibson
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: David Gibson @ 2023-08-07 13:46 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

We have different epoll type values for ICMP and ICMPv6 sockets, but they
both call the same handler function, icmp_sock_handler().  However that
function does essentially nothing in common for the two cases.  So, split
it into icmp_sock_handler() and icmpv6_sock_handler() and dispatch them
separately from the top level.

While we're there remove some parameters that the function was never using
anyway.  Also move the test for c->no_icmp into the functions, so that all
the logic specific to ICMP is within the handler, rather than in the top
level dispatch code.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 icmp.c  | 112 ++++++++++++++++++++++++++++++++------------------------
 icmp.h  |   9 ++---
 passt.c |   5 ++-
 3 files changed, 70 insertions(+), 56 deletions(-)

diff --git a/icmp.c b/icmp.c
index a4b6a47..b676a1a 100644
--- a/icmp.c
+++ b/icmp.c
@@ -60,78 +60,94 @@ static struct icmp_id_sock icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS];
 static uint8_t icmp_act[IP_VERSIONS][DIV_ROUND_UP(ICMP_NUM_IDS, 8)];
 
 /**
- * icmp_sock_handler() - Handle new data from socket
+ * icmp_sock_handler() - Handle new data from IPv4 ICMP socket
  * @c:		Execution context
  * @ref:	epoll reference
- * @events:	epoll events bitmap
- * @now:	Current timestamp, unused
  */
-void icmp_sock_handler(const struct ctx *c, union epoll_ref ref,
-		       uint32_t events, const struct timespec *now)
+void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
 {
-	union icmp_epoll_ref *iref = &ref.icmp;
-	struct sockaddr_storage sr;
-	socklen_t sl = sizeof(sr);
 	char buf[USHRT_MAX];
+	struct icmphdr *ih = (struct icmphdr *)buf;
+	struct sockaddr_in sr;
+	socklen_t sl = sizeof(sr);
 	uint16_t seq, id;
 	ssize_t n;
 
-	(void)events;
-	(void)now;
+	if (c->no_icmp)
+		return;
 
+	/* FIXME: Workaround clang-tidy not realizing that recvfrom()
+	 * writes the socket address.  See
+	 * https://github.com/llvm/llvm-project/issues/58992
+	 */
+	memset(&sr, 0, sizeof(sr));
 	n = recvfrom(ref.fd, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl);
 	if (n < 0)
 		return;
 
-	if (iref->v6) {
-		struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr;
-		struct icmp6hdr *ih = (struct icmp6hdr *)buf;
+	id = ntohs(ih->un.echo.id);
+	seq = ntohs(ih->un.echo.sequence);
 
-		id = ntohs(ih->icmp6_identifier);
-		seq = ntohs(ih->icmp6_sequence);
+	if (id != ref.icmp.id)
+		ih->un.echo.id = htons(ref.icmp.id);
 
-		/* If bind() fails e.g. because of a broken SELinux policy, this
-		 * might happen. Fix up the identifier to match the sent one.
-		 */
-		if (id != iref->id)
-			ih->icmp6_identifier = htons(iref->id);
+	if (c->mode == MODE_PASTA) {
+		if (icmp_id_map[V4][id].seq == seq)
+			return;
 
-		/* In PASTA mode, we'll get any reply we send, discard them. */
-		if (c->mode == MODE_PASTA) {
-			if (icmp_id_map[V6][id].seq == seq)
-				return;
-
-			icmp_id_map[V6][id].seq = seq;
-		}
+		icmp_id_map[V4][id].seq = seq;
+	}
 
-		debug("ICMPv6: echo %s to tap, ID: %i, seq: %i",
-		      (ih->icmp6_type == 128) ? "request" : "reply", id, seq);
+	debug("ICMP: echo %s to tap, ID: %i, seq: %i",
+	      (ih->type == ICMP_ECHO) ? "request" : "reply", id, seq);
 
-		tap_icmp6_send(c, &sr6->sin6_addr,
-			       tap_ip6_daddr(c, &sr6->sin6_addr), buf, n);
-	} else {
-		struct sockaddr_in *sr4 = (struct sockaddr_in *)&sr;
-		struct icmphdr *ih = (struct icmphdr *)buf;
+	tap_icmp4_send(c, sr.sin_addr, tap_ip4_daddr(c), buf, n);
+}
 
-		id = ntohs(ih->un.echo.id);
-		seq = ntohs(ih->un.echo.sequence);
+/**
+ * icmpv6_sock_handler() - Handle new data from ICMPv6 socket
+ * @c:		Execution context
+ * @ref:	epoll reference
+ */
+void icmpv6_sock_handler(const struct ctx *c, union epoll_ref ref)
+{
+	char buf[USHRT_MAX];
+	struct icmp6hdr *ih = (struct icmp6hdr *)buf;
+	struct sockaddr_in6 sr;
+	socklen_t sl = sizeof(sr);
+	uint16_t seq, id;
+	ssize_t n;
 
-		if (id != iref->id)
-			ih->un.echo.id = htons(iref->id);
+	if (c->no_icmp)
+		return;
 
-		if (c->mode == MODE_PASTA) {
+	n = recvfrom(ref.fd, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl);
+	if (n < 0)
+		return;
 
-			if (icmp_id_map[V4][id].seq == seq)
-				return;
+	id = ntohs(ih->icmp6_identifier);
+	seq = ntohs(ih->icmp6_sequence);
 
-			icmp_id_map[V4][id].seq = seq;
-		}
+	/* If bind() fails e.g. because of a broken SELinux policy,
+	 * this might happen. Fix up the identifier to match the sent
+	 * one.
+	 */
+	if (id != ref.icmp.id)
+		ih->icmp6_identifier = htons(ref.icmp.id);
 
-		debug("ICMP: echo %s to tap, ID: %i, seq: %i",
-		      (ih->type == ICMP_ECHO) ? "request" : "reply", id, seq);
+	/* In PASTA mode, we'll get any reply we send, discard them. */
+	if (c->mode == MODE_PASTA) {
+		if (icmp_id_map[V6][id].seq == seq)
+			return;
 
-		tap_icmp4_send(c, sr4->sin_addr, tap_ip4_daddr(c), buf, n);
+		icmp_id_map[V6][id].seq = seq;
 	}
+
+	debug("ICMPv6: echo %s to tap, ID: %i, seq: %i",
+	      (ih->icmp6_type == 128) ? "request" : "reply", id, seq);
+
+	tap_icmp6_send(c, &sr.sin6_addr,
+		       tap_ip6_daddr(c, &sr.sin6_addr), buf, n);
 }
 
 /**
@@ -150,11 +166,11 @@ int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
 	size_t plen;
 
 	if (af == AF_INET) {
-		union icmp_epoll_ref iref = { .v6 = 0 };
 		struct sockaddr_in sa = {
 			.sin_family = AF_INET,
 			.sin_addr = { .s_addr = htonl(INADDR_ANY) },
 		};
+		union icmp_epoll_ref iref;
 		struct icmphdr *ih;
 		int id, s;
 
@@ -204,12 +220,12 @@ int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
 			      id, ntohs(ih->un.echo.sequence));
 		}
 	} else if (af == AF_INET6) {
-		union icmp_epoll_ref iref = { .v6 = 1 };
 		struct sockaddr_in6 sa = {
 			.sin6_family = AF_INET6,
 			.sin6_addr = IN6ADDR_ANY_INIT,
 			.sin6_scope_id = c->ifi6,
 		};
+		union icmp_epoll_ref iref;
 		struct icmp6hdr *ih;
 		int id, s;
 
diff --git a/icmp.h b/icmp.h
index 29c7829..32f0c47 100644
--- a/icmp.h
+++ b/icmp.h
@@ -10,8 +10,8 @@
 
 struct ctx;
 
-void icmp_sock_handler(const struct ctx *c, union epoll_ref ref,
-		       uint32_t events, const struct timespec *now);
+void icmp_sock_handler(const struct ctx *c, union epoll_ref ref);
+void icmpv6_sock_handler(const struct ctx *c, union epoll_ref ref);
 int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
 		     const struct pool *p, const struct timespec *now);
 void icmp_timer(const struct ctx *c, const struct timespec *ts);
@@ -24,10 +24,7 @@ void icmp_init(void);
  * @id:			Associated echo identifier, needed if bind() fails
  */
 union icmp_epoll_ref {
-	struct {
-		uint32_t	v6:1,
-				id:16;
-	};
+	uint16_t id;
 	uint32_t u32;
 };
 
diff --git a/passt.c b/passt.c
index 32ea2ca..33c2162 100644
--- a/passt.c
+++ b/passt.c
@@ -329,9 +329,10 @@ loop:
 				udp_sock_handler(&c, ref, eventmask, &now);
 			break;
 		case EPOLL_TYPE_ICMP:
+			icmp_sock_handler(&c, ref);
+			break;
 		case EPOLL_TYPE_ICMPV6:
-			if (!c.no_icmp)
-				icmp_sock_handler(&c, ref, eventmask, &now);
+			icmpv6_sock_handler(&c, ref);
 			break;
 		default:
 			/* Can't happen */
-- 
@@ -329,9 +329,10 @@ loop:
 				udp_sock_handler(&c, ref, eventmask, &now);
 			break;
 		case EPOLL_TYPE_ICMP:
+			icmp_sock_handler(&c, ref);
+			break;
 		case EPOLL_TYPE_ICMPV6:
-			if (!c.no_icmp)
-				icmp_sock_handler(&c, ref, eventmask, &now);
+			icmpv6_sock_handler(&c, ref);
 			break;
 		default:
 			/* Can't happen */
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 5/9] epoll: Tiny cleanup to udp_sock_handler()
  2023-08-07 13:46 [PATCH 0/9] Clean up to epoll dispatch David Gibson
                   ` (3 preceding siblings ...)
  2023-08-07 13:46 ` [PATCH 4/9] epoll: Split handling of ICMP and ICMPv6 sockets David Gibson
@ 2023-08-07 13:46 ` David Gibson
  2023-08-07 13:46 ` [PATCH 6/9] epoll: Split handling of TCP timerfds into its own handler function David Gibson
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: David Gibson @ 2023-08-07 13:46 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

Move the test for c->no_udp into the function itself, rather than in the
dispatching switch statement to better localize the UDP specific logic, and
make for greated consistency with other handler functions.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt.c | 3 +--
 udp.c   | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/passt.c b/passt.c
index 33c2162..c7d7f56 100644
--- a/passt.c
+++ b/passt.c
@@ -325,8 +325,7 @@ loop:
 				tcp_sock_handler(&c, ref, eventmask, &now);
 			break;
 		case EPOLL_TYPE_UDP:
-			if (!c.no_udp)
-				udp_sock_handler(&c, ref, eventmask, &now);
+			udp_sock_handler(&c, ref, eventmask, &now);
 			break;
 		case EPOLL_TYPE_ICMP:
 			icmp_sock_handler(&c, ref);
diff --git a/udp.c b/udp.c
index 62f8360..138e7ab 100644
--- a/udp.c
+++ b/udp.c
@@ -756,7 +756,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
 	struct mmsghdr *mmh_recv;
 	int i, m;
 
-	if (!(events & EPOLLIN))
+	if (c->no_udp || !(events & EPOLLIN))
 		return;
 
 	if (v6) {
-- 
@@ -756,7 +756,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
 	struct mmsghdr *mmh_recv;
 	int i, m;
 
-	if (!(events & EPOLLIN))
+	if (c->no_udp || !(events & EPOLLIN))
 		return;
 
 	if (v6) {
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 6/9] epoll: Split handling of TCP timerfds into its own handler function
  2023-08-07 13:46 [PATCH 0/9] Clean up to epoll dispatch David Gibson
                   ` (4 preceding siblings ...)
  2023-08-07 13:46 ` [PATCH 5/9] epoll: Tiny cleanup to udp_sock_handler() David Gibson
@ 2023-08-07 13:46 ` David Gibson
  2023-08-07 13:46 ` [PATCH 7/9] epoll: Split handling of listening TCP sockets into their own handler David Gibson
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: David Gibson @ 2023-08-07 13:46 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

tcp_sock_handler() actually handles several different types of fd events.
This includes timerfds that aren't sockets at all.  The handling of these
has essentially nothing in common with the other cases.  So, give the
TCP timers there own epoll_type value and dispatch directly to their
handler.  This also means we can remove the timer field from tcp_epoll_ref,
the information it encoded is now implicit in the epoll_type value.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt.c |  4 ++++
 passt.h |  4 +++-
 tcp.c   | 15 ++++-----------
 tcp.h   |  3 +--
 4 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/passt.c b/passt.c
index c7d7f56..c750fad 100644
--- a/passt.c
+++ b/passt.c
@@ -57,6 +57,7 @@ char pkt_buf[PKT_BUF_BYTES]	__attribute__ ((aligned(PAGE_SIZE)));
 
 char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
 	[EPOLL_TYPE_TCP]	= "TCP socket",
+	[EPOLL_TYPE_TCP_TIMER]	= "TCP timer",
 	[EPOLL_TYPE_UDP]	= "UDP socket",
 	[EPOLL_TYPE_ICMP]	= "ICMP socket",
 	[EPOLL_TYPE_ICMPV6]	= "ICMPv6 socket",
@@ -324,6 +325,9 @@ loop:
 			if (!c.no_tcp)
 				tcp_sock_handler(&c, ref, eventmask, &now);
 			break;
+		case EPOLL_TYPE_TCP_TIMER:
+			tcp_timer_handler(&c, ref);
+			break;
 		case EPOLL_TYPE_UDP:
 			udp_sock_handler(&c, ref, eventmask, &now);
 			break;
diff --git a/passt.h b/passt.h
index 8878a11..fc1efdb 100644
--- a/passt.h
+++ b/passt.h
@@ -47,8 +47,10 @@ union epoll_ref;
 enum epoll_type {
 	/* Special value to indicate an invalid type */
 	EPOLL_TYPE_NONE = 0,
-	/* Sockets and timerfds for TCP handling */
+	/* TCP sockets */
 	EPOLL_TYPE_TCP,
+	/* timerfds used for TCP timers */
+	EPOLL_TYPE_TCP_TIMER,
 	/* UDP sockets */
 	EPOLL_TYPE_UDP,
 	/* IPv4 ICMP sockets */
diff --git a/tcp.c b/tcp.c
index 18b781a..98761a2 100644
--- a/tcp.c
+++ b/tcp.c
@@ -663,9 +663,8 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 	conn->c.in_epoll = true;
 
 	if (conn->timer != -1) {
-		union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP,
+		union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
 					  .fd = conn->sock,
-					  .tcp.timer = 1,
 					  .tcp.index = CONN_IDX(conn) };
 		struct epoll_event ev_t = { .data.u64 = ref_t.u64,
 					    .events = EPOLLIN | EPOLLET };
@@ -692,9 +691,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		return;
 
 	if (conn->timer == -1) {
-		union epoll_ref ref = { .type = EPOLL_TYPE_TCP,
+		union epoll_ref ref = { .type = EPOLL_TYPE_TCP_TIMER,
 					.fd = conn->sock,
-					.tcp.timer = 1,
 					.tcp.index = CONN_IDX(conn) };
 		struct epoll_event ev = { .data.u64 = ref.u64,
 					  .events = EPOLLIN | EPOLLET };
@@ -2813,12 +2811,12 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
  *
  * #syscalls timerfd_gettime
  */
-static void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
+void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
 {
 	struct tcp_tap_conn *conn = conn_at_idx(ref.tcp.index);
 	struct itimerspec check_armed = { { 0 }, { 0 } };
 
-	if (!conn)
+	if (c->no_tcp || !conn)
 		return;
 
 	/* We don't reset timers on ~ACK_FROM_TAP_DUE, ~ACK_TO_TAP_DUE. If the
@@ -2935,11 +2933,6 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
 {
 	union tcp_conn *conn;
 
-	if (ref.tcp.timer) {
-		tcp_timer_handler(c, ref);
-		return;
-	}
-
 	if (ref.tcp.listen) {
 		tcp_conn_from_sock(c, ref, now);
 		return;
diff --git a/tcp.h b/tcp.h
index 5499127..8eb7782 100644
--- a/tcp.h
+++ b/tcp.h
@@ -13,6 +13,7 @@
 
 struct ctx;
 
+void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
 void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
 		      const struct timespec *now);
 int tcp_tap_handler(struct ctx *c, int af, const void *addr,
@@ -31,7 +32,6 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
  * union tcp_epoll_ref - epoll reference portion for TCP connections
  * @listen:		Set if this file descriptor is a listening socket
  * @outbound:		Listening socket maps to outbound, spliced connection
- * @timer:		Reference is a timerfd descriptor for connection
  * @index:		Index of connection in table, or port for bound sockets
  * @u32:		Opaque u32 value of reference
  */
@@ -39,7 +39,6 @@ union tcp_epoll_ref {
 	struct {
 		uint32_t	listen:1,
 				outbound:1,
-				timer:1,
 				index:20;
 	};
 	uint32_t u32;
-- 
@@ -13,6 +13,7 @@
 
 struct ctx;
 
+void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
 void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
 		      const struct timespec *now);
 int tcp_tap_handler(struct ctx *c, int af, const void *addr,
@@ -31,7 +32,6 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
  * union tcp_epoll_ref - epoll reference portion for TCP connections
  * @listen:		Set if this file descriptor is a listening socket
  * @outbound:		Listening socket maps to outbound, spliced connection
- * @timer:		Reference is a timerfd descriptor for connection
  * @index:		Index of connection in table, or port for bound sockets
  * @u32:		Opaque u32 value of reference
  */
@@ -39,7 +39,6 @@ union tcp_epoll_ref {
 	struct {
 		uint32_t	listen:1,
 				outbound:1,
-				timer:1,
 				index:20;
 	};
 	uint32_t u32;
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 7/9] epoll: Split handling of listening TCP sockets into their own handler
  2023-08-07 13:46 [PATCH 0/9] Clean up to epoll dispatch David Gibson
                   ` (5 preceding siblings ...)
  2023-08-07 13:46 ` [PATCH 6/9] epoll: Split handling of TCP timerfds into its own handler function David Gibson
@ 2023-08-07 13:46 ` David Gibson
  2023-08-09  6:29   ` David Gibson
  2023-08-07 13:46 ` [PATCH 8/9] epoll: Split listening Unix domain socket into its own type David Gibson
  2023-08-07 13:46 ` [PATCH 9/9] epoll: Use different epoll types for passt and pasta tap fds David Gibson
  8 siblings, 1 reply; 17+ messages in thread
From: David Gibson @ 2023-08-07 13:46 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

tcp_sock_handler() handles both listening TCP sockets, and connected TCP
sockets, but what it needs to do in those cases has essentially nothing in
common.  Therefore, give listening sockets their own epoll_type value and
dispatch directly to their own handler from the top level.  This also lets
us remove the listen field from tcp_epoll_ref, since the information it
carries is implicit in the epoll_type.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt.c |  8 ++++++--
 passt.h |  4 +++-
 tcp.c   | 31 +++++++++----------------------
 tcp.h   |  8 ++++----
 util.c  |  2 +-
 5 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/passt.c b/passt.c
index c750fad..c32981d 100644
--- a/passt.c
+++ b/passt.c
@@ -56,7 +56,8 @@
 char pkt_buf[PKT_BUF_BYTES]	__attribute__ ((aligned(PAGE_SIZE)));
 
 char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
-	[EPOLL_TYPE_TCP]	= "TCP socket",
+	[EPOLL_TYPE_TCP]	= "connected TCP socket",
+	[EPOLL_TYPE_TCP_LISTEN]	= "listening TCP socket",
 	[EPOLL_TYPE_TCP_TIMER]	= "TCP timer",
 	[EPOLL_TYPE_UDP]	= "UDP socket",
 	[EPOLL_TYPE_ICMP]	= "ICMP socket",
@@ -323,7 +324,10 @@ loop:
 			break;
 		case EPOLL_TYPE_TCP:
 			if (!c.no_tcp)
-				tcp_sock_handler(&c, ref, eventmask, &now);
+				tcp_sock_handler(&c, ref, eventmask);
+			break;
+		case EPOLL_TYPE_TCP_LISTEN:
+			tcp_listen_handler(&c, ref, &now);
 			break;
 		case EPOLL_TYPE_TCP_TIMER:
 			tcp_timer_handler(&c, ref);
diff --git a/passt.h b/passt.h
index fc1efdb..176bc85 100644
--- a/passt.h
+++ b/passt.h
@@ -47,8 +47,10 @@ union epoll_ref;
 enum epoll_type {
 	/* Special value to indicate an invalid type */
 	EPOLL_TYPE_NONE = 0,
-	/* TCP sockets */
+	/* Connected TCP sockets */
 	EPOLL_TYPE_TCP,
+	/* Listening TCP sockets */
+	EPOLL_TYPE_TCP_LISTEN,
 	/* timerfds used for TCP timers */
 	EPOLL_TYPE_TCP_TIMER,
 	/* UDP sockets */
diff --git a/tcp.c b/tcp.c
index 98761a2..c237393 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2765,22 +2765,20 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref,
 }
 
 /**
- * tcp_conn_from_sock() - Handle new connection request from listening socket
+ * tcp_listen_handler() - Handle new connection request from listening socket
  * @c:		Execution context
  * @ref:	epoll reference of listening socket
  * @now:	Current timestamp
  */
-static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
-			       const struct timespec *now)
+void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
+			const struct timespec *now)
 {
 	struct sockaddr_storage sa;
 	union tcp_conn *conn;
 	socklen_t sl;
 	int s;
 
-	ASSERT(ref.tcp.listen);
-
-	if (c->tcp.conn_count >= TCP_MAX_CONNS)
+	if (c->no_tcp || c->tcp.conn_count >= TCP_MAX_CONNS)
 		return;
 
 	sl = sizeof(sa);
@@ -2926,19 +2924,10 @@ static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn,
  * @c:		Execution context
  * @ref:	epoll reference
  * @events:	epoll events bitmap
- * @now:	Current timestamp
  */
-void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
-		      const struct timespec *now)
+void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
 {
-	union tcp_conn *conn;
-
-	if (ref.tcp.listen) {
-		tcp_conn_from_sock(c, ref, now);
-		return;
-	}
-
-	conn = tc + ref.tcp.index;
+	union tcp_conn *conn = tc + ref.tcp.index;
 
 	if (conn->c.spliced)
 		tcp_splice_sock_handler(c, &conn->splice, ref.fd, events);
@@ -2960,7 +2949,7 @@ static int tcp_sock_init_af(const struct ctx *c, int af, in_port_t port,
 			    const struct in_addr *addr, const char *ifname)
 {
 	in_port_t idx = port + c->tcp.fwd_in.delta[port];
-	union tcp_epoll_ref tref = { .listen = 1, .index = idx };
+	union tcp_epoll_ref tref = { .index = idx };
 	int s;
 
 	s = sock_l4(c, af, IPPROTO_TCP, addr, ifname, port, tref.u32);
@@ -3020,8 +3009,7 @@ int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
 static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
 {
 	in_port_t idx = port + c->tcp.fwd_out.delta[port];
-	union tcp_epoll_ref tref = { .listen = 1, .outbound = 1,
-				     .index = idx };
+	union tcp_epoll_ref tref = { .outbound = 1, .index = idx };
 	struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
 	int s;
 
@@ -3045,8 +3033,7 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
 static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
 {
 	in_port_t idx = port + c->tcp.fwd_out.delta[port];
-	union tcp_epoll_ref tref = { .listen = 1, .outbound = 1,
-				     .index = idx };
+	union tcp_epoll_ref tref = { .outbound = 1, .index = idx };
 	int s;
 
 	ASSERT(c->mode == MODE_PASTA);
diff --git a/tcp.h b/tcp.h
index 8eb7782..8189ac0 100644
--- a/tcp.h
+++ b/tcp.h
@@ -14,8 +14,9 @@
 struct ctx;
 
 void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
-void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
-		      const struct timespec *now);
+void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
+			const struct timespec *now);
+void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
 int tcp_tap_handler(struct ctx *c, int af, const void *addr,
 		    const struct pool *p, const struct timespec *now);
 int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
@@ -37,8 +38,7 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
  */
 union tcp_epoll_ref {
 	struct {
-		uint32_t	listen:1,
-				outbound:1,
+		uint32_t	outbound:1,
 				index:20;
 	};
 	uint32_t u32;
diff --git a/util.c b/util.c
index 2cac7ba..d965f48 100644
--- a/util.c
+++ b/util.c
@@ -120,7 +120,7 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
 
 	switch (proto) {
 	case IPPROTO_TCP:
-		ref.type = EPOLL_TYPE_TCP;
+		ref.type = EPOLL_TYPE_TCP_LISTEN;
 		break;
 	case IPPROTO_UDP:
 		ref.type = EPOLL_TYPE_UDP;
-- 
@@ -120,7 +120,7 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
 
 	switch (proto) {
 	case IPPROTO_TCP:
-		ref.type = EPOLL_TYPE_TCP;
+		ref.type = EPOLL_TYPE_TCP_LISTEN;
 		break;
 	case IPPROTO_UDP:
 		ref.type = EPOLL_TYPE_UDP;
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 8/9] epoll: Split listening Unix domain socket into its own type
  2023-08-07 13:46 [PATCH 0/9] Clean up to epoll dispatch David Gibson
                   ` (6 preceding siblings ...)
  2023-08-07 13:46 ` [PATCH 7/9] epoll: Split handling of listening TCP sockets into their own handler David Gibson
@ 2023-08-07 13:46 ` David Gibson
  2023-08-09 19:59   ` Stefano Brivio
  2023-08-07 13:46 ` [PATCH 9/9] epoll: Use different epoll types for passt and pasta tap fds David Gibson
  8 siblings, 1 reply; 17+ messages in thread
From: David Gibson @ 2023-08-07 13:46 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

tap_handler() actually handles events on three different types of object:
the /dev/tap character device (pasta), a connected Unix domain socket
(passt) or a listening Unix domain socket (passt).

The last, in particular, really has no handling in common with the others,
so split it into its own epoll type and directly dispatch to the relevant
handler from the top level.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt.c |  6 +++++-
 passt.h |  6 ++++--
 tap.c   | 20 ++++++++------------
 tap.h   |  4 ++--
 4 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/passt.c b/passt.c
index c32981d..c60f346 100644
--- a/passt.c
+++ b/passt.c
@@ -64,6 +64,7 @@ char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
 	[EPOLL_TYPE_ICMPV6]	= "ICMPv6 socket",
 	[EPOLL_TYPE_NSQUIT]	= "namespace inotify",
 	[EPOLL_TYPE_TAP]	= "tap device",
+	[EPOLL_TYPE_TAP_LISTEN]	= "listening qemu socket",
 };
 
 /**
@@ -317,7 +318,10 @@ loop:
 
 		switch (ref.type) {
 		case EPOLL_TYPE_TAP:
-			tap_handler(&c, ref.fd, events[i].events, &now);
+			tap_handler(&c, events[i].events, &now);
+			break;
+		case EPOLL_TYPE_TAP_LISTEN:
+			tap_listen_handler(&c, eventmask);
 			break;
 		case EPOLL_TYPE_NSQUIT:
 			pasta_netns_quit_handler(&c, quit_fd);
diff --git a/passt.h b/passt.h
index 176bc85..7dae08b 100644
--- a/passt.h
+++ b/passt.h
@@ -61,10 +61,12 @@ enum epoll_type {
 	EPOLL_TYPE_ICMPV6,
 	/* inotify fd watching for end of netns (pasta) */
 	EPOLL_TYPE_NSQUIT,
-	/* tap char device, or qemu socket fd */
+	/* tap char device, or connected qemu socket fd */
 	EPOLL_TYPE_TAP,
+	/* socket listening for qemu socket connections */
+	EPOLL_TYPE_TAP_LISTEN,
 
-	EPOLL_TYPE_MAX = EPOLL_TYPE_TAP,
+	EPOLL_TYPE_MAX = EPOLL_TYPE_TAP_LISTEN,
 };
 
 /**
diff --git a/tap.c b/tap.c
index ad0decf..8468f86 100644
--- a/tap.c
+++ b/tap.c
@@ -1076,7 +1076,7 @@ restart:
 static void tap_sock_unix_init(struct ctx *c)
 {
 	int fd = socket(AF_UNIX, SOCK_STREAM, 0);
-	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
+	union epoll_ref ref = { .type = EPOLL_TYPE_TAP_LISTEN };
 	struct epoll_event ev = { 0 };
 	struct sockaddr_un addr = {
 		.sun_family = AF_UNIX,
@@ -1142,10 +1142,11 @@ static void tap_sock_unix_init(struct ctx *c)
 }
 
 /**
- * tap_sock_unix_new() - Handle new connection on listening socket
+ * tap_listen_handler() - Handle new connection on listening socket
  * @c:		Execution context
+ * @events:	epoll events on the socket
  */
-static void tap_sock_unix_new(struct ctx *c)
+void tap_listen_handler(struct ctx *c, uint32_t events)
 {
 	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
 	struct epoll_event ev = { 0 };
@@ -1153,6 +1154,9 @@ static void tap_sock_unix_new(struct ctx *c)
 	struct ucred ucred;
 	socklen_t len;
 
+	if (events != EPOLLIN)
+		return;
+
 	len = sizeof(ucred);
 
 	/* Another client is already connected: accept and close right away. */
@@ -1286,19 +1290,11 @@ void tap_sock_init(struct ctx *c)
 /**
  * tap_handler() - Packet handler for AF_UNIX or tuntap file descriptor
  * @c:		Execution context
- * @fd:		File descriptor where event occurred
  * @events:	epoll events
  * @now:	Current timestamp, can be NULL on EPOLLERR
  */
-void tap_handler(struct ctx *c, int fd, uint32_t events,
-		 const struct timespec *now)
+void tap_handler(struct ctx *c, uint32_t events, const struct timespec *now)
 {
-	if (fd == c->fd_tap_listen) {
-		if (events == EPOLLIN)
-			tap_sock_unix_new(c);
-		return;
-	}
-
 	if (c->mode == MODE_PASST)
 		tap_handler_passt(c, events, now);
 	else if (c->mode == MODE_PASTA)
diff --git a/tap.h b/tap.h
index a1ad115..dd6bfdd 100644
--- a/tap.h
+++ b/tap.h
@@ -76,8 +76,8 @@ int tap_send(const struct ctx *c, const void *data, size_t len);
 void tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n);
 void tap_update_mac(struct tap_hdr *taph,
 		    const unsigned char *eth_d, const unsigned char *eth_s);
-void tap_handler(struct ctx *c, int fd, uint32_t events,
-		 const struct timespec *now);
+void tap_listen_handler(struct ctx *c, uint32_t events);
+void tap_handler(struct ctx *c, uint32_t events, const struct timespec *now);
 void tap_sock_init(struct ctx *c);
 
 #endif /* TAP_H */
-- 
@@ -76,8 +76,8 @@ int tap_send(const struct ctx *c, const void *data, size_t len);
 void tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n);
 void tap_update_mac(struct tap_hdr *taph,
 		    const unsigned char *eth_d, const unsigned char *eth_s);
-void tap_handler(struct ctx *c, int fd, uint32_t events,
-		 const struct timespec *now);
+void tap_listen_handler(struct ctx *c, uint32_t events);
+void tap_handler(struct ctx *c, uint32_t events, const struct timespec *now);
 void tap_sock_init(struct ctx *c);
 
 #endif /* TAP_H */
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 9/9] epoll: Use different epoll types for passt and pasta tap fds
  2023-08-07 13:46 [PATCH 0/9] Clean up to epoll dispatch David Gibson
                   ` (7 preceding siblings ...)
  2023-08-07 13:46 ` [PATCH 8/9] epoll: Split listening Unix domain socket into its own type David Gibson
@ 2023-08-07 13:46 ` David Gibson
  8 siblings, 0 replies; 17+ messages in thread
From: David Gibson @ 2023-08-07 13:46 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

Currently we have a single epoll event type for the "tap" fd, which could
be either a handle on a /dev/net/tun device (pasta) or a connected Unix
socket (passt).  However for the two modes we call different handler
functions.  Simplify this a little by using different epoll types and
dispatching directly to the correct handler function.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 passt.c | 10 +++++++---
 passt.h |  6 ++++--
 tap.c   | 39 +++++++++++++++------------------------
 tap.h   |  5 ++++-
 4 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/passt.c b/passt.c
index c60f346..c297de6 100644
--- a/passt.c
+++ b/passt.c
@@ -63,7 +63,8 @@ char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
 	[EPOLL_TYPE_ICMP]	= "ICMP socket",
 	[EPOLL_TYPE_ICMPV6]	= "ICMPv6 socket",
 	[EPOLL_TYPE_NSQUIT]	= "namespace inotify",
-	[EPOLL_TYPE_TAP]	= "tap device",
+	[EPOLL_TYPE_TAP_PASTA]	= "/dev/net/tun device",
+	[EPOLL_TYPE_TAP_PASST]	= "connected qemu socket",
 	[EPOLL_TYPE_TAP_LISTEN]	= "listening qemu socket",
 };
 
@@ -317,8 +318,11 @@ loop:
 		      EPOLL_TYPE_STR(ref.type), ref.fd, events);
 
 		switch (ref.type) {
-		case EPOLL_TYPE_TAP:
-			tap_handler(&c, events[i].events, &now);
+		case EPOLL_TYPE_TAP_PASTA:
+			tap_handler_pasta(&c, eventmask, &now);
+			break;
+		case EPOLL_TYPE_TAP_PASST:
+			tap_handler_passt(&c, eventmask, &now);
 			break;
 		case EPOLL_TYPE_TAP_LISTEN:
 			tap_listen_handler(&c, eventmask);
diff --git a/passt.h b/passt.h
index 7dae08b..e13cf13 100644
--- a/passt.h
+++ b/passt.h
@@ -61,8 +61,10 @@ enum epoll_type {
 	EPOLL_TYPE_ICMPV6,
 	/* inotify fd watching for end of netns (pasta) */
 	EPOLL_TYPE_NSQUIT,
-	/* tap char device, or connected qemu socket fd */
-	EPOLL_TYPE_TAP,
+	/* tuntap character device */
+	EPOLL_TYPE_TAP_PASTA,
+	/* socket connected to qemu  */
+	EPOLL_TYPE_TAP_PASST,
 	/* socket listening for qemu socket connections */
 	EPOLL_TYPE_TAP_LISTEN,
 
diff --git a/tap.c b/tap.c
index 8468f86..3ce7e0b 100644
--- a/tap.c
+++ b/tap.c
@@ -914,8 +914,8 @@ static void tap_sock_reset(struct ctx *c)
  * @events:	epoll events
  * @now:	Current timestamp
  */
-static void tap_handler_passt(struct ctx *c, uint32_t events,
-			      const struct timespec *now)
+void tap_handler_passt(struct ctx *c, uint32_t events,
+		       const struct timespec *now)
 {
 	struct ethhdr *eh;
 	ssize_t n, rem;
@@ -1001,13 +1001,13 @@ next:
 }
 
 /**
- * tap_handler_pasta() - Packet handler for tuntap file descriptor
+ * tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor
  * @c:		Execution context
  * @events:	epoll events
  * @now:	Current timestamp
  */
-static void tap_handler_pasta(struct ctx *c, uint32_t events,
-			      const struct timespec *now)
+void tap_handler_pasta(struct ctx *c, uint32_t events,
+		       const struct timespec *now)
 {
 	ssize_t n, len;
 	int ret;
@@ -1148,7 +1148,7 @@ static void tap_sock_unix_init(struct ctx *c)
  */
 void tap_listen_handler(struct ctx *c, uint32_t events)
 {
-	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
+	union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST };
 	struct epoll_event ev = { 0 };
 	int v = INT_MAX / 2;
 	struct ucred ucred;
@@ -1230,12 +1230,12 @@ static int tap_ns_tun(void *arg)
 }
 
 /**
- * tap_sock_init_tun() - Set up tuntap file descriptor
+ * tap_sock_tun_init() - Set up /dev/net/tun file descriptor
  * @c:		Execution context
  */
 static void tap_sock_tun_init(struct ctx *c)
 {
-	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
+	union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASTA };
 	struct epoll_event ev = { 0 };
 
 	NS_CALL(tap_ns_tun, c);
@@ -1268,11 +1268,16 @@ void tap_sock_init(struct ctx *c)
 	}
 
 	if (c->fd_tap != -1) { /* Passed as --fd */
-		union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
 		struct epoll_event ev = { 0 };
-		ASSERT(c->one_off);
+		union epoll_ref ref;
 
+		ASSERT(c->one_off);
 		ref.fd = c->fd_tap;
+		if (c->mode == MODE_PASST)
+			ref.type = EPOLL_TYPE_TAP_PASST;
+		else
+			ref.type = EPOLL_TYPE_TAP_PASTA;
+
 		ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
 		ev.data.u64 = ref.u64;
 		epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
@@ -1286,17 +1291,3 @@ void tap_sock_init(struct ctx *c)
 		tap_sock_tun_init(c);
 	}
 }
-
-/**
- * tap_handler() - Packet handler for AF_UNIX or tuntap file descriptor
- * @c:		Execution context
- * @events:	epoll events
- * @now:	Current timestamp, can be NULL on EPOLLERR
- */
-void tap_handler(struct ctx *c, uint32_t events, const struct timespec *now)
-{
-	if (c->mode == MODE_PASST)
-		tap_handler_passt(c, events, now);
-	else if (c->mode == MODE_PASTA)
-		tap_handler_pasta(c, events, now);
-}
diff --git a/tap.h b/tap.h
index dd6bfdd..021fb7c 100644
--- a/tap.h
+++ b/tap.h
@@ -77,7 +77,10 @@ void tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n);
 void tap_update_mac(struct tap_hdr *taph,
 		    const unsigned char *eth_d, const unsigned char *eth_s);
 void tap_listen_handler(struct ctx *c, uint32_t events);
-void tap_handler(struct ctx *c, uint32_t events, const struct timespec *now);
+void tap_handler_pasta(struct ctx *c, uint32_t events,
+		       const struct timespec *now);
+void tap_handler_passt(struct ctx *c, uint32_t events,
+		       const struct timespec *now);
 void tap_sock_init(struct ctx *c);
 
 #endif /* TAP_H */
-- 
@@ -77,7 +77,10 @@ void tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n);
 void tap_update_mac(struct tap_hdr *taph,
 		    const unsigned char *eth_d, const unsigned char *eth_s);
 void tap_listen_handler(struct ctx *c, uint32_t events);
-void tap_handler(struct ctx *c, uint32_t events, const struct timespec *now);
+void tap_handler_pasta(struct ctx *c, uint32_t events,
+		       const struct timespec *now);
+void tap_handler_passt(struct ctx *c, uint32_t events,
+		       const struct timespec *now);
 void tap_sock_init(struct ctx *c);
 
 #endif /* TAP_H */
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 7/9] epoll: Split handling of listening TCP sockets into their own handler
  2023-08-07 13:46 ` [PATCH 7/9] epoll: Split handling of listening TCP sockets into their own handler David Gibson
@ 2023-08-09  6:29   ` David Gibson
  0 siblings, 0 replies; 17+ messages in thread
From: David Gibson @ 2023-08-09  6:29 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev

[-- Attachment #1: Type: text/plain, Size: 6834 bytes --]

On Mon, Aug 07, 2023 at 11:46:29PM +1000, David Gibson wrote:
> tcp_sock_handler() handles both listening TCP sockets, and connected TCP
> sockets, but what it needs to do in those cases has essentially nothing in
> common.  Therefore, give listening sockets their own epoll_type value and
> dispatch directly to their own handler from the top level.  This also lets
> us remove the listen field from tcp_epoll_ref, since the information it
> carries is implicit in the epoll_type.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>

Stefano,

I've realized an additional change that belongs with this patch, so
I'll be making a second spin.  Go ahead and review, but don't apply
just yet.

> ---
>  passt.c |  8 ++++++--
>  passt.h |  4 +++-
>  tcp.c   | 31 +++++++++----------------------
>  tcp.h   |  8 ++++----
>  util.c  |  2 +-
>  5 files changed, 23 insertions(+), 30 deletions(-)
> 
> diff --git a/passt.c b/passt.c
> index c750fad..c32981d 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -56,7 +56,8 @@
>  char pkt_buf[PKT_BUF_BYTES]	__attribute__ ((aligned(PAGE_SIZE)));
>  
>  char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
> -	[EPOLL_TYPE_TCP]	= "TCP socket",
> +	[EPOLL_TYPE_TCP]	= "connected TCP socket",
> +	[EPOLL_TYPE_TCP_LISTEN]	= "listening TCP socket",
>  	[EPOLL_TYPE_TCP_TIMER]	= "TCP timer",
>  	[EPOLL_TYPE_UDP]	= "UDP socket",
>  	[EPOLL_TYPE_ICMP]	= "ICMP socket",
> @@ -323,7 +324,10 @@ loop:
>  			break;
>  		case EPOLL_TYPE_TCP:
>  			if (!c.no_tcp)
> -				tcp_sock_handler(&c, ref, eventmask, &now);
> +				tcp_sock_handler(&c, ref, eventmask);
> +			break;
> +		case EPOLL_TYPE_TCP_LISTEN:
> +			tcp_listen_handler(&c, ref, &now);
>  			break;
>  		case EPOLL_TYPE_TCP_TIMER:
>  			tcp_timer_handler(&c, ref);
> diff --git a/passt.h b/passt.h
> index fc1efdb..176bc85 100644
> --- a/passt.h
> +++ b/passt.h
> @@ -47,8 +47,10 @@ union epoll_ref;
>  enum epoll_type {
>  	/* Special value to indicate an invalid type */
>  	EPOLL_TYPE_NONE = 0,
> -	/* TCP sockets */
> +	/* Connected TCP sockets */
>  	EPOLL_TYPE_TCP,
> +	/* Listening TCP sockets */
> +	EPOLL_TYPE_TCP_LISTEN,
>  	/* timerfds used for TCP timers */
>  	EPOLL_TYPE_TCP_TIMER,
>  	/* UDP sockets */
> diff --git a/tcp.c b/tcp.c
> index 98761a2..c237393 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -2765,22 +2765,20 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref,
>  }
>  
>  /**
> - * tcp_conn_from_sock() - Handle new connection request from listening socket
> + * tcp_listen_handler() - Handle new connection request from listening socket
>   * @c:		Execution context
>   * @ref:	epoll reference of listening socket
>   * @now:	Current timestamp
>   */
> -static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
> -			       const struct timespec *now)
> +void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
> +			const struct timespec *now)
>  {
>  	struct sockaddr_storage sa;
>  	union tcp_conn *conn;
>  	socklen_t sl;
>  	int s;
>  
> -	ASSERT(ref.tcp.listen);
> -
> -	if (c->tcp.conn_count >= TCP_MAX_CONNS)
> +	if (c->no_tcp || c->tcp.conn_count >= TCP_MAX_CONNS)
>  		return;
>  
>  	sl = sizeof(sa);
> @@ -2926,19 +2924,10 @@ static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn,
>   * @c:		Execution context
>   * @ref:	epoll reference
>   * @events:	epoll events bitmap
> - * @now:	Current timestamp
>   */
> -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
> -		      const struct timespec *now)
> +void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
>  {
> -	union tcp_conn *conn;
> -
> -	if (ref.tcp.listen) {
> -		tcp_conn_from_sock(c, ref, now);
> -		return;
> -	}
> -
> -	conn = tc + ref.tcp.index;
> +	union tcp_conn *conn = tc + ref.tcp.index;
>  
>  	if (conn->c.spliced)
>  		tcp_splice_sock_handler(c, &conn->splice, ref.fd, events);
> @@ -2960,7 +2949,7 @@ static int tcp_sock_init_af(const struct ctx *c, int af, in_port_t port,
>  			    const struct in_addr *addr, const char *ifname)
>  {
>  	in_port_t idx = port + c->tcp.fwd_in.delta[port];
> -	union tcp_epoll_ref tref = { .listen = 1, .index = idx };
> +	union tcp_epoll_ref tref = { .index = idx };
>  	int s;
>  
>  	s = sock_l4(c, af, IPPROTO_TCP, addr, ifname, port, tref.u32);
> @@ -3020,8 +3009,7 @@ int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
>  static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
>  {
>  	in_port_t idx = port + c->tcp.fwd_out.delta[port];
> -	union tcp_epoll_ref tref = { .listen = 1, .outbound = 1,
> -				     .index = idx };
> +	union tcp_epoll_ref tref = { .outbound = 1, .index = idx };
>  	struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
>  	int s;
>  
> @@ -3045,8 +3033,7 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
>  static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
>  {
>  	in_port_t idx = port + c->tcp.fwd_out.delta[port];
> -	union tcp_epoll_ref tref = { .listen = 1, .outbound = 1,
> -				     .index = idx };
> +	union tcp_epoll_ref tref = { .outbound = 1, .index = idx };
>  	int s;
>  
>  	ASSERT(c->mode == MODE_PASTA);
> diff --git a/tcp.h b/tcp.h
> index 8eb7782..8189ac0 100644
> --- a/tcp.h
> +++ b/tcp.h
> @@ -14,8 +14,9 @@
>  struct ctx;
>  
>  void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
> -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
> -		      const struct timespec *now);
> +void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
> +			const struct timespec *now);
> +void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
>  int tcp_tap_handler(struct ctx *c, int af, const void *addr,
>  		    const struct pool *p, const struct timespec *now);
>  int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
> @@ -37,8 +38,7 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
>   */
>  union tcp_epoll_ref {
>  	struct {
> -		uint32_t	listen:1,
> -				outbound:1,
> +		uint32_t	outbound:1,
>  				index:20;
>  	};
>  	uint32_t u32;
> diff --git a/util.c b/util.c
> index 2cac7ba..d965f48 100644
> --- a/util.c
> +++ b/util.c
> @@ -120,7 +120,7 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
>  
>  	switch (proto) {
>  	case IPPROTO_TCP:
> -		ref.type = EPOLL_TYPE_TCP;
> +		ref.type = EPOLL_TYPE_TCP_LISTEN;
>  		break;
>  	case IPPROTO_UDP:
>  		ref.type = EPOLL_TYPE_UDP;

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/9] epoll: Generalize epoll_ref to cover things other than sockets
  2023-08-07 13:46 ` [PATCH 1/9] epoll: Generalize epoll_ref to cover things other than sockets David Gibson
@ 2023-08-09 19:59   ` Stefano Brivio
  2023-08-10  0:23     ` David Gibson
  0 siblings, 1 reply; 17+ messages in thread
From: Stefano Brivio @ 2023-08-09 19:59 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Mon,  7 Aug 2023 23:46:23 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> The epoll_ref type includes fields for the IP protocol of a socket, and the
> socket fd.  However, we already have a few things in the epoll which aren't
> protocol sockets, and we may have more in future.  Rename these fields to
> an abstract "fd type" and file descriptor for more generality.
> 
> Similarly, rather than using existing IP protocol numbers for the type,
> introduce our own number space.  For now these just correspond to the
> supported protocols, but we'll expand on that in future.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  icmp.c       |  6 +++---
>  passt.c      | 25 ++++++++++++-------------
>  passt.h      | 40 +++++++++++++++++++++++++++++-----------
>  tcp.c        | 22 +++++++++++-----------
>  tcp_conn.h   |  4 ++--
>  tcp_splice.c |  4 ++--
>  udp.c        | 14 +++++++-------
>  util.c       | 27 ++++++++++++++++++++-------
>  8 files changed, 86 insertions(+), 56 deletions(-)
> 
> diff --git a/icmp.c b/icmp.c
> index 676fa64..a4b6a47 100644
> --- a/icmp.c
> +++ b/icmp.c
> @@ -79,7 +79,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref,
>  	(void)events;
>  	(void)now;
>  
> -	n = recvfrom(ref.s, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl);
> +	n = recvfrom(ref.fd, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl);
>  	if (n < 0)
>  		return;
>  
> @@ -182,7 +182,7 @@ int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
>  				    bind_if, id, iref.u32);
>  			if (s < 0)
>  				goto fail_sock;
> -			if (s > SOCKET_MAX) {
> +			if (s > FD_REF_MAX) {
>  				close(s);
>  				return 1;
>  			}
> @@ -236,7 +236,7 @@ int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
>  				    bind_if, id, iref.u32);
>  			if (s < 0)
>  				goto fail_sock;
> -			if (s > SOCKET_MAX) {
> +			if (s > FD_REF_MAX) {
>  				close(s);
>  				return 1;
>  			}
> diff --git a/passt.c b/passt.c
> index 9123868..b42f42d 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -55,12 +55,11 @@
>  
>  char pkt_buf[PKT_BUF_BYTES]	__attribute__ ((aligned(PAGE_SIZE)));
>  
> -char *ip_proto_str[IPPROTO_SCTP + 1] = {
> -	[IPPROTO_ICMP]		= "ICMP",
> -	[IPPROTO_TCP]		= "TCP",
> -	[IPPROTO_UDP]		= "UDP",
> -	[IPPROTO_ICMPV6]	= "ICMPV6",
> -	[IPPROTO_SCTP]		= "SCTP",
> +char *epoll_type_str[EPOLL_TYPE_MAX+1] = {

For consistency, given you're respinning anyway: [EPOLL_TYPE_MAX + 1]

> +	[EPOLL_TYPE_TCP]	= "TCP socket",
> +	[EPOLL_TYPE_UDP]	= "UDP socket",
> +	[EPOLL_TYPE_ICMP]	= "ICMP socket",
> +	[EPOLL_TYPE_ICMPV6]	= "ICMPv6 socket",
>  };
>  
>  /**
> @@ -73,16 +72,16 @@ char *ip_proto_str[IPPROTO_SCTP + 1] = {
>  static void sock_handler(struct ctx *c, union epoll_ref ref,
>  			 uint32_t events, const struct timespec *now)
>  {
> -	trace("%s: %s packet from socket %i (events: 0x%08x)",
> +	trace("%s: packet from %s %i (events: 0x%08x)",
>  	      c->mode == MODE_PASST ? "passt" : "pasta",
> -	      IP_PROTO_STR(ref.proto), ref.s, events);
> +	      EPOLL_TYPE_STR(ref.type), ref.fd, events);
>  
> -	if (!c->no_tcp && ref.proto == IPPROTO_TCP)
> -		tcp_sock_handler( c, ref, events, now);
> -	else if (!c->no_udp && ref.proto == IPPROTO_UDP)
> -		udp_sock_handler( c, ref, events, now);
> +	if (!c->no_tcp && ref.type == EPOLL_TYPE_TCP)
> +		tcp_sock_handler(c, ref, events, now);
> +	else if (!c->no_udp && ref.type == EPOLL_TYPE_UDP)
> +		udp_sock_handler(c, ref, events, now);
>  	else if (!c->no_icmp &&
> -		 (ref.proto == IPPROTO_ICMP || ref.proto == IPPROTO_ICMPV6))
> +		 (ref.type == EPOLL_TYPE_ICMP || ref.type == EPOLL_TYPE_ICMPV6))
>  		icmp_sock_handler(c, ref, events, now);
>  }
>  
> diff --git a/passt.h b/passt.h
> index edc4841..2110781 100644
> --- a/passt.h
> +++ b/passt.h
> @@ -42,9 +42,27 @@ union epoll_ref;
>  #include "udp.h"
>  
>  /**
> - * union epoll_ref - Breakdown of reference for epoll socket bookkeeping
> - * @proto:	IP protocol number
> - * @s:		Socket number (implies 2^24-1 limit on number of descriptors)
> + * enum epoll_type - Different types of fds we poll over
> + */
> +enum epoll_type {
> +	/* Special value to indicate an invalid type */
> +	EPOLL_TYPE_NONE = 0,
> +	/* Sockets and timerfds for TCP handling */
> +	EPOLL_TYPE_TCP,
> +	/* UDP sockets */
> +	EPOLL_TYPE_UDP,
> +	/* IPv4 ICMP sockets */
> +	EPOLL_TYPE_ICMP,
> +	/* ICMPv6 sockets */
> +	EPOLL_TYPE_ICMPV6,
> +
> +	EPOLL_TYPE_MAX = EPOLL_TYPE_ICMPV6,
> +};
> +
> +/**
> + * union epoll_ref - Breakdown of reference for epoll fd bookkeeping
> + * @type:	Type of fd (tells us what to do with events)
> + * @fd:		File descriptor number (implies < 2^24 total descriptors)
>   * @tcp:	TCP-specific reference part
>   * @udp:	UDP-specific reference part
>   * @icmp:	ICMP-specific reference part
> @@ -53,10 +71,10 @@ union epoll_ref;
>   */
>  union epoll_ref {
>  	struct {
> -		int32_t		proto:8,
> -#define SOCKET_REF_BITS		24
> -#define SOCKET_MAX		MAX_FROM_BITS(SOCKET_REF_BITS)
> -				s:SOCKET_REF_BITS;
> +		enum epoll_type type:8;
> +#define FD_REF_BITS		24
> +#define FD_REF_MAX		MAX_FROM_BITS(FD_REF_BITS)
> +		int32_t		fd:FD_REF_BITS;
>  		union {
>  			union tcp_epoll_ref tcp;
>  			union udp_epoll_ref udp;
> @@ -78,10 +96,10 @@ static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
>  #define PKT_BUF_BYTES		MAX(TAP_BUF_BYTES, 0)
>  extern char pkt_buf		[PKT_BUF_BYTES];
>  
> -extern char *ip_proto_str[];
> -#define IP_PROTO_STR(n)							\
> -	(((uint8_t)(n) <= IPPROTO_SCTP && ip_proto_str[(n)]) ?		\
> -			  ip_proto_str[(n)] : "?")
> +extern char *epoll_type_str[];
> +#define EPOLL_TYPE_STR(n)						\
> +	(((uint8_t)(n) <= EPOLL_TYPE_MAX && epoll_type_str[(n)]) ?	\
> +	                                    epoll_type_str[(n)] : "?")
>  
>  #include <resolv.h>	/* For MAXNS below */
>  
> diff --git a/tcp.c b/tcp.c
> index d9ecee5..18b781a 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -643,7 +643,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
>  static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
>  {
>  	int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
> -	union epoll_ref ref = { .proto = IPPROTO_TCP, .s = conn->sock,
> +	union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
>  				.tcp.index = CONN_IDX(conn) };
>  	struct epoll_event ev = { .data.u64 = ref.u64 };
>  
> @@ -663,8 +663,8 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
>  	conn->c.in_epoll = true;
>  
>  	if (conn->timer != -1) {
> -		union epoll_ref ref_t = { .proto = IPPROTO_TCP,
> -					  .s = conn->sock,
> +		union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP,
> +					  .fd = conn->sock,
>  					  .tcp.timer = 1,
>  					  .tcp.index = CONN_IDX(conn) };
>  		struct epoll_event ev_t = { .data.u64 = ref_t.u64,
> @@ -692,8 +692,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
>  		return;
>  
>  	if (conn->timer == -1) {
> -		union epoll_ref ref = { .proto = IPPROTO_TCP,
> -					.s = conn->sock,
> +		union epoll_ref ref = { .type = EPOLL_TYPE_TCP,
> +					.fd = conn->sock,
>  					.tcp.timer = 1,
>  					.tcp.index = CONN_IDX(conn) };
>  		struct epoll_event ev = { .data.u64 = ref.u64,
> @@ -701,7 +701,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
>  		int fd;
>  
>  		fd = timerfd_create(CLOCK_MONOTONIC, 0);
> -		if (fd == -1 || fd > SOCKET_MAX) {
> +		if (fd == -1 || fd > FD_REF_MAX) {
>  			debug("TCP: failed to get timer: %s", strerror(errno));
>  			if (fd > -1)
>  				close(fd);
> @@ -1908,7 +1908,7 @@ int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
>  
>  	s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
>  
> -	if (s > SOCKET_MAX) {
> +	if (s > FD_REF_MAX) {
>  		close(s);
>  		return -EIO;
>  	}
> @@ -2791,7 +2791,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
>  	 * https://github.com/llvm/llvm-project/issues/58992
>  	 */
>  	memset(&sa, 0, sizeof(struct sockaddr_in6));
> -	s = accept4(ref.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK);
> +	s = accept4(ref.fd, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK);
>  	if (s < 0)
>  		return;
>  
> @@ -2948,7 +2948,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
>  	conn = tc + ref.tcp.index;
>  
>  	if (conn->c.spliced)
> -		tcp_splice_sock_handler(c, &conn->splice, ref.s, events);
> +		tcp_splice_sock_handler(c, &conn->splice, ref.fd, events);
>  	else
>  		tcp_tap_sock_handler(c, &conn->tap, events);
>  }
> @@ -2999,7 +2999,7 @@ static int tcp_sock_init_af(const struct ctx *c, int af, in_port_t port,
>  int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
>  		  const char *ifname, in_port_t port)
>  {
> -	int r4 = SOCKET_MAX + 1, r6 = SOCKET_MAX + 1;
> +	int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
>  
>  	if (af == AF_UNSPEC && c->ifi4 && c->ifi6)
>  		/* Attempt to get a dual stack socket */
> @@ -3013,7 +3013,7 @@ int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
>  	if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6)
>  		r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname);
>  
> -	if (IN_INTERVAL(0, SOCKET_MAX, r4) || IN_INTERVAL(0, SOCKET_MAX, r6))
> +	if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
>  		return 0;
>  
>  	return r4 < 0 ? r4 : r6;
> diff --git a/tcp_conn.h b/tcp_conn.h
> index 9e2b1bf..0b36940 100644
> --- a/tcp_conn.h
> +++ b/tcp_conn.h
> @@ -62,7 +62,7 @@ struct tcp_tap_conn {
>  	unsigned int	ws_to_tap	:TCP_WS_BITS;
>  
>  
> -	int		sock		:SOCKET_REF_BITS;
> +	int		sock		:FD_REF_BITS;
>  
>  	uint8_t		events;
>  #define CLOSED			0
> @@ -80,7 +80,7 @@ struct tcp_tap_conn {
>  	(SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
>  
>  
> -	int		timer		:SOCKET_REF_BITS;
> +	int		timer		:FD_REF_BITS;
>  
>  	uint8_t		flags;
>  #define STALLED			BIT(0)
> diff --git a/tcp_splice.c b/tcp_splice.c
> index 03e14c1..24995e2 100644
> --- a/tcp_splice.c
> +++ b/tcp_splice.c
> @@ -173,9 +173,9 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
>  				struct tcp_splice_conn *conn)
>  {
>  	int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
> -	union epoll_ref ref_a = { .proto = IPPROTO_TCP, .s = conn->a,
> +	union epoll_ref ref_a = { .type = EPOLL_TYPE_TCP, .fd = conn->a,
>  				  .tcp.index = CONN_IDX(conn) };
> -	union epoll_ref ref_b = { .proto = IPPROTO_TCP, .s = conn->b,
> +	union epoll_ref ref_b = { .type = EPOLL_TYPE_TCP, .fd = conn->b,
>  				  .tcp.index = CONN_IDX(conn) };
>  	struct epoll_event ev_a = { .data.u64 = ref_a.u64 };
>  	struct epoll_event ev_b = { .data.u64 = ref_b.u64 };
> diff --git a/udp.c b/udp.c
> index 5a852fb..62f8360 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -388,9 +388,9 @@ static void udp_sock6_iov_init(const struct ctx *c)
>  int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns)
>  {
>  	struct epoll_event ev = { .events = EPOLLIN | EPOLLRDHUP | EPOLLHUP };
> -	union epoll_ref ref = { .proto = IPPROTO_UDP,
> +	union epoll_ref ref = { .type = EPOLL_TYPE_UDP,
>  				.udp = { .splice = true, .ns = ns,
> -					     .v6 = v6, .port = src }
> +					 .v6 = v6, .port = src }
>  			      };
>  	struct udp_splice_port *sp;
>  	int act, s;
> @@ -406,7 +406,7 @@ int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns)
>  	s = socket(v6 ? AF_INET6 : AF_INET, SOCK_DGRAM | SOCK_NONBLOCK,
>  		   IPPROTO_UDP);
>  
> -	if (s > SOCKET_MAX) {
> +	if (s > FD_REF_MAX) {
>  		close(s);
>  		return -EIO;
>  	}
> @@ -414,7 +414,7 @@ int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns)
>  	if (s < 0)
>  		return s;
>  
> -	ref.s = s;
> +	ref.fd = s;
>  
>  	if (v6) {
>  		struct sockaddr_in6 addr6 = {
> @@ -767,7 +767,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
>  		udp4_localname.sin_port = htons(dstport);
>  	}
>  
> -	n = recvmmsg(ref.s, mmh_recv, n, 0, NULL);
> +	n = recvmmsg(ref.fd, mmh_recv, n, 0, NULL);
>  	if (n <= 0)
>  		return;
>  
> @@ -980,7 +980,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
>  		  const void *addr, const char *ifname, in_port_t port)
>  {
>  	union udp_epoll_ref uref = { .u32 = 0 };
> -	int s, r4 = SOCKET_MAX + 1, r6 = SOCKET_MAX + 1;
> +	int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
>  
>  	if (ns) {
>  		uref.port = (in_port_t)(port + c->udp.fwd_out.f.delta[port]);
> @@ -1030,7 +1030,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
>  		}
>  	}
>  
> -	if (IN_INTERVAL(0, SOCKET_MAX, r4) || IN_INTERVAL(0, SOCKET_MAX, r6))
> +	if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
>  		return 0;
>  
>  	return r4 < 0 ? r4 : r6;
> diff --git a/util.c b/util.c
> index 019c56c..2cac7ba 100644
> --- a/util.c
> +++ b/util.c
> @@ -102,7 +102,7 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
>  	    const void *bind_addr, const char *ifname, uint16_t port,
>  	    uint32_t data)
>  {
> -	union epoll_ref ref = { .proto = proto, .data = data };
> +	union epoll_ref ref = { .data = data };
>  	struct sockaddr_in addr4 = {
>  		.sin_family = AF_INET,
>  		.sin_port = htons(port),
> @@ -118,9 +118,22 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
>  	int fd, sl, y = 1, ret;
>  	struct epoll_event ev;
>  
> -	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
> -	    proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6)
> +	switch (proto) {
> +	case IPPROTO_TCP:
> +		ref.type = EPOLL_TYPE_TCP;
> +		break;
> +	case IPPROTO_UDP:
> +		ref.type = EPOLL_TYPE_UDP;
> +		break;
> +	case IPPROTO_ICMP:
> +		ref.type = EPOLL_TYPE_ICMP;
> +		break;
> +	case IPPROTO_ICMPV6:
> +		ref.type = EPOLL_TYPE_ICMPV6;
> +		break;
> +	default:
>  		return -EPFNOSUPPORT;	/* Not implemented. */
> +	}
>  
>  	if (af == AF_UNSPEC) {
>  		if (!DUAL_STACK_SOCKETS || bind_addr)
> @@ -140,12 +153,12 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
>  		return ret;
>  	}
>  
> -	if (fd > SOCKET_MAX) {
> +	if (fd > FD_REF_MAX) {
>  		close(fd);
>  		return -EBADF;
>  	}
>  
> -	ref.s = fd;
> +	ref.fd = fd;
>  
>  	if (af == AF_INET) {
>  		if (bind_addr)
> @@ -188,8 +201,8 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
>  		if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
>  			       ifname, strlen(ifname))) {
>  			ret = -errno;
> -			warn("Can't bind socket for %s port %u to %s, closing",
> -			     ip_proto_str[proto], port, ifname);
> +			warn("Can't bind %s socket for port %u to %s, closing",
> +			     EPOLL_TYPE_STR(proto), port, ifname);
>  			close(fd);
>  			return ret;
>  		}


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 8/9] epoll: Split listening Unix domain socket into its own type
  2023-08-07 13:46 ` [PATCH 8/9] epoll: Split listening Unix domain socket into its own type David Gibson
@ 2023-08-09 19:59   ` Stefano Brivio
  2023-08-10  1:08     ` David Gibson
  0 siblings, 1 reply; 17+ messages in thread
From: Stefano Brivio @ 2023-08-09 19:59 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Mon,  7 Aug 2023 23:46:30 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> tap_handler() actually handles events on three different types of object:
> the /dev/tap character device (pasta), a connected Unix domain socket
> (passt) or a listening Unix domain socket (passt).
> 
> The last, in particular, really has no handling in common with the others,
> so split it into its own epoll type and directly dispatch to the relevant
> handler from the top level.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  passt.c |  6 +++++-
>  passt.h |  6 ++++--
>  tap.c   | 20 ++++++++------------
>  tap.h   |  4 ++--
>  4 files changed, 19 insertions(+), 17 deletions(-)
> 
> diff --git a/passt.c b/passt.c
> index c32981d..c60f346 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -64,6 +64,7 @@ char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
>  	[EPOLL_TYPE_ICMPV6]	= "ICMPv6 socket",
>  	[EPOLL_TYPE_NSQUIT]	= "namespace inotify",
>  	[EPOLL_TYPE_TAP]	= "tap device",
> +	[EPOLL_TYPE_TAP_LISTEN]	= "listening qemu socket",
>  };
>  
>  /**
> @@ -317,7 +318,10 @@ loop:
>  
>  		switch (ref.type) {
>  		case EPOLL_TYPE_TAP:
> -			tap_handler(&c, ref.fd, events[i].events, &now);
> +			tap_handler(&c, events[i].events, &now);
> +			break;
> +		case EPOLL_TYPE_TAP_LISTEN:
> +			tap_listen_handler(&c, eventmask);
>  			break;
>  		case EPOLL_TYPE_NSQUIT:
>  			pasta_netns_quit_handler(&c, quit_fd);
> diff --git a/passt.h b/passt.h
> index 176bc85..7dae08b 100644
> --- a/passt.h
> +++ b/passt.h
> @@ -61,10 +61,12 @@ enum epoll_type {
>  	EPOLL_TYPE_ICMPV6,
>  	/* inotify fd watching for end of netns (pasta) */
>  	EPOLL_TYPE_NSQUIT,
> -	/* tap char device, or qemu socket fd */
> +	/* tap char device, or connected qemu socket fd */
>  	EPOLL_TYPE_TAP,
> +	/* socket listening for qemu socket connections */
> +	EPOLL_TYPE_TAP_LISTEN,
>  
> -	EPOLL_TYPE_MAX = EPOLL_TYPE_TAP,
> +	EPOLL_TYPE_MAX = EPOLL_TYPE_TAP_LISTEN,
>  };
>  
>  /**
> diff --git a/tap.c b/tap.c
> index ad0decf..8468f86 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -1076,7 +1076,7 @@ restart:
>  static void tap_sock_unix_init(struct ctx *c)
>  {
>  	int fd = socket(AF_UNIX, SOCK_STREAM, 0);
> -	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
> +	union epoll_ref ref = { .type = EPOLL_TYPE_TAP_LISTEN };
>  	struct epoll_event ev = { 0 };
>  	struct sockaddr_un addr = {
>  		.sun_family = AF_UNIX,
> @@ -1142,10 +1142,11 @@ static void tap_sock_unix_init(struct ctx *c)
>  }
>  
>  /**
> - * tap_sock_unix_new() - Handle new connection on listening socket
> + * tap_listen_handler() - Handle new connection on listening socket
>   * @c:		Execution context
> + * @events:	epoll events on the socket
>   */
> -static void tap_sock_unix_new(struct ctx *c)
> +void tap_listen_handler(struct ctx *c, uint32_t events)
>  {
>  	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
>  	struct epoll_event ev = { 0 };
> @@ -1153,6 +1154,9 @@ static void tap_sock_unix_new(struct ctx *c)
>  	struct ucred ucred;
>  	socklen_t len;
>  
> +	if (events != EPOLLIN)
> +		return;
> +

This comment actually belongs to 2/4 of the tap reset clean-up series,
but posting it here for clarity: ...wouldn't it be appropriate to
handle errors while at it? Otherwise I guess we'll just spin on
EPOLLERR or EPOLLRDHUP.

I didn't realise before that other series that we didn't actually
handle errors on this path -- that's the reason why error handling is
missing, nothing else.

The rest of the series looks good to me, thanks -- waiting for v2.

-- 
Stefano


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/9] epoll: Generalize epoll_ref to cover things other than sockets
  2023-08-09 19:59   ` Stefano Brivio
@ 2023-08-10  0:23     ` David Gibson
  0 siblings, 0 replies; 17+ messages in thread
From: David Gibson @ 2023-08-10  0:23 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 2959 bytes --]

On Wed, Aug 09, 2023 at 09:59:10PM +0200, Stefano Brivio wrote:
> On Mon,  7 Aug 2023 23:46:23 +1000
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > The epoll_ref type includes fields for the IP protocol of a socket, and the
> > socket fd.  However, we already have a few things in the epoll which aren't
> > protocol sockets, and we may have more in future.  Rename these fields to
> > an abstract "fd type" and file descriptor for more generality.
> > 
> > Similarly, rather than using existing IP protocol numbers for the type,
> > introduce our own number space.  For now these just correspond to the
> > supported protocols, but we'll expand on that in future.
> > 
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  icmp.c       |  6 +++---
> >  passt.c      | 25 ++++++++++++-------------
> >  passt.h      | 40 +++++++++++++++++++++++++++++-----------
> >  tcp.c        | 22 +++++++++++-----------
> >  tcp_conn.h   |  4 ++--
> >  tcp_splice.c |  4 ++--
> >  udp.c        | 14 +++++++-------
> >  util.c       | 27 ++++++++++++++++++++-------
> >  8 files changed, 86 insertions(+), 56 deletions(-)
> > 
> > diff --git a/icmp.c b/icmp.c
> > index 676fa64..a4b6a47 100644
> > --- a/icmp.c
> > +++ b/icmp.c
> > @@ -79,7 +79,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref,
> >  	(void)events;
> >  	(void)now;
> >  
> > -	n = recvfrom(ref.s, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl);
> > +	n = recvfrom(ref.fd, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl);
> >  	if (n < 0)
> >  		return;
> >  
> > @@ -182,7 +182,7 @@ int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
> >  				    bind_if, id, iref.u32);
> >  			if (s < 0)
> >  				goto fail_sock;
> > -			if (s > SOCKET_MAX) {
> > +			if (s > FD_REF_MAX) {
> >  				close(s);
> >  				return 1;
> >  			}
> > @@ -236,7 +236,7 @@ int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
> >  				    bind_if, id, iref.u32);
> >  			if (s < 0)
> >  				goto fail_sock;
> > -			if (s > SOCKET_MAX) {
> > +			if (s > FD_REF_MAX) {
> >  				close(s);
> >  				return 1;
> >  			}
> > diff --git a/passt.c b/passt.c
> > index 9123868..b42f42d 100644
> > --- a/passt.c
> > +++ b/passt.c
> > @@ -55,12 +55,11 @@
> >  
> >  char pkt_buf[PKT_BUF_BYTES]	__attribute__ ((aligned(PAGE_SIZE)));
> >  
> > -char *ip_proto_str[IPPROTO_SCTP + 1] = {
> > -	[IPPROTO_ICMP]		= "ICMP",
> > -	[IPPROTO_TCP]		= "TCP",
> > -	[IPPROTO_UDP]		= "UDP",
> > -	[IPPROTO_ICMPV6]	= "ICMPV6",
> > -	[IPPROTO_SCTP]		= "SCTP",
> > +char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
> 
> For consistency, given you're respinning anyway: [EPOLL_TYPE_MAX + 1]

Done, thanks.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 8/9] epoll: Split listening Unix domain socket into its own type
  2023-08-09 19:59   ` Stefano Brivio
@ 2023-08-10  1:08     ` David Gibson
  2023-08-10  7:50       ` Stefano Brivio
  0 siblings, 1 reply; 17+ messages in thread
From: David Gibson @ 2023-08-10  1:08 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 4381 bytes --]

On Wed, Aug 09, 2023 at 09:59:27PM +0200, Stefano Brivio wrote:
> On Mon,  7 Aug 2023 23:46:30 +1000
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > tap_handler() actually handles events on three different types of object:
> > the /dev/tap character device (pasta), a connected Unix domain socket
> > (passt) or a listening Unix domain socket (passt).
> > 
> > The last, in particular, really has no handling in common with the others,
> > so split it into its own epoll type and directly dispatch to the relevant
> > handler from the top level.
> > 
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  passt.c |  6 +++++-
> >  passt.h |  6 ++++--
> >  tap.c   | 20 ++++++++------------
> >  tap.h   |  4 ++--
> >  4 files changed, 19 insertions(+), 17 deletions(-)
> > 
> > diff --git a/passt.c b/passt.c
> > index c32981d..c60f346 100644
> > --- a/passt.c
> > +++ b/passt.c
> > @@ -64,6 +64,7 @@ char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
> >  	[EPOLL_TYPE_ICMPV6]	= "ICMPv6 socket",
> >  	[EPOLL_TYPE_NSQUIT]	= "namespace inotify",
> >  	[EPOLL_TYPE_TAP]	= "tap device",
> > +	[EPOLL_TYPE_TAP_LISTEN]	= "listening qemu socket",
> >  };
> >  
> >  /**
> > @@ -317,7 +318,10 @@ loop:
> >  
> >  		switch (ref.type) {
> >  		case EPOLL_TYPE_TAP:
> > -			tap_handler(&c, ref.fd, events[i].events, &now);
> > +			tap_handler(&c, events[i].events, &now);
> > +			break;
> > +		case EPOLL_TYPE_TAP_LISTEN:
> > +			tap_listen_handler(&c, eventmask);
> >  			break;
> >  		case EPOLL_TYPE_NSQUIT:
> >  			pasta_netns_quit_handler(&c, quit_fd);
> > diff --git a/passt.h b/passt.h
> > index 176bc85..7dae08b 100644
> > --- a/passt.h
> > +++ b/passt.h
> > @@ -61,10 +61,12 @@ enum epoll_type {
> >  	EPOLL_TYPE_ICMPV6,
> >  	/* inotify fd watching for end of netns (pasta) */
> >  	EPOLL_TYPE_NSQUIT,
> > -	/* tap char device, or qemu socket fd */
> > +	/* tap char device, or connected qemu socket fd */
> >  	EPOLL_TYPE_TAP,
> > +	/* socket listening for qemu socket connections */
> > +	EPOLL_TYPE_TAP_LISTEN,
> >  
> > -	EPOLL_TYPE_MAX = EPOLL_TYPE_TAP,
> > +	EPOLL_TYPE_MAX = EPOLL_TYPE_TAP_LISTEN,
> >  };
> >  
> >  /**
> > diff --git a/tap.c b/tap.c
> > index ad0decf..8468f86 100644
> > --- a/tap.c
> > +++ b/tap.c
> > @@ -1076,7 +1076,7 @@ restart:
> >  static void tap_sock_unix_init(struct ctx *c)
> >  {
> >  	int fd = socket(AF_UNIX, SOCK_STREAM, 0);
> > -	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
> > +	union epoll_ref ref = { .type = EPOLL_TYPE_TAP_LISTEN };
> >  	struct epoll_event ev = { 0 };
> >  	struct sockaddr_un addr = {
> >  		.sun_family = AF_UNIX,
> > @@ -1142,10 +1142,11 @@ static void tap_sock_unix_init(struct ctx *c)
> >  }
> >  
> >  /**
> > - * tap_sock_unix_new() - Handle new connection on listening socket
> > + * tap_listen_handler() - Handle new connection on listening socket
> >   * @c:		Execution context
> > + * @events:	epoll events on the socket
> >   */
> > -static void tap_sock_unix_new(struct ctx *c)
> > +void tap_listen_handler(struct ctx *c, uint32_t events)
> >  {
> >  	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
> >  	struct epoll_event ev = { 0 };
> > @@ -1153,6 +1154,9 @@ static void tap_sock_unix_new(struct ctx *c)
> >  	struct ucred ucred;
> >  	socklen_t len;
> >  
> > +	if (events != EPOLLIN)
> > +		return;
> > +
> 
> This comment actually belongs to 2/4 of the tap reset clean-up series,
> but posting it here for clarity: ...wouldn't it be appropriate to
> handle errors while at it? Otherwise I guess we'll just spin on
> EPOLLERR or EPOLLRDHUP.

So, I don't think we'll spin, because we set EPOLLET (edge
triggered).  That said, EPOLLRDHUP makes no sense on a listening
socket, and we're not subscribed to EPOLLERR

> I didn't realise before that other series that we didn't actually
> handle errors on this path -- that's the reason why error handling is
> missing, nothing else.

Ok.  I've revised the patch in the tap reset series to handle this
more thoroughly.  New spin coming soon.

> The rest of the series looks good to me, thanks -- waiting for v2.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 8/9] epoll: Split listening Unix domain socket into its own type
  2023-08-10  1:08     ` David Gibson
@ 2023-08-10  7:50       ` Stefano Brivio
  2023-08-11  3:17         ` David Gibson
  0 siblings, 1 reply; 17+ messages in thread
From: Stefano Brivio @ 2023-08-10  7:50 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Thu, 10 Aug 2023 11:08:19 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Wed, Aug 09, 2023 at 09:59:27PM +0200, Stefano Brivio wrote:
> > On Mon,  7 Aug 2023 23:46:30 +1000
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> > > tap_handler() actually handles events on three different types of object:
> > > the /dev/tap character device (pasta), a connected Unix domain socket
> > > (passt) or a listening Unix domain socket (passt).
> > > 
> > > The last, in particular, really has no handling in common with the others,
> > > so split it into its own epoll type and directly dispatch to the relevant
> > > handler from the top level.
> > > 
> > > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > > ---
> > >  passt.c |  6 +++++-
> > >  passt.h |  6 ++++--
> > >  tap.c   | 20 ++++++++------------
> > >  tap.h   |  4 ++--
> > >  4 files changed, 19 insertions(+), 17 deletions(-)
> > > 
> > > diff --git a/passt.c b/passt.c
> > > index c32981d..c60f346 100644
> > > --- a/passt.c
> > > +++ b/passt.c
> > > @@ -64,6 +64,7 @@ char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
> > >  	[EPOLL_TYPE_ICMPV6]	= "ICMPv6 socket",
> > >  	[EPOLL_TYPE_NSQUIT]	= "namespace inotify",
> > >  	[EPOLL_TYPE_TAP]	= "tap device",
> > > +	[EPOLL_TYPE_TAP_LISTEN]	= "listening qemu socket",
> > >  };
> > >  
> > >  /**
> > > @@ -317,7 +318,10 @@ loop:
> > >  
> > >  		switch (ref.type) {
> > >  		case EPOLL_TYPE_TAP:
> > > -			tap_handler(&c, ref.fd, events[i].events, &now);
> > > +			tap_handler(&c, events[i].events, &now);
> > > +			break;
> > > +		case EPOLL_TYPE_TAP_LISTEN:
> > > +			tap_listen_handler(&c, eventmask);
> > >  			break;
> > >  		case EPOLL_TYPE_NSQUIT:
> > >  			pasta_netns_quit_handler(&c, quit_fd);
> > > diff --git a/passt.h b/passt.h
> > > index 176bc85..7dae08b 100644
> > > --- a/passt.h
> > > +++ b/passt.h
> > > @@ -61,10 +61,12 @@ enum epoll_type {
> > >  	EPOLL_TYPE_ICMPV6,
> > >  	/* inotify fd watching for end of netns (pasta) */
> > >  	EPOLL_TYPE_NSQUIT,
> > > -	/* tap char device, or qemu socket fd */
> > > +	/* tap char device, or connected qemu socket fd */
> > >  	EPOLL_TYPE_TAP,
> > > +	/* socket listening for qemu socket connections */
> > > +	EPOLL_TYPE_TAP_LISTEN,
> > >  
> > > -	EPOLL_TYPE_MAX = EPOLL_TYPE_TAP,
> > > +	EPOLL_TYPE_MAX = EPOLL_TYPE_TAP_LISTEN,
> > >  };
> > >  
> > >  /**
> > > diff --git a/tap.c b/tap.c
> > > index ad0decf..8468f86 100644
> > > --- a/tap.c
> > > +++ b/tap.c
> > > @@ -1076,7 +1076,7 @@ restart:
> > >  static void tap_sock_unix_init(struct ctx *c)
> > >  {
> > >  	int fd = socket(AF_UNIX, SOCK_STREAM, 0);
> > > -	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
> > > +	union epoll_ref ref = { .type = EPOLL_TYPE_TAP_LISTEN };
> > >  	struct epoll_event ev = { 0 };
> > >  	struct sockaddr_un addr = {
> > >  		.sun_family = AF_UNIX,
> > > @@ -1142,10 +1142,11 @@ static void tap_sock_unix_init(struct ctx *c)
> > >  }
> > >  
> > >  /**
> > > - * tap_sock_unix_new() - Handle new connection on listening socket
> > > + * tap_listen_handler() - Handle new connection on listening socket
> > >   * @c:		Execution context
> > > + * @events:	epoll events on the socket
> > >   */
> > > -static void tap_sock_unix_new(struct ctx *c)
> > > +void tap_listen_handler(struct ctx *c, uint32_t events)
> > >  {
> > >  	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
> > >  	struct epoll_event ev = { 0 };
> > > @@ -1153,6 +1154,9 @@ static void tap_sock_unix_new(struct ctx *c)
> > >  	struct ucred ucred;
> > >  	socklen_t len;
> > >  
> > > +	if (events != EPOLLIN)
> > > +		return;
> > > +  
> > 
> > This comment actually belongs to 2/4 of the tap reset clean-up series,
> > but posting it here for clarity: ...wouldn't it be appropriate to
> > handle errors while at it? Otherwise I guess we'll just spin on
> > EPOLLERR or EPOLLRDHUP.  
> 
> So, I don't think we'll spin, because we set EPOLLET (edge
> triggered).  That said, EPOLLRDHUP makes no sense on a listening
> socket, and we're not subscribed to EPOLLERR

There's no need to subscribe to EPOLLERR (and EPOLLHUP): both types of
events are always reported.

I guess it's also fine to indicate it explicitly in 2/13 v2 as you did,
to remark that we handle it, but if you look around we never add
EPOLLERR or EPOLLHUP (except for a stray occurrence in
udp_splice_new()) to the set of events.

-- 
Stefano


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 8/9] epoll: Split listening Unix domain socket into its own type
  2023-08-10  7:50       ` Stefano Brivio
@ 2023-08-11  3:17         ` David Gibson
  0 siblings, 0 replies; 17+ messages in thread
From: David Gibson @ 2023-08-11  3:17 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 5156 bytes --]

On Thu, Aug 10, 2023 at 09:50:55AM +0200, Stefano Brivio wrote:
> On Thu, 10 Aug 2023 11:08:19 +1000
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Wed, Aug 09, 2023 at 09:59:27PM +0200, Stefano Brivio wrote:
> > > On Mon,  7 Aug 2023 23:46:30 +1000
> > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > >   
> > > > tap_handler() actually handles events on three different types of object:
> > > > the /dev/tap character device (pasta), a connected Unix domain socket
> > > > (passt) or a listening Unix domain socket (passt).
> > > > 
> > > > The last, in particular, really has no handling in common with the others,
> > > > so split it into its own epoll type and directly dispatch to the relevant
> > > > handler from the top level.
> > > > 
> > > > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > > > ---
> > > >  passt.c |  6 +++++-
> > > >  passt.h |  6 ++++--
> > > >  tap.c   | 20 ++++++++------------
> > > >  tap.h   |  4 ++--
> > > >  4 files changed, 19 insertions(+), 17 deletions(-)
> > > > 
> > > > diff --git a/passt.c b/passt.c
> > > > index c32981d..c60f346 100644
> > > > --- a/passt.c
> > > > +++ b/passt.c
> > > > @@ -64,6 +64,7 @@ char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
> > > >  	[EPOLL_TYPE_ICMPV6]	= "ICMPv6 socket",
> > > >  	[EPOLL_TYPE_NSQUIT]	= "namespace inotify",
> > > >  	[EPOLL_TYPE_TAP]	= "tap device",
> > > > +	[EPOLL_TYPE_TAP_LISTEN]	= "listening qemu socket",
> > > >  };
> > > >  
> > > >  /**
> > > > @@ -317,7 +318,10 @@ loop:
> > > >  
> > > >  		switch (ref.type) {
> > > >  		case EPOLL_TYPE_TAP:
> > > > -			tap_handler(&c, ref.fd, events[i].events, &now);
> > > > +			tap_handler(&c, events[i].events, &now);
> > > > +			break;
> > > > +		case EPOLL_TYPE_TAP_LISTEN:
> > > > +			tap_listen_handler(&c, eventmask);
> > > >  			break;
> > > >  		case EPOLL_TYPE_NSQUIT:
> > > >  			pasta_netns_quit_handler(&c, quit_fd);
> > > > diff --git a/passt.h b/passt.h
> > > > index 176bc85..7dae08b 100644
> > > > --- a/passt.h
> > > > +++ b/passt.h
> > > > @@ -61,10 +61,12 @@ enum epoll_type {
> > > >  	EPOLL_TYPE_ICMPV6,
> > > >  	/* inotify fd watching for end of netns (pasta) */
> > > >  	EPOLL_TYPE_NSQUIT,
> > > > -	/* tap char device, or qemu socket fd */
> > > > +	/* tap char device, or connected qemu socket fd */
> > > >  	EPOLL_TYPE_TAP,
> > > > +	/* socket listening for qemu socket connections */
> > > > +	EPOLL_TYPE_TAP_LISTEN,
> > > >  
> > > > -	EPOLL_TYPE_MAX = EPOLL_TYPE_TAP,
> > > > +	EPOLL_TYPE_MAX = EPOLL_TYPE_TAP_LISTEN,
> > > >  };
> > > >  
> > > >  /**
> > > > diff --git a/tap.c b/tap.c
> > > > index ad0decf..8468f86 100644
> > > > --- a/tap.c
> > > > +++ b/tap.c
> > > > @@ -1076,7 +1076,7 @@ restart:
> > > >  static void tap_sock_unix_init(struct ctx *c)
> > > >  {
> > > >  	int fd = socket(AF_UNIX, SOCK_STREAM, 0);
> > > > -	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
> > > > +	union epoll_ref ref = { .type = EPOLL_TYPE_TAP_LISTEN };
> > > >  	struct epoll_event ev = { 0 };
> > > >  	struct sockaddr_un addr = {
> > > >  		.sun_family = AF_UNIX,
> > > > @@ -1142,10 +1142,11 @@ static void tap_sock_unix_init(struct ctx *c)
> > > >  }
> > > >  
> > > >  /**
> > > > - * tap_sock_unix_new() - Handle new connection on listening socket
> > > > + * tap_listen_handler() - Handle new connection on listening socket
> > > >   * @c:		Execution context
> > > > + * @events:	epoll events on the socket
> > > >   */
> > > > -static void tap_sock_unix_new(struct ctx *c)
> > > > +void tap_listen_handler(struct ctx *c, uint32_t events)
> > > >  {
> > > >  	union epoll_ref ref = { .type = EPOLL_TYPE_TAP };
> > > >  	struct epoll_event ev = { 0 };
> > > > @@ -1153,6 +1154,9 @@ static void tap_sock_unix_new(struct ctx *c)
> > > >  	struct ucred ucred;
> > > >  	socklen_t len;
> > > >  
> > > > +	if (events != EPOLLIN)
> > > > +		return;
> > > > +  
> > > 
> > > This comment actually belongs to 2/4 of the tap reset clean-up series,
> > > but posting it here for clarity: ...wouldn't it be appropriate to
> > > handle errors while at it? Otherwise I guess we'll just spin on
> > > EPOLLERR or EPOLLRDHUP.  
> > 
> > So, I don't think we'll spin, because we set EPOLLET (edge
> > triggered).  That said, EPOLLRDHUP makes no sense on a listening
> > socket, and we're not subscribed to EPOLLERR
> 
> There's no need to subscribe to EPOLLERR (and EPOLLHUP): both types of
> events are always reported.
> 
> I guess it's also fine to indicate it explicitly in 2/13 v2 as you did,
> to remark that we handle it, but if you look around we never add
> EPOLLERR or EPOLLHUP (except for a stray occurrence in
> udp_splice_new()) to the set of events.

Ah, right, I missed that.  Since I have a few other minor reasons to
do a respin anyway, I've removed EPOLLERR for consistency with the
other places, and will send that in v3.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2023-08-11  3:20 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-07 13:46 [PATCH 0/9] Clean up to epoll dispatch David Gibson
2023-08-07 13:46 ` [PATCH 1/9] epoll: Generalize epoll_ref to cover things other than sockets David Gibson
2023-08-09 19:59   ` Stefano Brivio
2023-08-10  0:23     ` David Gibson
2023-08-07 13:46 ` [PATCH 2/9] epoll: Always use epoll_ref for the epoll data variable David Gibson
2023-08-07 13:46 ` [PATCH 3/9] epoll: Fold sock_handler into general switch on epoll event fd David Gibson
2023-08-07 13:46 ` [PATCH 4/9] epoll: Split handling of ICMP and ICMPv6 sockets David Gibson
2023-08-07 13:46 ` [PATCH 5/9] epoll: Tiny cleanup to udp_sock_handler() David Gibson
2023-08-07 13:46 ` [PATCH 6/9] epoll: Split handling of TCP timerfds into its own handler function David Gibson
2023-08-07 13:46 ` [PATCH 7/9] epoll: Split handling of listening TCP sockets into their own handler David Gibson
2023-08-09  6:29   ` David Gibson
2023-08-07 13:46 ` [PATCH 8/9] epoll: Split listening Unix domain socket into its own type David Gibson
2023-08-09 19:59   ` Stefano Brivio
2023-08-10  1:08     ` David Gibson
2023-08-10  7:50       ` Stefano Brivio
2023-08-11  3:17         ` David Gibson
2023-08-07 13:46 ` [PATCH 9/9] epoll: Use different epoll types for passt and pasta tap fds David Gibson

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).