From: David Gibson <david@gibson.dropbear.id.au>
To: Stefano Brivio <sbrivio@redhat.com>, passt-dev@passt.top
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH 1/9] epoll: Generalize epoll_ref to cover things other than sockets
Date: Mon, 7 Aug 2023 23:46:23 +1000 [thread overview]
Message-ID: <20230807134631.1400119-2-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20230807134631.1400119-1-david@gibson.dropbear.id.au>
The epoll_ref type includes fields for the IP protocol of a socket, and the
socket fd. However, we already have a few things in the epoll which aren't
protocol sockets, and we may have more in future. Rename these fields to
an abstract "fd type" and file descriptor for more generality.
Similarly, rather than using existing IP protocol numbers for the type,
introduce our own number space. For now these just correspond to the
supported protocols, but we'll expand on that in future.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
icmp.c | 6 +++---
passt.c | 25 ++++++++++++-------------
passt.h | 40 +++++++++++++++++++++++++++++-----------
tcp.c | 22 +++++++++++-----------
tcp_conn.h | 4 ++--
tcp_splice.c | 4 ++--
udp.c | 14 +++++++-------
util.c | 27 ++++++++++++++++++++-------
8 files changed, 86 insertions(+), 56 deletions(-)
diff --git a/icmp.c b/icmp.c
index 676fa64..a4b6a47 100644
--- a/icmp.c
+++ b/icmp.c
@@ -79,7 +79,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref,
(void)events;
(void)now;
- n = recvfrom(ref.s, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl);
+ n = recvfrom(ref.fd, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl);
if (n < 0)
return;
@@ -182,7 +182,7 @@ int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
bind_if, id, iref.u32);
if (s < 0)
goto fail_sock;
- if (s > SOCKET_MAX) {
+ if (s > FD_REF_MAX) {
close(s);
return 1;
}
@@ -236,7 +236,7 @@ int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
bind_if, id, iref.u32);
if (s < 0)
goto fail_sock;
- if (s > SOCKET_MAX) {
+ if (s > FD_REF_MAX) {
close(s);
return 1;
}
diff --git a/passt.c b/passt.c
index 9123868..b42f42d 100644
--- a/passt.c
+++ b/passt.c
@@ -55,12 +55,11 @@
char pkt_buf[PKT_BUF_BYTES] __attribute__ ((aligned(PAGE_SIZE)));
-char *ip_proto_str[IPPROTO_SCTP + 1] = {
- [IPPROTO_ICMP] = "ICMP",
- [IPPROTO_TCP] = "TCP",
- [IPPROTO_UDP] = "UDP",
- [IPPROTO_ICMPV6] = "ICMPV6",
- [IPPROTO_SCTP] = "SCTP",
+char *epoll_type_str[EPOLL_TYPE_MAX+1] = {
+ [EPOLL_TYPE_TCP] = "TCP socket",
+ [EPOLL_TYPE_UDP] = "UDP socket",
+ [EPOLL_TYPE_ICMP] = "ICMP socket",
+ [EPOLL_TYPE_ICMPV6] = "ICMPv6 socket",
};
/**
@@ -73,16 +72,16 @@ char *ip_proto_str[IPPROTO_SCTP + 1] = {
static void sock_handler(struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now)
{
- trace("%s: %s packet from socket %i (events: 0x%08x)",
+ trace("%s: packet from %s %i (events: 0x%08x)",
c->mode == MODE_PASST ? "passt" : "pasta",
- IP_PROTO_STR(ref.proto), ref.s, events);
+ EPOLL_TYPE_STR(ref.type), ref.fd, events);
- if (!c->no_tcp && ref.proto == IPPROTO_TCP)
- tcp_sock_handler( c, ref, events, now);
- else if (!c->no_udp && ref.proto == IPPROTO_UDP)
- udp_sock_handler( c, ref, events, now);
+ if (!c->no_tcp && ref.type == EPOLL_TYPE_TCP)
+ tcp_sock_handler(c, ref, events, now);
+ else if (!c->no_udp && ref.type == EPOLL_TYPE_UDP)
+ udp_sock_handler(c, ref, events, now);
else if (!c->no_icmp &&
- (ref.proto == IPPROTO_ICMP || ref.proto == IPPROTO_ICMPV6))
+ (ref.type == EPOLL_TYPE_ICMP || ref.type == EPOLL_TYPE_ICMPV6))
icmp_sock_handler(c, ref, events, now);
}
diff --git a/passt.h b/passt.h
index edc4841..2110781 100644
--- a/passt.h
+++ b/passt.h
@@ -42,9 +42,27 @@ union epoll_ref;
#include "udp.h"
/**
- * union epoll_ref - Breakdown of reference for epoll socket bookkeeping
- * @proto: IP protocol number
- * @s: Socket number (implies 2^24-1 limit on number of descriptors)
+ * enum epoll_type - Different types of fds we poll over
+ */
+enum epoll_type {
+ /* Special value to indicate an invalid type */
+ EPOLL_TYPE_NONE = 0,
+ /* Sockets and timerfds for TCP handling */
+ EPOLL_TYPE_TCP,
+ /* UDP sockets */
+ EPOLL_TYPE_UDP,
+ /* IPv4 ICMP sockets */
+ EPOLL_TYPE_ICMP,
+ /* ICMPv6 sockets */
+ EPOLL_TYPE_ICMPV6,
+
+ EPOLL_TYPE_MAX = EPOLL_TYPE_ICMPV6,
+};
+
+/**
+ * union epoll_ref - Breakdown of reference for epoll fd bookkeeping
+ * @type: Type of fd (tells us what to do with events)
+ * @fd: File descriptor number (implies < 2^24 total descriptors)
* @tcp: TCP-specific reference part
* @udp: UDP-specific reference part
* @icmp: ICMP-specific reference part
@@ -53,10 +71,10 @@ union epoll_ref;
*/
union epoll_ref {
struct {
- int32_t proto:8,
-#define SOCKET_REF_BITS 24
-#define SOCKET_MAX MAX_FROM_BITS(SOCKET_REF_BITS)
- s:SOCKET_REF_BITS;
+ enum epoll_type type:8;
+#define FD_REF_BITS 24
+#define FD_REF_MAX MAX_FROM_BITS(FD_REF_BITS)
+ int32_t fd:FD_REF_BITS;
union {
union tcp_epoll_ref tcp;
union udp_epoll_ref udp;
@@ -78,10 +96,10 @@ static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0)
extern char pkt_buf [PKT_BUF_BYTES];
-extern char *ip_proto_str[];
-#define IP_PROTO_STR(n) \
- (((uint8_t)(n) <= IPPROTO_SCTP && ip_proto_str[(n)]) ? \
- ip_proto_str[(n)] : "?")
+extern char *epoll_type_str[];
+#define EPOLL_TYPE_STR(n) \
+ (((uint8_t)(n) <= EPOLL_TYPE_MAX && epoll_type_str[(n)]) ? \
+ epoll_type_str[(n)] : "?")
#include <resolv.h> /* For MAXNS below */
diff --git a/tcp.c b/tcp.c
index d9ecee5..18b781a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -643,7 +643,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
- union epoll_ref ref = { .proto = IPPROTO_TCP, .s = conn->sock,
+ union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
.tcp.index = CONN_IDX(conn) };
struct epoll_event ev = { .data.u64 = ref.u64 };
@@ -663,8 +663,8 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
conn->c.in_epoll = true;
if (conn->timer != -1) {
- union epoll_ref ref_t = { .proto = IPPROTO_TCP,
- .s = conn->sock,
+ union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP,
+ .fd = conn->sock,
.tcp.timer = 1,
.tcp.index = CONN_IDX(conn) };
struct epoll_event ev_t = { .data.u64 = ref_t.u64,
@@ -692,8 +692,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
return;
if (conn->timer == -1) {
- union epoll_ref ref = { .proto = IPPROTO_TCP,
- .s = conn->sock,
+ union epoll_ref ref = { .type = EPOLL_TYPE_TCP,
+ .fd = conn->sock,
.tcp.timer = 1,
.tcp.index = CONN_IDX(conn) };
struct epoll_event ev = { .data.u64 = ref.u64,
@@ -701,7 +701,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
int fd;
fd = timerfd_create(CLOCK_MONOTONIC, 0);
- if (fd == -1 || fd > SOCKET_MAX) {
+ if (fd == -1 || fd > FD_REF_MAX) {
debug("TCP: failed to get timer: %s", strerror(errno));
if (fd > -1)
close(fd);
@@ -1908,7 +1908,7 @@ int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
- if (s > SOCKET_MAX) {
+ if (s > FD_REF_MAX) {
close(s);
return -EIO;
}
@@ -2791,7 +2791,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
* https://github.com/llvm/llvm-project/issues/58992
*/
memset(&sa, 0, sizeof(struct sockaddr_in6));
- s = accept4(ref.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK);
+ s = accept4(ref.fd, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK);
if (s < 0)
return;
@@ -2948,7 +2948,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
conn = tc + ref.tcp.index;
if (conn->c.spliced)
- tcp_splice_sock_handler(c, &conn->splice, ref.s, events);
+ tcp_splice_sock_handler(c, &conn->splice, ref.fd, events);
else
tcp_tap_sock_handler(c, &conn->tap, events);
}
@@ -2999,7 +2999,7 @@ static int tcp_sock_init_af(const struct ctx *c, int af, in_port_t port,
int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
const char *ifname, in_port_t port)
{
- int r4 = SOCKET_MAX + 1, r6 = SOCKET_MAX + 1;
+ int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
if (af == AF_UNSPEC && c->ifi4 && c->ifi6)
/* Attempt to get a dual stack socket */
@@ -3013,7 +3013,7 @@ int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6)
r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname);
- if (IN_INTERVAL(0, SOCKET_MAX, r4) || IN_INTERVAL(0, SOCKET_MAX, r6))
+ if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
return 0;
return r4 < 0 ? r4 : r6;
diff --git a/tcp_conn.h b/tcp_conn.h
index 9e2b1bf..0b36940 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -62,7 +62,7 @@ struct tcp_tap_conn {
unsigned int ws_to_tap :TCP_WS_BITS;
- int sock :SOCKET_REF_BITS;
+ int sock :FD_REF_BITS;
uint8_t events;
#define CLOSED 0
@@ -80,7 +80,7 @@ struct tcp_tap_conn {
(SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
- int timer :SOCKET_REF_BITS;
+ int timer :FD_REF_BITS;
uint8_t flags;
#define STALLED BIT(0)
diff --git a/tcp_splice.c b/tcp_splice.c
index 03e14c1..24995e2 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -173,9 +173,9 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
struct tcp_splice_conn *conn)
{
int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
- union epoll_ref ref_a = { .proto = IPPROTO_TCP, .s = conn->a,
+ union epoll_ref ref_a = { .type = EPOLL_TYPE_TCP, .fd = conn->a,
.tcp.index = CONN_IDX(conn) };
- union epoll_ref ref_b = { .proto = IPPROTO_TCP, .s = conn->b,
+ union epoll_ref ref_b = { .type = EPOLL_TYPE_TCP, .fd = conn->b,
.tcp.index = CONN_IDX(conn) };
struct epoll_event ev_a = { .data.u64 = ref_a.u64 };
struct epoll_event ev_b = { .data.u64 = ref_b.u64 };
diff --git a/udp.c b/udp.c
index 5a852fb..62f8360 100644
--- a/udp.c
+++ b/udp.c
@@ -388,9 +388,9 @@ static void udp_sock6_iov_init(const struct ctx *c)
int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns)
{
struct epoll_event ev = { .events = EPOLLIN | EPOLLRDHUP | EPOLLHUP };
- union epoll_ref ref = { .proto = IPPROTO_UDP,
+ union epoll_ref ref = { .type = EPOLL_TYPE_UDP,
.udp = { .splice = true, .ns = ns,
- .v6 = v6, .port = src }
+ .v6 = v6, .port = src }
};
struct udp_splice_port *sp;
int act, s;
@@ -406,7 +406,7 @@ int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns)
s = socket(v6 ? AF_INET6 : AF_INET, SOCK_DGRAM | SOCK_NONBLOCK,
IPPROTO_UDP);
- if (s > SOCKET_MAX) {
+ if (s > FD_REF_MAX) {
close(s);
return -EIO;
}
@@ -414,7 +414,7 @@ int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns)
if (s < 0)
return s;
- ref.s = s;
+ ref.fd = s;
if (v6) {
struct sockaddr_in6 addr6 = {
@@ -767,7 +767,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
udp4_localname.sin_port = htons(dstport);
}
- n = recvmmsg(ref.s, mmh_recv, n, 0, NULL);
+ n = recvmmsg(ref.fd, mmh_recv, n, 0, NULL);
if (n <= 0)
return;
@@ -980,7 +980,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
const void *addr, const char *ifname, in_port_t port)
{
union udp_epoll_ref uref = { .u32 = 0 };
- int s, r4 = SOCKET_MAX + 1, r6 = SOCKET_MAX + 1;
+ int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
if (ns) {
uref.port = (in_port_t)(port + c->udp.fwd_out.f.delta[port]);
@@ -1030,7 +1030,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
}
}
- if (IN_INTERVAL(0, SOCKET_MAX, r4) || IN_INTERVAL(0, SOCKET_MAX, r6))
+ if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
return 0;
return r4 < 0 ? r4 : r6;
diff --git a/util.c b/util.c
index 019c56c..2cac7ba 100644
--- a/util.c
+++ b/util.c
@@ -102,7 +102,7 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
const void *bind_addr, const char *ifname, uint16_t port,
uint32_t data)
{
- union epoll_ref ref = { .proto = proto, .data = data };
+ union epoll_ref ref = { .data = data };
struct sockaddr_in addr4 = {
.sin_family = AF_INET,
.sin_port = htons(port),
@@ -118,9 +118,22 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
int fd, sl, y = 1, ret;
struct epoll_event ev;
- if (proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
- proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6)
+ switch (proto) {
+ case IPPROTO_TCP:
+ ref.type = EPOLL_TYPE_TCP;
+ break;
+ case IPPROTO_UDP:
+ ref.type = EPOLL_TYPE_UDP;
+ break;
+ case IPPROTO_ICMP:
+ ref.type = EPOLL_TYPE_ICMP;
+ break;
+ case IPPROTO_ICMPV6:
+ ref.type = EPOLL_TYPE_ICMPV6;
+ break;
+ default:
return -EPFNOSUPPORT; /* Not implemented. */
+ }
if (af == AF_UNSPEC) {
if (!DUAL_STACK_SOCKETS || bind_addr)
@@ -140,12 +153,12 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
return ret;
}
- if (fd > SOCKET_MAX) {
+ if (fd > FD_REF_MAX) {
close(fd);
return -EBADF;
}
- ref.s = fd;
+ ref.fd = fd;
if (af == AF_INET) {
if (bind_addr)
@@ -188,8 +201,8 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
ifname, strlen(ifname))) {
ret = -errno;
- warn("Can't bind socket for %s port %u to %s, closing",
- ip_proto_str[proto], port, ifname);
+ warn("Can't bind %s socket for port %u to %s, closing",
+ EPOLL_TYPE_STR(proto), port, ifname);
close(fd);
return ret;
}
--
@@ -102,7 +102,7 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
const void *bind_addr, const char *ifname, uint16_t port,
uint32_t data)
{
- union epoll_ref ref = { .proto = proto, .data = data };
+ union epoll_ref ref = { .data = data };
struct sockaddr_in addr4 = {
.sin_family = AF_INET,
.sin_port = htons(port),
@@ -118,9 +118,22 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
int fd, sl, y = 1, ret;
struct epoll_event ev;
- if (proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
- proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6)
+ switch (proto) {
+ case IPPROTO_TCP:
+ ref.type = EPOLL_TYPE_TCP;
+ break;
+ case IPPROTO_UDP:
+ ref.type = EPOLL_TYPE_UDP;
+ break;
+ case IPPROTO_ICMP:
+ ref.type = EPOLL_TYPE_ICMP;
+ break;
+ case IPPROTO_ICMPV6:
+ ref.type = EPOLL_TYPE_ICMPV6;
+ break;
+ default:
return -EPFNOSUPPORT; /* Not implemented. */
+ }
if (af == AF_UNSPEC) {
if (!DUAL_STACK_SOCKETS || bind_addr)
@@ -140,12 +153,12 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
return ret;
}
- if (fd > SOCKET_MAX) {
+ if (fd > FD_REF_MAX) {
close(fd);
return -EBADF;
}
- ref.s = fd;
+ ref.fd = fd;
if (af == AF_INET) {
if (bind_addr)
@@ -188,8 +201,8 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto,
if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
ifname, strlen(ifname))) {
ret = -errno;
- warn("Can't bind socket for %s port %u to %s, closing",
- ip_proto_str[proto], port, ifname);
+ warn("Can't bind %s socket for port %u to %s, closing",
+ EPOLL_TYPE_STR(proto), port, ifname);
close(fd);
return ret;
}
--
2.41.0
next prev parent reply other threads:[~2023-08-07 13:46 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-08-07 13:46 [PATCH 0/9] Clean up to epoll dispatch David Gibson
2023-08-07 13:46 ` David Gibson [this message]
2023-08-09 19:59 ` [PATCH 1/9] epoll: Generalize epoll_ref to cover things other than sockets Stefano Brivio
2023-08-10 0:23 ` David Gibson
2023-08-07 13:46 ` [PATCH 2/9] epoll: Always use epoll_ref for the epoll data variable David Gibson
2023-08-07 13:46 ` [PATCH 3/9] epoll: Fold sock_handler into general switch on epoll event fd David Gibson
2023-08-07 13:46 ` [PATCH 4/9] epoll: Split handling of ICMP and ICMPv6 sockets David Gibson
2023-08-07 13:46 ` [PATCH 5/9] epoll: Tiny cleanup to udp_sock_handler() David Gibson
2023-08-07 13:46 ` [PATCH 6/9] epoll: Split handling of TCP timerfds into its own handler function David Gibson
2023-08-07 13:46 ` [PATCH 7/9] epoll: Split handling of listening TCP sockets into their own handler David Gibson
2023-08-09 6:29 ` David Gibson
2023-08-07 13:46 ` [PATCH 8/9] epoll: Split listening Unix domain socket into its own type David Gibson
2023-08-09 19:59 ` Stefano Brivio
2023-08-10 1:08 ` David Gibson
2023-08-10 7:50 ` Stefano Brivio
2023-08-11 3:17 ` David Gibson
2023-08-07 13:46 ` [PATCH 9/9] epoll: Use different epoll types for passt and pasta tap fds David Gibson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230807134631.1400119-2-david@gibson.dropbear.id.au \
--to=david@gibson.dropbear.id.au \
--cc=passt-dev@passt.top \
--cc=sbrivio@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).