From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from gandalf.ozlabs.org (gandalf.ozlabs.org [150.107.74.76]) by passt.top (Postfix) with ESMTPS id AB66F5A0275 for ; Mon, 7 Aug 2023 15:46:44 +0200 (CEST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gibson.dropbear.id.au; s=201602; t=1691415994; bh=gjl7SoxIa6lq2A274kfnuKLUj3jI3SQa2Ds99bbgYAE=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=nRm/ns/F4QvQP5socqtWe0vGSVG74Y4dBOhV6eigTCnaUIkREn5b5IofT7o0TZgmM XuQnfKTdmxDrpu5H/tTh2v71B9MiAs4TikfjCtbuwDcnlZV36iJdnWjWmeu03eO3Ux NBpnnU1kZH39qSV/10+m257TMPzZL962Tc3Q5QqU= Received: by gandalf.ozlabs.org (Postfix, from userid 1007) id 4RKHff0QXRz4wZn; Mon, 7 Aug 2023 23:46:34 +1000 (AEST) From: David Gibson To: Stefano Brivio , passt-dev@passt.top Subject: [PATCH 1/9] epoll: Generalize epoll_ref to cover things other than sockets Date: Mon, 7 Aug 2023 23:46:23 +1000 Message-ID: <20230807134631.1400119-2-david@gibson.dropbear.id.au> X-Mailer: git-send-email 2.41.0 In-Reply-To: <20230807134631.1400119-1-david@gibson.dropbear.id.au> References: <20230807134631.1400119-1-david@gibson.dropbear.id.au> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Message-ID-Hash: ZCUBCOKVEZW6GFDTYP6MJTLSEOV3OQCP X-Message-ID-Hash: ZCUBCOKVEZW6GFDTYP6MJTLSEOV3OQCP X-MailFrom: dgibson@gandalf.ozlabs.org X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; emergency; loop; banned-address; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header CC: David Gibson X-Mailman-Version: 3.3.8 Precedence: list List-Id: Development discussion and patches for passt Archived-At: Archived-At: List-Archive: List-Archive: List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: The epoll_ref type includes fields for the IP protocol of a socket, and the socket fd. However, we already have a few things in the epoll which aren't protocol sockets, and we may have more in future. Rename these fields to an abstract "fd type" and file descriptor for more generality. Similarly, rather than using existing IP protocol numbers for the type, introduce our own number space. For now these just correspond to the supported protocols, but we'll expand on that in future. Signed-off-by: David Gibson --- icmp.c | 6 +++--- passt.c | 25 ++++++++++++------------- passt.h | 40 +++++++++++++++++++++++++++++----------- tcp.c | 22 +++++++++++----------- tcp_conn.h | 4 ++-- tcp_splice.c | 4 ++-- udp.c | 14 +++++++------- util.c | 27 ++++++++++++++++++++------- 8 files changed, 86 insertions(+), 56 deletions(-) diff --git a/icmp.c b/icmp.c index 676fa64..a4b6a47 100644 --- a/icmp.c +++ b/icmp.c @@ -79,7 +79,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref, (void)events; (void)now; - n = recvfrom(ref.s, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl); + n = recvfrom(ref.fd, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl); if (n < 0) return; @@ -182,7 +182,7 @@ int icmp_tap_handler(const struct ctx *c, int af, const void *addr, bind_if, id, iref.u32); if (s < 0) goto fail_sock; - if (s > SOCKET_MAX) { + if (s > FD_REF_MAX) { close(s); return 1; } @@ -236,7 +236,7 @@ int icmp_tap_handler(const struct ctx *c, int af, const void *addr, bind_if, id, iref.u32); if (s < 0) goto fail_sock; - if (s > SOCKET_MAX) { + if (s > FD_REF_MAX) { close(s); return 1; } diff --git a/passt.c b/passt.c index 9123868..b42f42d 100644 --- a/passt.c +++ b/passt.c @@ -55,12 +55,11 @@ char pkt_buf[PKT_BUF_BYTES] __attribute__ ((aligned(PAGE_SIZE))); -char *ip_proto_str[IPPROTO_SCTP + 1] = { - [IPPROTO_ICMP] = "ICMP", - [IPPROTO_TCP] = "TCP", - [IPPROTO_UDP] = "UDP", - [IPPROTO_ICMPV6] = "ICMPV6", - [IPPROTO_SCTP] = "SCTP", +char *epoll_type_str[EPOLL_TYPE_MAX+1] = { + [EPOLL_TYPE_TCP] = "TCP socket", + [EPOLL_TYPE_UDP] = "UDP socket", + [EPOLL_TYPE_ICMP] = "ICMP socket", + [EPOLL_TYPE_ICMPV6] = "ICMPv6 socket", }; /** @@ -73,16 +72,16 @@ char *ip_proto_str[IPPROTO_SCTP + 1] = { static void sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { - trace("%s: %s packet from socket %i (events: 0x%08x)", + trace("%s: packet from %s %i (events: 0x%08x)", c->mode == MODE_PASST ? "passt" : "pasta", - IP_PROTO_STR(ref.proto), ref.s, events); + EPOLL_TYPE_STR(ref.type), ref.fd, events); - if (!c->no_tcp && ref.proto == IPPROTO_TCP) - tcp_sock_handler( c, ref, events, now); - else if (!c->no_udp && ref.proto == IPPROTO_UDP) - udp_sock_handler( c, ref, events, now); + if (!c->no_tcp && ref.type == EPOLL_TYPE_TCP) + tcp_sock_handler(c, ref, events, now); + else if (!c->no_udp && ref.type == EPOLL_TYPE_UDP) + udp_sock_handler(c, ref, events, now); else if (!c->no_icmp && - (ref.proto == IPPROTO_ICMP || ref.proto == IPPROTO_ICMPV6)) + (ref.type == EPOLL_TYPE_ICMP || ref.type == EPOLL_TYPE_ICMPV6)) icmp_sock_handler(c, ref, events, now); } diff --git a/passt.h b/passt.h index edc4841..2110781 100644 --- a/passt.h +++ b/passt.h @@ -42,9 +42,27 @@ union epoll_ref; #include "udp.h" /** - * union epoll_ref - Breakdown of reference for epoll socket bookkeeping - * @proto: IP protocol number - * @s: Socket number (implies 2^24-1 limit on number of descriptors) + * enum epoll_type - Different types of fds we poll over + */ +enum epoll_type { + /* Special value to indicate an invalid type */ + EPOLL_TYPE_NONE = 0, + /* Sockets and timerfds for TCP handling */ + EPOLL_TYPE_TCP, + /* UDP sockets */ + EPOLL_TYPE_UDP, + /* IPv4 ICMP sockets */ + EPOLL_TYPE_ICMP, + /* ICMPv6 sockets */ + EPOLL_TYPE_ICMPV6, + + EPOLL_TYPE_MAX = EPOLL_TYPE_ICMPV6, +}; + +/** + * union epoll_ref - Breakdown of reference for epoll fd bookkeeping + * @type: Type of fd (tells us what to do with events) + * @fd: File descriptor number (implies < 2^24 total descriptors) * @tcp: TCP-specific reference part * @udp: UDP-specific reference part * @icmp: ICMP-specific reference part @@ -53,10 +71,10 @@ union epoll_ref; */ union epoll_ref { struct { - int32_t proto:8, -#define SOCKET_REF_BITS 24 -#define SOCKET_MAX MAX_FROM_BITS(SOCKET_REF_BITS) - s:SOCKET_REF_BITS; + enum epoll_type type:8; +#define FD_REF_BITS 24 +#define FD_REF_MAX MAX_FROM_BITS(FD_REF_BITS) + int32_t fd:FD_REF_BITS; union { union tcp_epoll_ref tcp; union udp_epoll_ref udp; @@ -78,10 +96,10 @@ static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data), #define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0) extern char pkt_buf [PKT_BUF_BYTES]; -extern char *ip_proto_str[]; -#define IP_PROTO_STR(n) \ - (((uint8_t)(n) <= IPPROTO_SCTP && ip_proto_str[(n)]) ? \ - ip_proto_str[(n)] : "?") +extern char *epoll_type_str[]; +#define EPOLL_TYPE_STR(n) \ + (((uint8_t)(n) <= EPOLL_TYPE_MAX && epoll_type_str[(n)]) ? \ + epoll_type_str[(n)] : "?") #include /* For MAXNS below */ diff --git a/tcp.c b/tcp.c index d9ecee5..18b781a 100644 --- a/tcp.c +++ b/tcp.c @@ -643,7 +643,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; - union epoll_ref ref = { .proto = IPPROTO_TCP, .s = conn->sock, + union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, .tcp.index = CONN_IDX(conn) }; struct epoll_event ev = { .data.u64 = ref.u64 }; @@ -663,8 +663,8 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) conn->c.in_epoll = true; if (conn->timer != -1) { - union epoll_ref ref_t = { .proto = IPPROTO_TCP, - .s = conn->sock, + union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP, + .fd = conn->sock, .tcp.timer = 1, .tcp.index = CONN_IDX(conn) }; struct epoll_event ev_t = { .data.u64 = ref_t.u64, @@ -692,8 +692,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) return; if (conn->timer == -1) { - union epoll_ref ref = { .proto = IPPROTO_TCP, - .s = conn->sock, + union epoll_ref ref = { .type = EPOLL_TYPE_TCP, + .fd = conn->sock, .tcp.timer = 1, .tcp.index = CONN_IDX(conn) }; struct epoll_event ev = { .data.u64 = ref.u64, @@ -701,7 +701,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) int fd; fd = timerfd_create(CLOCK_MONOTONIC, 0); - if (fd == -1 || fd > SOCKET_MAX) { + if (fd == -1 || fd > FD_REF_MAX) { debug("TCP: failed to get timer: %s", strerror(errno)); if (fd > -1) close(fd); @@ -1908,7 +1908,7 @@ int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); - if (s > SOCKET_MAX) { + if (s > FD_REF_MAX) { close(s); return -EIO; } @@ -2791,7 +2791,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, * https://github.com/llvm/llvm-project/issues/58992 */ memset(&sa, 0, sizeof(struct sockaddr_in6)); - s = accept4(ref.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK); + s = accept4(ref.fd, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK); if (s < 0) return; @@ -2948,7 +2948,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, conn = tc + ref.tcp.index; if (conn->c.spliced) - tcp_splice_sock_handler(c, &conn->splice, ref.s, events); + tcp_splice_sock_handler(c, &conn->splice, ref.fd, events); else tcp_tap_sock_handler(c, &conn->tap, events); } @@ -2999,7 +2999,7 @@ static int tcp_sock_init_af(const struct ctx *c, int af, in_port_t port, int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, const char *ifname, in_port_t port) { - int r4 = SOCKET_MAX + 1, r6 = SOCKET_MAX + 1; + int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; if (af == AF_UNSPEC && c->ifi4 && c->ifi6) /* Attempt to get a dual stack socket */ @@ -3013,7 +3013,7 @@ int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname); - if (IN_INTERVAL(0, SOCKET_MAX, r4) || IN_INTERVAL(0, SOCKET_MAX, r6)) + if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6)) return 0; return r4 < 0 ? r4 : r6; diff --git a/tcp_conn.h b/tcp_conn.h index 9e2b1bf..0b36940 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -62,7 +62,7 @@ struct tcp_tap_conn { unsigned int ws_to_tap :TCP_WS_BITS; - int sock :SOCKET_REF_BITS; + int sock :FD_REF_BITS; uint8_t events; #define CLOSED 0 @@ -80,7 +80,7 @@ struct tcp_tap_conn { (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED) - int timer :SOCKET_REF_BITS; + int timer :FD_REF_BITS; uint8_t flags; #define STALLED BIT(0) diff --git a/tcp_splice.c b/tcp_splice.c index 03e14c1..24995e2 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -173,9 +173,9 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, struct tcp_splice_conn *conn) { int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; - union epoll_ref ref_a = { .proto = IPPROTO_TCP, .s = conn->a, + union epoll_ref ref_a = { .type = EPOLL_TYPE_TCP, .fd = conn->a, .tcp.index = CONN_IDX(conn) }; - union epoll_ref ref_b = { .proto = IPPROTO_TCP, .s = conn->b, + union epoll_ref ref_b = { .type = EPOLL_TYPE_TCP, .fd = conn->b, .tcp.index = CONN_IDX(conn) }; struct epoll_event ev_a = { .data.u64 = ref_a.u64 }; struct epoll_event ev_b = { .data.u64 = ref_b.u64 }; diff --git a/udp.c b/udp.c index 5a852fb..62f8360 100644 --- a/udp.c +++ b/udp.c @@ -388,9 +388,9 @@ static void udp_sock6_iov_init(const struct ctx *c) int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns) { struct epoll_event ev = { .events = EPOLLIN | EPOLLRDHUP | EPOLLHUP }; - union epoll_ref ref = { .proto = IPPROTO_UDP, + union epoll_ref ref = { .type = EPOLL_TYPE_UDP, .udp = { .splice = true, .ns = ns, - .v6 = v6, .port = src } + .v6 = v6, .port = src } }; struct udp_splice_port *sp; int act, s; @@ -406,7 +406,7 @@ int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns) s = socket(v6 ? AF_INET6 : AF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); - if (s > SOCKET_MAX) { + if (s > FD_REF_MAX) { close(s); return -EIO; } @@ -414,7 +414,7 @@ int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns) if (s < 0) return s; - ref.s = s; + ref.fd = s; if (v6) { struct sockaddr_in6 addr6 = { @@ -767,7 +767,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, udp4_localname.sin_port = htons(dstport); } - n = recvmmsg(ref.s, mmh_recv, n, 0, NULL); + n = recvmmsg(ref.fd, mmh_recv, n, 0, NULL); if (n <= 0) return; @@ -980,7 +980,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, const void *addr, const char *ifname, in_port_t port) { union udp_epoll_ref uref = { .u32 = 0 }; - int s, r4 = SOCKET_MAX + 1, r6 = SOCKET_MAX + 1; + int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; if (ns) { uref.port = (in_port_t)(port + c->udp.fwd_out.f.delta[port]); @@ -1030,7 +1030,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, } } - if (IN_INTERVAL(0, SOCKET_MAX, r4) || IN_INTERVAL(0, SOCKET_MAX, r6)) + if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6)) return 0; return r4 < 0 ? r4 : r6; diff --git a/util.c b/util.c index 019c56c..2cac7ba 100644 --- a/util.c +++ b/util.c @@ -102,7 +102,7 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto, const void *bind_addr, const char *ifname, uint16_t port, uint32_t data) { - union epoll_ref ref = { .proto = proto, .data = data }; + union epoll_ref ref = { .data = data }; struct sockaddr_in addr4 = { .sin_family = AF_INET, .sin_port = htons(port), @@ -118,9 +118,22 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto, int fd, sl, y = 1, ret; struct epoll_event ev; - if (proto != IPPROTO_TCP && proto != IPPROTO_UDP && - proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6) + switch (proto) { + case IPPROTO_TCP: + ref.type = EPOLL_TYPE_TCP; + break; + case IPPROTO_UDP: + ref.type = EPOLL_TYPE_UDP; + break; + case IPPROTO_ICMP: + ref.type = EPOLL_TYPE_ICMP; + break; + case IPPROTO_ICMPV6: + ref.type = EPOLL_TYPE_ICMPV6; + break; + default: return -EPFNOSUPPORT; /* Not implemented. */ + } if (af == AF_UNSPEC) { if (!DUAL_STACK_SOCKETS || bind_addr) @@ -140,12 +153,12 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto, return ret; } - if (fd > SOCKET_MAX) { + if (fd > FD_REF_MAX) { close(fd); return -EBADF; } - ref.s = fd; + ref.fd = fd; if (af == AF_INET) { if (bind_addr) @@ -188,8 +201,8 @@ int sock_l4(const struct ctx *c, int af, uint8_t proto, if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, strlen(ifname))) { ret = -errno; - warn("Can't bind socket for %s port %u to %s, closing", - ip_proto_str[proto], port, ifname); + warn("Can't bind %s socket for port %u to %s, closing", + EPOLL_TYPE_STR(proto), port, ifname); close(fd); return ret; } -- 2.41.0