* [PATCH 1/5] util: Simplify epoll_del() interface to take epollfd directly
2025-10-03 15:27 [PATCH 0/5] Refactor epoll handling in preparation for multithreading Laurent Vivier
@ 2025-10-03 15:27 ` Laurent Vivier
2025-10-07 5:26 ` David Gibson
2025-10-03 15:27 ` [PATCH 2/5] util: Move epoll registration out of sock_l4_sa() Laurent Vivier
` (3 subsequent siblings)
4 siblings, 1 reply; 10+ messages in thread
From: Laurent Vivier @ 2025-10-03 15:27 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
Change epoll_del() to accept the epoll file descriptor directly instead
of the full context structure. This simplifies the interface and aligns
with the threading refactoring by reducing dependency on the context
structure for basic epoll operations as we will manage an epollfd per
thread.
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
icmp.c | 2 +-
tap.c | 2 +-
tcp.c | 6 +++---
tcp_splice.c | 4 ++--
udp_flow.c | 4 ++--
util.c | 6 +++---
util.h | 2 +-
vhost_user.c | 6 +++---
8 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/icmp.c b/icmp.c
index 6dffafb0bf54..bd3108a21675 100644
--- a/icmp.c
+++ b/icmp.c
@@ -151,7 +151,7 @@ unexpected:
static void icmp_ping_close(const struct ctx *c,
const struct icmp_ping_flow *pingf)
{
- epoll_del(c, pingf->sock);
+ epoll_del(c->epollfd, pingf->sock);
close(pingf->sock);
flow_hash_remove(c, FLOW_SIDX(pingf, INISIDE));
}
diff --git a/tap.c b/tap.c
index 95d309bd1938..134c37a72979 100644
--- a/tap.c
+++ b/tap.c
@@ -1142,7 +1142,7 @@ void tap_sock_reset(struct ctx *c)
}
/* Close the connected socket, wait for a new connection */
- epoll_del(c, c->fd_tap);
+ epoll_del(c->epollfd, c->fd_tap);
close(c->fd_tap);
c->fd_tap = -1;
if (c->mode == MODE_VU)
diff --git a/tcp.c b/tcp.c
index 48b1ef29bfe8..04725deabb65 100644
--- a/tcp.c
+++ b/tcp.c
@@ -511,9 +511,9 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
if (conn->events == CLOSED) {
if (conn->in_epoll)
- epoll_del(c, conn->sock);
+ epoll_del(c->epollfd, conn->sock);
if (conn->timer != -1)
- epoll_del(c, conn->timer);
+ epoll_del(c->epollfd, conn->timer);
return 0;
}
@@ -3445,7 +3445,7 @@ int tcp_flow_migrate_source_ext(const struct ctx *c,
if (c->migrate_no_linger)
close(s);
else
- epoll_del(c, s);
+ epoll_del(c->epollfd, s);
/* Adjustments unrelated to FIN segments: sequence numbers we dumped are
* based on the end of the queues.
diff --git a/tcp_splice.c b/tcp_splice.c
index 26cb63064583..666ee62b738f 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -204,8 +204,8 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
}
if (flag == CLOSING) {
- epoll_del(c, conn->s[0]);
- epoll_del(c, conn->s[1]);
+ epoll_del(c->epollfd, conn->s[0]);
+ epoll_del(c->epollfd, conn->s[1]);
}
}
diff --git a/udp_flow.c b/udp_flow.c
index cef3fb588bbe..84973f807167 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -51,7 +51,7 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
flow_foreach_sidei(sidei) {
flow_hash_remove(c, FLOW_SIDX(uflow, sidei));
if (uflow->s[sidei] >= 0) {
- epoll_del(c, uflow->s[sidei]);
+ epoll_del(c->epollfd, uflow->s[sidei]);
close(uflow->s[sidei]);
uflow->s[sidei] = -1;
}
@@ -88,7 +88,7 @@ static int udp_flow_sock(const struct ctx *c,
if (flowside_connect(c, s, pif, side) < 0) {
int rc = -errno;
- epoll_del(c, s);
+ epoll_del(c->epollfd, s);
close(s);
flow_dbg_perror(uflow, "Couldn't connect flow socket");
diff --git a/util.c b/util.c
index c492f904b3fc..88a91b1100f5 100644
--- a/util.c
+++ b/util.c
@@ -996,12 +996,12 @@ void raw_random(void *buf, size_t buflen)
/**
* epoll_del() - Remove a file descriptor from our passt epoll
- * @c: Execution context
+ * @epollfd: epoll file descriptor to add to
* @fd: File descriptor to remove
*/
-void epoll_del(const struct ctx *c, int fd)
+void epoll_del(int epollfd, int fd)
{
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL);
+ epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, NULL);
}
diff --git a/util.h b/util.h
index 22eaac56e719..c61cbef357aa 100644
--- a/util.h
+++ b/util.h
@@ -300,7 +300,7 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m)
#define FPRINTF(f, ...) (void)fprintf(f, __VA_ARGS__)
void raw_random(void *buf, size_t buflen);
-void epoll_del(const struct ctx *c, int fd);
+void epoll_del(int epollfd, int fd);
/*
* Starting from glibc 2.40.9000 and commit 25a5eb4010df ("string: strerror,
diff --git a/vhost_user.c b/vhost_user.c
index fa343a86fac2..1221ac1abcd0 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -733,7 +733,7 @@ static bool vu_get_vring_base_exec(struct vu_dev *vdev,
vdev->vq[idx].call_fd = -1;
}
if (vdev->vq[idx].kick_fd != -1) {
- epoll_del(vdev->context, vdev->vq[idx].kick_fd);
+ epoll_del(vdev->context->epollfd, vdev->vq[idx].kick_fd);
close(vdev->vq[idx].kick_fd);
vdev->vq[idx].kick_fd = -1;
}
@@ -801,7 +801,7 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
vu_check_queue_msg_file(vmsg);
if (vdev->vq[idx].kick_fd != -1) {
- epoll_del(vdev->context, vdev->vq[idx].kick_fd);
+ epoll_del(vdev->context->epollfd, vdev->vq[idx].kick_fd);
close(vdev->vq[idx].kick_fd);
vdev->vq[idx].kick_fd = -1;
}
@@ -1092,7 +1092,7 @@ void vu_cleanup(struct vu_dev *vdev)
vq->err_fd = -1;
}
if (vq->kick_fd != -1) {
- epoll_del(vdev->context, vq->kick_fd);
+ epoll_del(vdev->context->epollfd, vq->kick_fd);
close(vq->kick_fd);
vq->kick_fd = -1;
}
--
2.50.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/5] util: Simplify epoll_del() interface to take epollfd directly
2025-10-03 15:27 ` [PATCH 1/5] util: Simplify epoll_del() interface to take epollfd directly Laurent Vivier
@ 2025-10-07 5:26 ` David Gibson
0 siblings, 0 replies; 10+ messages in thread
From: David Gibson @ 2025-10-07 5:26 UTC (permalink / raw)
To: Laurent Vivier; +Cc: passt-dev
[-- Attachment #1: Type: text/plain, Size: 5979 bytes --]
On Fri, Oct 03, 2025 at 05:27:13PM +0200, Laurent Vivier wrote:
> Change epoll_del() to accept the epoll file descriptor directly instead
> of the full context structure. This simplifies the interface and aligns
> with the threading refactoring by reducing dependency on the context
> structure for basic epoll operations as we will manage an epollfd per
> thread.
>
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> ---
> icmp.c | 2 +-
> tap.c | 2 +-
> tcp.c | 6 +++---
> tcp_splice.c | 4 ++--
> udp_flow.c | 4 ++--
> util.c | 6 +++---
> util.h | 2 +-
> vhost_user.c | 6 +++---
> 8 files changed, 16 insertions(+), 16 deletions(-)
>
> diff --git a/icmp.c b/icmp.c
> index 6dffafb0bf54..bd3108a21675 100644
> --- a/icmp.c
> +++ b/icmp.c
> @@ -151,7 +151,7 @@ unexpected:
> static void icmp_ping_close(const struct ctx *c,
> const struct icmp_ping_flow *pingf)
> {
> - epoll_del(c, pingf->sock);
> + epoll_del(c->epollfd, pingf->sock);
> close(pingf->sock);
> flow_hash_remove(c, FLOW_SIDX(pingf, INISIDE));
> }
> diff --git a/tap.c b/tap.c
> index 95d309bd1938..134c37a72979 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -1142,7 +1142,7 @@ void tap_sock_reset(struct ctx *c)
> }
>
> /* Close the connected socket, wait for a new connection */
> - epoll_del(c, c->fd_tap);
> + epoll_del(c->epollfd, c->fd_tap);
> close(c->fd_tap);
> c->fd_tap = -1;
> if (c->mode == MODE_VU)
> diff --git a/tcp.c b/tcp.c
> index 48b1ef29bfe8..04725deabb65 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -511,9 +511,9 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
>
> if (conn->events == CLOSED) {
> if (conn->in_epoll)
> - epoll_del(c, conn->sock);
> + epoll_del(c->epollfd, conn->sock);
> if (conn->timer != -1)
> - epoll_del(c, conn->timer);
> + epoll_del(c->epollfd, conn->timer);
> return 0;
> }
>
> @@ -3445,7 +3445,7 @@ int tcp_flow_migrate_source_ext(const struct ctx *c,
> if (c->migrate_no_linger)
> close(s);
> else
> - epoll_del(c, s);
> + epoll_del(c->epollfd, s);
>
> /* Adjustments unrelated to FIN segments: sequence numbers we dumped are
> * based on the end of the queues.
> diff --git a/tcp_splice.c b/tcp_splice.c
> index 26cb63064583..666ee62b738f 100644
> --- a/tcp_splice.c
> +++ b/tcp_splice.c
> @@ -204,8 +204,8 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
> }
>
> if (flag == CLOSING) {
> - epoll_del(c, conn->s[0]);
> - epoll_del(c, conn->s[1]);
> + epoll_del(c->epollfd, conn->s[0]);
> + epoll_del(c->epollfd, conn->s[1]);
> }
> }
>
> diff --git a/udp_flow.c b/udp_flow.c
> index cef3fb588bbe..84973f807167 100644
> --- a/udp_flow.c
> +++ b/udp_flow.c
> @@ -51,7 +51,7 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
> flow_foreach_sidei(sidei) {
> flow_hash_remove(c, FLOW_SIDX(uflow, sidei));
> if (uflow->s[sidei] >= 0) {
> - epoll_del(c, uflow->s[sidei]);
> + epoll_del(c->epollfd, uflow->s[sidei]);
> close(uflow->s[sidei]);
> uflow->s[sidei] = -1;
> }
> @@ -88,7 +88,7 @@ static int udp_flow_sock(const struct ctx *c,
> if (flowside_connect(c, s, pif, side) < 0) {
> int rc = -errno;
>
> - epoll_del(c, s);
> + epoll_del(c->epollfd, s);
> close(s);
>
> flow_dbg_perror(uflow, "Couldn't connect flow socket");
> diff --git a/util.c b/util.c
> index c492f904b3fc..88a91b1100f5 100644
> --- a/util.c
> +++ b/util.c
> @@ -996,12 +996,12 @@ void raw_random(void *buf, size_t buflen)
>
> /**
> * epoll_del() - Remove a file descriptor from our passt epoll
> - * @c: Execution context
> + * @epollfd: epoll file descriptor to add to
> * @fd: File descriptor to remove
> */
> -void epoll_del(const struct ctx *c, int fd)
> +void epoll_del(int epollfd, int fd)
> {
> - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL);
> + epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, NULL);
>
> }
>
> diff --git a/util.h b/util.h
> index 22eaac56e719..c61cbef357aa 100644
> --- a/util.h
> +++ b/util.h
> @@ -300,7 +300,7 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m)
> #define FPRINTF(f, ...) (void)fprintf(f, __VA_ARGS__)
>
> void raw_random(void *buf, size_t buflen);
> -void epoll_del(const struct ctx *c, int fd);
> +void epoll_del(int epollfd, int fd);
>
> /*
> * Starting from glibc 2.40.9000 and commit 25a5eb4010df ("string: strerror,
> diff --git a/vhost_user.c b/vhost_user.c
> index fa343a86fac2..1221ac1abcd0 100644
> --- a/vhost_user.c
> +++ b/vhost_user.c
> @@ -733,7 +733,7 @@ static bool vu_get_vring_base_exec(struct vu_dev *vdev,
> vdev->vq[idx].call_fd = -1;
> }
> if (vdev->vq[idx].kick_fd != -1) {
> - epoll_del(vdev->context, vdev->vq[idx].kick_fd);
> + epoll_del(vdev->context->epollfd, vdev->vq[idx].kick_fd);
> close(vdev->vq[idx].kick_fd);
> vdev->vq[idx].kick_fd = -1;
> }
> @@ -801,7 +801,7 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
> vu_check_queue_msg_file(vmsg);
>
> if (vdev->vq[idx].kick_fd != -1) {
> - epoll_del(vdev->context, vdev->vq[idx].kick_fd);
> + epoll_del(vdev->context->epollfd, vdev->vq[idx].kick_fd);
> close(vdev->vq[idx].kick_fd);
> vdev->vq[idx].kick_fd = -1;
> }
> @@ -1092,7 +1092,7 @@ void vu_cleanup(struct vu_dev *vdev)
> vq->err_fd = -1;
> }
> if (vq->kick_fd != -1) {
> - epoll_del(vdev->context, vq->kick_fd);
> + epoll_del(vdev->context->epollfd, vq->kick_fd);
> close(vq->kick_fd);
> vq->kick_fd = -1;
> }
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 2/5] util: Move epoll registration out of sock_l4_sa()
2025-10-03 15:27 [PATCH 0/5] Refactor epoll handling in preparation for multithreading Laurent Vivier
2025-10-03 15:27 ` [PATCH 1/5] util: Simplify epoll_del() interface to take epollfd directly Laurent Vivier
@ 2025-10-03 15:27 ` Laurent Vivier
2025-10-07 5:57 ` David Gibson
2025-10-03 15:27 ` [PATCH 3/5] tcp, flow: Replace per-connection in_epoll flag with epollfd in flow_common Laurent Vivier
` (2 subsequent siblings)
4 siblings, 1 reply; 10+ messages in thread
From: Laurent Vivier @ 2025-10-03 15:27 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
Move epoll_ctl() calls from sock_l4_sa() to the protocol-specific code
(icmp.c, pif.c, udp_flow.c) to give callers more control over epoll
registration. This allows sock_l4_sa() to focus solely on socket
creation and binding, while epoll management happens at a higher level.
Remove the data parameter from sock_l4_sa() and flowside_sock_l4() as
it's no longer needed - callers now construct the full epoll_ref and
register the socket themselves after creation.
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
flow.c | 10 ++++------
flow.h | 2 +-
icmp.c | 19 +++++++++++++++----
pif.c | 34 ++++++++++++++++++++++++++++------
udp_flow.c | 17 ++++++++++++++++-
util.c | 15 +--------------
util.h | 2 +-
7 files changed, 66 insertions(+), 33 deletions(-)
diff --git a/flow.c b/flow.c
index feefda3ce74e..b14e9d8b63ff 100644
--- a/flow.c
+++ b/flow.c
@@ -163,7 +163,6 @@ static void flowside_from_af(struct flowside *side, sa_family_t af,
* @type: Socket epoll type
* @sa: Socket address
* @sl: Length of @sa
- * @data: epoll reference data
*/
struct flowside_sock_args {
const struct ctx *c;
@@ -173,7 +172,6 @@ struct flowside_sock_args {
const struct sockaddr *sa;
socklen_t sl;
const char *path;
- uint32_t data;
};
/** flowside_sock_splice() - Create and bind socket for PIF_SPLICE based on flowside
@@ -188,7 +186,7 @@ static int flowside_sock_splice(void *arg)
ns_enter(a->c);
a->fd = sock_l4_sa(a->c, a->type, a->sa, a->sl, NULL,
- a->sa->sa_family == AF_INET6, a->data);
+ a->sa->sa_family == AF_INET6);
a->err = errno;
return 0;
@@ -205,7 +203,7 @@ static int flowside_sock_splice(void *arg)
* (if specified).
*/
int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
- const struct flowside *tgt, uint32_t data)
+ const struct flowside *tgt)
{
const char *ifname = NULL;
union sockaddr_inany sa;
@@ -225,12 +223,12 @@ int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
ifname = c->ip6.ifname_out;
return sock_l4_sa(c, type, &sa, sl, ifname,
- sa.sa_family == AF_INET6, data);
+ sa.sa_family == AF_INET6);
case PIF_SPLICE: {
struct flowside_sock_args args = {
.c = c, .type = type,
- .sa = &sa.sa, .sl = sl, .data = data,
+ .sa = &sa.sa, .sl = sl,
};
NS_CALL(flowside_sock_splice, &args);
errno = args.err;
diff --git a/flow.h b/flow.h
index cac618ad0ca1..ef138b83add8 100644
--- a/flow.h
+++ b/flow.h
@@ -167,7 +167,7 @@ static inline bool flowside_eq(const struct flowside *left,
}
int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
- const struct flowside *tgt, uint32_t data);
+ const struct flowside *tgt);
int flowside_connect(const struct ctx *c, int s,
uint8_t pif, const struct flowside *tgt);
diff --git a/icmp.c b/icmp.c
index bd3108a21675..d6f0abe68269 100644
--- a/icmp.c
+++ b/icmp.c
@@ -172,10 +172,11 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
{
uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6;
uint8_t flowtype = af == AF_INET ? FLOW_PING4 : FLOW_PING6;
- union epoll_ref ref = { .type = EPOLL_TYPE_PING };
union flow *flow = flow_alloc();
struct icmp_ping_flow *pingf;
const struct flowside *tgt;
+ struct epoll_event ev;
+ union epoll_ref ref;
if (!flow)
return NULL;
@@ -196,9 +197,7 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
pingf->seq = -1;
- ref.flowside = FLOW_SIDX(flow, TGTSIDE);
- pingf->sock = flowside_sock_l4(c, EPOLL_TYPE_PING, PIF_HOST,
- tgt, ref.data);
+ pingf->sock = flowside_sock_l4(c, EPOLL_TYPE_PING, PIF_HOST, tgt);
if (pingf->sock < 0) {
warn("Cannot open \"ping\" socket. You might need to:");
@@ -210,6 +209,18 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
if (pingf->sock > FD_REF_MAX)
goto cancel;
+ ref.type = EPOLL_TYPE_PING;
+ ref.flowside = FLOW_SIDX(flow, TGTSIDE);
+ ref.fd = pingf->sock;
+
+ ev.events = EPOLLIN;
+ ev.data.u64 = ref.u64;
+ if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, pingf->sock, &ev) == -1) {
+ warn_perror("L4 epoll_ctl");
+ close(pingf->sock);
+ goto cancel;
+ }
+
flow_dbg(pingf, "new socket %i for echo ID %"PRIu16, pingf->sock, id);
flow_hash_insert(c, FLOW_SIDX(pingf, INISIDE));
diff --git a/pif.c b/pif.c
index 592fafaab58a..906be4025f58 100644
--- a/pif.c
+++ b/pif.c
@@ -5,9 +5,11 @@
* Passt/pasta interface types and IDs
*/
+#include <errno.h>
#include <stdint.h>
#include <assert.h>
#include <netinet/in.h>
+#include <sys/epoll.h>
#include "util.h"
#include "pif.h"
@@ -83,6 +85,8 @@ int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
.sa6.sin6_addr = in6addr_any,
.sa6.sin6_port = htons(port),
};
+ struct epoll_event ev;
+ union epoll_ref ref;
socklen_t sl;
ASSERT(pif_is_socket(pif));
@@ -93,11 +97,29 @@ int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
ASSERT(addr && inany_is_loopback(addr));
}
- if (!addr)
- return sock_l4_sa(c, type, &sa, sizeof(sa.sa6),
- ifname, false, data);
+ if (!addr) {
+ ref.fd = sock_l4_sa(c, type, &sa, sizeof(sa.sa6),
+ ifname, false);
+ } else {
+ pif_sockaddr(c, &sa, &sl, pif, addr, port);
+ ref.fd = sock_l4_sa(c, type, &sa, sl,
+ ifname, sa.sa_family == AF_INET6);
+ }
+
+ if (ref.fd < 0)
+ return ref.fd;
+
+ ref.type = type;
+ ref.data = data;
+
+ ev.events = EPOLLIN;
+ ev.data.u64 = ref.u64;
+ if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev) == -1) {
+ int ret = -errno;
+ close(ref.fd);
+ warn("L4 epoll_ctl: %s", strerror_(-ret));
+ return ret;
+ }
- pif_sockaddr(c, &sa, &sl, pif, addr, port);
- return sock_l4_sa(c, type, &sa, sl,
- ifname, sa.sa_family == AF_INET6, data);
+ return ref.fd;
}
diff --git a/udp_flow.c b/udp_flow.c
index 84973f807167..f99994523a6c 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -77,14 +77,29 @@ static int udp_flow_sock(const struct ctx *c,
flow_sidx_t sidx;
uint32_t data;
} fref = { .sidx = FLOW_SIDX(uflow, sidei) };
+ struct epoll_event ev;
+ union epoll_ref ref;
int s;
- s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data);
+ s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side);
if (s < 0) {
flow_dbg_perror(uflow, "Couldn't open flow specific socket");
return s;
}
+ ref.type = EPOLL_TYPE_UDP;
+ ref.data = fref.data;
+ ref.fd = s;
+
+ ev.events = EPOLLIN;
+ ev.data.u64 = ref.u64;
+ if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev) == -1) {
+ int rc = -errno;
+ close(s);
+ warn("L4 epoll_ctl: %s", strerror_(-rc));
+ return rc;
+ }
+
if (flowside_connect(c, s, pif, side) < 0) {
int rc = -errno;
diff --git a/util.c b/util.c
index 88a91b1100f5..17c062c85bf8 100644
--- a/util.c
+++ b/util.c
@@ -47,18 +47,15 @@
* @sl: Length of @sa
* @ifname: Interface for binding, NULL for any
* @v6only: Set IPV6_V6ONLY socket option
- * @data: epoll reference portion for protocol handlers
*
* Return: newly created socket, negative error code on failure
*/
int sock_l4_sa(const struct ctx *c, enum epoll_type type,
const void *sa, socklen_t sl,
- const char *ifname, bool v6only, uint32_t data)
+ const char *ifname, bool v6only)
{
sa_family_t af = ((const struct sockaddr *)sa)->sa_family;
- union epoll_ref ref = { .type = type, .data = data };
bool freebind = false;
- struct epoll_event ev;
int fd, y = 1, ret;
uint8_t proto;
int socktype;
@@ -100,8 +97,6 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
return -EBADF;
}
- ref.fd = fd;
-
if (v6only)
if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &y, sizeof(y)))
debug("Failed to set IPV6_V6ONLY on socket %i", fd);
@@ -172,14 +167,6 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
return ret;
}
- ev.events = EPOLLIN;
- ev.data.u64 = ref.u64;
- if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) {
- ret = -errno;
- warn("L4 epoll_ctl: %s", strerror_(-ret));
- return ret;
- }
-
return fd;
}
diff --git a/util.h b/util.h
index c61cbef357aa..89478de023f9 100644
--- a/util.h
+++ b/util.h
@@ -204,7 +204,7 @@ struct ctx;
int sock_l4_sa(const struct ctx *c, enum epoll_type type,
const void *sa, socklen_t sl,
- const char *ifname, bool v6only, uint32_t data);
+ const char *ifname, bool v6only);
int sock_unix(char *sock_path);
void sock_probe_mem(struct ctx *c);
long timespec_diff_ms(const struct timespec *a, const struct timespec *b);
--
2.50.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/5] util: Move epoll registration out of sock_l4_sa()
2025-10-03 15:27 ` [PATCH 2/5] util: Move epoll registration out of sock_l4_sa() Laurent Vivier
@ 2025-10-07 5:57 ` David Gibson
0 siblings, 0 replies; 10+ messages in thread
From: David Gibson @ 2025-10-07 5:57 UTC (permalink / raw)
To: Laurent Vivier; +Cc: passt-dev
[-- Attachment #1: Type: text/plain, Size: 2629 bytes --]
On Fri, Oct 03, 2025 at 05:27:14PM +0200, Laurent Vivier wrote:
> Move epoll_ctl() calls from sock_l4_sa() to the protocol-specific code
> (icmp.c, pif.c, udp_flow.c) to give callers more control over epoll
> registration. This allows sock_l4_sa() to focus solely on socket
> creation and binding, while epoll management happens at a higher level.
>
> Remove the data parameter from sock_l4_sa() and flowside_sock_l4() as
> it's no longer needed - callers now construct the full epoll_ref and
> register the socket themselves after creation.
I like this idea in principle. I've previously thought that doing the
epoll registration in sock_l4_sa() was verging on a layering
violation. However...
[snip]
> diff --git a/icmp.c b/icmp.c
> index bd3108a21675..d6f0abe68269 100644
> --- a/icmp.c
> +++ b/icmp.c
> @@ -172,10 +172,11 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
> {
> uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6;
> uint8_t flowtype = af == AF_INET ? FLOW_PING4 : FLOW_PING6;
> - union epoll_ref ref = { .type = EPOLL_TYPE_PING };
> union flow *flow = flow_alloc();
> struct icmp_ping_flow *pingf;
> const struct flowside *tgt;
> + struct epoll_event ev;
> + union epoll_ref ref;
>
> if (!flow)
> return NULL;
> @@ -196,9 +197,7 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
>
> pingf->seq = -1;
>
> - ref.flowside = FLOW_SIDX(flow, TGTSIDE);
> - pingf->sock = flowside_sock_l4(c, EPOLL_TYPE_PING, PIF_HOST,
> - tgt, ref.data);
> + pingf->sock = flowside_sock_l4(c, EPOLL_TYPE_PING, PIF_HOST, tgt);
>
> if (pingf->sock < 0) {
> warn("Cannot open \"ping\" socket. You might need to:");
> @@ -210,6 +209,18 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
> if (pingf->sock > FD_REF_MAX)
> goto cancel;
>
> + ref.type = EPOLL_TYPE_PING;
> + ref.flowside = FLOW_SIDX(flow, TGTSIDE);
> + ref.fd = pingf->sock;
> +
> + ev.events = EPOLLIN;
> + ev.data.u64 = ref.u64;
> + if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, pingf->sock, &ev) == -1) {
> + warn_perror("L4 epoll_ctl");
> + close(pingf->sock);
> + goto cancel;
> + }
> +
... this is uncomfortably bulky to have in every protocol. Could we
maybe mitigate this with an epoll_add() helper of some sort that does
at least some of the bit shuffling to construct the epoll data?
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 3/5] tcp, flow: Replace per-connection in_epoll flag with epollfd in flow_common
2025-10-03 15:27 [PATCH 0/5] Refactor epoll handling in preparation for multithreading Laurent Vivier
2025-10-03 15:27 ` [PATCH 1/5] util: Simplify epoll_del() interface to take epollfd directly Laurent Vivier
2025-10-03 15:27 ` [PATCH 2/5] util: Move epoll registration out of sock_l4_sa() Laurent Vivier
@ 2025-10-03 15:27 ` Laurent Vivier
2025-10-07 6:07 ` David Gibson
2025-10-03 15:27 ` [PATCH 4/5] icmp: Use epollfd from flow_common structure Laurent Vivier
2025-10-03 15:27 ` [PATCH 5/5] udp: " Laurent Vivier
4 siblings, 1 reply; 10+ messages in thread
From: Laurent Vivier @ 2025-10-03 15:27 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
The in_epoll boolean flag in tcp_tap_conn and tcp_splice_conn only tracked
whether a connection was registered with epoll, not which epoll instance.
This limited flexibility for future multi-epoll support.
Replace the boolean with an epollfd field in flow_common that serves dual
purpose: zero indicates not registered (replacing in_epoll=false), non-zero
stores the actual epoll fd (replacing in_epoll=true).
This change also simplifies tcp_timer_ctl() by removing the need to pass
the context 'c', since the epoll fd is now directly accessible from the
connection structure.
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
flow.c | 2 +-
flow.h | 2 ++
tcp.c | 36 ++++++++++++++++++------------------
tcp_conn.h | 8 +-------
tcp_splice.c | 23 +++++++++++------------
5 files changed, 33 insertions(+), 38 deletions(-)
diff --git a/flow.c b/flow.c
index b14e9d8b63ff..7c61ee87ae9d 100644
--- a/flow.c
+++ b/flow.c
@@ -827,7 +827,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
case FLOW_TCP_SPLICE:
closed = tcp_splice_flow_defer(&flow->tcp_splice);
if (!closed && timer)
- tcp_splice_timer(c, &flow->tcp_splice);
+ tcp_splice_timer(&flow->tcp_splice);
break;
case FLOW_PING4:
case FLOW_PING6:
diff --git a/flow.h b/flow.h
index ef138b83add8..592d9e3792f6 100644
--- a/flow.h
+++ b/flow.h
@@ -175,6 +175,7 @@ int flowside_connect(const struct ctx *c, int s,
* struct flow_common - Common fields for packet flows
* @state: State of the flow table entry
* @type: Type of packet flow
+ * @epollfd: epoll instance flow is registered with (0 if not registered)
* @pif[]: Interface for each side of the flow
* @side[]: Information for each side of the flow
*/
@@ -190,6 +191,7 @@ struct flow_common {
static_assert(sizeof(uint8_t) * 8 >= FLOW_TYPE_BITS,
"Not enough bits for type field");
#endif
+ int epollfd;
uint8_t pif[SIDES];
struct flowside side[SIDES];
};
diff --git a/tcp.c b/tcp.c
index 04725deabb65..c995b40f38f8 100644
--- a/tcp.c
+++ b/tcp.c
@@ -504,25 +504,26 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
*/
static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
- int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+ int m = conn->f.epollfd ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
.flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), };
struct epoll_event ev = { .data.u64 = ref.u64 };
+ int epollfd = conn->f.epollfd ? conn->f.epollfd : c->epollfd;
if (conn->events == CLOSED) {
- if (conn->in_epoll)
- epoll_del(c->epollfd, conn->sock);
+ if (conn->f.epollfd)
+ epoll_del(epollfd, conn->sock);
if (conn->timer != -1)
- epoll_del(c->epollfd, conn->timer);
+ epoll_del(epollfd, conn->timer);
return 0;
}
ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
- if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
+ if (epoll_ctl(epollfd, m, conn->sock, &ev))
return -errno;
- conn->in_epoll = true;
+ conn->f.epollfd = epollfd;
if (conn->timer != -1) {
union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
@@ -531,7 +532,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
struct epoll_event ev_t = { .data.u64 = ref_t.u64,
.events = EPOLLIN | EPOLLET };
- if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
+ if (epoll_ctl(conn->f.epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
return -errno;
}
@@ -540,12 +541,11 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
/**
* tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
- * @c: Execution context
* @conn: Connection pointer
*
* #syscalls timerfd_create timerfd_settime
*/
-static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
+static void tcp_timer_ctl(struct tcp_tap_conn *conn)
{
struct itimerspec it = { { 0 }, { 0 } };
@@ -570,7 +570,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
}
conn->timer = fd;
- if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
+ if (epoll_ctl(conn->f.epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
flow_dbg_perror(conn, "failed to add timer");
close(conn->timer);
conn->timer = -1;
@@ -628,7 +628,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
* flags and factor this into the logic below.
*/
if (flag == ACK_FROM_TAP_DUE)
- tcp_timer_ctl(c, conn);
+ tcp_timer_ctl(conn);
return;
}
@@ -644,7 +644,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE ||
(flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
(flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE)))
- tcp_timer_ctl(c, conn);
+ tcp_timer_ctl(conn);
}
/**
@@ -699,7 +699,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
tcp_epoll_ctl(c, conn);
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
- tcp_timer_ctl(c, conn);
+ tcp_timer_ctl(conn);
}
/**
@@ -1732,7 +1732,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
seq, conn->seq_from_tap);
tcp_send_flag(c, conn, ACK);
- tcp_timer_ctl(c, conn);
+ tcp_timer_ctl(conn);
if (p->count == 1) {
tcp_tap_window_update(c, conn,
@@ -2375,7 +2375,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
if (conn->flags & ACK_TO_TAP_DUE) {
tcp_send_flag(c, conn, ACK_IF_NEEDED);
- tcp_timer_ctl(c, conn);
+ tcp_timer_ctl(conn);
} else if (conn->flags & ACK_FROM_TAP_DUE) {
if (!(conn->events & ESTABLISHED)) {
flow_dbg(conn, "handshake timeout");
@@ -2397,7 +2397,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
return;
tcp_data_from_sock(c, conn);
- tcp_timer_ctl(c, conn);
+ tcp_timer_ctl(conn);
}
} else {
struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
@@ -3445,7 +3445,7 @@ int tcp_flow_migrate_source_ext(const struct ctx *c,
if (c->migrate_no_linger)
close(s);
else
- epoll_del(c->epollfd, s);
+ epoll_del(conn->f.epollfd, s);
/* Adjustments unrelated to FIN segments: sequence numbers we dumped are
* based on the end of the queues.
@@ -3594,7 +3594,7 @@ static int tcp_flow_repair_connect(const struct ctx *c,
return rc;
}
- conn->in_epoll = 0;
+ conn->f.epollfd = 0;
conn->timer = -1;
conn->listening_sock = -1;
diff --git a/tcp_conn.h b/tcp_conn.h
index 38b5c541f003..81333122d531 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -12,7 +12,6 @@
/**
* struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
* @f: Generic flow information
- * @in_epoll: Is the connection in the epoll set?
* @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
* @ws_from_tap: Window scaling factor advertised from tap/guest
* @ws_to_tap: Window scaling factor advertised to tap/guest
@@ -36,8 +35,6 @@ struct tcp_tap_conn {
/* Must be first element */
struct flow_common f;
- bool in_epoll :1;
-
#define TCP_RETRANS_BITS 3
unsigned int retrans :TCP_RETRANS_BITS;
#define TCP_MAX_RETRANS MAX_FROM_BITS(TCP_RETRANS_BITS)
@@ -196,7 +193,6 @@ struct tcp_tap_transfer_ext {
* @written: Bytes written (not fully written from one other side read)
* @events: Events observed/actions performed on connection
* @flags: Connection flags (attributes, not events)
- * @in_epoll: Is the connection in the epoll set?
*/
struct tcp_splice_conn {
/* Must be first element */
@@ -220,8 +216,6 @@ struct tcp_splice_conn {
#define RCVLOWAT_SET(sidei_) ((sidei_) ? BIT(1) : BIT(0))
#define RCVLOWAT_ACT(sidei_) ((sidei_) ? BIT(3) : BIT(2))
#define CLOSING BIT(4)
-
- bool in_epoll :1;
};
/* Socket pools */
@@ -245,7 +239,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
bool tcp_flow_is_established(const struct tcp_tap_conn *conn);
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
-void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
+void tcp_splice_timer(struct tcp_splice_conn *conn);
int tcp_conn_pool_sock(int pool[]);
int tcp_conn_sock(sa_family_t af);
int tcp_sock_refill_pool(int pool[], sa_family_t af);
diff --git a/tcp_splice.c b/tcp_splice.c
index 666ee62b738f..49fb43473de6 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -149,7 +149,7 @@ static void tcp_splice_conn_epoll_events(uint16_t events,
static int tcp_splice_epoll_ctl(const struct ctx *c,
struct tcp_splice_conn *conn)
{
- int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+ int m = conn->f.epollfd ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
const union epoll_ref ref[SIDES] = {
{ .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0],
.flowside = FLOW_SIDX(conn, 0) },
@@ -158,28 +158,28 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
};
struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 },
{ .data.u64 = ref[1].u64 } };
+ int epollfd = conn->f.epollfd ? conn->f.epollfd : c->epollfd;
tcp_splice_conn_epoll_events(conn->events, ev);
- if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) ||
- epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) {
+
+ if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) ||
+ epoll_ctl(epollfd, m, conn->s[1], &ev[1])) {
int ret = -errno;
flow_perror(conn, "ERROR on epoll_ctl()");
return ret;
}
-
- conn->in_epoll = true;
+ conn->f.epollfd = epollfd;
return 0;
}
/**
* conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag
- * @c: Execution context
* @conn: Connection pointer
* @flag: Flag to set, or ~flag to unset
*/
-static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
+static void conn_flag_do(struct tcp_splice_conn *conn,
unsigned long flag)
{
if (flag & (flag - 1)) {
@@ -204,15 +204,15 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
}
if (flag == CLOSING) {
- epoll_del(c->epollfd, conn->s[0]);
- epoll_del(c->epollfd, conn->s[1]);
+ epoll_del(conn->f.epollfd, conn->s[0]);
+ epoll_del(conn->f.epollfd, conn->s[1]);
}
}
#define conn_flag(c, conn, flag) \
do { \
flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
- conn_flag_do(c, conn, flag); \
+ conn_flag_do(conn, flag); \
} while (0)
/**
@@ -751,10 +751,9 @@ void tcp_splice_init(struct ctx *c)
/**
* tcp_splice_timer() - Timer for spliced connections
- * @c: Execution context
* @conn: Connection to handle
*/
-void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn)
+void tcp_splice_timer(struct tcp_splice_conn *conn)
{
unsigned sidei;
--
2.50.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 3/5] tcp, flow: Replace per-connection in_epoll flag with epollfd in flow_common
2025-10-03 15:27 ` [PATCH 3/5] tcp, flow: Replace per-connection in_epoll flag with epollfd in flow_common Laurent Vivier
@ 2025-10-07 6:07 ` David Gibson
2025-10-07 9:51 ` Stefano Brivio
0 siblings, 1 reply; 10+ messages in thread
From: David Gibson @ 2025-10-07 6:07 UTC (permalink / raw)
To: Laurent Vivier; +Cc: passt-dev
[-- Attachment #1: Type: text/plain, Size: 12694 bytes --]
On Fri, Oct 03, 2025 at 05:27:15PM +0200, Laurent Vivier wrote:
> The in_epoll boolean flag in tcp_tap_conn and tcp_splice_conn only tracked
> whether a connection was registered with epoll, not which epoll instance.
> This limited flexibility for future multi-epoll support.
>
> Replace the boolean with an epollfd field in flow_common that serves dual
> purpose: zero indicates not registered (replacing in_epoll=false), non-zero
Don't use 0, since that's a valid fd.
> stores the actual epoll fd (replacing in_epoll=true).
I am a bit nervous about adding 31-bits to every flow, since I think
we're fairly close to a cacheline threshold.
I'm not sure we really can add any less to flow_common, though, given
alignment.
Then again... we probably don't need 8 bites each for TYPE and STATE,
so those could be packed tighter. Then we could use a limited-bits
index into a table of epollfds, rather than a raw fd. Much uglier,
but maybe worth it?
> This change also simplifies tcp_timer_ctl() by removing the need to pass
> the context 'c', since the epoll fd is now directly accessible from the
> connection structure.
>
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> ---
> flow.c | 2 +-
> flow.h | 2 ++
> tcp.c | 36 ++++++++++++++++++------------------
> tcp_conn.h | 8 +-------
> tcp_splice.c | 23 +++++++++++------------
> 5 files changed, 33 insertions(+), 38 deletions(-)
>
> diff --git a/flow.c b/flow.c
> index b14e9d8b63ff..7c61ee87ae9d 100644
> --- a/flow.c
> +++ b/flow.c
> @@ -827,7 +827,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
> case FLOW_TCP_SPLICE:
> closed = tcp_splice_flow_defer(&flow->tcp_splice);
> if (!closed && timer)
> - tcp_splice_timer(c, &flow->tcp_splice);
> + tcp_splice_timer(&flow->tcp_splice);
> break;
> case FLOW_PING4:
> case FLOW_PING6:
> diff --git a/flow.h b/flow.h
> index ef138b83add8..592d9e3792f6 100644
> --- a/flow.h
> +++ b/flow.h
> @@ -175,6 +175,7 @@ int flowside_connect(const struct ctx *c, int s,
> * struct flow_common - Common fields for packet flows
> * @state: State of the flow table entry
> * @type: Type of packet flow
> + * @epollfd: epoll instance flow is registered with (0 if not registered)
> * @pif[]: Interface for each side of the flow
> * @side[]: Information for each side of the flow
> */
> @@ -190,6 +191,7 @@ struct flow_common {
> static_assert(sizeof(uint8_t) * 8 >= FLOW_TYPE_BITS,
> "Not enough bits for type field");
> #endif
> + int epollfd;
This should go after pif[] - it's a less logical order, but it will
save 2 bytes of alignment padding.
> uint8_t pif[SIDES];
> struct flowside side[SIDES];
> };
> diff --git a/tcp.c b/tcp.c
> index 04725deabb65..c995b40f38f8 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -504,25 +504,26 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
> */
> static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
> {
> - int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
> + int m = conn->f.epollfd ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
> union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
> .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), };
> struct epoll_event ev = { .data.u64 = ref.u64 };
> + int epollfd = conn->f.epollfd ? conn->f.epollfd : c->epollfd;
>
> if (conn->events == CLOSED) {
> - if (conn->in_epoll)
> - epoll_del(c->epollfd, conn->sock);
> + if (conn->f.epollfd)
> + epoll_del(epollfd, conn->sock);
> if (conn->timer != -1)
> - epoll_del(c->epollfd, conn->timer);
> + epoll_del(epollfd, conn->timer);
> return 0;
> }
>
> ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
>
> - if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
> + if (epoll_ctl(epollfd, m, conn->sock, &ev))
> return -errno;
>
> - conn->in_epoll = true;
> + conn->f.epollfd = epollfd;
>
> if (conn->timer != -1) {
> union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
> @@ -531,7 +532,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
> struct epoll_event ev_t = { .data.u64 = ref_t.u64,
> .events = EPOLLIN | EPOLLET };
>
> - if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
> + if (epoll_ctl(conn->f.epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
> return -errno;
> }
>
> @@ -540,12 +541,11 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
>
> /**
> * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
> - * @c: Execution context
> * @conn: Connection pointer
> *
> * #syscalls timerfd_create timerfd_settime
> */
> -static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
> +static void tcp_timer_ctl(struct tcp_tap_conn *conn)
> {
> struct itimerspec it = { { 0 }, { 0 } };
>
> @@ -570,7 +570,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
> }
> conn->timer = fd;
>
> - if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
> + if (epoll_ctl(conn->f.epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
> flow_dbg_perror(conn, "failed to add timer");
> close(conn->timer);
> conn->timer = -1;
> @@ -628,7 +628,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
> * flags and factor this into the logic below.
> */
> if (flag == ACK_FROM_TAP_DUE)
> - tcp_timer_ctl(c, conn);
> + tcp_timer_ctl(conn);
>
> return;
> }
> @@ -644,7 +644,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
> if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE ||
> (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
> (flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE)))
> - tcp_timer_ctl(c, conn);
> + tcp_timer_ctl(conn);
> }
>
> /**
> @@ -699,7 +699,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
> tcp_epoll_ctl(c, conn);
>
> if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
> - tcp_timer_ctl(c, conn);
> + tcp_timer_ctl(conn);
> }
>
> /**
> @@ -1732,7 +1732,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> seq, conn->seq_from_tap);
>
> tcp_send_flag(c, conn, ACK);
> - tcp_timer_ctl(c, conn);
> + tcp_timer_ctl(conn);
>
> if (p->count == 1) {
> tcp_tap_window_update(c, conn,
> @@ -2375,7 +2375,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
>
> if (conn->flags & ACK_TO_TAP_DUE) {
> tcp_send_flag(c, conn, ACK_IF_NEEDED);
> - tcp_timer_ctl(c, conn);
> + tcp_timer_ctl(conn);
> } else if (conn->flags & ACK_FROM_TAP_DUE) {
> if (!(conn->events & ESTABLISHED)) {
> flow_dbg(conn, "handshake timeout");
> @@ -2397,7 +2397,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
> return;
>
> tcp_data_from_sock(c, conn);
> - tcp_timer_ctl(c, conn);
> + tcp_timer_ctl(conn);
> }
> } else {
> struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
> @@ -3445,7 +3445,7 @@ int tcp_flow_migrate_source_ext(const struct ctx *c,
> if (c->migrate_no_linger)
> close(s);
> else
> - epoll_del(c->epollfd, s);
> + epoll_del(conn->f.epollfd, s);
>
> /* Adjustments unrelated to FIN segments: sequence numbers we dumped are
> * based on the end of the queues.
> @@ -3594,7 +3594,7 @@ static int tcp_flow_repair_connect(const struct ctx *c,
> return rc;
> }
>
> - conn->in_epoll = 0;
> + conn->f.epollfd = 0;
> conn->timer = -1;
> conn->listening_sock = -1;
>
> diff --git a/tcp_conn.h b/tcp_conn.h
> index 38b5c541f003..81333122d531 100644
> --- a/tcp_conn.h
> +++ b/tcp_conn.h
> @@ -12,7 +12,6 @@
> /**
> * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
> * @f: Generic flow information
> - * @in_epoll: Is the connection in the epoll set?
> * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
> * @ws_from_tap: Window scaling factor advertised from tap/guest
> * @ws_to_tap: Window scaling factor advertised to tap/guest
> @@ -36,8 +35,6 @@ struct tcp_tap_conn {
> /* Must be first element */
> struct flow_common f;
>
> - bool in_epoll :1;
> -
> #define TCP_RETRANS_BITS 3
> unsigned int retrans :TCP_RETRANS_BITS;
> #define TCP_MAX_RETRANS MAX_FROM_BITS(TCP_RETRANS_BITS)
> @@ -196,7 +193,6 @@ struct tcp_tap_transfer_ext {
> * @written: Bytes written (not fully written from one other side read)
> * @events: Events observed/actions performed on connection
> * @flags: Connection flags (attributes, not events)
> - * @in_epoll: Is the connection in the epoll set?
> */
> struct tcp_splice_conn {
> /* Must be first element */
> @@ -220,8 +216,6 @@ struct tcp_splice_conn {
> #define RCVLOWAT_SET(sidei_) ((sidei_) ? BIT(1) : BIT(0))
> #define RCVLOWAT_ACT(sidei_) ((sidei_) ? BIT(3) : BIT(2))
> #define CLOSING BIT(4)
> -
> - bool in_epoll :1;
> };
>
> /* Socket pools */
> @@ -245,7 +239,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
> bool tcp_flow_is_established(const struct tcp_tap_conn *conn);
>
> bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
> -void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
> +void tcp_splice_timer(struct tcp_splice_conn *conn);
> int tcp_conn_pool_sock(int pool[]);
> int tcp_conn_sock(sa_family_t af);
> int tcp_sock_refill_pool(int pool[], sa_family_t af);
> diff --git a/tcp_splice.c b/tcp_splice.c
> index 666ee62b738f..49fb43473de6 100644
> --- a/tcp_splice.c
> +++ b/tcp_splice.c
> @@ -149,7 +149,7 @@ static void tcp_splice_conn_epoll_events(uint16_t events,
> static int tcp_splice_epoll_ctl(const struct ctx *c,
> struct tcp_splice_conn *conn)
> {
> - int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
> + int m = conn->f.epollfd ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
> const union epoll_ref ref[SIDES] = {
> { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0],
> .flowside = FLOW_SIDX(conn, 0) },
> @@ -158,28 +158,28 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
> };
> struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 },
> { .data.u64 = ref[1].u64 } };
> + int epollfd = conn->f.epollfd ? conn->f.epollfd : c->epollfd;
>
> tcp_splice_conn_epoll_events(conn->events, ev);
>
> - if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) ||
> - epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) {
> +
> + if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) ||
> + epoll_ctl(epollfd, m, conn->s[1], &ev[1])) {
> int ret = -errno;
> flow_perror(conn, "ERROR on epoll_ctl()");
> return ret;
> }
> -
> - conn->in_epoll = true;
> + conn->f.epollfd = epollfd;
>
> return 0;
> }
>
> /**
> * conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag
> - * @c: Execution context
> * @conn: Connection pointer
> * @flag: Flag to set, or ~flag to unset
> */
> -static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
> +static void conn_flag_do(struct tcp_splice_conn *conn,
> unsigned long flag)
> {
> if (flag & (flag - 1)) {
> @@ -204,15 +204,15 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
> }
>
> if (flag == CLOSING) {
> - epoll_del(c->epollfd, conn->s[0]);
> - epoll_del(c->epollfd, conn->s[1]);
> + epoll_del(conn->f.epollfd, conn->s[0]);
> + epoll_del(conn->f.epollfd, conn->s[1]);
> }
> }
>
> #define conn_flag(c, conn, flag) \
> do { \
> flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
> - conn_flag_do(c, conn, flag); \
> + conn_flag_do(conn, flag); \
> } while (0)
>
> /**
> @@ -751,10 +751,9 @@ void tcp_splice_init(struct ctx *c)
>
> /**
> * tcp_splice_timer() - Timer for spliced connections
> - * @c: Execution context
> * @conn: Connection to handle
> */
> -void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn)
> +void tcp_splice_timer(struct tcp_splice_conn *conn)
> {
> unsigned sidei;
>
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 3/5] tcp, flow: Replace per-connection in_epoll flag with epollfd in flow_common
2025-10-07 6:07 ` David Gibson
@ 2025-10-07 9:51 ` Stefano Brivio
0 siblings, 0 replies; 10+ messages in thread
From: Stefano Brivio @ 2025-10-07 9:51 UTC (permalink / raw)
To: David Gibson; +Cc: Laurent Vivier, passt-dev
On Tue, 7 Oct 2025 17:07:37 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:
> On Fri, Oct 03, 2025 at 05:27:15PM +0200, Laurent Vivier wrote:
> > The in_epoll boolean flag in tcp_tap_conn and tcp_splice_conn only tracked
> > whether a connection was registered with epoll, not which epoll instance.
> > This limited flexibility for future multi-epoll support.
> >
> > Replace the boolean with an epollfd field in flow_common that serves dual
> > purpose: zero indicates not registered (replacing in_epoll=false), non-zero
>
> Don't use 0, since that's a valid fd.
>
> > stores the actual epoll fd (replacing in_epoll=true).
>
> I am a bit nervous about adding 31-bits to every flow, since I think
> we're fairly close to a cacheline threshold.
I mentioned to Laurent in our weekly call on Monday the same concern
and that I would look into it (first step, I would say, actually check
with pahole?) and, if it's an issue, into possible tricks to avoid
making the struct too big. I still plan to do this.
--
Stefano
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 4/5] icmp: Use epollfd from flow_common structure
2025-10-03 15:27 [PATCH 0/5] Refactor epoll handling in preparation for multithreading Laurent Vivier
` (2 preceding siblings ...)
2025-10-03 15:27 ` [PATCH 3/5] tcp, flow: Replace per-connection in_epoll flag with epollfd in flow_common Laurent Vivier
@ 2025-10-03 15:27 ` Laurent Vivier
2025-10-03 15:27 ` [PATCH 5/5] udp: " Laurent Vivier
4 siblings, 0 replies; 10+ messages in thread
From: Laurent Vivier @ 2025-10-03 15:27 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
Store epollfd in the flow_common structure for ICMP ping flows and use
it for epoll operations instead of passing c->epollfd directly. This
makes ICMP consistent with the recent TCP changes and follows the
pattern established in previous commit.
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
icmp.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/icmp.c b/icmp.c
index d6f0abe68269..2246e1c9ef1d 100644
--- a/icmp.c
+++ b/icmp.c
@@ -151,7 +151,7 @@ unexpected:
static void icmp_ping_close(const struct ctx *c,
const struct icmp_ping_flow *pingf)
{
- epoll_del(c->epollfd, pingf->sock);
+ epoll_del(pingf->f.epollfd, pingf->sock);
close(pingf->sock);
flow_hash_remove(c, FLOW_SIDX(pingf, INISIDE));
}
@@ -213,9 +213,11 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
ref.flowside = FLOW_SIDX(flow, TGTSIDE);
ref.fd = pingf->sock;
+ pingf->f.epollfd = c->epollfd;
+
ev.events = EPOLLIN;
ev.data.u64 = ref.u64;
- if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, pingf->sock, &ev) == -1) {
+ if (epoll_ctl(pingf->f.epollfd, EPOLL_CTL_ADD, pingf->sock, &ev) == -1) {
warn_perror("L4 epoll_ctl");
close(pingf->sock);
goto cancel;
--
2.50.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 5/5] udp: Use epollfd from flow_common structure
2025-10-03 15:27 [PATCH 0/5] Refactor epoll handling in preparation for multithreading Laurent Vivier
` (3 preceding siblings ...)
2025-10-03 15:27 ` [PATCH 4/5] icmp: Use epollfd from flow_common structure Laurent Vivier
@ 2025-10-03 15:27 ` Laurent Vivier
4 siblings, 0 replies; 10+ messages in thread
From: Laurent Vivier @ 2025-10-03 15:27 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
Update UDP flow handling to use the epollfd field from the flow_common
structure instead of accessing it from the context. This aligns UDP with
the recent changes to TCP and ICMP flow management.
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
udp_flow.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/udp_flow.c b/udp_flow.c
index f99994523a6c..9843b9f5fb2f 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -51,7 +51,7 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
flow_foreach_sidei(sidei) {
flow_hash_remove(c, FLOW_SIDX(uflow, sidei));
if (uflow->s[sidei] >= 0) {
- epoll_del(c->epollfd, uflow->s[sidei]);
+ epoll_del(uflow->f.epollfd, uflow->s[sidei]);
close(uflow->s[sidei]);
uflow->s[sidei] = -1;
}
@@ -91,9 +91,11 @@ static int udp_flow_sock(const struct ctx *c,
ref.data = fref.data;
ref.fd = s;
+ uflow->f.epollfd = c->epollfd;
+
ev.events = EPOLLIN;
ev.data.u64 = ref.u64;
- if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev) == -1) {
+ if (epoll_ctl(uflow->f.epollfd, EPOLL_CTL_ADD, s, &ev) == -1) {
int rc = -errno;
close(s);
warn("L4 epoll_ctl: %s", strerror_(-rc));
@@ -103,7 +105,7 @@ static int udp_flow_sock(const struct ctx *c,
if (flowside_connect(c, s, pif, side) < 0) {
int rc = -errno;
- epoll_del(c->epollfd, s);
+ epoll_del(uflow->f.epollfd, s);
close(s);
flow_dbg_perror(uflow, "Couldn't connect flow socket");
--
2.50.1
^ permalink raw reply [flat|nested] 10+ messages in thread