On Fri, Dec 19, 2025 at 05:45:15PM +0100, Laurent Vivier wrote: > Currently, each flow type (TCP, TCP_SPLICE, PING, UDP) has its own > code to add or modify file descriptors in epoll. This leads to > duplicated boilerplate code across icmp.c, tcp.c, tcp_splice.c, and > udp_flow.c, each setting up epoll_ref unions and calling epoll_ctl() > with flow-type-specific details. > > Introduce flow_epoll_set() in flow.c to handle epoll operations for > all flow types in a unified way. The function takes an explicit epoll > type parameter, allowing it to handle not only flow socket types but > also the TCP timer (EPOLL_TYPE_TCP_TIMER). > > This will be needed to migrate queue pair from an epollfd to another. > > Signed-off-by: Laurent Vivier Love the concept. Couple of warts in execution. > --- > flow.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > flow.h | 3 +++ > icmp.c | 9 ++----- > tcp.c | 39 +++++++++-------------------- > tcp_splice.c | 27 +++++++------------- > udp_flow.c | 12 +-------- > 6 files changed, 97 insertions(+), 63 deletions(-) > > diff --git a/flow.c b/flow.c > index 4f53486586cd..9c98102e3616 100644 > --- a/flow.c > +++ b/flow.c > @@ -20,6 +20,7 @@ > #include "flow.h" > #include "flow_table.h" > #include "repair.h" > +#include "epoll_ctl.h" > > const char *flow_state_str[] = { > [FLOW_STATE_FREE] = "FREE", > @@ -390,6 +391,75 @@ void flow_epollid_clear(struct flow_common *f) > f->epollid = EPOLLFD_ID_INVALID; > } > > +/** > + * flow_epoll_set() - Add or modify epoll registration for a flow socket > + * @c: Execution context > + * @type: epoll type > + * @f: Flow to register socket for > + * @events: epoll events to watch for > + * @fd: File descriptor to register > + * @sidei: Side index of the flow > + * > + * Return: 0 on success, -1 on error (from epoll_ctl()) > + */ > +int flow_epoll_set(const struct ctx *c, enum epoll_type type, > + const struct flow_common *f, uint32_t events, int fd, > + unsigned int sidei) > +{ > + struct epoll_event ev; > + union epoll_ref ref; > + int epollfd; > + int m; > + > + ref.fd = fd; > + ref.type = type; > + > + switch (type) { > + case EPOLL_TYPE_TCP_SPLICE: > + case EPOLL_TYPE_TCP: > + ref.flowside = flow_sidx(f, sidei); > + if (flow_in_epoll(f)) { > + m = EPOLL_CTL_MOD; > + epollfd = flow_epollfd(f); > + } else { > + m = EPOLL_CTL_ADD; > + epollfd = c->epollfd; > + } It looks like the handling of flow_in_epoll() / flow_epoll() should be able to be common between all these epoll types. I'm guessing that doesn't quite work, because flow_in_epoll() isn't yet updated properly for all the types. Might this be cleaner if that was fixed first? > + break; > + case EPOLL_TYPE_TCP_TIMER: { > + const struct tcp_tap_conn *conn = &((union flow *)f)->tcp; > + > + ref.flow = flow_idx(f); > + m = conn->timer == -1 ? EPOLL_CTL_ADD : EPOLL_CTL_MOD; > + epollfd = flow_epollfd(f); > + break; > + } > + case EPOLL_TYPE_PING: > + ref.flowside = flow_sidx(f, sidei); > + m = EPOLL_CTL_ADD; > + epollfd = flow_epollfd(f); > + break; That would allow this case to be the same as the TCP ones. > + case EPOLL_TYPE_UDP: { > + union { > + flow_sidx_t sidx; > + uint32_t data; > + } fref = { .sidx = flow_sidx(f, sidei) }; I'm not sure quite what this union is about. EPOLL_TYPE_UDP uses epoll_ref.flowside, same as the non-timer TCP types. epoll_ref.udp is only for udp EPOLL_TYPE_UDP_LISTEN. Arguably we should change the name to reflect that, but I'm working towards unifying the epoll_ref variants for the various listening sockets anyway. > + > + ref.data = fref.data; > + m = EPOLL_CTL_ADD; > + epollfd = flow_epollfd(f); Given the above, this should be identical to the EPOLL_TYPE_PING case already, and apart from the EPOLL_CTL_ADD handling, it's identical to the non-timer TCP cases. > + break; > + } > + default: > + ASSERT(0); > + } > + > + ev.events = events; > + ev.data.u64 = ref.u64; > + > + return epoll_ctl(epollfd, m, fd, &ev); > +} > + > /** > * flow_epollid_register() - Initialize the epoll id -> fd mapping > * @epollid: epoll id to associate to > diff --git a/flow.h b/flow.h > index b43b0b1dd7f2..388fab124238 100644 > --- a/flow.h > +++ b/flow.h > @@ -265,6 +265,9 @@ bool flow_in_epoll(const struct flow_common *f); > int flow_epollfd(const struct flow_common *f); > void flow_epollid_set(struct flow_common *f, int epollid); > void flow_epollid_clear(struct flow_common *f); > +int flow_epoll_set(const struct ctx *c, enum epoll_type type, > + const struct flow_common *f, uint32_t events, int fd, > + unsigned int sidei); > void flow_epollid_register(int epollid, int epollfd); > void flow_defer_handler(const struct ctx *c, const struct timespec *now); > int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage, > diff --git a/icmp.c b/icmp.c > index 9564c4963f7b..235759567060 100644 > --- a/icmp.c > +++ b/icmp.c > @@ -177,7 +177,6 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, > union flow *flow = flow_alloc(); > struct icmp_ping_flow *pingf; > const struct flowside *tgt; > - union epoll_ref ref; > > if (!flow) > return NULL; > @@ -211,12 +210,8 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, > goto cancel; > > flow_epollid_set(&pingf->f, EPOLLFD_ID_DEFAULT); > - > - ref.type = EPOLL_TYPE_PING; > - ref.flowside = FLOW_SIDX(flow, TGTSIDE); > - ref.fd = pingf->sock; > - > - if (epoll_add(flow_epollfd(&pingf->f), EPOLLIN, ref) < 0) { > + if (flow_epoll_set(c, EPOLL_TYPE_PING, &pingf->f, EPOLLIN, pingf->sock, > + TGTSIDE) < 0) { > close(pingf->sock); > goto cancel; > } > diff --git a/tcp.c b/tcp.c > index 6e6156098c12..2907af282551 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -530,14 +530,10 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) > */ > static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) > { > - int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; > - union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, > - .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), }; > - struct epoll_event ev = { .data.u64 = ref.u64 }; > - int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) > - : c->epollfd; > + uint32_t events; > > if (conn->events == CLOSED) { > + int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) : c->epollfd; > if (flow_in_epoll(&conn->f)) > epoll_del(epollfd, conn->sock); > if (conn->timer != -1) > @@ -545,22 +541,17 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) > return 0; > } > > - ev.events = tcp_conn_epoll_events(conn->events, conn->flags); > + events = tcp_conn_epoll_events(conn->events, conn->flags); > > - if (epoll_ctl(epollfd, m, conn->sock, &ev)) > + if (flow_epoll_set(c, EPOLL_TYPE_TCP, &conn->f, events, conn->sock, > + !TAPSIDE(conn))) > return -errno; > > flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); > > if (conn->timer != -1) { > - union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER, > - .fd = conn->timer, > - .flow = FLOW_IDX(conn) }; > - struct epoll_event ev_t = { .data.u64 = ref_t.u64, > - .events = EPOLLIN | EPOLLET }; > - > - if (epoll_ctl(flow_epollfd(&conn->f), EPOLL_CTL_MOD, > - conn->timer, &ev_t)) > + if (flow_epoll_set(c, EPOLL_TYPE_TCP_TIMER, &conn->f, > + EPOLLIN | EPOLLET, conn->timer, 0)) > return -errno; > } > > @@ -581,11 +572,6 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) > return; > > if (conn->timer == -1) { > - union epoll_ref ref = { .type = EPOLL_TYPE_TCP_TIMER, > - .flow = FLOW_IDX(conn) }; > - struct epoll_event ev = { .data.u64 = ref.u64, > - .events = EPOLLIN | EPOLLET }; > - int epollfd = flow_epollfd(&conn->f); > int fd; > > fd = timerfd_create(CLOCK_MONOTONIC, 0); > @@ -593,18 +579,17 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) > flow_dbg_perror(conn, "failed to get timer"); > if (fd > -1) > close(fd); > - conn->timer = -1; > return; > } > - conn->timer = fd; > - ref.fd = conn->timer; > > - if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { > + if (flow_epoll_set(c, EPOLL_TYPE_TCP_TIMER, &conn->f, > + EPOLLIN | EPOLLET, fd, 0)) { > flow_dbg_perror(conn, "failed to add timer"); > - close(conn->timer); > - conn->timer = -1; > + close(fd); > return; > } > + > + conn->timer = fd; > } > > if (conn->flags & ACK_TO_TAP_DUE) { > diff --git a/tcp_splice.c b/tcp_splice.c > index bf4ff466de07..7367f435367b 100644 > --- a/tcp_splice.c > +++ b/tcp_splice.c > @@ -143,24 +143,15 @@ static uint32_t tcp_splice_conn_epoll_events(uint16_t events, unsigned sidei) > static int tcp_splice_epoll_ctl(const struct ctx *c, > struct tcp_splice_conn *conn) > { > - int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) > - : c->epollfd; > - int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; > - const union epoll_ref ref[SIDES] = { > - { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0], > - .flowside = FLOW_SIDX(conn, 0) }, > - { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[1], > - .flowside = FLOW_SIDX(conn, 1) } > - }; > - struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 }, > - { .data.u64 = ref[1].u64 } }; > - > - ev[0].events = tcp_splice_conn_epoll_events(conn->events, 0); > - ev[1].events = tcp_splice_conn_epoll_events(conn->events, 1); > - > - > - if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) || > - epoll_ctl(epollfd, m, conn->s[1], &ev[1])) { > + uint32_t events[2]; > + > + events[0] = tcp_splice_conn_epoll_events(conn->events, 0); > + events[1] = tcp_splice_conn_epoll_events(conn->events, 1); > + > + if (flow_epoll_set(c, EPOLL_TYPE_TCP_SPLICE, &conn->f, events[0], > + conn->s[0], 0) || > + flow_epoll_set(c, EPOLL_TYPE_TCP_SPLICE, &conn->f, events[1], > + conn->s[1], 1)) { > int ret = -errno; > flow_perror(conn, "ERROR on epoll_ctl()"); > return ret; > diff --git a/udp_flow.c b/udp_flow.c > index 33f29f21e69e..42f3622d621c 100644 > --- a/udp_flow.c > +++ b/udp_flow.c > @@ -74,11 +74,6 @@ static int udp_flow_sock(const struct ctx *c, > { > const struct flowside *side = &uflow->f.side[sidei]; > uint8_t pif = uflow->f.pif[sidei]; > - union { > - flow_sidx_t sidx; > - uint32_t data; > - } fref = { .sidx = FLOW_SIDX(uflow, sidei) }; > - union epoll_ref ref; > int rc; > int s; > > @@ -88,13 +83,8 @@ static int udp_flow_sock(const struct ctx *c, > return s; > } > > - ref.type = EPOLL_TYPE_UDP; > - ref.data = fref.data; > - ref.fd = s; > - > flow_epollid_set(&uflow->f, EPOLLFD_ID_DEFAULT); > - > - rc = epoll_add(flow_epollfd(&uflow->f), EPOLLIN, ref); > + rc = flow_epoll_set(c, EPOLL_TYPE_UDP, &uflow->f, EPOLLIN, s, sidei); > if (rc < 0) { > close(s); > return rc; > -- > 2.51.1 > -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson