On Fri, Oct 17, 2025 at 12:31:26PM +0200, Laurent Vivier wrote: > The in_epoll boolean flag in tcp_tap_conn and tcp_splice_conn only tracked > whether a connection was registered with epoll, not which epoll instance. > This limited flexibility for future multi-epoll support. > > Replace the boolean with a threadnb field in flow_common that identifies > which thread (and thus which epoll instance) the flow is registered with. > Use FLOW_THREADNB_INVALID to indicate when a flow is not registered with > any epoll instance. A threadnb_to_epollfd[] mapping table translates > thread numbers to their corresponding epoll file descriptors. > > Add helper functions: > - flow_in_epoll() to check if a flow is registered with epoll > - flow_epollfd() to retrieve the epoll fd for a flow's thread > - flow_thread_register() to register an epoll fd with a thread > - flow_thread_set() to set the thread number of a flow > > This change also simplifies tcp_timer_ctl() and conn_flag_do() by removing > the need to pass the context 'c', since the epoll fd is now directly > accessible from the flow structure via flow_epollfd(). > > Signed-off-by: Laurent Vivier Some minor queries only. > --- > flow.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++- > flow.h | 12 ++++++++++++ > passt.c | 1 + > tcp.c | 39 ++++++++++++++++++++------------------ > tcp_conn.h | 8 +------- > tcp_splice.c | 24 ++++++++++++------------ > 6 files changed, 99 insertions(+), 38 deletions(-) > > diff --git a/flow.c b/flow.c > index b14e9d8b63ff..d56bae776239 100644 > --- a/flow.c > +++ b/flow.c > @@ -116,6 +116,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, > unsigned flow_first_free; > union flow flowtab[FLOW_MAX]; > static const union flow *flow_new_entry; /* = NULL */ > +static int threadnb_to_epollfd[FLOW_THREADNB_SIZE]; Same comments as in the previous version about "thread" in the name. > > /* Hash table to index it */ > #define FLOW_HASH_LOAD 70 /* % */ > @@ -347,6 +348,55 @@ static void flow_set_state(struct flow_common *f, enum flow_state state) > flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate)); > } > > +/** > + * flow_in_epoll() - Check if flow is registered with an epoll instance > + * @f: Flow to check > + * > + * Return: true if flow is registered with epoll, false otherwise > + */ > +bool flow_in_epoll(const struct flow_common *f) > +{ > + return f->threadnb != FLOW_THREADNB_INVALID; > +} > + > +/** > + * flow_epollfd() - Get the epoll file descriptor for a flow > + * @f: Flow to query > + * > + * Return: epoll file descriptor associated with the flow's thread > + */ > +int flow_epollfd(const struct flow_common *f) > +{ > + ASSERT(f->threadnb < FLOW_THREADNB_MAX); > + > + return threadnb_to_epollfd[f->threadnb]; > +} > + > +/** > + * flow_thread_set() - Associate a flow with a thread > + * @f: Flow to update > + * @threadnb: Thread number to associate with this flow > + */ > +void flow_thread_set(struct flow_common *f, int threadnb) > +{ > + ASSERT(threadnb < FLOW_THREADNB_MAX); > + > + f->threadnb = threadnb; > +} > + > +/** > + * flow_thread_register() - Initialize the threadnb -> epollfd mapping > + * @threadnb: Thread number to associate to > + * @epollfd: epoll file descriptor for the thread > + */ > +void flow_thread_register(int threadnb, int epollfd) > +{ > + ASSERT(threadnb < FLOW_THREADNB_MAX); > + ASSERT(epollfd >= 0); Not sure about the second assert. It seems like an error people are unlikely to make by accident, whereas attempting to deliberately clear / delete an entry by setting its fd to -1 seems like a reasonable thing to do. > + > + threadnb_to_epollfd[threadnb] = epollfd; > +} > + > /** > * flow_initiate_() - Move flow to INI, setting pif[INISIDE] > * @flow: Flow to change state > @@ -548,6 +598,7 @@ union flow *flow_alloc(void) > > flow_new_entry = flow; > memset(flow, 0, sizeof(*flow)); > + flow->f.threadnb = FLOW_THREADNB_INVALID; > flow_set_state(&flow->f, FLOW_STATE_NEW); > > return flow; > @@ -827,7 +878,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) > case FLOW_TCP_SPLICE: > closed = tcp_splice_flow_defer(&flow->tcp_splice); > if (!closed && timer) > - tcp_splice_timer(c, &flow->tcp_splice); > + tcp_splice_timer(&flow->tcp_splice); > break; > case FLOW_PING4: > case FLOW_PING6: > diff --git a/flow.h b/flow.h > index ef138b83add8..700d8b32c990 100644 > --- a/flow.h > +++ b/flow.h > @@ -177,6 +177,8 @@ int flowside_connect(const struct ctx *c, int s, > * @type: Type of packet flow > * @pif[]: Interface for each side of the flow > * @side[]: Information for each side of the flow > + * @threadnb: Thread number flow is registered with > + * (FLOW_THREADNB_INVALID if not) > */ > struct flow_common { > #ifdef __GNUC__ > @@ -192,8 +194,14 @@ struct flow_common { > #endif > uint8_t pif[SIDES]; > struct flowside side[SIDES]; > +#define FLOW_THREADNB_BITS 8 > + unsigned int threadnb:FLOW_THREADNB_BITS; > }; > > +#define FLOW_THREADNB_SIZE (1 << FLOW_THREADNB_BITS) > +#define FLOW_THREADNB_MAX (FLOW_THREADNB_SIZE - 1) > +#define FLOW_THREADNB_INVALID FLOW_THREADNB_MAX > + > #define FLOW_INDEX_BITS 17 /* 128k - 1 */ > #define FLOW_MAX MAX_FROM_BITS(FLOW_INDEX_BITS) > > @@ -249,6 +257,10 @@ flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, > union flow; > > void flow_init(void); > +bool flow_in_epoll(const struct flow_common *f); > +int flow_epollfd(const struct flow_common *f); > +void flow_thread_set(struct flow_common *f, int threadnb); > +void flow_thread_register(int threadnb, int epollfd); > void flow_defer_handler(const struct ctx *c, const struct timespec *now); > int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage, > int fd); > diff --git a/passt.c b/passt.c > index af928111786b..37f2c897be84 100644 > --- a/passt.c > +++ b/passt.c > @@ -285,6 +285,7 @@ int main(int argc, char **argv) > c.epollfd = epoll_create1(EPOLL_CLOEXEC); > if (c.epollfd == -1) > die_perror("Failed to create epoll file descriptor"); > + flow_thread_register(0, c.epollfd); > > if (getrlimit(RLIMIT_NOFILE, &limit)) > die_perror("Failed to get maximum value of open files limit"); > diff --git a/tcp.c b/tcp.c > index db9f17c0622f..8c49852b8454 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -504,25 +504,27 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) > */ > static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) > { > - int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; > + int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; > union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, > .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), }; > struct epoll_event ev = { .data.u64 = ref.u64 }; > + int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) > + : c->epollfd; > > if (conn->events == CLOSED) { > - if (conn->in_epoll) > - epoll_del(c->epollfd, conn->sock); > + if (flow_in_epoll(&conn->f)) > + epoll_del(epollfd, conn->sock); > if (conn->timer != -1) > - epoll_del(c->epollfd, conn->timer); > + epoll_del(epollfd, conn->timer); > return 0; > } > > ev.events = tcp_conn_epoll_events(conn->events, conn->flags); > > - if (epoll_ctl(c->epollfd, m, conn->sock, &ev)) > + if (epoll_ctl(epollfd, m, conn->sock, &ev)) > return -errno; > > - conn->in_epoll = true; > + flow_thread_set(&conn->f, 0); > > if (conn->timer != -1) { > union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER, > @@ -531,7 +533,8 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) > struct epoll_event ev_t = { .data.u64 = ref_t.u64, > .events = EPOLLIN | EPOLLET }; > > - if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t)) > + if (epoll_ctl(flow_epollfd(&conn->f), EPOLL_CTL_MOD, > + conn->timer, &ev_t)) > return -errno; > } > > @@ -540,12 +543,11 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) > > /** > * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed > - * @c: Execution context > * @conn: Connection pointer > * > * #syscalls timerfd_create timerfd_settime > */ > -static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) > +static void tcp_timer_ctl(struct tcp_tap_conn *conn) > { > struct itimerspec it = { { 0 }, { 0 } }; > > @@ -570,7 +572,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) > } > conn->timer = fd; > > - if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { > + if (epoll_ctl(flow_epollfd(&conn->f), EPOLL_CTL_ADD, > + conn->timer, &ev)) { Possibly a question for an earlier patch, but is there a reason we can't use epoll_add() here? > flow_dbg_perror(conn, "failed to add timer"); > close(conn->timer); > conn->timer = -1; > @@ -628,7 +631,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, > * flags and factor this into the logic below. > */ > if (flag == ACK_FROM_TAP_DUE) > - tcp_timer_ctl(c, conn); > + tcp_timer_ctl(conn); > > return; > } > @@ -644,7 +647,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, > if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE || > (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) || > (flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE))) > - tcp_timer_ctl(c, conn); > + tcp_timer_ctl(conn); > } > > /** > @@ -699,7 +702,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, > tcp_epoll_ctl(c, conn); > > if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) > - tcp_timer_ctl(c, conn); > + tcp_timer_ctl(conn); > } > > /** > @@ -1757,7 +1760,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, > seq, conn->seq_from_tap); > > tcp_send_flag(c, conn, ACK); > - tcp_timer_ctl(c, conn); > + tcp_timer_ctl(conn); > > if (p->count == 1) { > tcp_tap_window_update(c, conn, > @@ -2406,7 +2409,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) > > if (conn->flags & ACK_TO_TAP_DUE) { > tcp_send_flag(c, conn, ACK_IF_NEEDED); > - tcp_timer_ctl(c, conn); > + tcp_timer_ctl(conn); > } else if (conn->flags & ACK_FROM_TAP_DUE) { > if (!(conn->events & ESTABLISHED)) { > flow_dbg(conn, "handshake timeout"); > @@ -2428,7 +2431,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) > return; > > tcp_data_from_sock(c, conn); > - tcp_timer_ctl(c, conn); > + tcp_timer_ctl(conn); > } > } else { > struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; > @@ -3476,7 +3479,7 @@ int tcp_flow_migrate_source_ext(const struct ctx *c, > if (c->migrate_no_linger) > close(s); > else > - epoll_del(c->epollfd, s); > + epoll_del(flow_epollfd(&conn->f), s); > > /* Adjustments unrelated to FIN segments: sequence numbers we dumped are > * based on the end of the queues. > @@ -3625,7 +3628,7 @@ static int tcp_flow_repair_connect(const struct ctx *c, > return rc; > } > > - conn->in_epoll = 0; > + conn->f.threadnb = FLOW_THREADNB_INVALID; > conn->timer = -1; > conn->listening_sock = -1; > > diff --git a/tcp_conn.h b/tcp_conn.h > index 38b5c541f003..81333122d531 100644 > --- a/tcp_conn.h > +++ b/tcp_conn.h > @@ -12,7 +12,6 @@ > /** > * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) > * @f: Generic flow information > - * @in_epoll: Is the connection in the epoll set? > * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT > * @ws_from_tap: Window scaling factor advertised from tap/guest > * @ws_to_tap: Window scaling factor advertised to tap/guest > @@ -36,8 +35,6 @@ struct tcp_tap_conn { > /* Must be first element */ > struct flow_common f; > > - bool in_epoll :1; > - > #define TCP_RETRANS_BITS 3 > unsigned int retrans :TCP_RETRANS_BITS; > #define TCP_MAX_RETRANS MAX_FROM_BITS(TCP_RETRANS_BITS) > @@ -196,7 +193,6 @@ struct tcp_tap_transfer_ext { > * @written: Bytes written (not fully written from one other side read) > * @events: Events observed/actions performed on connection > * @flags: Connection flags (attributes, not events) > - * @in_epoll: Is the connection in the epoll set? > */ > struct tcp_splice_conn { > /* Must be first element */ > @@ -220,8 +216,6 @@ struct tcp_splice_conn { > #define RCVLOWAT_SET(sidei_) ((sidei_) ? BIT(1) : BIT(0)) > #define RCVLOWAT_ACT(sidei_) ((sidei_) ? BIT(3) : BIT(2)) > #define CLOSING BIT(4) > - > - bool in_epoll :1; > }; > > /* Socket pools */ > @@ -245,7 +239,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd > bool tcp_flow_is_established(const struct tcp_tap_conn *conn); > > bool tcp_splice_flow_defer(struct tcp_splice_conn *conn); > -void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn); > +void tcp_splice_timer(struct tcp_splice_conn *conn); > int tcp_conn_pool_sock(int pool[]); > int tcp_conn_sock(sa_family_t af); > int tcp_sock_refill_pool(int pool[], sa_family_t af); > diff --git a/tcp_splice.c b/tcp_splice.c > index 6f21184bdc55..703bd7610890 100644 > --- a/tcp_splice.c > +++ b/tcp_splice.c > @@ -149,7 +149,9 @@ static void tcp_splice_conn_epoll_events(uint16_t events, > static int tcp_splice_epoll_ctl(const struct ctx *c, > struct tcp_splice_conn *conn) > { > - int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; > + int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) > + : c->epollfd; > + int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; > const union epoll_ref ref[SIDES] = { > { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0], > .flowside = FLOW_SIDX(conn, 0) }, > @@ -161,25 +163,24 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, > > tcp_splice_conn_epoll_events(conn->events, ev); > > - if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) || > - epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) { > + > + if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) || > + epoll_ctl(epollfd, m, conn->s[1], &ev[1])) { > int ret = -errno; > flow_perror(conn, "ERROR on epoll_ctl()"); > return ret; > } > - > - conn->in_epoll = true; > + flow_thread_set(&conn->f, 0); > > return 0; > } > > /** > * conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag > - * @c: Execution context > * @conn: Connection pointer > * @flag: Flag to set, or ~flag to unset > */ > -static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, > +static void conn_flag_do(struct tcp_splice_conn *conn, > unsigned long flag) > { > if (flag & (flag - 1)) { > @@ -204,15 +205,15 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, > } > > if (flag == CLOSING) { > - epoll_del(c->epollfd, conn->s[0]); > - epoll_del(c->epollfd, conn->s[1]); > + epoll_del(flow_epollfd(&conn->f), conn->s[0]); > + epoll_del(flow_epollfd(&conn->f), conn->s[1]); > } > } > > #define conn_flag(c, conn, flag) \ > do { \ > flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ > - conn_flag_do(c, conn, flag); \ > + conn_flag_do(conn, flag); \ > } while (0) > > /** > @@ -751,10 +752,9 @@ void tcp_splice_init(struct ctx *c) > > /** > * tcp_splice_timer() - Timer for spliced connections > - * @c: Execution context > * @conn: Connection to handle > */ > -void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn) > +void tcp_splice_timer(struct tcp_splice_conn *conn) > { > unsigned sidei; > > -- > 2.51.0 > -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson