From: David Gibson <david@gibson.dropbear.id.au>
To: Laurent Vivier <lvivier@redhat.com>
Cc: passt-dev@passt.top
Subject: Re: [PATCH v5 12/12] flow: Derive epoll fd from queue pair, removing epollid field
Date: Fri, 19 Jun 2026 16:52:12 +1000 [thread overview]
Message-ID: <ajTnHHoW3xw0apib@zatzit> (raw)
In-Reply-To: <20260616125130.1324274-13-lvivier@redhat.com>
[-- Attachment #1: Type: text/plain, Size: 15082 bytes --]
On Tue, Jun 16, 2026 at 02:51:30PM +0200, Laurent Vivier wrote:
> Since each queue pair maps to exactly one epoll instance, the epoll
> file descriptor can be looked up directly from the qpair field. This
> makes the separate epollid field in flow_common redundant.
>
> Replace epoll_id_to_fd[] with qpair_to_fd[], remove
> flow_epollid_set(), flow_epollid_register(), flow_qp()/FLOW_QP(),
> and the epollid field from flow_common. FLOW_QPAIR_INVALID is no
> longer needed: newly allocated flows get qpair 0 from memset.
It's not obvious to me why it was needed before, but not now.
> For new flows, FLOW_SETQP() sets the queue pair during creation, and
> the socket is registered with epoll separately via flow_epoll_set().
>
> For existing flows that may move between queue pairs, add
> flow_migrate()/FLOW_MIGRATE(), which removes the socket from the old
> epoll instance and re-registers it on the new one.
>
> TCP timers are migrated lazily: tcp_timer_handler() detects a qpair
> mismatch when a timer fires, moves the timerfd to the correct epoll
> instance, and returns without further processing.
>
> flow_init() now takes the execution context to initialise
> qpair_to_fd[].
>
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> ---
> flow.c | 63 ++++++++++++++++++++--------------------------------
> flow.h | 25 ++++++++-------------
> icmp.c | 3 +--
> passt.c | 3 +--
> tcp.c | 42 +++++++++++++++++++++++++----------
> tcp_splice.c | 1 -
> udp_flow.c | 4 +---
> 7 files changed, 66 insertions(+), 75 deletions(-)
>
> diff --git a/flow.c b/flow.c
> index bf855fe0dfaf..787a7139cfc1 100644
> --- a/flow.c
> +++ b/flow.c
> @@ -130,7 +130,7 @@ static_assert(ARRAY_SIZE(flow_epoll) == FLOW_NUM_TYPES,
> unsigned flow_first_free;
> union flow flowtab[FLOW_MAX];
> static const union flow *flow_new_entry; /* = NULL */
> -static int epoll_id_to_fd[EPOLLFD_ID_SIZE];
> +int qpair_to_fd[FLOW_QPAIR_SIZE];
>
> /* Hash table to index it */
> #define FLOW_HASH_LOAD 70 /* % */
> @@ -362,19 +362,7 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
> */
> int flow_epollfd(const struct flow_common *f)
> {
> - return epoll_id_to_fd[f->epollid];
> -}
> -
> -/**
> - * flow_epollid_set() - Associate a flow with an epoll id
> - * @f: Flow to update
> - * @epollid: epoll id to associate with this flow
> - */
> -void flow_epollid_set(struct flow_common *f, int epollid)
> -{
> - assert(epollid < EPOLLFD_ID_SIZE);
> -
> - f->epollid = epollid;
> + return qpair_to_fd[f->qpair];
> }
>
> /**
> @@ -404,40 +392,31 @@ int flow_epoll_set(const struct flow_common *f, int command, uint32_t events,
> }
>
> /**
> - * flow_epollid_register() - Initialize the epoll id -> fd mapping
> - * @epollid: epoll id to associate to
> - * @epollfd: epoll file descriptor for this epoll id
> + * flow_setqp() - Set queue pair assignment for a flow
> + * @f: Flow to update
> + * @qpair: Queue pair number to assign
> */
> -void flow_epollid_register(int epollid, int epollfd)
> +void flow_setqp(struct flow_common *f, unsigned int qpair)
> {
> - assert(epollid < EPOLLFD_ID_SIZE);
> + assert(qpair < FLOW_QPAIR_SIZE);
> - epoll_id_to_fd[epollid] = epollfd;
> -}
> + flow_trace((union flow *)f, "setting queue pair to %d", qpair);
>
> -/**
> - * flow_qp() - Get the queue pair for a flow
> - * @f: Flow to query (may be NULL)
> - *
> - * Return: queue pair number for the flow, or 0 if flow is NULL or has no
> - * valid queue pair assignment
> - */
> -/* cppcheck-suppress unusedFunction */
> -unsigned int flow_qp(const struct flow_common *f)
> -{
> - if (f == NULL || f->qpair == FLOW_QPAIR_INVALID)
> - return QPAIR_DEFAULT;
> - return f->qpair;
> + f->qpair = qpair;
> }
>
> /**
> - * flow_setqp() - Set queue pair assignment for a flow
> + * flow_migrate() - Migrate a flow to a different queue pair
> * @f: Flow to update
> * @qpair: Queue pair number to assign
> + * @events: epoll events to watch for
> + * @fd: File descriptor to register
> + * @sidei: Side index of the flow
> */
> -void flow_setqp(struct flow_common *f, unsigned int qpair)
> +void flow_migrate(struct flow_common *f, unsigned int qpair, uint32_t events,
> + int fd, unsigned int sidei)
> {
> - assert(qpair < FLOW_QPAIR_MAX);
> + assert(qpair < FLOW_QPAIR_SIZE);
>
> if (f->qpair == qpair)
> return;
> @@ -445,7 +424,10 @@ void flow_setqp(struct flow_common *f, unsigned int qpair)
> flow_trace((union flow *)f, "updating queue pair from %d to %d",
> f->qpair, qpair);
>
> + epoll_del(qpair_to_fd[f->qpair], fd);
> +
> f->qpair = qpair;
> + flow_epoll_set(f, EPOLL_CTL_ADD, events, fd, sidei);
> }
>
> /**
> @@ -669,7 +651,6 @@ union flow *flow_alloc(void)
>
> flow_new_entry = flow;
> memset(flow, 0, sizeof(*flow));
> - flow->f.qpair = FLOW_QPAIR_INVALID;
> flow_set_state(&flow->f, FLOW_STATE_NEW);
>
> return flow;
> @@ -1295,8 +1276,9 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
>
> /**
> * flow_init() - Initialise flow related data structures
> + * @c: Execution context
> */
> -void flow_init(void)
> +void flow_init(const struct ctx *c)
> {
> unsigned b;
>
> @@ -1306,4 +1288,7 @@ void flow_init(void)
>
> for (b = 0; b < FLOW_HASH_SIZE; b++)
> flow_hashtab[b] = FLOW_SIDX_NONE;
> +
> + for (b = 0; b < FLOW_QPAIR_SIZE; b++)
> + qpair_to_fd[b] = c->epollfd;
> }
> diff --git a/flow.h b/flow.h
> index 3c74dcbd95c4..53e0408a9ee5 100644
> --- a/flow.h
> +++ b/flow.h
> @@ -157,6 +157,8 @@ struct flowside {
> in_port_t eport;
> };
>
> +extern int qpair_to_fd[];
> +
> /**
> * flowside_eq() - Check if two flowsides are equal
> * @left, @right: Flowsides to compare
> @@ -204,20 +206,12 @@ struct flow_common {
>
> uint8_t tap_omac[6];
>
> -#define EPOLLFD_ID_BITS 8
> - unsigned int epollid:EPOLLFD_ID_BITS;
> #define FLOW_QPAIR_BITS 5
> unsigned int qpair:FLOW_QPAIR_BITS;
> };
>
> -#define EPOLLFD_ID_DEFAULT 0
> -#define EPOLLFD_ID_SIZE (1 << EPOLLFD_ID_BITS)
> -
> -#define FLOW_QPAIR_NUM (1 << FLOW_QPAIR_BITS)
> -#define FLOW_QPAIR_MAX (FLOW_QPAIR_NUM - 1)
> -#define FLOW_QPAIR_INVALID FLOW_QPAIR_MAX
> -
> -static_assert(VHOST_USER_MAX_VQS <= FLOW_QPAIR_MAX * 2);
> +#define FLOW_QPAIR_SIZE (1 << FLOW_QPAIR_BITS)
> +static_assert(VHOST_USER_MAX_VQS <= FLOW_QPAIR_SIZE * 2);
>
> #define FLOW_INDEX_BITS 17 /* 128k - 1 */
> #define FLOW_MAX MAX_FROM_BITS(FLOW_INDEX_BITS)
> @@ -273,18 +267,17 @@ flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
>
> union flow;
>
> -void flow_init(void);
> +void flow_init(const struct ctx *c);
> int flow_epollfd(const struct flow_common *f);
> -void flow_epollid_set(struct flow_common *f, int epollid);
> int flow_epoll_set(const struct flow_common *f, int command, uint32_t events,
> int fd, unsigned int sidei);
> -void flow_epollid_register(int epollid, int epollfd);
> -unsigned int flow_qp(const struct flow_common *f);
> -#define FLOW_QP(flow_) \
> - (flow_qp(&(flow_)->f))
> void flow_setqp(struct flow_common *f, unsigned int qpair);
> #define FLOW_SETQP(flow_, _qpair) \
> (flow_setqp(&(flow_)->f, _qpair))
> +void flow_migrate(struct flow_common *f, unsigned int qpair, uint32_t events,
> + int fd, unsigned int sidei);
> +#define FLOW_MIGRATE(flow_, qpair_, events_, fd_, sidei_) \
> + (flow_migrate(&(flow_)->f, qpair_, events_, fd_, sidei_))
>
> void flow_defer_handler(const struct ctx *c, const struct timespec *now,
> unsigned int qpair);
> diff --git a/icmp.c b/icmp.c
> index 2558fe5beaab..98ce55a8aff0 100644
> --- a/icmp.c
> +++ b/icmp.c
> @@ -216,7 +216,6 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
> goto cancel;
>
> FLOW_SETQP(pingf, qpair);
Having both FLOW_SETQP() and FLOW_MIGRATE() seems dangerous to me. I
think setting the initial value in flow_initiate_() or similar, then
only changing it via FLOW_MIGRATE() seems like a better idea.
> - flow_epollid_set(&pingf->f, EPOLLFD_ID_DEFAULT);
> if (flow_epoll_set(&pingf->f, EPOLL_CTL_ADD, EPOLLIN, pingf->sock,
> TGTSIDE) < 0) {
> close(pingf->sock);
> @@ -307,7 +306,7 @@ int icmp_tap_handler(const struct ctx *c, unsigned int qpair, uint8_t pif,
>
> if (flow) {
> pingf = &flow->ping;
> - FLOW_SETQP(pingf, qpair); /* XXX if qpair change, update epollfd */
> + FLOW_MIGRATE(pingf, qpair, EPOLLIN, pingf->sock, TGTSIDE);
> } else if (!(pingf = icmp_ping_new(c, qpair, af, id, saddr, daddr))) {
> return 1;
> }
> diff --git a/passt.c b/passt.c
> index c9e456641e85..3afc59b19120 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -365,7 +365,6 @@ int main(int argc, char **argv)
> c.epollfd = epoll_create1(EPOLL_CLOEXEC);
> if (c.epollfd == -1)
> die_perror("Failed to create epoll file descriptor");
> - flow_epollid_register(EPOLLFD_ID_DEFAULT, c.epollfd);
>
> if (getrlimit(RLIMIT_NOFILE, &limit))
> die_perror("Failed to get maximum value of open files limit");
> @@ -388,7 +387,7 @@ int main(int argc, char **argv)
> if (clock_gettime(CLOCK_MONOTONIC, &now))
> die_perror("Failed to get CLOCK_MONOTONIC time");
>
> - flow_init();
> + flow_init(&c);
> fwd_scan_ports_init(&c);
>
> if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
> diff --git a/tcp.c b/tcp.c
> index c0a4de33f068..1549e14adaf4 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -541,6 +541,21 @@ static int tcp_epoll_ctl(struct tcp_tap_conn *conn)
> return 0;
> }
>
> +static int tcp_timer_epoll_add(struct tcp_tap_conn *conn, int fd)
> +{
> + union epoll_ref ref;
> +
> + ref.type = EPOLL_TYPE_TCP_TIMER;
> + ref.flow = FLOW_IDX(conn);
> + ref.fd = fd;
> + if (epoll_add(flow_epollfd(&conn->f), EPOLLIN | EPOLLET, ref) < 0) {
> + flow_dbg(conn, "failed to add timer");
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> /**
> * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
> * @c: Execution context
> @@ -555,7 +570,6 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
> return;
>
> if (conn->timer == -1) {
> - union epoll_ref ref;
> int fd;
>
> fd = timerfd_create(CLOCK_MONOTONIC, 0);
> @@ -570,12 +584,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
> return;
> }
>
> - ref.type = EPOLL_TYPE_TCP_TIMER;
> - ref.flow = FLOW_IDX(conn);
> - ref.fd = fd;
> - if (epoll_add(flow_epollfd(&conn->f), EPOLLIN | EPOLLET,
> - ref) < 0) {
> - flow_dbg(conn, "failed to add timer");
> + if (tcp_timer_epoll_add(conn, fd) < 0) {
> close(fd);
> return;
> }
> @@ -1736,7 +1745,6 @@ static void tcp_conn_from_tap(const struct ctx *c, unsigned int qpair,
> conn->sock = s;
> conn->timer = -1;
> FLOW_SETQP(conn, qpair);
> - flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
> if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, TGTSIDE) < 0) {
> flow_perror(flow, "Can't register with epoll");
> goto cancel;
> @@ -2315,8 +2323,9 @@ int tcp_tap_handler(const struct ctx *c, unsigned int qpair, uint8_t pif,
> assert(pif_at_sidx(sidx) == PIF_TAP);
> conn = &flow->tcp;
>
> - /* update queue pair */
> - FLOW_SETQP(flow, qpair);
> + FLOW_MIGRATE(flow, qpair,
> + tcp_conn_epoll_events(conn->events, conn->flags),
> + conn->sock, !TAPSIDE(conn));
Doesn't the timer fd also need to be migrated to the matching
qpair/epollfd?
>
> flow_trace(conn, "packet length %zu from tap", l4len);
>
> @@ -2523,7 +2532,6 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
> conn->ws_to_tap = conn->ws_from_tap = 0;
>
> FLOW_SETQP(conn, QPAIR_DEFAULT);
> - flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
> if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, INISIDE) < 0) {
> flow_perror(flow, "Can't register with epoll");
> conn_flag(c, conn, CLOSING);
> @@ -2646,6 +2654,17 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref,
> assert(!c->no_tcp);
> assert(conn->f.type == FLOW_TCP);
>
> + if (conn->f.qpair != qpair) {
> + int old_epollfd = qpair_to_fd[qpair];
> +
> + epoll_del(old_epollfd, conn->timer);
> + if (tcp_timer_epoll_add(conn, conn->timer) < 0) {
> + close(conn->timer);
> + conn->timer = -1;
> + }
Oh, I see, it's done lazily. Why?
Also, don't you still need to actually process the current timer event
as well as migrate it for future events?
> + return;
> + }
> +
> /* We don't reset timers on ~ACK_FROM_TAP_DUE, ~ACK_TO_TAP_DUE. If the
> * timer is currently armed, this event came from a previous setting,
> * and we just set the timer to a new point in the future: discard it.
> @@ -3853,7 +3872,6 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
> }
>
> FLOW_SETQP(conn, QPAIR_DEFAULT);
> - flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
> if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->sock,
> !TAPSIDE(conn)))
> goto out; /* tcp_flow_migrate_target_ext() will clean this up */
> diff --git a/tcp_splice.c b/tcp_splice.c
> index 1a77ac2e8a18..3215337d3e62 100644
> --- a/tcp_splice.c
> +++ b/tcp_splice.c
> @@ -378,7 +378,6 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
> pif_sockaddr(c, &sa, tgtpif, &tgt->eaddr, tgt->eport);
>
> FLOW_SETQP(conn, QPAIR_DEFAULT);
> - flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
> if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->s[0], 0) ||
> flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->s[1], 1)) {
> int ret = -errno;
> diff --git a/udp_flow.c b/udp_flow.c
> index 44e0c4c50ca9..27dc24ffb2ae 100644
> --- a/udp_flow.c
> +++ b/udp_flow.c
> @@ -154,7 +154,6 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, unsigned int qpair,
> uflow->activity[INISIDE] = 1;
> uflow->activity[TGTSIDE] = 0;
> FLOW_SETQP(uflow, qpair);
> - flow_epollid_set(&uflow->f, EPOLLFD_ID_DEFAULT);
>
> flow_foreach_sidei(sidei) {
> if (pif_is_socket(uflow->f.pif[sidei]))
> @@ -292,8 +291,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, unsigned int qpair,
> srcport, dstport);
> if ((uflow = udp_at_sidx(sidx))) {
> udp_flow_activity(uflow, sidx.sidei, now);
> - /* update qpair */
> - FLOW_SETQP(uflow, qpair); /* if qpair changes, update epollfd */
> + FLOW_MIGRATE(uflow, qpair, EPOLLIN, uflow->s[TGTSIDE], TGTSIDE);
> return flow_sidx_opposite(sidx);
> }
>
> --
> 2.54.0
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
prev parent reply other threads:[~2026-06-19 6:52 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-16 12:51 [PATCH v5 00/12] vhost-user: Add multiqueue support Laurent Vivier
2026-06-16 12:51 ` [PATCH v5 01/12] tap: Remove pool parameter from tap4_handler() and tap6_handler() Laurent Vivier
2026-06-16 12:51 ` [PATCH v5 02/12] vhost-user: Advertise multiqueue support Laurent Vivier
2026-06-19 5:17 ` David Gibson
2026-06-16 12:51 ` [PATCH v5 03/12] test: Add multiqueue support to vhost-user test infrastructure Laurent Vivier
2026-06-19 5:06 ` David Gibson
2026-06-16 12:51 ` [PATCH v5 04/12] tap: Thread queue pair through all remaining tap paths Laurent Vivier
2026-06-19 5:37 ` David Gibson
2026-06-16 12:51 ` [PATCH v5 05/12] arp: Pass queue pair explicitly through ARP send path Laurent Vivier
2026-06-19 5:40 ` David Gibson
2026-06-16 12:51 ` [PATCH v5 06/12] tcp: Pass queue pair explicitly through TCP " Laurent Vivier
2026-06-19 6:00 ` David Gibson
2026-06-16 12:51 ` [PATCH v5 07/12] udp: Pass queue pair explicitly through UDP " Laurent Vivier
2026-06-19 6:08 ` David Gibson
2026-06-16 12:51 ` [PATCH v5 08/12] dhcp/dhcpv6: Pass queue pair explicitly through DHCP " Laurent Vivier
2026-06-19 6:10 ` David Gibson
2026-06-16 12:51 ` [PATCH v5 09/12] icmp: Pass queue pair explicitly through ICMP " Laurent Vivier
2026-06-19 6:12 ` David Gibson
2026-06-16 12:51 ` [PATCH v5 10/12] ndp: Pass queue pair explicitly through NDP " Laurent Vivier
2026-06-19 6:17 ` David Gibson
2026-06-16 12:51 ` [PATCH v5 11/12] flow: Add queue pair tracking to flow management Laurent Vivier
2026-06-19 6:36 ` David Gibson
2026-06-16 12:51 ` [PATCH v5 12/12] flow: Derive epoll fd from queue pair, removing epollid field Laurent Vivier
2026-06-19 6:52 ` David Gibson [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=ajTnHHoW3xw0apib@zatzit \
--to=david@gibson.dropbear.id.au \
--cc=lvivier@redhat.com \
--cc=passt-dev@passt.top \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).