public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: Laurent Vivier <lvivier@redhat.com>
To: passt-dev@passt.top
Cc: Laurent Vivier <lvivier@redhat.com>
Subject: [PATCH v5 12/12] flow: Derive epoll fd from queue pair, removing epollid field
Date: Tue, 16 Jun 2026 14:51:30 +0200	[thread overview]
Message-ID: <20260616125130.1324274-13-lvivier@redhat.com> (raw)
In-Reply-To: <20260616125130.1324274-1-lvivier@redhat.com>

Since each queue pair maps to exactly one epoll instance, the epoll
file descriptor can be looked up directly from the qpair field.  This
makes the separate epollid field in flow_common redundant.

Replace epoll_id_to_fd[] with qpair_to_fd[], remove
flow_epollid_set(), flow_epollid_register(), flow_qp()/FLOW_QP(),
and the epollid field from flow_common.  FLOW_QPAIR_INVALID is no
longer needed: newly allocated flows get qpair 0 from memset.

For new flows, FLOW_SETQP() sets the queue pair during creation, and
the socket is registered with epoll separately via flow_epoll_set().

For existing flows that may move between queue pairs, add
flow_migrate()/FLOW_MIGRATE(), which removes the socket from the old
epoll instance and re-registers it on the new one.

TCP timers are migrated lazily: tcp_timer_handler() detects a qpair
mismatch when a timer fires, moves the timerfd to the correct epoll
instance, and returns without further processing.

flow_init() now takes the execution context to initialise
qpair_to_fd[].

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
 flow.c       | 63 ++++++++++++++++++++--------------------------------
 flow.h       | 25 ++++++++-------------
 icmp.c       |  3 +--
 passt.c      |  3 +--
 tcp.c        | 42 +++++++++++++++++++++++++----------
 tcp_splice.c |  1 -
 udp_flow.c   |  4 +---
 7 files changed, 66 insertions(+), 75 deletions(-)

diff --git a/flow.c b/flow.c
index bf855fe0dfaf..787a7139cfc1 100644
--- a/flow.c
+++ b/flow.c
@@ -130,7 +130,7 @@ static_assert(ARRAY_SIZE(flow_epoll) == FLOW_NUM_TYPES,
 unsigned flow_first_free;
 union flow flowtab[FLOW_MAX];
 static const union flow *flow_new_entry; /* = NULL */
-static int epoll_id_to_fd[EPOLLFD_ID_SIZE];
+int qpair_to_fd[FLOW_QPAIR_SIZE];
 
 /* Hash table to index it */
 #define FLOW_HASH_LOAD		70		/* % */
@@ -362,19 +362,7 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
  */
 int flow_epollfd(const struct flow_common *f)
 {
-	return epoll_id_to_fd[f->epollid];
-}
-
-/**
- * flow_epollid_set() - Associate a flow with an epoll id
- * @f:		Flow to update
- * @epollid:	epoll id to associate with this flow
- */
-void flow_epollid_set(struct flow_common *f, int epollid)
-{
-	assert(epollid < EPOLLFD_ID_SIZE);
-
-	f->epollid = epollid;
+	return qpair_to_fd[f->qpair];
 }
 
 /**
@@ -404,40 +392,31 @@ int flow_epoll_set(const struct flow_common *f, int command, uint32_t events,
 }
 
 /**
- * flow_epollid_register() - Initialize the epoll id -> fd mapping
- * @epollid:	epoll id to associate to
- * @epollfd:	epoll file descriptor for this epoll id
+ * flow_setqp() - Set queue pair assignment for a flow
+ * @f:		Flow to update
+ * @qpair:	Queue pair number to assign
  */
-void flow_epollid_register(int epollid, int epollfd)
+void flow_setqp(struct flow_common *f, unsigned int qpair)
 {
-	assert(epollid < EPOLLFD_ID_SIZE);
+	assert(qpair < FLOW_QPAIR_SIZE);
 
-	epoll_id_to_fd[epollid] = epollfd;
-}
+	flow_trace((union flow *)f, "setting queue pair to %d", qpair);
 
-/**
- * flow_qp() - Get the queue pair for a flow
- * @f:		Flow to query (may be NULL)
- *
- * Return: queue pair number for the flow, or 0 if flow is NULL or has no
- *         valid queue pair assignment
- */
-/* cppcheck-suppress unusedFunction */
-unsigned int flow_qp(const struct flow_common *f)
-{
-	if (f == NULL || f->qpair == FLOW_QPAIR_INVALID)
-		return QPAIR_DEFAULT;
-	return f->qpair;
+	f->qpair = qpair;
 }
 
 /**
- * flow_setqp() - Set queue pair assignment for a flow
+ * flow_migrate() - Migrate a flow to a different queue pair
  * @f:		Flow to update
  * @qpair:	Queue pair number to assign
+ * @events:	epoll events to watch for
+ * @fd:		File descriptor to register
+ * @sidei:	Side index of the flow
  */
-void flow_setqp(struct flow_common *f, unsigned int qpair)
+void flow_migrate(struct flow_common *f, unsigned int qpair, uint32_t events,
+		  int fd, unsigned int sidei)
 {
-	assert(qpair < FLOW_QPAIR_MAX);
+	assert(qpair < FLOW_QPAIR_SIZE);
 
 	if (f->qpair == qpair)
 		return;
@@ -445,7 +424,10 @@ void flow_setqp(struct flow_common *f, unsigned int qpair)
 	flow_trace((union flow *)f, "updating queue pair from %d to %d",
 		   f->qpair, qpair);
 
+	epoll_del(qpair_to_fd[f->qpair], fd);
+
 	f->qpair = qpair;
+	flow_epoll_set(f, EPOLL_CTL_ADD, events, fd, sidei);
 }
 
 /**
@@ -669,7 +651,6 @@ union flow *flow_alloc(void)
 
 	flow_new_entry = flow;
 	memset(flow, 0, sizeof(*flow));
-	flow->f.qpair = FLOW_QPAIR_INVALID;
 	flow_set_state(&flow->f, FLOW_STATE_NEW);
 
 	return flow;
@@ -1295,8 +1276,9 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
 
 /**
  * flow_init() - Initialise flow related data structures
+ * @c:		Execution context
  */
-void flow_init(void)
+void flow_init(const struct ctx *c)
 {
 	unsigned b;
 
@@ -1306,4 +1288,7 @@ void flow_init(void)
 
 	for (b = 0; b < FLOW_HASH_SIZE; b++)
 		flow_hashtab[b] = FLOW_SIDX_NONE;
+
+	for (b = 0; b < FLOW_QPAIR_SIZE; b++)
+		qpair_to_fd[b] = c->epollfd;
 }
diff --git a/flow.h b/flow.h
index 3c74dcbd95c4..53e0408a9ee5 100644
--- a/flow.h
+++ b/flow.h
@@ -157,6 +157,8 @@ struct flowside {
 	in_port_t		eport;
 };
 
+extern int qpair_to_fd[];
+
 /**
  * flowside_eq() - Check if two flowsides are equal
  * @left, @right:	Flowsides to compare
@@ -204,20 +206,12 @@ struct flow_common {
 
 	uint8_t		tap_omac[6];
 
-#define EPOLLFD_ID_BITS 8
-	unsigned int	epollid:EPOLLFD_ID_BITS;
 #define FLOW_QPAIR_BITS 5
 	unsigned int	qpair:FLOW_QPAIR_BITS;
 };
 
-#define EPOLLFD_ID_DEFAULT	0
-#define EPOLLFD_ID_SIZE		(1 << EPOLLFD_ID_BITS)
-
-#define FLOW_QPAIR_NUM		(1 << FLOW_QPAIR_BITS)
-#define FLOW_QPAIR_MAX		(FLOW_QPAIR_NUM - 1)
-#define FLOW_QPAIR_INVALID	FLOW_QPAIR_MAX
-
-static_assert(VHOST_USER_MAX_VQS <= FLOW_QPAIR_MAX * 2);
+#define FLOW_QPAIR_SIZE		(1 << FLOW_QPAIR_BITS)
+static_assert(VHOST_USER_MAX_VQS <= FLOW_QPAIR_SIZE * 2);
 
 #define FLOW_INDEX_BITS		17	/* 128k - 1 */
 #define FLOW_MAX		MAX_FROM_BITS(FLOW_INDEX_BITS)
@@ -273,18 +267,17 @@ flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
 
 union flow;
 
-void flow_init(void);
+void flow_init(const struct ctx *c);
 int flow_epollfd(const struct flow_common *f);
-void flow_epollid_set(struct flow_common *f, int epollid);
 int flow_epoll_set(const struct flow_common *f, int command, uint32_t events,
 		   int fd, unsigned int sidei);
-void flow_epollid_register(int epollid, int epollfd);
-unsigned int flow_qp(const struct flow_common *f);
-#define FLOW_QP(flow_)				\
-	(flow_qp(&(flow_)->f))
 void flow_setqp(struct flow_common *f, unsigned int qpair);
 #define FLOW_SETQP(flow_, _qpair)		\
 	(flow_setqp(&(flow_)->f, _qpair))
+void flow_migrate(struct flow_common *f, unsigned int qpair, uint32_t events,
+		  int fd, unsigned int sidei);
+#define FLOW_MIGRATE(flow_, qpair_, events_, fd_, sidei_)	\
+	(flow_migrate(&(flow_)->f, qpair_, events_, fd_, sidei_))
 
 void flow_defer_handler(const struct ctx *c, const struct timespec *now,
 			unsigned int qpair);
diff --git a/icmp.c b/icmp.c
index 2558fe5beaab..98ce55a8aff0 100644
--- a/icmp.c
+++ b/icmp.c
@@ -216,7 +216,6 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
 		goto cancel;
 
 	FLOW_SETQP(pingf, qpair);
-	flow_epollid_set(&pingf->f, EPOLLFD_ID_DEFAULT);
 	if (flow_epoll_set(&pingf->f, EPOLL_CTL_ADD, EPOLLIN, pingf->sock,
 			   TGTSIDE) < 0) {
 		close(pingf->sock);
@@ -307,7 +306,7 @@ int icmp_tap_handler(const struct ctx *c, unsigned int qpair, uint8_t pif,
 
 	if (flow) {
 		pingf = &flow->ping;
-		FLOW_SETQP(pingf, qpair); /* XXX if qpair change, update epollfd */
+		FLOW_MIGRATE(pingf, qpair, EPOLLIN, pingf->sock, TGTSIDE);
 	} else if (!(pingf = icmp_ping_new(c, qpair, af, id, saddr, daddr))) {
 		return 1;
 	}
diff --git a/passt.c b/passt.c
index c9e456641e85..3afc59b19120 100644
--- a/passt.c
+++ b/passt.c
@@ -365,7 +365,6 @@ int main(int argc, char **argv)
 	c.epollfd = epoll_create1(EPOLL_CLOEXEC);
 	if (c.epollfd == -1)
 		die_perror("Failed to create epoll file descriptor");
-	flow_epollid_register(EPOLLFD_ID_DEFAULT, c.epollfd);
 
 	if (getrlimit(RLIMIT_NOFILE, &limit))
 		die_perror("Failed to get maximum value of open files limit");
@@ -388,7 +387,7 @@ int main(int argc, char **argv)
 	if (clock_gettime(CLOCK_MONOTONIC, &now))
 		die_perror("Failed to get CLOCK_MONOTONIC time");
 
-	flow_init();
+	flow_init(&c);
 	fwd_scan_ports_init(&c);
 
 	if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
diff --git a/tcp.c b/tcp.c
index c0a4de33f068..1549e14adaf4 100644
--- a/tcp.c
+++ b/tcp.c
@@ -541,6 +541,21 @@ static int tcp_epoll_ctl(struct tcp_tap_conn *conn)
 	return 0;
 }
 
+static int tcp_timer_epoll_add(struct tcp_tap_conn *conn, int fd)
+{
+	union epoll_ref ref;
+
+	ref.type = EPOLL_TYPE_TCP_TIMER;
+	ref.flow = FLOW_IDX(conn);
+	ref.fd = fd;
+	if (epoll_add(flow_epollfd(&conn->f), EPOLLIN | EPOLLET, ref) < 0) {
+		flow_dbg(conn, "failed to add timer");
+		return -1;
+	}
+
+	return 0;
+}
+
 /**
  * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
  * @c:		Execution context
@@ -555,7 +570,6 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		return;
 
 	if (conn->timer == -1) {
-		union epoll_ref ref;
 		int fd;
 
 		fd = timerfd_create(CLOCK_MONOTONIC, 0);
@@ -570,12 +584,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 			return;
 		}
 
-		ref.type = EPOLL_TYPE_TCP_TIMER;
-		ref.flow = FLOW_IDX(conn);
-		ref.fd = fd;
-		if (epoll_add(flow_epollfd(&conn->f), EPOLLIN | EPOLLET,
-			      ref) < 0) {
-			flow_dbg(conn, "failed to add timer");
+		if (tcp_timer_epoll_add(conn, fd) < 0) {
 			close(fd);
 			return;
 		}
@@ -1736,7 +1745,6 @@ static void tcp_conn_from_tap(const struct ctx *c, unsigned int qpair,
 	conn->sock = s;
 	conn->timer = -1;
 	FLOW_SETQP(conn, qpair);
-	flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
 	if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, TGTSIDE) < 0) {
 		flow_perror(flow, "Can't register with epoll");
 		goto cancel;
@@ -2315,8 +2323,9 @@ int tcp_tap_handler(const struct ctx *c, unsigned int qpair, uint8_t pif,
 	assert(pif_at_sidx(sidx) == PIF_TAP);
 	conn = &flow->tcp;
 
-	/* update queue pair */
-	FLOW_SETQP(flow, qpair);
+	FLOW_MIGRATE(flow, qpair,
+		     tcp_conn_epoll_events(conn->events, conn->flags),
+		     conn->sock, !TAPSIDE(conn));
 
 	flow_trace(conn, "packet length %zu from tap", l4len);
 
@@ -2523,7 +2532,6 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
 	conn->ws_to_tap = conn->ws_from_tap = 0;
 
 	FLOW_SETQP(conn, QPAIR_DEFAULT);
-	flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
 	if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, INISIDE) < 0) {
 		flow_perror(flow, "Can't register with epoll");
 		conn_flag(c, conn, CLOSING);
@@ -2646,6 +2654,17 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref,
 	assert(!c->no_tcp);
 	assert(conn->f.type == FLOW_TCP);
 
+	if (conn->f.qpair != qpair) {
+		int old_epollfd = qpair_to_fd[qpair];
+
+		epoll_del(old_epollfd, conn->timer);
+		if (tcp_timer_epoll_add(conn, conn->timer) < 0) {
+			close(conn->timer);
+			conn->timer = -1;
+		}
+		return;
+	}
+
 	/* We don't reset timers on ~ACK_FROM_TAP_DUE, ~ACK_TO_TAP_DUE. If the
 	 * timer is currently armed, this event came from a previous setting,
 	 * and we just set the timer to a new point in the future: discard it.
@@ -3853,7 +3872,6 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
 	}
 
 	FLOW_SETQP(conn, QPAIR_DEFAULT);
-	flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
 	if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->sock,
 			   !TAPSIDE(conn)))
 		goto out; /* tcp_flow_migrate_target_ext() will clean this up */
diff --git a/tcp_splice.c b/tcp_splice.c
index 1a77ac2e8a18..3215337d3e62 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -378,7 +378,6 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
 	pif_sockaddr(c, &sa, tgtpif, &tgt->eaddr, tgt->eport);
 
 	FLOW_SETQP(conn, QPAIR_DEFAULT);
-	flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
 	if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->s[0], 0) ||
 	    flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->s[1], 1)) {
 		int ret = -errno;
diff --git a/udp_flow.c b/udp_flow.c
index 44e0c4c50ca9..27dc24ffb2ae 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -154,7 +154,6 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, unsigned int qpair,
 	uflow->activity[INISIDE] = 1;
 	uflow->activity[TGTSIDE] = 0;
 	FLOW_SETQP(uflow, qpair);
-	flow_epollid_set(&uflow->f, EPOLLFD_ID_DEFAULT);
 
 	flow_foreach_sidei(sidei) {
 		if (pif_is_socket(uflow->f.pif[sidei]))
@@ -292,8 +291,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, unsigned int qpair,
 			      srcport, dstport);
 	if ((uflow = udp_at_sidx(sidx))) {
 		udp_flow_activity(uflow, sidx.sidei, now);
-		/* update qpair */
-		FLOW_SETQP(uflow, qpair); /* if qpair changes, update epollfd */
+		FLOW_MIGRATE(uflow, qpair, EPOLLIN, uflow->s[TGTSIDE], TGTSIDE);
 		return flow_sidx_opposite(sidx);
 	}
 
-- 
2.54.0


      parent reply	other threads:[~2026-06-16 12:51 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-16 12:51 [PATCH v5 00/12] vhost-user: Add multiqueue support Laurent Vivier
2026-06-16 12:51 ` [PATCH v5 01/12] tap: Remove pool parameter from tap4_handler() and tap6_handler() Laurent Vivier
2026-06-16 12:51 ` [PATCH v5 02/12] vhost-user: Advertise multiqueue support Laurent Vivier
2026-06-16 12:51 ` [PATCH v5 03/12] test: Add multiqueue support to vhost-user test infrastructure Laurent Vivier
2026-06-16 12:51 ` [PATCH v5 04/12] tap: Thread queue pair through all remaining tap paths Laurent Vivier
2026-06-16 12:51 ` [PATCH v5 05/12] arp: Pass queue pair explicitly through ARP send path Laurent Vivier
2026-06-16 12:51 ` [PATCH v5 06/12] tcp: Pass queue pair explicitly through TCP " Laurent Vivier
2026-06-16 12:51 ` [PATCH v5 07/12] udp: Pass queue pair explicitly through UDP " Laurent Vivier
2026-06-16 12:51 ` [PATCH v5 08/12] dhcp/dhcpv6: Pass queue pair explicitly through DHCP " Laurent Vivier
2026-06-16 12:51 ` [PATCH v5 09/12] icmp: Pass queue pair explicitly through ICMP " Laurent Vivier
2026-06-16 12:51 ` [PATCH v5 10/12] ndp: Pass queue pair explicitly through NDP " Laurent Vivier
2026-06-16 12:51 ` [PATCH v5 11/12] flow: Add queue pair tracking to flow management Laurent Vivier
2026-06-16 12:51 ` Laurent Vivier [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260616125130.1324274-13-lvivier@redhat.com \
    --to=lvivier@redhat.com \
    --cc=passt-dev@passt.top \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).