public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: Laurent Vivier <lvivier@redhat.com>
To: passt-dev@passt.top
Cc: Laurent Vivier <lvivier@redhat.com>
Subject: [PATCH v3 4/6] tcp, flow: Replace per-connection in_epoll flag with threadnb in flow_common
Date: Thu,  9 Oct 2025 15:04:07 +0200	[thread overview]
Message-ID: <20251009130409.3931795-5-lvivier@redhat.com> (raw)
In-Reply-To: <20251009130409.3931795-1-lvivier@redhat.com>

The in_epoll boolean flag in tcp_tap_conn and tcp_splice_conn only tracked
whether a connection was registered with epoll, not which epoll instance.
This limited flexibility for future multi-epoll support.

Replace the boolean with a threadnb field in flow_common that identifies
which thread (and thus which epoll instance) the flow is registered with.
Use FLOW_THREADNB_INVALID to indicate when a flow is not registered with
any epoll instance. A threadnb_to_epollfd[] mapping table translates
thread numbers to their corresponding epoll file descriptors.

Add helper functions:
- flow_epollfd_valid() to check if a flow is registered with epoll
- flow_epollfd_get() to retrieve the epoll fd for a flow's thread
- flow_epollfd_set() to register a flow with a thread's epoll instance

This change also simplifies tcp_timer_ctl() and conn_flag_do() by removing
the need to pass the context 'c', since the epoll fd is now directly
accessible from the flow structure via flow_epollfd_get().

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
 flow.c       | 43 ++++++++++++++++++++++++++++++++++++++++++-
 flow.h       | 11 +++++++++++
 tcp.c        | 39 +++++++++++++++++++++------------------
 tcp_conn.h   |  8 +-------
 tcp_splice.c | 24 ++++++++++++------------
 5 files changed, 87 insertions(+), 38 deletions(-)

diff --git a/flow.c b/flow.c
index b14e9d8b63ff..6298bbafcd51 100644
--- a/flow.c
+++ b/flow.c
@@ -116,6 +116,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
 unsigned flow_first_free;
 union flow flowtab[FLOW_MAX];
 static const union flow *flow_new_entry; /* = NULL */
+static int threadnb_to_epollfd[FLOW_THREADNB_SIZE];
 
 /* Hash table to index it */
 #define FLOW_HASH_LOAD		70		/* % */
@@ -347,6 +348,45 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
 	flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
 }
 
+/**
+ * flow_epollfd_valid() - Check if flow is registered with an epoll instance
+ * @f:		Flow to check
+ *
+ * Return: true if flow is registered with epoll, false otherwise
+ */
+bool flow_epollfd_valid(const struct flow_common *f)
+{
+	return f->threadnb != FLOW_THREADNB_INVALID;
+}
+
+/**
+ * flow_epollfd_get() - Get the epoll file descriptor for a flow
+ * @f:		Flow to query
+ *
+ * Return: epoll file descriptor associated with the flow's thread
+ */
+int flow_epollfd_get(const struct flow_common *f)
+{
+	ASSERT(f->threadnb < FLOW_THREADNB_MAX);
+
+	return threadnb_to_epollfd[f->threadnb];
+}
+
+/**
+ * flow_epollfd_set() - Set the epoll instance for a flow
+ * @f:		Flow to update
+ * @threadnb:	Thread number to associate with this flow
+ * @epollfd:	epoll file descriptor for the thread
+ */
+void flow_epollfd_set(struct flow_common *f, int threadnb, int epollfd)
+{
+	ASSERT(threadnb < FLOW_THREADNB_MAX);
+	ASSERT(epollfd >= 0);
+
+	threadnb_to_epollfd[threadnb] = epollfd;
+	f->threadnb = threadnb;
+}
+
 /**
  * flow_initiate_() - Move flow to INI, setting pif[INISIDE]
  * @flow:	Flow to change state
@@ -548,6 +588,7 @@ union flow *flow_alloc(void)
 
 	flow_new_entry = flow;
 	memset(flow, 0, sizeof(*flow));
+	flow->f.threadnb = FLOW_THREADNB_INVALID;
 	flow_set_state(&flow->f, FLOW_STATE_NEW);
 
 	return flow;
@@ -827,7 +868,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 		case FLOW_TCP_SPLICE:
 			closed = tcp_splice_flow_defer(&flow->tcp_splice);
 			if (!closed && timer)
-				tcp_splice_timer(c, &flow->tcp_splice);
+				tcp_splice_timer(&flow->tcp_splice);
 			break;
 		case FLOW_PING4:
 		case FLOW_PING6:
diff --git a/flow.h b/flow.h
index ef138b83add8..18ae45d67720 100644
--- a/flow.h
+++ b/flow.h
@@ -177,6 +177,8 @@ int flowside_connect(const struct ctx *c, int s,
  * @type:	Type of packet flow
  * @pif[]:	Interface for each side of the flow
  * @side[]:	Information for each side of the flow
+ * @threadnb:	Thread number flow is registered with
+ *		(FLOW_THREADNB_INVALID if not)
  */
 struct flow_common {
 #ifdef __GNUC__
@@ -192,8 +194,14 @@ struct flow_common {
 #endif
 	uint8_t		pif[SIDES];
 	struct flowside	side[SIDES];
+#define FLOW_THREADNB_BITS 8
+	unsigned int	threadnb:FLOW_THREADNB_BITS;
 };
 
+#define FLOW_THREADNB_SIZE	(1 << FLOW_THREADNB_BITS)
+#define FLOW_THREADNB_MAX	(FLOW_THREADNB_SIZE - 1)
+#define FLOW_THREADNB_INVALID	FLOW_THREADNB_MAX
+
 #define FLOW_INDEX_BITS		17	/* 128k - 1 */
 #define FLOW_MAX		MAX_FROM_BITS(FLOW_INDEX_BITS)
 
@@ -249,6 +257,9 @@ flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
 union flow;
 
 void flow_init(void);
+bool flow_epollfd_valid(const struct flow_common *f);
+int flow_epollfd_get(const struct flow_common *f);
+void flow_epollfd_set(struct flow_common *f, int threadnb, int epollfd);
 void flow_defer_handler(const struct ctx *c, const struct timespec *now);
 int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage,
 			      int fd);
diff --git a/tcp.c b/tcp.c
index c2d842bbdf4f..f9c67e2b006d 100644
--- a/tcp.c
+++ b/tcp.c
@@ -504,25 +504,27 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
  */
 static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 {
-	int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+	int m = flow_epollfd_valid(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
 	union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
 		                .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), };
 	struct epoll_event ev = { .data.u64 = ref.u64 };
+	int epollfd = flow_epollfd_valid(&conn->f) ? flow_epollfd_get(&conn->f)
+						   : c->epollfd;
 
 	if (conn->events == CLOSED) {
-		if (conn->in_epoll)
-			epoll_del(c->epollfd, conn->sock);
+		if (flow_epollfd_valid(&conn->f))
+			epoll_del(epollfd, conn->sock);
 		if (conn->timer != -1)
-			epoll_del(c->epollfd, conn->timer);
+			epoll_del(epollfd, conn->timer);
 		return 0;
 	}
 
 	ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
 
-	if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
+	if (epoll_ctl(epollfd, m, conn->sock, &ev))
 		return -errno;
 
-	conn->in_epoll = true;
+	flow_epollfd_set(&conn->f, 0, epollfd);
 
 	if (conn->timer != -1) {
 		union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
@@ -531,7 +533,8 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		struct epoll_event ev_t = { .data.u64 = ref_t.u64,
 					    .events = EPOLLIN | EPOLLET };
 
-		if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
+		if (epoll_ctl(flow_epollfd_get(&conn->f), EPOLL_CTL_MOD,
+			      conn->timer, &ev_t))
 			return -errno;
 	}
 
@@ -540,12 +543,11 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 
 /**
  * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
- * @c:		Execution context
  * @conn:	Connection pointer
  *
  * #syscalls timerfd_create timerfd_settime
  */
-static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
+static void tcp_timer_ctl(struct tcp_tap_conn *conn)
 {
 	struct itimerspec it = { { 0 }, { 0 } };
 
@@ -570,7 +572,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		}
 		conn->timer = fd;
 
-		if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
+		if (epoll_ctl(flow_epollfd_get(&conn->f), EPOLL_CTL_ADD,
+			      conn->timer, &ev)) {
 			flow_dbg_perror(conn, "failed to add timer");
 			close(conn->timer);
 			conn->timer = -1;
@@ -628,7 +631,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
 			 * flags and factor this into the logic below.
 			 */
 			if (flag == ACK_FROM_TAP_DUE)
-				tcp_timer_ctl(c, conn);
+				tcp_timer_ctl(conn);
 
 			return;
 		}
@@ -644,7 +647,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
 	if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE		  ||
 	    (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
 	    (flag == ~ACK_TO_TAP_DUE   && (conn->flags & ACK_FROM_TAP_DUE)))
-		tcp_timer_ctl(c, conn);
+		tcp_timer_ctl(conn);
 }
 
 /**
@@ -699,7 +702,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
 		tcp_epoll_ctl(c, conn);
 
 	if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
-		tcp_timer_ctl(c, conn);
+		tcp_timer_ctl(conn);
 }
 
 /**
@@ -1737,7 +1740,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 				   seq, conn->seq_from_tap);
 
 			tcp_send_flag(c, conn, ACK);
-			tcp_timer_ctl(c, conn);
+			tcp_timer_ctl(conn);
 
 			if (p->count == 1) {
 				tcp_tap_window_update(c, conn,
@@ -2386,7 +2389,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 
 	if (conn->flags & ACK_TO_TAP_DUE) {
 		tcp_send_flag(c, conn, ACK_IF_NEEDED);
-		tcp_timer_ctl(c, conn);
+		tcp_timer_ctl(conn);
 	} else if (conn->flags & ACK_FROM_TAP_DUE) {
 		if (!(conn->events & ESTABLISHED)) {
 			flow_dbg(conn, "handshake timeout");
@@ -2408,7 +2411,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 				return;
 
 			tcp_data_from_sock(c, conn);
-			tcp_timer_ctl(c, conn);
+			tcp_timer_ctl(conn);
 		}
 	} else {
 		struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
@@ -3456,7 +3459,7 @@ int tcp_flow_migrate_source_ext(const struct ctx *c,
 	if (c->migrate_no_linger)
 		close(s);
 	else
-		epoll_del(c->epollfd, s);
+		epoll_del(flow_epollfd_get(&conn->f), s);
 
 	/* Adjustments unrelated to FIN segments: sequence numbers we dumped are
 	 * based on the end of the queues.
@@ -3605,7 +3608,7 @@ static int tcp_flow_repair_connect(const struct ctx *c,
 		return rc;
 	}
 
-	conn->in_epoll = 0;
+	conn->f.threadnb = FLOW_THREADNB_INVALID;
 	conn->timer = -1;
 	conn->listening_sock = -1;
 
diff --git a/tcp_conn.h b/tcp_conn.h
index 38b5c541f003..81333122d531 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -12,7 +12,6 @@
 /**
  * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
  * @f:			Generic flow information
- * @in_epoll:		Is the connection in the epoll set?
  * @retrans:		Number of retransmissions occurred due to ACK_TIMEOUT
  * @ws_from_tap:	Window scaling factor advertised from tap/guest
  * @ws_to_tap:		Window scaling factor advertised to tap/guest
@@ -36,8 +35,6 @@ struct tcp_tap_conn {
 	/* Must be first element */
 	struct flow_common f;
 
-	bool		in_epoll	:1;
-
 #define TCP_RETRANS_BITS		3
 	unsigned int	retrans		:TCP_RETRANS_BITS;
 #define TCP_MAX_RETRANS			MAX_FROM_BITS(TCP_RETRANS_BITS)
@@ -196,7 +193,6 @@ struct tcp_tap_transfer_ext {
  * @written:		Bytes written (not fully written from one other side read)
  * @events:		Events observed/actions performed on connection
  * @flags:		Connection flags (attributes, not events)
- * @in_epoll:		Is the connection in the epoll set?
  */
 struct tcp_splice_conn {
 	/* Must be first element */
@@ -220,8 +216,6 @@ struct tcp_splice_conn {
 #define RCVLOWAT_SET(sidei_)		((sidei_) ? BIT(1) : BIT(0))
 #define RCVLOWAT_ACT(sidei_)		((sidei_) ? BIT(3) : BIT(2))
 #define CLOSING				BIT(4)
-
-	bool in_epoll	:1;
 };
 
 /* Socket pools */
@@ -245,7 +239,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 bool tcp_flow_is_established(const struct tcp_tap_conn *conn);
 
 bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
-void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
+void tcp_splice_timer(struct tcp_splice_conn *conn);
 int tcp_conn_pool_sock(int pool[]);
 int tcp_conn_sock(sa_family_t af);
 int tcp_sock_refill_pool(int pool[], sa_family_t af);
diff --git a/tcp_splice.c b/tcp_splice.c
index 6f21184bdc55..8842feef0684 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -149,7 +149,9 @@ static void tcp_splice_conn_epoll_events(uint16_t events,
 static int tcp_splice_epoll_ctl(const struct ctx *c,
 				struct tcp_splice_conn *conn)
 {
-	int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+	int epollfd = flow_epollfd_valid(&conn->f) ? flow_epollfd_get(&conn->f)
+						   : c->epollfd;
+	int m = flow_epollfd_valid(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
 	const union epoll_ref ref[SIDES] = {
 		{ .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0],
 		  .flowside = FLOW_SIDX(conn, 0) },
@@ -161,25 +163,24 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
 
 	tcp_splice_conn_epoll_events(conn->events, ev);
 
-	if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) ||
-	    epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) {
+
+	if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) ||
+	    epoll_ctl(epollfd, m, conn->s[1], &ev[1])) {
 		int ret = -errno;
 		flow_perror(conn, "ERROR on epoll_ctl()");
 		return ret;
 	}
-
-	conn->in_epoll = true;
+	flow_epollfd_set(&conn->f, 0, epollfd);
 
 	return 0;
 }
 
 /**
  * conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag
- * @c:		Execution context
  * @conn:	Connection pointer
  * @flag:	Flag to set, or ~flag to unset
  */
-static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
+static void conn_flag_do(struct tcp_splice_conn *conn,
 			 unsigned long flag)
 {
 	if (flag & (flag - 1)) {
@@ -204,15 +205,15 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
 	}
 
 	if (flag == CLOSING) {
-		epoll_del(c->epollfd, conn->s[0]);
-		epoll_del(c->epollfd, conn->s[1]);
+		epoll_del(flow_epollfd_get(&conn->f), conn->s[0]);
+		epoll_del(flow_epollfd_get(&conn->f), conn->s[1]);
 	}
 }
 
 #define conn_flag(c, conn, flag)					\
 	do {								\
 		flow_trace(conn, "flag at %s:%i", __func__, __LINE__);	\
-		conn_flag_do(c, conn, flag);				\
+		conn_flag_do(conn, flag);				\
 	} while (0)
 
 /**
@@ -751,10 +752,9 @@ void tcp_splice_init(struct ctx *c)
 
 /**
  * tcp_splice_timer() - Timer for spliced connections
- * @c:		Execution context
  * @conn:	Connection to handle
  */
-void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn)
+void tcp_splice_timer(struct tcp_splice_conn *conn)
 {
 	unsigned sidei;
 
-- 
2.50.1


  parent reply	other threads:[~2025-10-09 13:04 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-09 13:04 [PATCH v3 0/6] Refactor epoll handling in preparation for multithreading Laurent Vivier
2025-10-09 13:04 ` [PATCH v3 1/6] util: Simplify epoll_del() interface to take epollfd directly Laurent Vivier
2025-10-09 13:04 ` [PATCH v3 2/6] epoll_ctl: Extract epoll operations Laurent Vivier
2025-10-09 13:04 ` [PATCH v3 3/6] util: Move epoll registration out of sock_l4_sa() Laurent Vivier
2025-10-09 13:04 ` Laurent Vivier [this message]
2025-10-09 13:04 ` [PATCH v3 5/6] icmp: Use thread-based epoll management for ICMP flows Laurent Vivier
2025-10-09 13:04 ` [PATCH v3 6/6] udp: Use thread-based epoll management for UDP flows Laurent Vivier

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251009130409.3931795-5-lvivier@redhat.com \
    --to=lvivier@redhat.com \
    --cc=passt-dev@passt.top \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).