[PATCH v3 0/4] Retry SYNs for inbound connections

public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed

* [PATCH v3 0/4] Retry SYNs for inbound connections
@ 2025-10-14  7:38 Yumei Huang
  2025-10-14  7:38 ` [PATCH v3 1/4] tcp: Rename "retrans" to "retries" Yumei Huang
                   ` (3 more replies)
  0 siblings, 4 replies; 31+ messages in thread
From: Yumei Huang @ 2025-10-14  7:38 UTC (permalink / raw)
  To: passt-dev, sbrivio; +Cc: david, yuhuang

When a client connects, SYN would be sent to guest only once. If the
guest is not connected or ready at that time, the connection will be
reset in 10s. These patches introduce the SYN retry mechanism using
the similar backoff timeout as linux kernel. Also update the data
retransmission timeout using the backoff timeout.

Yumei Huang (4):
  tcp: Rename "retrans" to "retries"
  util: Introduce read_file() and read_file_long() function
  tcp: Resend SYN for inbound connections
  tcp: Update data retransmission timeout

 tcp.c      | 79 ++++++++++++++++++++++++++++++++++------------
 tcp.h      |  2 ++
 tcp_conn.h | 12 +++----
 util.c     | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 util.h     |  2 ++
 5 files changed, 161 insertions(+), 26 deletions(-)

-- 
2.47.0


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v3 1/4] tcp: Rename "retrans" to "retries"
  2025-10-14  7:38 [PATCH v3 0/4] Retry SYNs for inbound connections Yumei Huang
@ 2025-10-14  7:38 ` Yumei Huang
  2025-10-14 22:50   ` David Gibson
  2025-10-14  7:38 ` [PATCH v3 2/4] util: Introduce read_file() and read_file_long() function Yumei Huang
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 31+ messages in thread
From: Yumei Huang @ 2025-10-14  7:38 UTC (permalink / raw)
  To: passt-dev, sbrivio; +Cc: david, yuhuang

Rename "retrans" to "retries" so it can be used for SYN retries.

Signed-off-by: Yumei Huang <yuhuang@redhat.com>
---
 tcp.c      | 12 ++++++------
 tcp_conn.h | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tcp.c b/tcp.c
index 0f9e9b3..2ec4b0c 100644
--- a/tcp.c
+++ b/tcp.c
@@ -186,7 +186,7 @@
  * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
  *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
  *   socket and reset sequence to what was acknowledged. If this persists for
- *   more than TCP_MAX_RETRANS times in a row, reset the connection
+ *   more than TCP_MAX_RETRIES times in a row, reset the connection
  *
  * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
  *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
@@ -1127,7 +1127,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
 		if (SEQ_LT(seq, conn->seq_to_tap))
 			conn_flag(c, conn, ACK_FROM_TAP_DUE);
 
-		conn->retrans = 0;
+		conn->retries = 0;
 		conn->seq_ack_from_tap = seq;
 	}
 }
@@ -2414,7 +2414,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 		} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
 			flow_dbg(conn, "FIN timeout");
 			tcp_rst(c, conn);
-		} else if (conn->retrans == TCP_MAX_RETRANS) {
+		} else if (conn->retries == TCP_MAX_RETRIES) {
 			flow_dbg(conn, "retransmissions count exceeded");
 			tcp_rst(c, conn);
 		} else {
@@ -2423,7 +2423,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 			if (!conn->wnd_from_tap)
 				conn->wnd_from_tap = 1; /* Zero-window probe */
 
-			conn->retrans++;
+			conn->retries++;
 			if (tcp_rewind_seq(c, conn))
 				return;
 
@@ -3382,7 +3382,7 @@ static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn,
 int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
 {
 	struct tcp_tap_transfer t = {
-		.retrans		= conn->retrans,
+		.retries		= conn->retries,
 		.ws_from_tap		= conn->ws_from_tap,
 		.ws_to_tap		= conn->ws_to_tap,
 		.events			= conn->events,
@@ -3662,7 +3662,7 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
 	memcpy(&flow->f.side, &t.side, sizeof(flow->f.side));
 	conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
 
-	conn->retrans			= t.retrans;
+	conn->retries			= t.retries;
 	conn->ws_from_tap		= t.ws_from_tap;
 	conn->ws_to_tap			= t.ws_to_tap;
 	conn->events			= t.events;
diff --git a/tcp_conn.h b/tcp_conn.h
index 38b5c54..e5c8146 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -13,7 +13,7 @@
  * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
  * @f:			Generic flow information
  * @in_epoll:		Is the connection in the epoll set?
- * @retrans:		Number of retransmissions occurred due to ACK_TIMEOUT
+ * @retries:		Number of retries occurred due to timeouts
  * @ws_from_tap:	Window scaling factor advertised from tap/guest
  * @ws_to_tap:		Window scaling factor advertised to tap/guest
  * @tap_mss:		MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
@@ -38,9 +38,9 @@ struct tcp_tap_conn {
 
 	bool		in_epoll	:1;
 
-#define TCP_RETRANS_BITS		3
-	unsigned int	retrans		:TCP_RETRANS_BITS;
-#define TCP_MAX_RETRANS			MAX_FROM_BITS(TCP_RETRANS_BITS)
+#define TCP_RETRIES_BITS		3
+	unsigned int	retries		:TCP_RETRIES_BITS;
+#define TCP_MAX_RETRIES			MAX_FROM_BITS(TCP_RETRIES_BITS)
 
 #define TCP_WS_BITS			4	/* RFC 7323 */
 #define TCP_WS_MAX			14
@@ -102,7 +102,7 @@ struct tcp_tap_conn {
  * struct tcp_tap_transfer - Migrated TCP data, flow table part, network order
  * @pif:		Interfaces for each side of the flow
  * @side:		Addresses and ports for each side of the flow
- * @retrans:		Number of retransmissions occurred due to ACK_TIMEOUT
+ * @retries:		Number of retries occurred due to timeouts
  * @ws_from_tap:	Window scaling factor advertised from tap/guest
  * @ws_to_tap:		Window scaling factor advertised to tap/guest
  * @events:		Connection events, implying connection states
@@ -122,7 +122,7 @@ struct tcp_tap_transfer {
 	uint8_t		pif[SIDES];
 	struct flowside	side[SIDES];
 
-	uint8_t		retrans;
+	uint8_t		retries;
 	uint8_t		ws_from_tap;
 	uint8_t		ws_to_tap;
 	uint8_t		events;
-- 
2.47.0


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v3 2/4] util: Introduce read_file() and read_file_long() function
  2025-10-14  7:38 [PATCH v3 0/4] Retry SYNs for inbound connections Yumei Huang
  2025-10-14  7:38 ` [PATCH v3 1/4] tcp: Rename "retrans" to "retries" Yumei Huang
@ 2025-10-14  7:38 ` Yumei Huang
  2025-10-14 23:27   ` David Gibson
  2025-10-14  7:38 ` [PATCH v3 3/4] tcp: Resend SYN for inbound connections Yumei Huang
  2025-10-14  7:38 ` [PATCH v3 4/4] tcp: Update data retransmission timeout Yumei Huang
  3 siblings, 1 reply; 31+ messages in thread
From: Yumei Huang @ 2025-10-14  7:38 UTC (permalink / raw)
  To: passt-dev, sbrivio; +Cc: david, yuhuang

Signed-off-by: Yumei Huang <yuhuang@redhat.com>
---
 util.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 util.h |  2 ++
 2 files changed, 94 insertions(+)

diff --git a/util.c b/util.c
index c492f90..d331f08 100644
--- a/util.c
+++ b/util.c
@@ -579,6 +579,98 @@ int write_file(const char *path, const char *buf)
 	return len == 0 ? 0 : -1;
 }
 
+/**
+ * read_file() - Read contents of file into a buffer
+ * @path:	File to read
+ * @buf:	Buffer to store file contents
+ * @buf_size:	Size of buffer
+ *
+ * Return: number of bytes read on success, -1 on any error, -2 on truncation
+*/
+int read_file(const char *path, char *buf, size_t buf_size)
+{
+	int fd = open(path, O_RDONLY | O_CLOEXEC);
+	size_t total_read = 0;
+	ssize_t rc;
+	bool truncated = false;
+
+	if (fd < 0) {
+		warn_perror("Could not open %s", path);
+		return -1;
+	}
+
+	while (total_read < buf_size - 1) {
+		rc = read(fd, buf + total_read, buf_size - 1 - total_read);
+
+		if (rc < 0 ) {
+			warn_perror("Couldn't read from %s", path);
+			close(fd);
+			return -1;
+		}
+
+		if (rc == 0) {
+			break;
+		}
+
+		total_read += rc;
+
+		if (total_read == buf_size - 1) {
+			char test_byte;
+			rc = read(fd, &test_byte, 1);
+			if (rc >0) {
+				truncated = true;
+				warn_perror("File %s truncated, buffer too small", path);
+			}
+		}
+	}
+
+	close(fd);
+
+	if (total_read < buf_size){
+		buf[total_read] = '\0';
+	}
+
+	return truncated ? -2 : (int)total_read;
+}
+
+/**
+ * read_file_long() - Read a long integer value from a file
+ * @path: Path to the sysctl file
+ * @fallback: Default value if file can't be read
+ *
+ * Return: Parameter value, fallback on failure
+*/
+long read_file_long(const char *path, long fallback)
+{
+        char buf[32];
+        char *end;
+        long value;
+        int bytes_read;
+
+        bytes_read = read_file(path, buf, sizeof(buf));
+        if (bytes_read < 0) {
+                debug("Unable to read %s", path);
+                return fallback;
+        }
+
+        if (bytes_read == 0) {
+                debug("Empty file %s", path);
+                return fallback;
+        }
+
+        errno = 0;
+        value = strtol(buf, &end, 10);
+        if (*end && *end != '\n') {
+                debug("Invalid format in %s", path);
+                return fallback;
+        }
+        if (errno || value < 0 || value > LONG_MAX) {
+                debug("Invalid value in %s: %ld", path, value);
+                return fallback;
+        }
+        return value;
+}
+
 #ifdef __ia64__
 /* Needed by do_clone() below: glibc doesn't export the prototype of __clone2(),
  * use the description from clone(2).
diff --git a/util.h b/util.h
index 22eaac5..e509bec 100644
--- a/util.h
+++ b/util.h
@@ -222,6 +222,8 @@ void pidfile_write(int fd, pid_t pid);
 int __daemon(int pidfile_fd, int devnull_fd);
 int fls(unsigned long x);
 int write_file(const char *path, const char *buf);
+int read_file(const char *path, char *buf, size_t buf_size);
+long read_file_long(const char *path, long fallback);
 int write_all_buf(int fd, const void *buf, size_t len);
 int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip);
 int read_all_buf(int fd, void *buf, size_t len);
-- 
2.47.0


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v3 3/4] tcp: Resend SYN for inbound connections
  2025-10-14  7:38 [PATCH v3 0/4] Retry SYNs for inbound connections Yumei Huang
  2025-10-14  7:38 ` [PATCH v3 1/4] tcp: Rename "retrans" to "retries" Yumei Huang
  2025-10-14  7:38 ` [PATCH v3 2/4] util: Introduce read_file() and read_file_long() function Yumei Huang
@ 2025-10-14  7:38 ` Yumei Huang
  2025-10-14 23:40   ` David Gibson
  2025-10-14  7:38 ` [PATCH v3 4/4] tcp: Update data retransmission timeout Yumei Huang
  3 siblings, 1 reply; 31+ messages in thread
From: Yumei Huang @ 2025-10-14  7:38 UTC (permalink / raw)
  To: passt-dev, sbrivio; +Cc: david, yuhuang

If a client connects while guest is not connected or ready yet,
resend SYN instead of just resetting connection after 10 seconds.

Use the same backoff calculation for the timeout as linux kernel.

Signed-off-by: Yumei Huang <yuhuang@redhat.com>
---
 tcp.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++--------
 tcp.h |  2 ++
 2 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/tcp.c b/tcp.c
index 2ec4b0c..3ce3991 100644
--- a/tcp.c
+++ b/tcp.c
@@ -179,9 +179,11 @@
  *
  * Timeouts are implemented by means of timerfd timers, set based on flags:
  *
- * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake (flag
- *   ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, reset the
- *   connection
+ * - SYN_TIMEOUT_INIT: if no ACK is received from tap/guest during handshake
+ *   (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, resend
+ *   SYN. It's the starting timeout for the first SYN retry. If this persists
+ *   for more than TCP_MAX_RETRIES or (tcp_syn_retries +
+ *   tcp_syn_linear_timeouts) times in a row, reset the connection
  *
  * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
  *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
@@ -340,7 +342,7 @@ enum {
 #define WINDOW_DEFAULT			14600		/* RFC 6928 */
 
 #define ACK_INTERVAL			10		/* ms */
-#define SYN_TIMEOUT			10		/* s */
+#define SYN_TIMEOUT_INIT		1		/* s */
 #define ACK_TIMEOUT			2
 #define FIN_TIMEOUT			60
 #define ACT_TIMEOUT			7200
@@ -365,6 +367,10 @@ uint8_t tcp_migrate_rcv_queue		[TCP_MIGRATE_RCV_QUEUE_MAX];
 
 #define TCP_MIGRATE_RESTORE_CHUNK_MIN	1024 /* Try smaller when above this */
 
+#define TCP_SYN_RETRIES_SYSCTL		"/proc/sys/net/ipv4/tcp_syn_retries"
+#define TCP_SYN_LINEAR_TIMEOUTS_SYSCTL						\
+	"/proc/sys/net/ipv4/tcp_syn_linear_timeouts"
+
 /* "Extended" data (not stored in the flow table) for TCP flow migration */
 static struct tcp_tap_transfer_ext migrate_ext[FLOW_MAX];
 
@@ -581,8 +587,13 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 	if (conn->flags & ACK_TO_TAP_DUE) {
 		it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
 	} else if (conn->flags & ACK_FROM_TAP_DUE) {
-		if (!(conn->events & ESTABLISHED))
-			it.it_value.tv_sec = SYN_TIMEOUT;
+		if (!(conn->events & ESTABLISHED)) {
+			if (conn->retries < c->tcp.syn_linear_timeouts)
+				it.it_value.tv_sec = SYN_TIMEOUT_INIT;
+			else
+				it.it_value.tv_sec = SYN_TIMEOUT_INIT <<
+					(conn->retries - c->tcp.syn_linear_timeouts);
+		}
 		else
 			it.it_value.tv_sec = ACK_TIMEOUT;
 	} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
@@ -2409,8 +2420,16 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 		tcp_timer_ctl(c, conn);
 	} else if (conn->flags & ACK_FROM_TAP_DUE) {
 		if (!(conn->events & ESTABLISHED)) {
-			flow_dbg(conn, "handshake timeout");
-			tcp_rst(c, conn);
+			if (conn->retries >= MIN(TCP_MAX_RETRIES,
+				(c->tcp.tcp_syn_retries + c->tcp.syn_linear_timeouts))) {
+				flow_dbg(conn, "handshake timeout");
+				tcp_rst(c, conn);
+			} else {
+				flow_dbg(conn, "SYN timeout, retry");
+				tcp_send_flag(c, conn, SYN);
+				conn->retries++;
+				tcp_timer_ctl(c, conn);
+			}
 		} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
 			flow_dbg(conn, "FIN timeout");
 			tcp_rst(c, conn);
@@ -2766,6 +2785,24 @@ static socklen_t tcp_probe_tcp_info(void)
 	return sl;
 }
 
+/**
+ * tcp_syn_params_init() - Get initial syn params for inbound connection
+ * @c:		Execution context
+*/
+void tcp_syn_params_init(struct ctx *c)
+{
+	long tcp_syn_retries, syn_linear_timeouts;
+
+	tcp_syn_retries = read_file_long(TCP_SYN_RETRIES_SYSCTL, 8);
+	syn_linear_timeouts = read_file_long(TCP_SYN_LINEAR_TIMEOUTS_SYSCTL, 1);
+
+	c->tcp.tcp_syn_retries = (uint8_t)MIN(tcp_syn_retries, UINT8_MAX);
+	c->tcp.syn_linear_timeouts = (uint8_t)MIN(syn_linear_timeouts, UINT8_MAX);
+
+	debug("TCP SYN parameters: retries=%d, linear_timeouts=%d",
+		  c->tcp.tcp_syn_retries, c->tcp.syn_linear_timeouts);
+}
+
 /**
  * tcp_init() - Get initial sequence, hash secret, initialise per-socket data
  * @c:		Execution context
@@ -2776,6 +2813,8 @@ int tcp_init(struct ctx *c)
 {
 	ASSERT(!c->no_tcp);
 
+	tcp_syn_params_init(c);
+
 	tcp_sock_iov_init(c);
 
 	memset(init_sock_pool4,		0xff,	sizeof(init_sock_pool4));
diff --git a/tcp.h b/tcp.h
index 234a803..df699a4 100644
--- a/tcp.h
+++ b/tcp.h
@@ -65,6 +65,8 @@ struct tcp_ctx {
 	struct fwd_ports fwd_out;
 	struct timespec timer_run;
 	size_t pipe_size;
+	uint8_t tcp_syn_retries;
+	uint8_t syn_linear_timeouts;
 };
 
 #endif /* TCP_H */
-- 
2.47.0


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-14  7:38 [PATCH v3 0/4] Retry SYNs for inbound connections Yumei Huang
                   ` (2 preceding siblings ...)
  2025-10-14  7:38 ` [PATCH v3 3/4] tcp: Resend SYN for inbound connections Yumei Huang
@ 2025-10-14  7:38 ` Yumei Huang
  2025-10-15  0:05   ` David Gibson
  3 siblings, 1 reply; 31+ messages in thread
From: Yumei Huang @ 2025-10-14  7:38 UTC (permalink / raw)
  To: passt-dev, sbrivio; +Cc: david, yuhuang

According to RFC 2988 and RFC 6298, we should use an exponential
backoff timeout for data retransmission starting from one second
(see Appendix A in RFC 6298), and limit it to about 60 seconds
as allowed by the same RFC:

   (2.5) A maximum value MAY be placed on RTO provided it is at
         least 60 seconds.

Combine the macros defining the initial timeout for both SYN and ACK.
And add a macro ACK_RETRIES to limit the total timeout to about 60s.

Signed-off-by: Yumei Huang <yuhuang@redhat.com>
---
 tcp.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tcp.c b/tcp.c
index 3ce3991..84da069 100644
--- a/tcp.c
+++ b/tcp.c
@@ -179,16 +179,12 @@
  *
  * Timeouts are implemented by means of timerfd timers, set based on flags:
  *
- * - SYN_TIMEOUT_INIT: if no ACK is received from tap/guest during handshake
- *   (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, resend
- *   SYN. It's the starting timeout for the first SYN retry. If this persists
- *   for more than TCP_MAX_RETRIES or (tcp_syn_retries +
- *   tcp_syn_linear_timeouts) times in a row, reset the connection
- *
- * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
- *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
- *   socket and reset sequence to what was acknowledged. If this persists for
- *   more than TCP_MAX_RETRIES times in a row, reset the connection
+ * - ACK_TIMEOUT_INIT: if no ACK segment was received from tap/guest, eiher
+ *   during handshake(flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after
+ *   sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data
+ *   from the socket and reset sequence to what was acknowledged. It's the
+ *   starting timeout for the first retry. If this persists for more than 
+ *   allowed times in a row, reset the connection
  *
  * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
  *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
@@ -342,8 +338,7 @@ enum {
 #define WINDOW_DEFAULT			14600		/* RFC 6928 */
 
 #define ACK_INTERVAL			10		/* ms */
-#define SYN_TIMEOUT_INIT		1		/* s */
-#define ACK_TIMEOUT			2
+#define ACK_TIMEOUT_INIT		1		/* s, RFC 6298 */
 #define FIN_TIMEOUT			60
 #define ACT_TIMEOUT			7200
 
@@ -352,6 +347,11 @@ enum {
 
 #define ACK_IF_NEEDED	0		/* See tcp_send_flag() */
 
+/* Number of retries calculated from the exponential backoff formula, limited
+ * by a total timeout of about 60 seconds.
+ */
+#define ACK_RETRIES		5
+
 #define CONN_IS_CLOSING(conn)						\
 	(((conn)->events & ESTABLISHED) &&				\
 	 ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
@@ -589,13 +589,13 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 	} else if (conn->flags & ACK_FROM_TAP_DUE) {
 		if (!(conn->events & ESTABLISHED)) {
 			if (conn->retries < c->tcp.syn_linear_timeouts)
-				it.it_value.tv_sec = SYN_TIMEOUT_INIT;
+				it.it_value.tv_sec = ACK_TIMEOUT_INIT;
 			else
-				it.it_value.tv_sec = SYN_TIMEOUT_INIT <<
+				it.it_value.tv_sec = ACK_TIMEOUT_INIT <<
 					(conn->retries - c->tcp.syn_linear_timeouts);
 		}
 		else
-			it.it_value.tv_sec = ACK_TIMEOUT;
+			it.it_value.tv_sec = ACK_TIMEOUT_INIT << conn->retries;
 	} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
 		it.it_value.tv_sec = FIN_TIMEOUT;
 	} else {
@@ -2433,7 +2433,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 		} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
 			flow_dbg(conn, "FIN timeout");
 			tcp_rst(c, conn);
-		} else if (conn->retries == TCP_MAX_RETRIES) {
+		} else if (conn->retries >= ACK_RETRIES) {
 			flow_dbg(conn, "retransmissions count exceeded");
 			tcp_rst(c, conn);
 		} else {
-- 
2.47.0


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/4] tcp: Rename "retrans" to "retries"
  2025-10-14  7:38 ` [PATCH v3 1/4] tcp: Rename "retrans" to "retries" Yumei Huang
@ 2025-10-14 22:50   ` David Gibson
  2025-10-15  2:17     ` Yumei Huang
  0 siblings, 1 reply; 31+ messages in thread
From: David Gibson @ 2025-10-14 22:50 UTC (permalink / raw)
  To: Yumei Huang; +Cc: passt-dev, sbrivio

[-- Attachment #1: Type: text/plain, Size: 4997 bytes --]

On Tue, Oct 14, 2025 at 03:38:33PM +0800, Yumei Huang wrote:
> Rename "retrans" to "retries" so it can be used for SYN retries.
> 
> Signed-off-by: Yumei Huang <yuhuang@redhat.com>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

Btw, if the patch hasn't changed, you can keep the Reviewed-by line
from previous versions of the series.

> ---
>  tcp.c      | 12 ++++++------
>  tcp_conn.h | 12 ++++++------
>  2 files changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 0f9e9b3..2ec4b0c 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -186,7 +186,7 @@
>   * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
>   *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
>   *   socket and reset sequence to what was acknowledged. If this persists for
> - *   more than TCP_MAX_RETRANS times in a row, reset the connection
> + *   more than TCP_MAX_RETRIES times in a row, reset the connection
>   *
>   * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
>   *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
> @@ -1127,7 +1127,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
>  		if (SEQ_LT(seq, conn->seq_to_tap))
>  			conn_flag(c, conn, ACK_FROM_TAP_DUE);
>  
> -		conn->retrans = 0;
> +		conn->retries = 0;
>  		conn->seq_ack_from_tap = seq;
>  	}
>  }
> @@ -2414,7 +2414,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
>  		} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
>  			flow_dbg(conn, "FIN timeout");
>  			tcp_rst(c, conn);
> -		} else if (conn->retrans == TCP_MAX_RETRANS) {
> +		} else if (conn->retries == TCP_MAX_RETRIES) {
>  			flow_dbg(conn, "retransmissions count exceeded");
>  			tcp_rst(c, conn);
>  		} else {
> @@ -2423,7 +2423,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
>  			if (!conn->wnd_from_tap)
>  				conn->wnd_from_tap = 1; /* Zero-window probe */
>  
> -			conn->retrans++;
> +			conn->retries++;
>  			if (tcp_rewind_seq(c, conn))
>  				return;
>  
> @@ -3382,7 +3382,7 @@ static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn,
>  int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
>  {
>  	struct tcp_tap_transfer t = {
> -		.retrans		= conn->retrans,
> +		.retries		= conn->retries,
>  		.ws_from_tap		= conn->ws_from_tap,
>  		.ws_to_tap		= conn->ws_to_tap,
>  		.events			= conn->events,
> @@ -3662,7 +3662,7 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
>  	memcpy(&flow->f.side, &t.side, sizeof(flow->f.side));
>  	conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
>  
> -	conn->retrans			= t.retrans;
> +	conn->retries			= t.retries;
>  	conn->ws_from_tap		= t.ws_from_tap;
>  	conn->ws_to_tap			= t.ws_to_tap;
>  	conn->events			= t.events;
> diff --git a/tcp_conn.h b/tcp_conn.h
> index 38b5c54..e5c8146 100644
> --- a/tcp_conn.h
> +++ b/tcp_conn.h
> @@ -13,7 +13,7 @@
>   * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
>   * @f:			Generic flow information
>   * @in_epoll:		Is the connection in the epoll set?
> - * @retrans:		Number of retransmissions occurred due to ACK_TIMEOUT
> + * @retries:		Number of retries occurred due to timeouts
>   * @ws_from_tap:	Window scaling factor advertised from tap/guest
>   * @ws_to_tap:		Window scaling factor advertised to tap/guest
>   * @tap_mss:		MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
> @@ -38,9 +38,9 @@ struct tcp_tap_conn {
>  
>  	bool		in_epoll	:1;
>  
> -#define TCP_RETRANS_BITS		3
> -	unsigned int	retrans		:TCP_RETRANS_BITS;
> -#define TCP_MAX_RETRANS			MAX_FROM_BITS(TCP_RETRANS_BITS)
> +#define TCP_RETRIES_BITS		3
> +	unsigned int	retries		:TCP_RETRIES_BITS;
> +#define TCP_MAX_RETRIES			MAX_FROM_BITS(TCP_RETRIES_BITS)
>  
>  #define TCP_WS_BITS			4	/* RFC 7323 */
>  #define TCP_WS_MAX			14
> @@ -102,7 +102,7 @@ struct tcp_tap_conn {
>   * struct tcp_tap_transfer - Migrated TCP data, flow table part, network order
>   * @pif:		Interfaces for each side of the flow
>   * @side:		Addresses and ports for each side of the flow
> - * @retrans:		Number of retransmissions occurred due to ACK_TIMEOUT
> + * @retries:		Number of retries occurred due to timeouts
>   * @ws_from_tap:	Window scaling factor advertised from tap/guest
>   * @ws_to_tap:		Window scaling factor advertised to tap/guest
>   * @events:		Connection events, implying connection states
> @@ -122,7 +122,7 @@ struct tcp_tap_transfer {
>  	uint8_t		pif[SIDES];
>  	struct flowside	side[SIDES];
>  
> -	uint8_t		retrans;
> +	uint8_t		retries;
>  	uint8_t		ws_from_tap;
>  	uint8_t		ws_to_tap;
>  	uint8_t		events;
> -- 
> 2.47.0
> 

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 2/4] util: Introduce read_file() and read_file_long() function
  2025-10-14  7:38 ` [PATCH v3 2/4] util: Introduce read_file() and read_file_long() function Yumei Huang
@ 2025-10-14 23:27   ` David Gibson
  2025-10-15  3:50     ` Yumei Huang
  0 siblings, 1 reply; 31+ messages in thread
From: David Gibson @ 2025-10-14 23:27 UTC (permalink / raw)
  To: Yumei Huang; +Cc: passt-dev, sbrivio

[-- Attachment #1: Type: text/plain, Size: 5454 bytes --]

On Tue, Oct 14, 2025 at 03:38:34PM +0800, Yumei Huang wrote:
> Signed-off-by: Yumei Huang <yuhuang@redhat.com>
> ---
>  util.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  util.h |  2 ++
>  2 files changed, 94 insertions(+)
> 
> diff --git a/util.c b/util.c
> index c492f90..d331f08 100644
> --- a/util.c
> +++ b/util.c
> @@ -579,6 +579,98 @@ int write_file(const char *path, const char *buf)
>  	return len == 0 ? 0 : -1;
>  }
>  
> +/**
> + * read_file() - Read contents of file into a buffer
> + * @path:	File to read
> + * @buf:	Buffer to store file contents
> + * @buf_size:	Size of buffer
> + *
> + * Return: number of bytes read on success, -1 on any error, -2 on truncation
> +*/

Looks ok, but I think there's a simpler way.

> +int read_file(const char *path, char *buf, size_t buf_size)
> +{
> +	int fd = open(path, O_RDONLY | O_CLOEXEC);
> +	size_t total_read = 0;
> +	ssize_t rc;
> +	bool truncated = false;
> +
> +	if (fd < 0) {
> +		warn_perror("Could not open %s", path);
> +		return -1;
> +	}
> +
> +	while (total_read < buf_size - 1) {
> +		rc = read(fd, buf + total_read, buf_size - 1 - total_read);

The '- 1' is to leave space for the \0, but if you instead attempt to
read the entire buffer...

> +
> +		if (rc < 0 ) {

(nit: extra space before ')')

> +			warn_perror("Couldn't read from %s", path);
> +			close(fd);
> +			return -1;
> +		}
> +
> +		if (rc == 0) {
> +			break;
> +		}
> +
> +		total_read += rc;
> +
> +		if (total_read == buf_size - 1) {
> +			char test_byte;
> +			rc = read(fd, &test_byte, 1);
> +			if (rc >0) {
> +				truncated = true;
> +				warn_perror("File %s truncated, buffer too small", path);
> +			}
> +		}

...then you can tell if you have to truncate by finishing the loop
then checking if (total_read < buf_size).  If it is, there's space for
the \0, otherwise there isn't and you report truncation.  No need for
test_byte.

> +	}
> +
> +	close(fd);
> +
> +	if (total_read < buf_size){
> +		buf[total_read] = '\0';


And if you test for truncation and exit early, you can  make this
unconditional.

> +	}
> +
> +	return truncated ? -2 : (int)total_read;
> +}
> +
> +/**
> + * read_file_long() - Read a long integer value from a file

When I first read this name I thought it was for reading a long file,
rather than reading a long (int) from a file.  Not immediately sure
how to clarify that.  read_file_long_int() is clear, but awkward.

A better choice might be to change this to use strtoimax() and call it
read_file_integer().

> + * @path: Path to the sysctl file
> + * @fallback: Default value if file can't be read
> + *
> + * Return: Parameter value, fallback on failure
> +*/
> +long read_file_long(const char *path, long fallback)
> +{
> +        char buf[32];

Rather than just using a semi-arbitrary 32 here, I'd suggest defining
a new constant similar to UINT16_STRLEN.  Except that's trickier for a
type that doesn't have a known fixed width.  Pity the C library
doesn't have constants for these AFAICT.

> +        char *end;
> +        long value;
> +        int bytes_read;
> +
> +        bytes_read = read_file(path, buf, sizeof(buf));
> +        if (bytes_read < 0) {
> +                debug("Unable to read %s", path);

If there's a an error on open() or read(), this will produce two very
similar error messages in a row, which isn't ideal.

> +                return fallback;
> +        }
> +
> +        if (bytes_read == 0) {
> +                debug("Empty file %s", path);
> +                return fallback;
> +        }

Might be worth checking strtol()'s behaviour on an empty string to see
if this special case would already be handled below.

> +
> +        errno = 0;
> +        value = strtol(buf, &end, 10);
> +        if (*end && *end != '\n') {
> +                debug("Invalid format in %s", path);
> +                return fallback;
> +        }
> +        if (errno || value < 0 || value > LONG_MAX) {

No need to exclude negative values here.  (value > LONG_MAX) can never
be true since value is a long.

> +                debug("Invalid value in %s: %ld", path, value);

If errno != 0, value might be uninitialised here, and certainly won't
have something useful.  Better to print the contents as a string.

> +                return fallback;
> +        }
> +        return value;
> +}
> +
>  #ifdef __ia64__
>  /* Needed by do_clone() below: glibc doesn't export the prototype of __clone2(),
>   * use the description from clone(2).
> diff --git a/util.h b/util.h
> index 22eaac5..e509bec 100644
> --- a/util.h
> +++ b/util.h
> @@ -222,6 +222,8 @@ void pidfile_write(int fd, pid_t pid);
>  int __daemon(int pidfile_fd, int devnull_fd);
>  int fls(unsigned long x);
>  int write_file(const char *path, const char *buf);
> +int read_file(const char *path, char *buf, size_t buf_size);
> +long read_file_long(const char *path, long fallback);
>  int write_all_buf(int fd, const void *buf, size_t len);
>  int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip);
>  int read_all_buf(int fd, void *buf, size_t len);
> -- 
> 2.47.0
> 

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 3/4] tcp: Resend SYN for inbound connections
  2025-10-14  7:38 ` [PATCH v3 3/4] tcp: Resend SYN for inbound connections Yumei Huang
@ 2025-10-14 23:40   ` David Gibson
  0 siblings, 0 replies; 31+ messages in thread
From: David Gibson @ 2025-10-14 23:40 UTC (permalink / raw)
  To: Yumei Huang; +Cc: passt-dev, sbrivio

[-- Attachment #1: Type: text/plain, Size: 5792 bytes --]

On Tue, Oct 14, 2025 at 03:38:35PM +0800, Yumei Huang wrote:
> If a client connects while guest is not connected or ready yet,
> resend SYN instead of just resetting connection after 10 seconds.
> 
> Use the same backoff calculation for the timeout as linux kernel.
> 
> Signed-off-by: Yumei Huang <yuhuang@redhat.com>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

A few cosmetic nits noted below.

> ---
>  tcp.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++--------
>  tcp.h |  2 ++
>  2 files changed, 49 insertions(+), 8 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 2ec4b0c..3ce3991 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -179,9 +179,11 @@
>   *
>   * Timeouts are implemented by means of timerfd timers, set based on flags:
>   *
> - * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake (flag
> - *   ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, reset the
> - *   connection
> + * - SYN_TIMEOUT_INIT: if no ACK is received from tap/guest during handshake
> + *   (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, resend
> + *   SYN. It's the starting timeout for the first SYN retry. If this persists
> + *   for more than TCP_MAX_RETRIES or (tcp_syn_retries +
> + *   tcp_syn_linear_timeouts) times in a row, reset the connection
>   *
>   * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
>   *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
> @@ -340,7 +342,7 @@ enum {
>  #define WINDOW_DEFAULT			14600		/* RFC 6928 */
>  
>  #define ACK_INTERVAL			10		/* ms */
> -#define SYN_TIMEOUT			10		/* s */
> +#define SYN_TIMEOUT_INIT		1		/* s */
>  #define ACK_TIMEOUT			2
>  #define FIN_TIMEOUT			60
>  #define ACT_TIMEOUT			7200
> @@ -365,6 +367,10 @@ uint8_t tcp_migrate_rcv_queue		[TCP_MIGRATE_RCV_QUEUE_MAX];
>  
>  #define TCP_MIGRATE_RESTORE_CHUNK_MIN	1024 /* Try smaller when above this */
>  
> +#define TCP_SYN_RETRIES_SYSCTL		"/proc/sys/net/ipv4/tcp_syn_retries"
> +#define TCP_SYN_LINEAR_TIMEOUTS_SYSCTL						\
> +	"/proc/sys/net/ipv4/tcp_syn_linear_timeouts"
> +
>  /* "Extended" data (not stored in the flow table) for TCP flow migration */
>  static struct tcp_tap_transfer_ext migrate_ext[FLOW_MAX];
>  
> @@ -581,8 +587,13 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
>  	if (conn->flags & ACK_TO_TAP_DUE) {
>  		it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
>  	} else if (conn->flags & ACK_FROM_TAP_DUE) {
> -		if (!(conn->events & ESTABLISHED))
> -			it.it_value.tv_sec = SYN_TIMEOUT;
> +		if (!(conn->events & ESTABLISHED)) {
> +			if (conn->retries < c->tcp.syn_linear_timeouts)
> +				it.it_value.tv_sec = SYN_TIMEOUT_INIT;
> +			else
> +				it.it_value.tv_sec = SYN_TIMEOUT_INIT <<
> +					(conn->retries - c->tcp.syn_linear_timeouts);
> +		}
>  		else
>  			it.it_value.tv_sec = ACK_TIMEOUT;
>  	} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
> @@ -2409,8 +2420,16 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
>  		tcp_timer_ctl(c, conn);
>  	} else if (conn->flags & ACK_FROM_TAP_DUE) {
>  		if (!(conn->events & ESTABLISHED)) {
> -			flow_dbg(conn, "handshake timeout");
> -			tcp_rst(c, conn);
> +			if (conn->retries >= MIN(TCP_MAX_RETRIES,
> +				(c->tcp.tcp_syn_retries + c->tcp.syn_linear_timeouts))) {
> +				flow_dbg(conn, "handshake timeout");
> +				tcp_rst(c, conn);
> +			} else {
> +				flow_dbg(conn, "SYN timeout, retry");

I'd suggest demoting this to flow_trace() since it can occur on a
perfectly ok connection.

> +				tcp_send_flag(c, conn, SYN);
> +				conn->retries++;
> +				tcp_timer_ctl(c, conn);
> +			}
>  		} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
>  			flow_dbg(conn, "FIN timeout");
>  			tcp_rst(c, conn);
> @@ -2766,6 +2785,24 @@ static socklen_t tcp_probe_tcp_info(void)
>  	return sl;
>  }
>  
> +/**
> + * tcp_syn_params_init() - Get initial syn params for inbound connection
> + * @c:		Execution context
> +*/
> +void tcp_syn_params_init(struct ctx *c)
> +{
> +	long tcp_syn_retries, syn_linear_timeouts;
> +
> +	tcp_syn_retries = read_file_long(TCP_SYN_RETRIES_SYSCTL, 8);
> +	syn_linear_timeouts = read_file_long(TCP_SYN_LINEAR_TIMEOUTS_SYSCTL, 1);
> +
> +	c->tcp.tcp_syn_retries = (uint8_t)MIN(tcp_syn_retries, UINT8_MAX);
> +	c->tcp.syn_linear_timeouts = (uint8_t)MIN(syn_linear_timeouts, UINT8_MAX);
> +
> +	debug("TCP SYN parameters: retries=%d, linear_timeouts=%d",
> +		  c->tcp.tcp_syn_retries, c->tcp.syn_linear_timeouts);

This will work because of the (somewhat arcane) promotion rules for
variadic functions.  It would be more correct to use the PRIu8 define
from inttypes.h, since the parameters are uint8_t (grep for "PRIu" to
see examples of how these defines are used).

> +}
> +
>  /**
>   * tcp_init() - Get initial sequence, hash secret, initialise per-socket data
>   * @c:		Execution context
> @@ -2776,6 +2813,8 @@ int tcp_init(struct ctx *c)
>  {
>  	ASSERT(!c->no_tcp);
>  
> +	tcp_syn_params_init(c);
> +
>  	tcp_sock_iov_init(c);
>  
>  	memset(init_sock_pool4,		0xff,	sizeof(init_sock_pool4));
> diff --git a/tcp.h b/tcp.h
> index 234a803..df699a4 100644
> --- a/tcp.h
> +++ b/tcp.h
> @@ -65,6 +65,8 @@ struct tcp_ctx {
>  	struct fwd_ports fwd_out;
>  	struct timespec timer_run;
>  	size_t pipe_size;
> +	uint8_t tcp_syn_retries;
> +	uint8_t syn_linear_timeouts;
>  };
>  
>  #endif /* TCP_H */
> -- 
> 2.47.0
> 

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-14  7:38 ` [PATCH v3 4/4] tcp: Update data retransmission timeout Yumei Huang
@ 2025-10-15  0:05   ` David Gibson
  2025-10-15  6:31     ` Yumei Huang
  0 siblings, 1 reply; 31+ messages in thread
From: David Gibson @ 2025-10-15  0:05 UTC (permalink / raw)
  To: Yumei Huang; +Cc: passt-dev, sbrivio

[-- Attachment #1: Type: text/plain, Size: 5128 bytes --]

On Tue, Oct 14, 2025 at 03:38:36PM +0800, Yumei Huang wrote:
> According to RFC 2988 and RFC 6298, we should use an exponential
> backoff timeout for data retransmission starting from one second
> (see Appendix A in RFC 6298), and limit it to about 60 seconds
> as allowed by the same RFC:
> 
>    (2.5) A maximum value MAY be placed on RTO provided it is at
>          least 60 seconds.

The interpretation of this isn't entirely clear to me.  Does it mean
if the total retransmit delay exceeds 60s we give up and RST (what
this patch implements)?  Or does it mean that if the retransmit delay
reaches 60s we keep retransmitting, but don't increase the delay any
further?

Looking at tcp_bound_rto() and related code in the kernel suggests the
second interpretation.

> Combine the macros defining the initial timeout for both SYN and ACK.
> And add a macro ACK_RETRIES to limit the total timeout to about 60s.
> 
> Signed-off-by: Yumei Huang <yuhuang@redhat.com>
> ---
>  tcp.c | 32 ++++++++++++++++----------------
>  1 file changed, 16 insertions(+), 16 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 3ce3991..84da069 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -179,16 +179,12 @@
>   *
>   * Timeouts are implemented by means of timerfd timers, set based on flags:
>   *
> - * - SYN_TIMEOUT_INIT: if no ACK is received from tap/guest during handshake
> - *   (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, resend
> - *   SYN. It's the starting timeout for the first SYN retry. If this persists
> - *   for more than TCP_MAX_RETRIES or (tcp_syn_retries +
> - *   tcp_syn_linear_timeouts) times in a row, reset the connection
> - *
> - * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
> - *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
> - *   socket and reset sequence to what was acknowledged. If this persists for
> - *   more than TCP_MAX_RETRIES times in a row, reset the connection
> + * - ACK_TIMEOUT_INIT: if no ACK segment was received from tap/guest, eiher
> + *   during handshake(flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after
> + *   sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data
> + *   from the socket and reset sequence to what was acknowledged. It's the
> + *   starting timeout for the first retry. If this persists for more than 
> + *   allowed times in a row, reset the connection
>   *
>   * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
>   *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
> @@ -342,8 +338,7 @@ enum {
>  #define WINDOW_DEFAULT			14600		/* RFC 6928 */
>  
>  #define ACK_INTERVAL			10		/* ms */
> -#define SYN_TIMEOUT_INIT		1		/* s */
> -#define ACK_TIMEOUT			2
> +#define ACK_TIMEOUT_INIT		1		/* s, RFC 6298 */

I'd suggest calling this RTO_INIT to match the terminology used in the
RFCs.

>  #define FIN_TIMEOUT			60
>  #define ACT_TIMEOUT			7200
>  
> @@ -352,6 +347,11 @@ enum {
>  
>  #define ACK_IF_NEEDED	0		/* See tcp_send_flag() */
>  
> +/* Number of retries calculated from the exponential backoff formula, limited
> + * by a total timeout of about 60 seconds.
> + */
> +#define ACK_RETRIES		5
> +

As noted above, I think this is based on a misunderstanding of what
the RFC is saying.  TCP_MAX_RETRIES should be fine as it is, I think.
We could implement the clamping of the RTO, but it's a "MAY" in the
RFC, so we don't have to, and I don't really see a strong reason to do
so.

>  #define CONN_IS_CLOSING(conn)						\
>  	(((conn)->events & ESTABLISHED) &&				\
>  	 ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
> @@ -589,13 +589,13 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
>  	} else if (conn->flags & ACK_FROM_TAP_DUE) {
>  		if (!(conn->events & ESTABLISHED)) {
>  			if (conn->retries < c->tcp.syn_linear_timeouts)
> -				it.it_value.tv_sec = SYN_TIMEOUT_INIT;
> +				it.it_value.tv_sec = ACK_TIMEOUT_INIT;
>  			else
> -				it.it_value.tv_sec = SYN_TIMEOUT_INIT <<
> +				it.it_value.tv_sec = ACK_TIMEOUT_INIT <<
>  					(conn->retries - c->tcp.syn_linear_timeouts);
>  		}
>  		else
> -			it.it_value.tv_sec = ACK_TIMEOUT;
> +			it.it_value.tv_sec = ACK_TIMEOUT_INIT << conn->retries;
>  	} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
>  		it.it_value.tv_sec = FIN_TIMEOUT;
>  	} else {
> @@ -2433,7 +2433,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
>  		} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
>  			flow_dbg(conn, "FIN timeout");
>  			tcp_rst(c, conn);
> -		} else if (conn->retries == TCP_MAX_RETRIES) {
> +		} else if (conn->retries >= ACK_RETRIES) {
>  			flow_dbg(conn, "retransmissions count exceeded");
>  			tcp_rst(c, conn);
>  		} else {
> -- 
> 2.47.0
> 

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/4] tcp: Rename "retrans" to "retries"
  2025-10-14 22:50   ` David Gibson
@ 2025-10-15  2:17     ` Yumei Huang
  0 siblings, 0 replies; 31+ messages in thread
From: Yumei Huang @ 2025-10-15  2:17 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev, sbrivio

On Wed, Oct 15, 2025 at 7:28 AM David Gibson
<david@gibson.dropbear.id.au> wrote:
>
> On Tue, Oct 14, 2025 at 03:38:33PM +0800, Yumei Huang wrote:
> > Rename "retrans" to "retries" so it can be used for SYN retries.
> >
> > Signed-off-by: Yumei Huang <yuhuang@redhat.com>
>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> Btw, if the patch hasn't changed, you can keep the Reviewed-by line
> from previous versions of the series.

Got it, will add it in the next version.
>
> > ---
> >  tcp.c      | 12 ++++++------
> >  tcp_conn.h | 12 ++++++------
> >  2 files changed, 12 insertions(+), 12 deletions(-)
> >
> > diff --git a/tcp.c b/tcp.c
> > index 0f9e9b3..2ec4b0c 100644
> > --- a/tcp.c
> > +++ b/tcp.c
> > @@ -186,7 +186,7 @@
> >   * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
> >   *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
> >   *   socket and reset sequence to what was acknowledged. If this persists for
> > - *   more than TCP_MAX_RETRANS times in a row, reset the connection
> > + *   more than TCP_MAX_RETRIES times in a row, reset the connection
> >   *
> >   * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
> >   *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
> > @@ -1127,7 +1127,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
> >               if (SEQ_LT(seq, conn->seq_to_tap))
> >                       conn_flag(c, conn, ACK_FROM_TAP_DUE);
> >
> > -             conn->retrans = 0;
> > +             conn->retries = 0;
> >               conn->seq_ack_from_tap = seq;
> >       }
> >  }
> > @@ -2414,7 +2414,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
> >               } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
> >                       flow_dbg(conn, "FIN timeout");
> >                       tcp_rst(c, conn);
> > -             } else if (conn->retrans == TCP_MAX_RETRANS) {
> > +             } else if (conn->retries == TCP_MAX_RETRIES) {
> >                       flow_dbg(conn, "retransmissions count exceeded");
> >                       tcp_rst(c, conn);
> >               } else {
> > @@ -2423,7 +2423,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
> >                       if (!conn->wnd_from_tap)
> >                               conn->wnd_from_tap = 1; /* Zero-window probe */
> >
> > -                     conn->retrans++;
> > +                     conn->retries++;
> >                       if (tcp_rewind_seq(c, conn))
> >                               return;
> >
> > @@ -3382,7 +3382,7 @@ static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn,
> >  int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
> >  {
> >       struct tcp_tap_transfer t = {
> > -             .retrans                = conn->retrans,
> > +             .retries                = conn->retries,
> >               .ws_from_tap            = conn->ws_from_tap,
> >               .ws_to_tap              = conn->ws_to_tap,
> >               .events                 = conn->events,
> > @@ -3662,7 +3662,7 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
> >       memcpy(&flow->f.side, &t.side, sizeof(flow->f.side));
> >       conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
> >
> > -     conn->retrans                   = t.retrans;
> > +     conn->retries                   = t.retries;
> >       conn->ws_from_tap               = t.ws_from_tap;
> >       conn->ws_to_tap                 = t.ws_to_tap;
> >       conn->events                    = t.events;
> > diff --git a/tcp_conn.h b/tcp_conn.h
> > index 38b5c54..e5c8146 100644
> > --- a/tcp_conn.h
> > +++ b/tcp_conn.h
> > @@ -13,7 +13,7 @@
> >   * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
> >   * @f:                       Generic flow information
> >   * @in_epoll:                Is the connection in the epoll set?
> > - * @retrans:         Number of retransmissions occurred due to ACK_TIMEOUT
> > + * @retries:         Number of retries occurred due to timeouts
> >   * @ws_from_tap:     Window scaling factor advertised from tap/guest
> >   * @ws_to_tap:               Window scaling factor advertised to tap/guest
> >   * @tap_mss:         MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
> > @@ -38,9 +38,9 @@ struct tcp_tap_conn {
> >
> >       bool            in_epoll        :1;
> >
> > -#define TCP_RETRANS_BITS             3
> > -     unsigned int    retrans         :TCP_RETRANS_BITS;
> > -#define TCP_MAX_RETRANS                      MAX_FROM_BITS(TCP_RETRANS_BITS)
> > +#define TCP_RETRIES_BITS             3
> > +     unsigned int    retries         :TCP_RETRIES_BITS;
> > +#define TCP_MAX_RETRIES                      MAX_FROM_BITS(TCP_RETRIES_BITS)
> >
> >  #define TCP_WS_BITS                  4       /* RFC 7323 */
> >  #define TCP_WS_MAX                   14
> > @@ -102,7 +102,7 @@ struct tcp_tap_conn {
> >   * struct tcp_tap_transfer - Migrated TCP data, flow table part, network order
> >   * @pif:             Interfaces for each side of the flow
> >   * @side:            Addresses and ports for each side of the flow
> > - * @retrans:         Number of retransmissions occurred due to ACK_TIMEOUT
> > + * @retries:         Number of retries occurred due to timeouts
> >   * @ws_from_tap:     Window scaling factor advertised from tap/guest
> >   * @ws_to_tap:               Window scaling factor advertised to tap/guest
> >   * @events:          Connection events, implying connection states
> > @@ -122,7 +122,7 @@ struct tcp_tap_transfer {
> >       uint8_t         pif[SIDES];
> >       struct flowside side[SIDES];
> >
> > -     uint8_t         retrans;
> > +     uint8_t         retries;
> >       uint8_t         ws_from_tap;
> >       uint8_t         ws_to_tap;
> >       uint8_t         events;
> > --
> > 2.47.0
> >
>
> --
> David Gibson (he or they)       | I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au  | minimalist, thank you, not the other way
>                                 | around.
> http://www.ozlabs.org/~dgibson



-- 
Thanks,

Yumei Huang


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 2/4] util: Introduce read_file() and read_file_long() function
  2025-10-14 23:27   ` David Gibson
@ 2025-10-15  3:50     ` Yumei Huang
  2025-10-15  4:46       ` David Gibson
  0 siblings, 1 reply; 31+ messages in thread
From: Yumei Huang @ 2025-10-15  3:50 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev, sbrivio

On Wed, Oct 15, 2025 at 7:28 AM David Gibson
<david@gibson.dropbear.id.au> wrote:
>
> On Tue, Oct 14, 2025 at 03:38:34PM +0800, Yumei Huang wrote:
> > Signed-off-by: Yumei Huang <yuhuang@redhat.com>
> > ---
> >  util.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  util.h |  2 ++
> >  2 files changed, 94 insertions(+)
> >
> > diff --git a/util.c b/util.c
> > index c492f90..d331f08 100644
> > --- a/util.c
> > +++ b/util.c
> > @@ -579,6 +579,98 @@ int write_file(const char *path, const char *buf)
> >       return len == 0 ? 0 : -1;
> >  }
> >
> > +/**
> > + * read_file() - Read contents of file into a buffer
> > + * @path:    File to read
> > + * @buf:     Buffer to store file contents
> > + * @buf_size:        Size of buffer
> > + *
> > + * Return: number of bytes read on success, -1 on any error, -2 on truncation
> > +*/
>
> Looks ok, but I think there's a simpler way.
>
> > +int read_file(const char *path, char *buf, size_t buf_size)
> > +{
> > +     int fd = open(path, O_RDONLY | O_CLOEXEC);
> > +     size_t total_read = 0;
> > +     ssize_t rc;
> > +     bool truncated = false;
> > +
> > +     if (fd < 0) {
> > +             warn_perror("Could not open %s", path);
> > +             return -1;
> > +     }
> > +
> > +     while (total_read < buf_size - 1) {
> > +             rc = read(fd, buf + total_read, buf_size - 1 - total_read);
>
> The '- 1' is to leave space for the \0, but if you instead attempt to
> read the entire buffer...
>
> > +
> > +             if (rc < 0 ) {
>
> (nit: extra space before ')')
>
> > +                     warn_perror("Couldn't read from %s", path);
> > +                     close(fd);
> > +                     return -1;
> > +             }
> > +
> > +             if (rc == 0) {
> > +                     break;
> > +             }
> > +
> > +             total_read += rc;
> > +
> > +             if (total_read == buf_size - 1) {
> > +                     char test_byte;
> > +                     rc = read(fd, &test_byte, 1);
> > +                     if (rc >0) {
> > +                             truncated = true;
> > +                             warn_perror("File %s truncated, buffer too small", path);
> > +                     }
> > +             }
>
> ...then you can tell if you have to truncate by finishing the loop
> then checking if (total_read < buf_size).  If it is, there's space for
> the \0, otherwise there isn't and you report truncation.  No need for
> test_byte.

Yeah, that's much simpler. Will update in v4.
>
> > +     }
> > +
> > +     close(fd);
> > +
> > +     if (total_read < buf_size){
> > +             buf[total_read] = '\0';
>
>
> And if you test for truncation and exit early, you can  make this
> unconditional.

Agree.
>
> > +     }
> > +
> > +     return truncated ? -2 : (int)total_read;
> > +}
> > +
> > +/**
> > + * read_file_long() - Read a long integer value from a file
>
> When I first read this name I thought it was for reading a long file,
> rather than reading a long (int) from a file.  Not immediately sure
> how to clarify that.  read_file_long_int() is clear, but awkward.
>
> A better choice might be to change this to use strtoimax() and call it
> read_file_integer().

Good point. Will use strtoimax() and return intmax_t.
>
> > + * @path: Path to the sysctl file
> > + * @fallback: Default value if file can't be read
> > + *
> > + * Return: Parameter value, fallback on failure
> > +*/
> > +long read_file_long(const char *path, long fallback)
> > +{
> > +        char buf[32];
>
> Rather than just using a semi-arbitrary 32 here, I'd suggest defining
> a new constant similar to UINT16_STRLEN.  Except that's trickier for a
> type that doesn't have a known fixed width.  Pity the C library
> doesn't have constants for these AFAICT.

I will just define a UINTMAX_STRLEN with (sizeof("2147483647")).
>
> > +        char *end;
> > +        long value;
> > +        int bytes_read;
> > +
> > +        bytes_read = read_file(path, buf, sizeof(buf));
> > +        if (bytes_read < 0) {
> > +                debug("Unable to read %s", path);
>
> If there's a an error on open() or read(), this will produce two very
> similar error messages in a row, which isn't ideal.
>
> > +                return fallback;
> > +        }
> > +
> > +        if (bytes_read == 0) {
> > +                debug("Empty file %s", path);
> > +                return fallback;
> > +        }
>
> Might be worth checking strtol()'s behaviour on an empty string to see
> if this special case would already be handled below.

Checked both strtol() and strtoimax(), if the string is empty, it will
return 0 and set end to buf, which is \0, and errno remains unchanged.
So it's not handled below.
>
> > +
> > +        errno = 0;
> > +        value = strtol(buf, &end, 10);
> > +        if (*end && *end != '\n') {
> > +                debug("Invalid format in %s", path);
> > +                return fallback;
> > +        }
> > +        if (errno || value < 0 || value > LONG_MAX) {
>
> No need to exclude negative values here.  (value > LONG_MAX) can never
> be true since value is a long.
>
> > +                debug("Invalid value in %s: %ld", path, value);
>
> If errno != 0, value might be uninitialised here, and certainly won't
> have something useful.  Better to print the contents as a string.

Right.
>
> > +                return fallback;
> > +        }
> > +        return value;
> > +}
> > +
> >  #ifdef __ia64__
> >  /* Needed by do_clone() below: glibc doesn't export the prototype of __clone2(),
> >   * use the description from clone(2).
> > diff --git a/util.h b/util.h
> > index 22eaac5..e509bec 100644
> > --- a/util.h
> > +++ b/util.h
> > @@ -222,6 +222,8 @@ void pidfile_write(int fd, pid_t pid);
> >  int __daemon(int pidfile_fd, int devnull_fd);
> >  int fls(unsigned long x);
> >  int write_file(const char *path, const char *buf);
> > +int read_file(const char *path, char *buf, size_t buf_size);
> > +long read_file_long(const char *path, long fallback);
> >  int write_all_buf(int fd, const void *buf, size_t len);
> >  int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip);
> >  int read_all_buf(int fd, void *buf, size_t len);
> > --
> > 2.47.0
> >
>
> --
> David Gibson (he or they)       | I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au  | minimalist, thank you, not the other way
>                                 | around.
> http://www.ozlabs.org/~dgibson



-- 
Thanks,

Yumei Huang


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 2/4] util: Introduce read_file() and read_file_long() function
  2025-10-15  3:50     ` Yumei Huang
@ 2025-10-15  4:46       ` David Gibson
  2025-10-15  5:46         ` Yumei Huang
  2025-10-28 23:12         ` Stefano Brivio
  0 siblings, 2 replies; 31+ messages in thread
From: David Gibson @ 2025-10-15  4:46 UTC (permalink / raw)
  To: Yumei Huang; +Cc: passt-dev, sbrivio

[-- Attachment #1: Type: text/plain, Size: 1471 bytes --]

On Wed, Oct 15, 2025 at 11:50:53AM +0800, Yumei Huang wrote:
> On Wed, Oct 15, 2025 at 7:28 AM David Gibson
> <david@gibson.dropbear.id.au> wrote:
> > On Tue, Oct 14, 2025 at 03:38:34PM +0800, Yumei Huang wrote:
> > > Signed-off-by: Yumei Huang <yuhuang@redhat.com>
[snip]
> > > + * @path: Path to the sysctl file
> > > + * @fallback: Default value if file can't be read
> > > + *
> > > + * Return: Parameter value, fallback on failure
> > > +*/
> > > +long read_file_long(const char *path, long fallback)
> > > +{
> > > +        char buf[32];
> >
> > Rather than just using a semi-arbitrary 32 here, I'd suggest defining
> > a new constant similar to UINT16_STRLEN.  Except that's trickier for a
> > type that doesn't have a known fixed width.  Pity the C library
> > doesn't have constants for these AFAICT.
> 
> I will just define a UINTMAX_STRLEN with (sizeof("2147483647")).

That's not quite right.
 - It should be INTMAX_STRLEN (signed), UINTMAX would be for the
   unsigned version
 - That assumes intmax_t is 32-bit which is probably not the case (it
   will be 64-bit, maybe even 128-bit on modern systems)
 - For signed cases, it's the minimum (negative) value that gives the
   longest possible string (for 32-bit, "-2147483648")

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 2/4] util: Introduce read_file() and read_file_long() function
  2025-10-15  4:46       ` David Gibson
@ 2025-10-15  5:46         ` Yumei Huang
  2025-10-28 23:12         ` Stefano Brivio
  1 sibling, 0 replies; 31+ messages in thread
From: Yumei Huang @ 2025-10-15  5:46 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev, sbrivio

On Wed, Oct 15, 2025 at 12:46 PM David Gibson
<david@gibson.dropbear.id.au> wrote:
>
> On Wed, Oct 15, 2025 at 11:50:53AM +0800, Yumei Huang wrote:
> > On Wed, Oct 15, 2025 at 7:28 AM David Gibson
> > <david@gibson.dropbear.id.au> wrote:
> > > On Tue, Oct 14, 2025 at 03:38:34PM +0800, Yumei Huang wrote:
> > > > Signed-off-by: Yumei Huang <yuhuang@redhat.com>
> [snip]
> > > > + * @path: Path to the sysctl file
> > > > + * @fallback: Default value if file can't be read
> > > > + *
> > > > + * Return: Parameter value, fallback on failure
> > > > +*/
> > > > +long read_file_long(const char *path, long fallback)
> > > > +{
> > > > +        char buf[32];
> > >
> > > Rather than just using a semi-arbitrary 32 here, I'd suggest defining
> > > a new constant similar to UINT16_STRLEN.  Except that's trickier for a
> > > type that doesn't have a known fixed width.  Pity the C library
> > > doesn't have constants for these AFAICT.
> >
> > I will just define a UINTMAX_STRLEN with (sizeof("2147483647")).
>
> That's not quite right.
>  - It should be INTMAX_STRLEN (signed), UINTMAX would be for the
>    unsigned version
>  - That assumes intmax_t is 32-bit which is probably not the case (it
>    will be 64-bit, maybe even 128-bit on modern systems)
>  - For signed cases, it's the minimum (negative) value that gives the
>    longest possible string (for 32-bit, "-2147483648")

Maybe we could take it as 64 bits for now? I'm not sure under which
circumstance we would read such a large number.

If you agree, the define could be:

   #define INTMAX_STRLEN (sizeof("-9223372036854775808"))

Or maybe define it to 20 directly :

   #define INTMAX_STRLEN 20

what do you think?
>
> --
> David Gibson (he or they)       | I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au  | minimalist, thank you, not the other way
>                                 | around.
> http://www.ozlabs.org/~dgibson



-- 
Thanks,

Yumei Huang


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-15  0:05   ` David Gibson
@ 2025-10-15  6:31     ` Yumei Huang
  2025-10-15 22:54       ` David Gibson
  0 siblings, 1 reply; 31+ messages in thread
From: Yumei Huang @ 2025-10-15  6:31 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev, sbrivio

On Wed, Oct 15, 2025 at 8:05 AM David Gibson
<david@gibson.dropbear.id.au> wrote:
>
> On Tue, Oct 14, 2025 at 03:38:36PM +0800, Yumei Huang wrote:
> > According to RFC 2988 and RFC 6298, we should use an exponential
> > backoff timeout for data retransmission starting from one second
> > (see Appendix A in RFC 6298), and limit it to about 60 seconds
> > as allowed by the same RFC:
> >
> >    (2.5) A maximum value MAY be placed on RTO provided it is at
> >          least 60 seconds.
>
> The interpretation of this isn't entirely clear to me.  Does it mean
> if the total retransmit delay exceeds 60s we give up and RST (what
> this patch implements)?  Or does it mean that if the retransmit delay
> reaches 60s we keep retransmitting, but don't increase the delay any
> further?
>
> Looking at tcp_bound_rto() and related code in the kernel suggests the
> second interpretation.
>
> > Combine the macros defining the initial timeout for both SYN and ACK.
> > And add a macro ACK_RETRIES to limit the total timeout to about 60s.
> >
> > Signed-off-by: Yumei Huang <yuhuang@redhat.com>
> > ---
> >  tcp.c | 32 ++++++++++++++++----------------
> >  1 file changed, 16 insertions(+), 16 deletions(-)
> >
> > diff --git a/tcp.c b/tcp.c
> > index 3ce3991..84da069 100644
> > --- a/tcp.c
> > +++ b/tcp.c
> > @@ -179,16 +179,12 @@
> >   *
> >   * Timeouts are implemented by means of timerfd timers, set based on flags:
> >   *
> > - * - SYN_TIMEOUT_INIT: if no ACK is received from tap/guest during handshake
> > - *   (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, resend
> > - *   SYN. It's the starting timeout for the first SYN retry. If this persists
> > - *   for more than TCP_MAX_RETRIES or (tcp_syn_retries +
> > - *   tcp_syn_linear_timeouts) times in a row, reset the connection
> > - *
> > - * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
> > - *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
> > - *   socket and reset sequence to what was acknowledged. If this persists for
> > - *   more than TCP_MAX_RETRIES times in a row, reset the connection
> > + * - ACK_TIMEOUT_INIT: if no ACK segment was received from tap/guest, eiher
> > + *   during handshake(flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after
> > + *   sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data
> > + *   from the socket and reset sequence to what was acknowledged. It's the
> > + *   starting timeout for the first retry. If this persists for more than
> > + *   allowed times in a row, reset the connection
> >   *
> >   * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
> >   *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
> > @@ -342,8 +338,7 @@ enum {
> >  #define WINDOW_DEFAULT                       14600           /* RFC 6928 */
> >
> >  #define ACK_INTERVAL                 10              /* ms */
> > -#define SYN_TIMEOUT_INIT             1               /* s */
> > -#define ACK_TIMEOUT                  2
> > +#define ACK_TIMEOUT_INIT             1               /* s, RFC 6298 */
>
> I'd suggest calling this RTO_INIT to match the terminology used in the
> RFCs.

Sure.
>
> >  #define FIN_TIMEOUT                  60
> >  #define ACT_TIMEOUT                  7200
> >
> > @@ -352,6 +347,11 @@ enum {
> >
> >  #define ACK_IF_NEEDED        0               /* See tcp_send_flag() */
> >
> > +/* Number of retries calculated from the exponential backoff formula, limited
> > + * by a total timeout of about 60 seconds.
> > + */
> > +#define ACK_RETRIES          5
> > +
>
> As noted above, I think this is based on a misunderstanding of what
> the RFC is saying.  TCP_MAX_RETRIES should be fine as it is, I think.
> We could implement the clamping of the RTO, but it's a "MAY" in the
> RFC, so we don't have to, and I don't really see a strong reason to do
> so.

If we use TCP_MAX_RETRIES and not clamping RTO, the total timeout
could be 255 seconds.

Stefano mentioned "Retransmitting data after 256 seconds doesn't make
a lot of sense to me" in the previous comment.
Not sure what the reasonable timeout should be.

BTW, clamping the RTO to limit the delay to 60s should be easy to
implement, and it leads to 183s for the total timeout.

I'm okay with either approach. Please let me know your thoughts. Thanks.

>
> >  #define CONN_IS_CLOSING(conn)                                                \
> >       (((conn)->events & ESTABLISHED) &&                              \
> >        ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
> > @@ -589,13 +589,13 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
> >       } else if (conn->flags & ACK_FROM_TAP_DUE) {
> >               if (!(conn->events & ESTABLISHED)) {
> >                       if (conn->retries < c->tcp.syn_linear_timeouts)
> > -                             it.it_value.tv_sec = SYN_TIMEOUT_INIT;
> > +                             it.it_value.tv_sec = ACK_TIMEOUT_INIT;
> >                       else
> > -                             it.it_value.tv_sec = SYN_TIMEOUT_INIT <<
> > +                             it.it_value.tv_sec = ACK_TIMEOUT_INIT <<
> >                                       (conn->retries - c->tcp.syn_linear_timeouts);
> >               }
> >               else
> > -                     it.it_value.tv_sec = ACK_TIMEOUT;
> > +                     it.it_value.tv_sec = ACK_TIMEOUT_INIT << conn->retries;
> >       } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
> >               it.it_value.tv_sec = FIN_TIMEOUT;
> >       } else {
> > @@ -2433,7 +2433,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
> >               } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
> >                       flow_dbg(conn, "FIN timeout");
> >                       tcp_rst(c, conn);
> > -             } else if (conn->retries == TCP_MAX_RETRIES) {
> > +             } else if (conn->retries >= ACK_RETRIES) {
> >                       flow_dbg(conn, "retransmissions count exceeded");
> >                       tcp_rst(c, conn);
> >               } else {
> > --
> > 2.47.0
> >
>
> --
> David Gibson (he or they)       | I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au  | minimalist, thank you, not the other way
>                                 | around.
> http://www.ozlabs.org/~dgibson



-- 
Thanks,

Yumei Huang


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-15  6:31     ` Yumei Huang
@ 2025-10-15 22:54       ` David Gibson
  2025-10-17 18:28         ` Stefano Brivio
  0 siblings, 1 reply; 31+ messages in thread
From: David Gibson @ 2025-10-15 22:54 UTC (permalink / raw)
  To: Yumei Huang; +Cc: passt-dev, sbrivio

[-- Attachment #1: Type: text/plain, Size: 8028 bytes --]

On Wed, Oct 15, 2025 at 02:31:27PM +0800, Yumei Huang wrote:
> On Wed, Oct 15, 2025 at 8:05 AM David Gibson
> <david@gibson.dropbear.id.au> wrote:
> >
> > On Tue, Oct 14, 2025 at 03:38:36PM +0800, Yumei Huang wrote:
> > > According to RFC 2988 and RFC 6298, we should use an exponential
> > > backoff timeout for data retransmission starting from one second
> > > (see Appendix A in RFC 6298), and limit it to about 60 seconds
> > > as allowed by the same RFC:
> > >
> > >    (2.5) A maximum value MAY be placed on RTO provided it is at
> > >          least 60 seconds.
> >
> > The interpretation of this isn't entirely clear to me.  Does it mean
> > if the total retransmit delay exceeds 60s we give up and RST (what
> > this patch implements)?  Or does it mean that if the retransmit delay
> > reaches 60s we keep retransmitting, but don't increase the delay any
> > further?
> >
> > Looking at tcp_bound_rto() and related code in the kernel suggests the
> > second interpretation.
> >
> > > Combine the macros defining the initial timeout for both SYN and ACK.
> > > And add a macro ACK_RETRIES to limit the total timeout to about 60s.
> > >
> > > Signed-off-by: Yumei Huang <yuhuang@redhat.com>
> > > ---
> > >  tcp.c | 32 ++++++++++++++++----------------
> > >  1 file changed, 16 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/tcp.c b/tcp.c
> > > index 3ce3991..84da069 100644
> > > --- a/tcp.c
> > > +++ b/tcp.c
> > > @@ -179,16 +179,12 @@
> > >   *
> > >   * Timeouts are implemented by means of timerfd timers, set based on flags:
> > >   *
> > > - * - SYN_TIMEOUT_INIT: if no ACK is received from tap/guest during handshake
> > > - *   (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, resend
> > > - *   SYN. It's the starting timeout for the first SYN retry. If this persists
> > > - *   for more than TCP_MAX_RETRIES or (tcp_syn_retries +
> > > - *   tcp_syn_linear_timeouts) times in a row, reset the connection
> > > - *
> > > - * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
> > > - *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
> > > - *   socket and reset sequence to what was acknowledged. If this persists for
> > > - *   more than TCP_MAX_RETRIES times in a row, reset the connection
> > > + * - ACK_TIMEOUT_INIT: if no ACK segment was received from tap/guest, eiher
> > > + *   during handshake(flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after
> > > + *   sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data
> > > + *   from the socket and reset sequence to what was acknowledged. It's the
> > > + *   starting timeout for the first retry. If this persists for more than
> > > + *   allowed times in a row, reset the connection
> > >   *
> > >   * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
> > >   *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
> > > @@ -342,8 +338,7 @@ enum {
> > >  #define WINDOW_DEFAULT                       14600           /* RFC 6928 */
> > >
> > >  #define ACK_INTERVAL                 10              /* ms */
> > > -#define SYN_TIMEOUT_INIT             1               /* s */
> > > -#define ACK_TIMEOUT                  2
> > > +#define ACK_TIMEOUT_INIT             1               /* s, RFC 6298 */
> >
> > I'd suggest calling this RTO_INIT to match the terminology used in the
> > RFCs.
> 
> Sure.
> >
> > >  #define FIN_TIMEOUT                  60
> > >  #define ACT_TIMEOUT                  7200
> > >
> > > @@ -352,6 +347,11 @@ enum {
> > >
> > >  #define ACK_IF_NEEDED        0               /* See tcp_send_flag() */
> > >
> > > +/* Number of retries calculated from the exponential backoff formula, limited
> > > + * by a total timeout of about 60 seconds.
> > > + */
> > > +#define ACK_RETRIES          5
> > > +
> >
> > As noted above, I think this is based on a misunderstanding of what
> > the RFC is saying.  TCP_MAX_RETRIES should be fine as it is, I think.
> > We could implement the clamping of the RTO, but it's a "MAY" in the
> > RFC, so we don't have to, and I don't really see a strong reason to do
> > so.
> 
> If we use TCP_MAX_RETRIES and not clamping RTO, the total timeout
> could be 255 seconds.
> 
> Stefano mentioned "Retransmitting data after 256 seconds doesn't make
> a lot of sense to me" in the previous comment.

That's true, but it's pretty much true for 60s as well.  For the local
link we usually have between passt and guest, even 1s is an eternity.

Basically I see no harm, but also no advantage to clamping or limiting
the RTO, so I'm suggesting going with the simplest code.

Note that there are (rare) situations where we could get a response
after minutes.
 - The interface on the guest was disabled for a while
 - An error in guest firewall configuration blocked packets for a while
 - A bug on the guest cause the kernel to wedge for a while
 - The user manually suspended the guest for a while (VM/passt only)

These generally indicate something has gone fairly badly wrong, but a
long RTO gives the user a bit more time to realise their mistake and
fix things.  These are niche cases, but given the cost of implementing
it is "do nothing"...

> Not sure what the reasonable timeout should be.
> 
> BTW, clamping the RTO to limit the delay to 60s should be easy to
> implement, and it leads to 183s for the total timeout.
> 
> I'm okay with either approach. Please let me know your thoughts. Thanks.
> 
> >
> > >  #define CONN_IS_CLOSING(conn)                                                \
> > >       (((conn)->events & ESTABLISHED) &&                              \
> > >        ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
> > > @@ -589,13 +589,13 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
> > >       } else if (conn->flags & ACK_FROM_TAP_DUE) {
> > >               if (!(conn->events & ESTABLISHED)) {
> > >                       if (conn->retries < c->tcp.syn_linear_timeouts)
> > > -                             it.it_value.tv_sec = SYN_TIMEOUT_INIT;
> > > +                             it.it_value.tv_sec = ACK_TIMEOUT_INIT;
> > >                       else
> > > -                             it.it_value.tv_sec = SYN_TIMEOUT_INIT <<
> > > +                             it.it_value.tv_sec = ACK_TIMEOUT_INIT <<
> > >                                       (conn->retries - c->tcp.syn_linear_timeouts);
> > >               }
> > >               else
> > > -                     it.it_value.tv_sec = ACK_TIMEOUT;
> > > +                     it.it_value.tv_sec = ACK_TIMEOUT_INIT << conn->retries;
> > >       } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
> > >               it.it_value.tv_sec = FIN_TIMEOUT;
> > >       } else {
> > > @@ -2433,7 +2433,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
> > >               } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
> > >                       flow_dbg(conn, "FIN timeout");
> > >                       tcp_rst(c, conn);
> > > -             } else if (conn->retries == TCP_MAX_RETRIES) {
> > > +             } else if (conn->retries >= ACK_RETRIES) {
> > >                       flow_dbg(conn, "retransmissions count exceeded");
> > >                       tcp_rst(c, conn);
> > >               } else {
> > > --
> > > 2.47.0
> > >
> >
> > --
> > David Gibson (he or they)       | I'll have my music baroque, and my code
> > david AT gibson.dropbear.id.au  | minimalist, thank you, not the other way
> >                                 | around.
> > http://www.ozlabs.org/~dgibson
> 
> 
> 
> -- 
> Thanks,
> 
> Yumei Huang
> 

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-15 22:54       ` David Gibson
@ 2025-10-17 18:28         ` Stefano Brivio
  2025-10-20  0:20           ` David Gibson
  2025-10-20 10:57           ` Yumei Huang
  0 siblings, 2 replies; 31+ messages in thread
From: Stefano Brivio @ 2025-10-17 18:28 UTC (permalink / raw)
  To: David Gibson, Yumei Huang; +Cc: passt-dev

On Thu, 16 Oct 2025 09:54:25 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Wed, Oct 15, 2025 at 02:31:27PM +0800, Yumei Huang wrote:
> > On Wed, Oct 15, 2025 at 8:05 AM David Gibson
> > <david@gibson.dropbear.id.au> wrote:  
> > >
> > > On Tue, Oct 14, 2025 at 03:38:36PM +0800, Yumei Huang wrote:  
> > > > According to RFC 2988 and RFC 6298, we should use an exponential
> > > > backoff timeout for data retransmission starting from one second
> > > > (see Appendix A in RFC 6298), and limit it to about 60 seconds
> > > > as allowed by the same RFC:
> > > >
> > > >    (2.5) A maximum value MAY be placed on RTO provided it is at
> > > >          least 60 seconds.  
> > >
> > > The interpretation of this isn't entirely clear to me.  Does it mean
> > > if the total retransmit delay exceeds 60s we give up and RST (what
> > > this patch implements)?  Or does it mean that if the retransmit delay
> > > reaches 60s we keep retransmitting, but don't increase the delay any
> > > further?
> > >
> > > Looking at tcp_bound_rto() and related code in the kernel suggests the
> > > second interpretation.
> > >  
> > > > Combine the macros defining the initial timeout for both SYN and ACK.
> > > > And add a macro ACK_RETRIES to limit the total timeout to about 60s.
> > > >
> > > > Signed-off-by: Yumei Huang <yuhuang@redhat.com>
> > > > ---
> > > >  tcp.c | 32 ++++++++++++++++----------------
> > > >  1 file changed, 16 insertions(+), 16 deletions(-)
> > > >
> > > > diff --git a/tcp.c b/tcp.c
> > > > index 3ce3991..84da069 100644
> > > > --- a/tcp.c
> > > > +++ b/tcp.c
> > > > @@ -179,16 +179,12 @@
> > > >   *
> > > >   * Timeouts are implemented by means of timerfd timers, set based on flags:
> > > >   *
> > > > - * - SYN_TIMEOUT_INIT: if no ACK is received from tap/guest during handshake
> > > > - *   (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, resend
> > > > - *   SYN. It's the starting timeout for the first SYN retry. If this persists
> > > > - *   for more than TCP_MAX_RETRIES or (tcp_syn_retries +
> > > > - *   tcp_syn_linear_timeouts) times in a row, reset the connection
> > > > - *
> > > > - * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
> > > > - *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
> > > > - *   socket and reset sequence to what was acknowledged. If this persists for
> > > > - *   more than TCP_MAX_RETRIES times in a row, reset the connection
> > > > + * - ACK_TIMEOUT_INIT: if no ACK segment was received from tap/guest, eiher
> > > > + *   during handshake(flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after
> > > > + *   sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data
> > > > + *   from the socket and reset sequence to what was acknowledged. It's the
> > > > + *   starting timeout for the first retry. If this persists for more than
> > > > + *   allowed times in a row, reset the connection
> > > >   *
> > > >   * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
> > > >   *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
> > > > @@ -342,8 +338,7 @@ enum {
> > > >  #define WINDOW_DEFAULT                       14600           /* RFC 6928 */
> > > >
> > > >  #define ACK_INTERVAL                 10              /* ms */
> > > > -#define SYN_TIMEOUT_INIT             1               /* s */
> > > > -#define ACK_TIMEOUT                  2
> > > > +#define ACK_TIMEOUT_INIT             1               /* s, RFC 6298 */  
> > >
> > > I'd suggest calling this RTO_INIT to match the terminology used in the
> > > RFCs.  
> > 
> > Sure.  
> > >  
> > > >  #define FIN_TIMEOUT                  60
> > > >  #define ACT_TIMEOUT                  7200
> > > >
> > > > @@ -352,6 +347,11 @@ enum {
> > > >
> > > >  #define ACK_IF_NEEDED        0               /* See tcp_send_flag() */
> > > >
> > > > +/* Number of retries calculated from the exponential backoff formula, limited
> > > > + * by a total timeout of about 60 seconds.
> > > > + */
> > > > +#define ACK_RETRIES          5
> > > > +  
> > >
> > > As noted above, I think this is based on a misunderstanding of what
> > > the RFC is saying.  TCP_MAX_RETRIES should be fine as it is, I think.
> > > We could implement the clamping of the RTO, but it's a "MAY" in the
> > > RFC, so we don't have to, and I don't really see a strong reason to do
> > > so.  
> > 
> > If we use TCP_MAX_RETRIES and not clamping RTO, the total timeout
> > could be 255 seconds.
> > 
> > Stefano mentioned "Retransmitting data after 256 seconds doesn't make
> > a lot of sense to me" in the previous comment.  
> 
> That's true, but it's pretty much true for 60s as well.  For the local
> link we usually have between passt and guest, even 1s is an eternity.

Rather than the local link I was thinking of whatever monitor or
liveness probe in KubeVirt which might have a 60-second period, or some
firewall agent, or how long it typically takes for guests to stop and
resume again in KubeVirt.

It's usually seconds or maybe minutes but not five minutes.

> Basically I see no harm, but also no advantage to clamping or limiting
> the RTO, so I'm suggesting going with the simplest code.

The advantage I see is that we'll recover significantly faster in case
something went wrong.

> Note that there are (rare) situations where we could get a response
> after minutes.
>  - The interface on the guest was disabled for a while
>  - An error in guest firewall configuration blocked packets for a while
>  - A bug on the guest cause the kernel to wedge for a while
>  - The user manually suspended the guest for a while (VM/passt only)
> 
> These generally indicate something has gone fairly badly wrong, but a
> long RTO gives the user a bit more time to realise their mistake and
> fix things.

True, it's just that to me five minutes sounds like "broken beyond
repair", while one minute sounds like "oh we tried again and it worked".

> These are niche cases, but given the cost of implementing
> it is "do nothing"...

...anyway, it's not a strong preference from my side. It's mostly about
experience but I won't be able to really come up with obvious evidence
(at least not quickly), so if the code is significantly simpler...
whatever. It's not provable so I won't insist.

Note: the comments I'm replying to are from yesterday / Thursday, on
v3, and today / Friday we're at v6. I don't expect a week grace period
as you would on the kernel:

  https://docs.kernel.org/process/submitting-patches.html#don-t-get-discouraged-or-impatient

because we can surely move faster than that, but three versions in a
day obviously before I get any chance to have a look means a
substantial overhead for me, and I might miss the meaning and context of
comments of other reviewers (David in this case). There are no
changelogs in cover letters either.

I plan to skip to v6 but don't expect a review soon, because of that
overhead I just mentioned.

-- 
Stefano


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-17 18:28         ` Stefano Brivio
@ 2025-10-20  0:20           ` David Gibson
  2025-10-20  5:11             ` Stefano Brivio
  2025-10-20 10:57           ` Yumei Huang
  1 sibling, 1 reply; 31+ messages in thread
From: David Gibson @ 2025-10-20  0:20 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Yumei Huang, passt-dev

[-- Attachment #1: Type: text/plain, Size: 8091 bytes --]

On Fri, Oct 17, 2025 at 08:28:12PM +0200, Stefano Brivio wrote:
> On Thu, 16 Oct 2025 09:54:25 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Wed, Oct 15, 2025 at 02:31:27PM +0800, Yumei Huang wrote:
> > > On Wed, Oct 15, 2025 at 8:05 AM David Gibson
> > > <david@gibson.dropbear.id.au> wrote:  
> > > >
> > > > On Tue, Oct 14, 2025 at 03:38:36PM +0800, Yumei Huang wrote:  
> > > > > According to RFC 2988 and RFC 6298, we should use an exponential
> > > > > backoff timeout for data retransmission starting from one second
> > > > > (see Appendix A in RFC 6298), and limit it to about 60 seconds
> > > > > as allowed by the same RFC:
> > > > >
> > > > >    (2.5) A maximum value MAY be placed on RTO provided it is at
> > > > >          least 60 seconds.  
> > > >
> > > > The interpretation of this isn't entirely clear to me.  Does it mean
> > > > if the total retransmit delay exceeds 60s we give up and RST (what
> > > > this patch implements)?  Or does it mean that if the retransmit delay
> > > > reaches 60s we keep retransmitting, but don't increase the delay any
> > > > further?
> > > >
> > > > Looking at tcp_bound_rto() and related code in the kernel suggests the
> > > > second interpretation.
> > > >  
> > > > > Combine the macros defining the initial timeout for both SYN and ACK.
> > > > > And add a macro ACK_RETRIES to limit the total timeout to about 60s.
> > > > >
> > > > > Signed-off-by: Yumei Huang <yuhuang@redhat.com>
> > > > > ---
> > > > >  tcp.c | 32 ++++++++++++++++----------------
> > > > >  1 file changed, 16 insertions(+), 16 deletions(-)
> > > > >
> > > > > diff --git a/tcp.c b/tcp.c
> > > > > index 3ce3991..84da069 100644
> > > > > --- a/tcp.c
> > > > > +++ b/tcp.c
> > > > > @@ -179,16 +179,12 @@
> > > > >   *
> > > > >   * Timeouts are implemented by means of timerfd timers, set based on flags:
> > > > >   *
> > > > > - * - SYN_TIMEOUT_INIT: if no ACK is received from tap/guest during handshake
> > > > > - *   (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, resend
> > > > > - *   SYN. It's the starting timeout for the first SYN retry. If this persists
> > > > > - *   for more than TCP_MAX_RETRIES or (tcp_syn_retries +
> > > > > - *   tcp_syn_linear_timeouts) times in a row, reset the connection
> > > > > - *
> > > > > - * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
> > > > > - *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
> > > > > - *   socket and reset sequence to what was acknowledged. If this persists for
> > > > > - *   more than TCP_MAX_RETRIES times in a row, reset the connection
> > > > > + * - ACK_TIMEOUT_INIT: if no ACK segment was received from tap/guest, eiher
> > > > > + *   during handshake(flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after
> > > > > + *   sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data
> > > > > + *   from the socket and reset sequence to what was acknowledged. It's the
> > > > > + *   starting timeout for the first retry. If this persists for more than
> > > > > + *   allowed times in a row, reset the connection
> > > > >   *
> > > > >   * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
> > > > >   *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
> > > > > @@ -342,8 +338,7 @@ enum {
> > > > >  #define WINDOW_DEFAULT                       14600           /* RFC 6928 */
> > > > >
> > > > >  #define ACK_INTERVAL                 10              /* ms */
> > > > > -#define SYN_TIMEOUT_INIT             1               /* s */
> > > > > -#define ACK_TIMEOUT                  2
> > > > > +#define ACK_TIMEOUT_INIT             1               /* s, RFC 6298 */  
> > > >
> > > > I'd suggest calling this RTO_INIT to match the terminology used in the
> > > > RFCs.  
> > > 
> > > Sure.  
> > > >  
> > > > >  #define FIN_TIMEOUT                  60
> > > > >  #define ACT_TIMEOUT                  7200
> > > > >
> > > > > @@ -352,6 +347,11 @@ enum {
> > > > >
> > > > >  #define ACK_IF_NEEDED        0               /* See tcp_send_flag() */
> > > > >
> > > > > +/* Number of retries calculated from the exponential backoff formula, limited
> > > > > + * by a total timeout of about 60 seconds.
> > > > > + */
> > > > > +#define ACK_RETRIES          5
> > > > > +  
> > > >
> > > > As noted above, I think this is based on a misunderstanding of what
> > > > the RFC is saying.  TCP_MAX_RETRIES should be fine as it is, I think.
> > > > We could implement the clamping of the RTO, but it's a "MAY" in the
> > > > RFC, so we don't have to, and I don't really see a strong reason to do
> > > > so.  
> > > 
> > > If we use TCP_MAX_RETRIES and not clamping RTO, the total timeout
> > > could be 255 seconds.
> > > 
> > > Stefano mentioned "Retransmitting data after 256 seconds doesn't make
> > > a lot of sense to me" in the previous comment.  
> > 
> > That's true, but it's pretty much true for 60s as well.  For the local
> > link we usually have between passt and guest, even 1s is an eternity.
> 
> Rather than the local link I was thinking of whatever monitor or
> liveness probe in KubeVirt which might have a 60-second period, or some
> firewall agent, or how long it typically takes for guests to stop and
> resume again in KubeVirt.

Right, I hadn't considered those.  Although.. do those actually re-use
a single connection?  I would have guessed they use a new connection
each time, making the timeouts here irrelevant.

> It's usually seconds or maybe minutes but not five minutes.
> 
> > Basically I see no harm, but also no advantage to clamping or limiting
> > the RTO, so I'm suggesting going with the simplest code.
> 
> The advantage I see is that we'll recover significantly faster in case
> something went wrong.

That's a fair point in a more general case.

> > Note that there are (rare) situations where we could get a response
> > after minutes.
> >  - The interface on the guest was disabled for a while
> >  - An error in guest firewall configuration blocked packets for a while
> >  - A bug on the guest cause the kernel to wedge for a while
> >  - The user manually suspended the guest for a while (VM/passt only)
> > 
> > These generally indicate something has gone fairly badly wrong, but a
> > long RTO gives the user a bit more time to realise their mistake and
> > fix things.
> 
> True, it's just that to me five minutes sounds like "broken beyond
> repair", while one minute sounds like "oh we tried again and it worked".

Eh, maybe.  By nature it's always going to be a bit arbitrary.

> > These are niche cases, but given the cost of implementing
> > it is "do nothing"...
> 
> ...anyway, it's not a strong preference from my side. It's mostly about
> experience but I won't be able to really come up with obvious evidence
> (at least not quickly), so if the code is significantly simpler...
> whatever. It's not provable so I won't insist.

It's a bit simpler, I'm not sure I'd go so far as "significantly".

> Note: the comments I'm replying to are from yesterday / Thursday, on
> v3, and today / Friday we're at v6. I don't expect a week grace period
> as you would on the kernel:
> 
>   https://docs.kernel.org/process/submitting-patches.html#don-t-get-discouraged-or-impatient
> 
> because we can surely move faster than that, but three versions in a
> day obviously before I get any chance to have a look means a
> substantial overhead for me, and I might miss the meaning and context of
> comments of other reviewers (David in this case). There are no
> changelogs in cover letters either.
> 
> I plan to skip to v6 but don't expect a review soon, because of that
> overhead I just mentioned.

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-20  0:20           ` David Gibson
@ 2025-10-20  5:11             ` Stefano Brivio
  2025-10-20  9:17               ` David Gibson
  0 siblings, 1 reply; 31+ messages in thread
From: Stefano Brivio @ 2025-10-20  5:11 UTC (permalink / raw)
  To: David Gibson; +Cc: Yumei Huang, passt-dev

On Mon, 20 Oct 2025 11:20:19 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Fri, Oct 17, 2025 at 08:28:12PM +0200, Stefano Brivio wrote:
> > On Thu, 16 Oct 2025 09:54:25 +1100
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> > > On Wed, Oct 15, 2025 at 02:31:27PM +0800, Yumei Huang wrote:  
> > > > On Wed, Oct 15, 2025 at 8:05 AM David Gibson
> > > > <david@gibson.dropbear.id.au> wrote:    
> > > > >
> > > > > On Tue, Oct 14, 2025 at 03:38:36PM +0800, Yumei Huang wrote:    
> > > > > > According to RFC 2988 and RFC 6298, we should use an exponential
> > > > > > backoff timeout for data retransmission starting from one second
> > > > > > (see Appendix A in RFC 6298), and limit it to about 60 seconds
> > > > > > as allowed by the same RFC:
> > > > > >
> > > > > >    (2.5) A maximum value MAY be placed on RTO provided it is at
> > > > > >          least 60 seconds.    
> > > > >
> > > > > The interpretation of this isn't entirely clear to me.  Does it mean
> > > > > if the total retransmit delay exceeds 60s we give up and RST (what
> > > > > this patch implements)?  Or does it mean that if the retransmit delay
> > > > > reaches 60s we keep retransmitting, but don't increase the delay any
> > > > > further?
> > > > >
> > > > > Looking at tcp_bound_rto() and related code in the kernel suggests the
> > > > > second interpretation.
> > > > >    
> > > > > > Combine the macros defining the initial timeout for both SYN and ACK.
> > > > > > And add a macro ACK_RETRIES to limit the total timeout to about 60s.
> > > > > >
> > > > > > Signed-off-by: Yumei Huang <yuhuang@redhat.com>
> > > > > > ---
> > > > > >  tcp.c | 32 ++++++++++++++++----------------
> > > > > >  1 file changed, 16 insertions(+), 16 deletions(-)
> > > > > >
> > > > > > diff --git a/tcp.c b/tcp.c
> > > > > > index 3ce3991..84da069 100644
> > > > > > --- a/tcp.c
> > > > > > +++ b/tcp.c
> > > > > > @@ -179,16 +179,12 @@
> > > > > >   *
> > > > > >   * Timeouts are implemented by means of timerfd timers, set based on flags:
> > > > > >   *
> > > > > > - * - SYN_TIMEOUT_INIT: if no ACK is received from tap/guest during handshake
> > > > > > - *   (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, resend
> > > > > > - *   SYN. It's the starting timeout for the first SYN retry. If this persists
> > > > > > - *   for more than TCP_MAX_RETRIES or (tcp_syn_retries +
> > > > > > - *   tcp_syn_linear_timeouts) times in a row, reset the connection
> > > > > > - *
> > > > > > - * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
> > > > > > - *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
> > > > > > - *   socket and reset sequence to what was acknowledged. If this persists for
> > > > > > - *   more than TCP_MAX_RETRIES times in a row, reset the connection
> > > > > > + * - ACK_TIMEOUT_INIT: if no ACK segment was received from tap/guest, eiher
> > > > > > + *   during handshake(flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after
> > > > > > + *   sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data
> > > > > > + *   from the socket and reset sequence to what was acknowledged. It's the
> > > > > > + *   starting timeout for the first retry. If this persists for more than
> > > > > > + *   allowed times in a row, reset the connection
> > > > > >   *
> > > > > >   * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
> > > > > >   *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
> > > > > > @@ -342,8 +338,7 @@ enum {
> > > > > >  #define WINDOW_DEFAULT                       14600           /* RFC 6928 */
> > > > > >
> > > > > >  #define ACK_INTERVAL                 10              /* ms */
> > > > > > -#define SYN_TIMEOUT_INIT             1               /* s */
> > > > > > -#define ACK_TIMEOUT                  2
> > > > > > +#define ACK_TIMEOUT_INIT             1               /* s, RFC 6298 */    
> > > > >
> > > > > I'd suggest calling this RTO_INIT to match the terminology used in the
> > > > > RFCs.    
> > > > 
> > > > Sure.    
> > > > >    
> > > > > >  #define FIN_TIMEOUT                  60
> > > > > >  #define ACT_TIMEOUT                  7200
> > > > > >
> > > > > > @@ -352,6 +347,11 @@ enum {
> > > > > >
> > > > > >  #define ACK_IF_NEEDED        0               /* See tcp_send_flag() */
> > > > > >
> > > > > > +/* Number of retries calculated from the exponential backoff formula, limited
> > > > > > + * by a total timeout of about 60 seconds.
> > > > > > + */
> > > > > > +#define ACK_RETRIES          5
> > > > > > +    
> > > > >
> > > > > As noted above, I think this is based on a misunderstanding of what
> > > > > the RFC is saying.  TCP_MAX_RETRIES should be fine as it is, I think.
> > > > > We could implement the clamping of the RTO, but it's a "MAY" in the
> > > > > RFC, so we don't have to, and I don't really see a strong reason to do
> > > > > so.    
> > > > 
> > > > If we use TCP_MAX_RETRIES and not clamping RTO, the total timeout
> > > > could be 255 seconds.
> > > > 
> > > > Stefano mentioned "Retransmitting data after 256 seconds doesn't make
> > > > a lot of sense to me" in the previous comment.    
> > > 
> > > That's true, but it's pretty much true for 60s as well.  For the local
> > > link we usually have between passt and guest, even 1s is an eternity.  
> > 
> > Rather than the local link I was thinking of whatever monitor or
> > liveness probe in KubeVirt which might have a 60-second period, or some
> > firewall agent, or how long it typically takes for guests to stop and
> > resume again in KubeVirt.  
> 
> Right, I hadn't considered those.  Although.. do those actually re-use
> a single connection?  I would have guessed they use a new connection
> each time, making the timeouts here irrelevant.

It depends on the definition of "each time", because we don't time out
host-side connections immediately.

Pretending passt isn't there, the timeout would come from the default
values for TCP connections. It looks like there's no specific
SO_SNDTIMEO value set for those probes, and you can't configure the
timeout, at least according to:

  https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-tcp-liveness-probe

and for tcp_syn_retries, tcp(7) says:

  The default value is 6, which corresponds to retrying for up to
  approximately 127 seconds.

In this series, to make things transparent, we read out those values,
so that part is fine. But does the Linux kernel clamp the RTO?

It turns out that yes, it does, TCP_RTO_MAX_SEC is 120 seconds (before
1280c26228bd ("tcp: add tcp_rto_max_ms sysctl") that was TCP_RTO_MAX,
same value), and it's used by tcp_retransmit_timer() via tcp_rto_max().
That change makes it configurable.

I'm tempted to suggest that we should read out that value as well
(with a 120-second fallback for older kernels) to make our behaviour
as transparent as possible.

It's slightly more complicated and perhaps not strictly needed, but
we've been bitten a few times by cases where applications and users
expect us to behave like the Linux kernel, and we didn't... so maybe
we could do this as well while at it? Given the rest of this series,
it looks like a relatively small addition to it.

> > It's usually seconds or maybe minutes but not five minutes.
> >   
> > > Basically I see no harm, but also no advantage to clamping or limiting
> > > the RTO, so I'm suggesting going with the simplest code.  
> > 
> > The advantage I see is that we'll recover significantly faster in case
> > something went wrong.  
> 
> That's a fair point in a more general case.
> 
> > > Note that there are (rare) situations where we could get a response
> > > after minutes.
> > >  - The interface on the guest was disabled for a while
> > >  - An error in guest firewall configuration blocked packets for a while
> > >  - A bug on the guest cause the kernel to wedge for a while
> > >  - The user manually suspended the guest for a while (VM/passt only)
> > > 
> > > These generally indicate something has gone fairly badly wrong, but a
> > > long RTO gives the user a bit more time to realise their mistake and
> > > fix things.  
> > 
> > True, it's just that to me five minutes sounds like "broken beyond
> > repair", while one minute sounds like "oh we tried again and it worked".  
> 
> Eh, maybe.  By nature it's always going to be a bit arbitrary.
> 
> > > These are niche cases, but given the cost of implementing
> > > it is "do nothing"...  
> > 
> > ...anyway, it's not a strong preference from my side. It's mostly about
> > experience but I won't be able to really come up with obvious evidence
> > (at least not quickly), so if the code is significantly simpler...
> > whatever. It's not provable so I won't insist.  
> 
> It's a bit simpler, I'm not sure I'd go so far as "significantly".
> 
> > Note: the comments I'm replying to are from yesterday / Thursday, on
> > v3, and today / Friday we're at v6. I don't expect a week grace period
> > as you would on the kernel:
> > 
> >   https://docs.kernel.org/process/submitting-patches.html#don-t-get-discouraged-or-impatient
> > 
> > because we can surely move faster than that, but three versions in a
> > day obviously before I get any chance to have a look means a
> > substantial overhead for me, and I might miss the meaning and context of
> > comments of other reviewers (David in this case). There are no
> > changelogs in cover letters either.
> > 
> > I plan to skip to v6 but don't expect a review soon, because of that
> > overhead I just mentioned.  

-- 
Stefano


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-20  5:11             ` Stefano Brivio
@ 2025-10-20  9:17               ` David Gibson
  2025-10-28 23:13                 ` Stefano Brivio
  0 siblings, 1 reply; 31+ messages in thread
From: David Gibson @ 2025-10-20  9:17 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Yumei Huang, passt-dev

[-- Attachment #1: Type: text/plain, Size: 8450 bytes --]

On Mon, Oct 20, 2025 at 07:11:07AM +0200, Stefano Brivio wrote:
> On Mon, 20 Oct 2025 11:20:19 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Fri, Oct 17, 2025 at 08:28:12PM +0200, Stefano Brivio wrote:
> > > On Thu, 16 Oct 2025 09:54:25 +1100
> > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > >   
> > > > On Wed, Oct 15, 2025 at 02:31:27PM +0800, Yumei Huang wrote:  
> > > > > On Wed, Oct 15, 2025 at 8:05 AM David Gibson
> > > > > <david@gibson.dropbear.id.au> wrote:    
> > > > > >
> > > > > > On Tue, Oct 14, 2025 at 03:38:36PM +0800, Yumei Huang wrote:    
> > > > > > > According to RFC 2988 and RFC 6298, we should use an exponential
> > > > > > > backoff timeout for data retransmission starting from one second
> > > > > > > (see Appendix A in RFC 6298), and limit it to about 60 seconds
> > > > > > > as allowed by the same RFC:
> > > > > > >
> > > > > > >    (2.5) A maximum value MAY be placed on RTO provided it is at
> > > > > > >          least 60 seconds.    
> > > > > >
> > > > > > The interpretation of this isn't entirely clear to me.  Does it mean
> > > > > > if the total retransmit delay exceeds 60s we give up and RST (what
> > > > > > this patch implements)?  Or does it mean that if the retransmit delay
> > > > > > reaches 60s we keep retransmitting, but don't increase the delay any
> > > > > > further?
> > > > > >
> > > > > > Looking at tcp_bound_rto() and related code in the kernel suggests the
> > > > > > second interpretation.
> > > > > >    
> > > > > > > Combine the macros defining the initial timeout for both SYN and ACK.
> > > > > > > And add a macro ACK_RETRIES to limit the total timeout to about 60s.
> > > > > > >
> > > > > > > Signed-off-by: Yumei Huang <yuhuang@redhat.com>
> > > > > > > ---
> > > > > > >  tcp.c | 32 ++++++++++++++++----------------
> > > > > > >  1 file changed, 16 insertions(+), 16 deletions(-)
> > > > > > >
> > > > > > > diff --git a/tcp.c b/tcp.c
> > > > > > > index 3ce3991..84da069 100644
> > > > > > > --- a/tcp.c
> > > > > > > +++ b/tcp.c
> > > > > > > @@ -179,16 +179,12 @@
> > > > > > >   *
> > > > > > >   * Timeouts are implemented by means of timerfd timers, set based on flags:
> > > > > > >   *
> > > > > > > - * - SYN_TIMEOUT_INIT: if no ACK is received from tap/guest during handshake
> > > > > > > - *   (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, resend
> > > > > > > - *   SYN. It's the starting timeout for the first SYN retry. If this persists
> > > > > > > - *   for more than TCP_MAX_RETRIES or (tcp_syn_retries +
> > > > > > > - *   tcp_syn_linear_timeouts) times in a row, reset the connection
> > > > > > > - *
> > > > > > > - * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
> > > > > > > - *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
> > > > > > > - *   socket and reset sequence to what was acknowledged. If this persists for
> > > > > > > - *   more than TCP_MAX_RETRIES times in a row, reset the connection
> > > > > > > + * - ACK_TIMEOUT_INIT: if no ACK segment was received from tap/guest, eiher
> > > > > > > + *   during handshake(flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after
> > > > > > > + *   sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data
> > > > > > > + *   from the socket and reset sequence to what was acknowledged. It's the
> > > > > > > + *   starting timeout for the first retry. If this persists for more than
> > > > > > > + *   allowed times in a row, reset the connection
> > > > > > >   *
> > > > > > >   * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
> > > > > > >   *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
> > > > > > > @@ -342,8 +338,7 @@ enum {
> > > > > > >  #define WINDOW_DEFAULT                       14600           /* RFC 6928 */
> > > > > > >
> > > > > > >  #define ACK_INTERVAL                 10              /* ms */
> > > > > > > -#define SYN_TIMEOUT_INIT             1               /* s */
> > > > > > > -#define ACK_TIMEOUT                  2
> > > > > > > +#define ACK_TIMEOUT_INIT             1               /* s, RFC 6298 */    
> > > > > >
> > > > > > I'd suggest calling this RTO_INIT to match the terminology used in the
> > > > > > RFCs.    
> > > > > 
> > > > > Sure.    
> > > > > >    
> > > > > > >  #define FIN_TIMEOUT                  60
> > > > > > >  #define ACT_TIMEOUT                  7200
> > > > > > >
> > > > > > > @@ -352,6 +347,11 @@ enum {
> > > > > > >
> > > > > > >  #define ACK_IF_NEEDED        0               /* See tcp_send_flag() */
> > > > > > >
> > > > > > > +/* Number of retries calculated from the exponential backoff formula, limited
> > > > > > > + * by a total timeout of about 60 seconds.
> > > > > > > + */
> > > > > > > +#define ACK_RETRIES          5
> > > > > > > +    
> > > > > >
> > > > > > As noted above, I think this is based on a misunderstanding of what
> > > > > > the RFC is saying.  TCP_MAX_RETRIES should be fine as it is, I think.
> > > > > > We could implement the clamping of the RTO, but it's a "MAY" in the
> > > > > > RFC, so we don't have to, and I don't really see a strong reason to do
> > > > > > so.    
> > > > > 
> > > > > If we use TCP_MAX_RETRIES and not clamping RTO, the total timeout
> > > > > could be 255 seconds.
> > > > > 
> > > > > Stefano mentioned "Retransmitting data after 256 seconds doesn't make
> > > > > a lot of sense to me" in the previous comment.    
> > > > 
> > > > That's true, but it's pretty much true for 60s as well.  For the local
> > > > link we usually have between passt and guest, even 1s is an eternity.  
> > > 
> > > Rather than the local link I was thinking of whatever monitor or
> > > liveness probe in KubeVirt which might have a 60-second period, or some
> > > firewall agent, or how long it typically takes for guests to stop and
> > > resume again in KubeVirt.  
> > 
> > Right, I hadn't considered those.  Although.. do those actually re-use
> > a single connection?  I would have guessed they use a new connection
> > each time, making the timeouts here irrelevant.
> 
> It depends on the definition of "each time", because we don't time out
> host-side connections immediately.

Hm, ok.  Is your concern that getting a negative answer from the probe
will take too long?

> Pretending passt isn't there, the timeout would come from the default
> values for TCP connections. It looks like there's no specific
> SO_SNDTIMEO value set for those probes, and you can't configure the
> timeout, at least according to:
> 
>   https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-tcp-liveness-probe

My guess would be that the probe would probably time out at the
application level long before the TCP layer times out, but I don't
know for sure.

> and for tcp_syn_retries, tcp(7) says:
> 
>   The default value is 6, which corresponds to retrying for up to
>   approximately 127 seconds.
> 
> In this series, to make things transparent, we read out those values,
> so that part is fine. But does the Linux kernel clamp the RTO?
> 
> It turns out that yes, it does, TCP_RTO_MAX_SEC is 120 seconds (before
> 1280c26228bd ("tcp: add tcp_rto_max_ms sysctl") that was TCP_RTO_MAX,
> same value), and it's used by tcp_retransmit_timer() via tcp_rto_max().
> That change makes it configurable.
> 
> I'm tempted to suggest that we should read out that value as well
> (with a 120-second fallback for older kernels) to make our behaviour
> as transparent as possible.
> 
> It's slightly more complicated and perhaps not strictly needed, but
> we've been bitten a few times by cases where applications and users
> expect us to behave like the Linux kernel, and we didn't... so maybe
> we could do this as well while at it? Given the rest of this series,
> it looks like a relatively small addition to it.

I think that's a good idea.  It's a bit more work, but it doesn't
greatly increase the conceptual complexity and will more closely match
the kernel's behaviour.

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-17 18:28         ` Stefano Brivio
  2025-10-20  0:20           ` David Gibson
@ 2025-10-20 10:57           ` Yumei Huang
  2025-10-20 23:20             ` Stefano Brivio
  1 sibling, 1 reply; 31+ messages in thread
From: Yumei Huang @ 2025-10-20 10:57 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: David Gibson, passt-dev

On Sat, Oct 18, 2025 at 2:28 AM Stefano Brivio <sbrivio@redhat.com> wrote:
>
> On Thu, 16 Oct 2025 09:54:25 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
>
> > On Wed, Oct 15, 2025 at 02:31:27PM +0800, Yumei Huang wrote:
> > > On Wed, Oct 15, 2025 at 8:05 AM David Gibson
> > > <david@gibson.dropbear.id.au> wrote:
> > > >
> > > > On Tue, Oct 14, 2025 at 03:38:36PM +0800, Yumei Huang wrote:
> > > > > According to RFC 2988 and RFC 6298, we should use an exponential
> > > > > backoff timeout for data retransmission starting from one second
> > > > > (see Appendix A in RFC 6298), and limit it to about 60 seconds
> > > > > as allowed by the same RFC:
> > > > >
> > > > >    (2.5) A maximum value MAY be placed on RTO provided it is at
> > > > >          least 60 seconds.
> > > >
> > > > The interpretation of this isn't entirely clear to me.  Does it mean
> > > > if the total retransmit delay exceeds 60s we give up and RST (what
> > > > this patch implements)?  Or does it mean that if the retransmit delay
> > > > reaches 60s we keep retransmitting, but don't increase the delay any
> > > > further?
> > > >
> > > > Looking at tcp_bound_rto() and related code in the kernel suggests the
> > > > second interpretation.
> > > >
> > > > > Combine the macros defining the initial timeout for both SYN and ACK.
> > > > > And add a macro ACK_RETRIES to limit the total timeout to about 60s.
> > > > >
> > > > > Signed-off-by: Yumei Huang <yuhuang@redhat.com>
> > > > > ---
> > > > >  tcp.c | 32 ++++++++++++++++----------------
> > > > >  1 file changed, 16 insertions(+), 16 deletions(-)
> > > > >
> > > > > diff --git a/tcp.c b/tcp.c
> > > > > index 3ce3991..84da069 100644
> > > > > --- a/tcp.c
> > > > > +++ b/tcp.c
> > > > > @@ -179,16 +179,12 @@
> > > > >   *
> > > > >   * Timeouts are implemented by means of timerfd timers, set based on flags:
> > > > >   *
> > > > > - * - SYN_TIMEOUT_INIT: if no ACK is received from tap/guest during handshake
> > > > > - *   (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, resend
> > > > > - *   SYN. It's the starting timeout for the first SYN retry. If this persists
> > > > > - *   for more than TCP_MAX_RETRIES or (tcp_syn_retries +
> > > > > - *   tcp_syn_linear_timeouts) times in a row, reset the connection
> > > > > - *
> > > > > - * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
> > > > > - *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
> > > > > - *   socket and reset sequence to what was acknowledged. If this persists for
> > > > > - *   more than TCP_MAX_RETRIES times in a row, reset the connection
> > > > > + * - ACK_TIMEOUT_INIT: if no ACK segment was received from tap/guest, eiher
> > > > > + *   during handshake(flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after
> > > > > + *   sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data
> > > > > + *   from the socket and reset sequence to what was acknowledged. It's the
> > > > > + *   starting timeout for the first retry. If this persists for more than
> > > > > + *   allowed times in a row, reset the connection
> > > > >   *
> > > > >   * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
> > > > >   *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
> > > > > @@ -342,8 +338,7 @@ enum {
> > > > >  #define WINDOW_DEFAULT                       14600           /* RFC 6928 */
> > > > >
> > > > >  #define ACK_INTERVAL                 10              /* ms */
> > > > > -#define SYN_TIMEOUT_INIT             1               /* s */
> > > > > -#define ACK_TIMEOUT                  2
> > > > > +#define ACK_TIMEOUT_INIT             1               /* s, RFC 6298 */
> > > >
> > > > I'd suggest calling this RTO_INIT to match the terminology used in the
> > > > RFCs.
> > >
> > > Sure.
> > > >
> > > > >  #define FIN_TIMEOUT                  60
> > > > >  #define ACT_TIMEOUT                  7200
> > > > >
> > > > > @@ -352,6 +347,11 @@ enum {
> > > > >
> > > > >  #define ACK_IF_NEEDED        0               /* See tcp_send_flag() */
> > > > >
> > > > > +/* Number of retries calculated from the exponential backoff formula, limited
> > > > > + * by a total timeout of about 60 seconds.
> > > > > + */
> > > > > +#define ACK_RETRIES          5
> > > > > +
> > > >
> > > > As noted above, I think this is based on a misunderstanding of what
> > > > the RFC is saying.  TCP_MAX_RETRIES should be fine as it is, I think.
> > > > We could implement the clamping of the RTO, but it's a "MAY" in the
> > > > RFC, so we don't have to, and I don't really see a strong reason to do
> > > > so.
> > >
> > > If we use TCP_MAX_RETRIES and not clamping RTO, the total timeout
> > > could be 255 seconds.
> > >
> > > Stefano mentioned "Retransmitting data after 256 seconds doesn't make
> > > a lot of sense to me" in the previous comment.
> >
> > That's true, but it's pretty much true for 60s as well.  For the local
> > link we usually have between passt and guest, even 1s is an eternity.
>
> Rather than the local link I was thinking of whatever monitor or
> liveness probe in KubeVirt which might have a 60-second period, or some
> firewall agent, or how long it typically takes for guests to stop and
> resume again in KubeVirt.
>
> It's usually seconds or maybe minutes but not five minutes.
>
> > Basically I see no harm, but also no advantage to clamping or limiting
> > the RTO, so I'm suggesting going with the simplest code.
>
> The advantage I see is that we'll recover significantly faster in case
> something went wrong.
>
> > Note that there are (rare) situations where we could get a response
> > after minutes.
> >  - The interface on the guest was disabled for a while
> >  - An error in guest firewall configuration blocked packets for a while
> >  - A bug on the guest cause the kernel to wedge for a while
> >  - The user manually suspended the guest for a while (VM/passt only)
> >
> > These generally indicate something has gone fairly badly wrong, but a
> > long RTO gives the user a bit more time to realise their mistake and
> > fix things.
>
> True, it's just that to me five minutes sounds like "broken beyond
> repair", while one minute sounds like "oh we tried again and it worked".
>
> > These are niche cases, but given the cost of implementing
> > it is "do nothing"...
>
> ...anyway, it's not a strong preference from my side. It's mostly about
> experience but I won't be able to really come up with obvious evidence
> (at least not quickly), so if the code is significantly simpler...
> whatever. It's not provable so I won't insist.
>
> Note: the comments I'm replying to are from yesterday / Thursday, on
> v3, and today / Friday we're at v6. I don't expect a week grace period
> as you would on the kernel:
>
>   https://docs.kernel.org/process/submitting-patches.html#don-t-get-discouraged-or-impatient
>
> because we can surely move faster than that, but three versions in a
> day obviously before I get any chance to have a look means a
> substantial overhead for me, and I might miss the meaning and context of
> comments of other reviewers (David in this case). There are no
> changelogs in cover letters either.
>
> I plan to skip to v6 but don't expect a review soon, because of that
> overhead I just mentioned.

Sorry for the overhead I brought. It's just so different from what we
do with MRs or PRs(at least within our team), which we are supposed to
update as soon as possible, so reviewers could review again at any
time they are available. And it's always the latest code (with less
"problematic" code) there for review, not the outdated ones. I thought
it's the same with patches in emails, that outdated versions are no
longer useful. Apparently I got it wrong. I will keep it in mind and
not send too many versions in a short time, and add changelogs in
cover letters when necessary.
>
> --
> Stefano
>


-- 
Thanks,

Yumei Huang


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-20 10:57           ` Yumei Huang
@ 2025-10-20 23:20             ` Stefano Brivio
  2025-10-22  2:23               ` David Gibson
  0 siblings, 1 reply; 31+ messages in thread
From: Stefano Brivio @ 2025-10-20 23:20 UTC (permalink / raw)
  To: Yumei Huang; +Cc: David Gibson, passt-dev

On Mon, 20 Oct 2025 18:57:45 +0800
Yumei Huang <yuhuang@redhat.com> wrote:

> On Sat, Oct 18, 2025 at 2:28 AM Stefano Brivio <sbrivio@redhat.com> wrote:
> >
> > On Thu, 16 Oct 2025 09:54:25 +1100
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >  
> > > On Wed, Oct 15, 2025 at 02:31:27PM +0800, Yumei Huang wrote:  
> > > > On Wed, Oct 15, 2025 at 8:05 AM David Gibson
> > > > <david@gibson.dropbear.id.au> wrote:  
> > > > >
> > > > > On Tue, Oct 14, 2025 at 03:38:36PM +0800, Yumei Huang wrote:  
> > > > > > According to RFC 2988 and RFC 6298, we should use an exponential
> > > > > > backoff timeout for data retransmission starting from one second
> > > > > > (see Appendix A in RFC 6298), and limit it to about 60 seconds
> > > > > > as allowed by the same RFC:
> > > > > >
> > > > > >    (2.5) A maximum value MAY be placed on RTO provided it is at
> > > > > >          least 60 seconds.  
> > > > >
> > > > > The interpretation of this isn't entirely clear to me.  Does it mean
> > > > > if the total retransmit delay exceeds 60s we give up and RST (what
> > > > > this patch implements)?  Or does it mean that if the retransmit delay
> > > > > reaches 60s we keep retransmitting, but don't increase the delay any
> > > > > further?
> > > > >
> > > > > Looking at tcp_bound_rto() and related code in the kernel suggests the
> > > > > second interpretation.
> > > > >  
> > > > > > Combine the macros defining the initial timeout for both SYN and ACK.
> > > > > > And add a macro ACK_RETRIES to limit the total timeout to about 60s.
> > > > > >
> > > > > > Signed-off-by: Yumei Huang <yuhuang@redhat.com>
> > > > > > ---
> > > > > >  tcp.c | 32 ++++++++++++++++----------------
> > > > > >  1 file changed, 16 insertions(+), 16 deletions(-)
> > > > > >
> > > > > > diff --git a/tcp.c b/tcp.c
> > > > > > index 3ce3991..84da069 100644
> > > > > > --- a/tcp.c
> > > > > > +++ b/tcp.c
> > > > > > @@ -179,16 +179,12 @@
> > > > > >   *
> > > > > >   * Timeouts are implemented by means of timerfd timers, set based on flags:
> > > > > >   *
> > > > > > - * - SYN_TIMEOUT_INIT: if no ACK is received from tap/guest during handshake
> > > > > > - *   (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, resend
> > > > > > - *   SYN. It's the starting timeout for the first SYN retry. If this persists
> > > > > > - *   for more than TCP_MAX_RETRIES or (tcp_syn_retries +
> > > > > > - *   tcp_syn_linear_timeouts) times in a row, reset the connection
> > > > > > - *
> > > > > > - * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
> > > > > > - *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
> > > > > > - *   socket and reset sequence to what was acknowledged. If this persists for
> > > > > > - *   more than TCP_MAX_RETRIES times in a row, reset the connection
> > > > > > + * - ACK_TIMEOUT_INIT: if no ACK segment was received from tap/guest, eiher
> > > > > > + *   during handshake(flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after
> > > > > > + *   sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data
> > > > > > + *   from the socket and reset sequence to what was acknowledged. It's the
> > > > > > + *   starting timeout for the first retry. If this persists for more than
> > > > > > + *   allowed times in a row, reset the connection
> > > > > >   *
> > > > > >   * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
> > > > > >   *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
> > > > > > @@ -342,8 +338,7 @@ enum {
> > > > > >  #define WINDOW_DEFAULT                       14600           /* RFC 6928 */
> > > > > >
> > > > > >  #define ACK_INTERVAL                 10              /* ms */
> > > > > > -#define SYN_TIMEOUT_INIT             1               /* s */
> > > > > > -#define ACK_TIMEOUT                  2
> > > > > > +#define ACK_TIMEOUT_INIT             1               /* s, RFC 6298 */  
> > > > >
> > > > > I'd suggest calling this RTO_INIT to match the terminology used in the
> > > > > RFCs.  
> > > >
> > > > Sure.  
> > > > >  
> > > > > >  #define FIN_TIMEOUT                  60
> > > > > >  #define ACT_TIMEOUT                  7200
> > > > > >
> > > > > > @@ -352,6 +347,11 @@ enum {
> > > > > >
> > > > > >  #define ACK_IF_NEEDED        0               /* See tcp_send_flag() */
> > > > > >
> > > > > > +/* Number of retries calculated from the exponential backoff formula, limited
> > > > > > + * by a total timeout of about 60 seconds.
> > > > > > + */
> > > > > > +#define ACK_RETRIES          5
> > > > > > +  
> > > > >
> > > > > As noted above, I think this is based on a misunderstanding of what
> > > > > the RFC is saying.  TCP_MAX_RETRIES should be fine as it is, I think.
> > > > > We could implement the clamping of the RTO, but it's a "MAY" in the
> > > > > RFC, so we don't have to, and I don't really see a strong reason to do
> > > > > so.  
> > > >
> > > > If we use TCP_MAX_RETRIES and not clamping RTO, the total timeout
> > > > could be 255 seconds.
> > > >
> > > > Stefano mentioned "Retransmitting data after 256 seconds doesn't make
> > > > a lot of sense to me" in the previous comment.  
> > >
> > > That's true, but it's pretty much true for 60s as well.  For the local
> > > link we usually have between passt and guest, even 1s is an eternity.  
> >
> > Rather than the local link I was thinking of whatever monitor or
> > liveness probe in KubeVirt which might have a 60-second period, or some
> > firewall agent, or how long it typically takes for guests to stop and
> > resume again in KubeVirt.
> >
> > It's usually seconds or maybe minutes but not five minutes.
> >  
> > > Basically I see no harm, but also no advantage to clamping or limiting
> > > the RTO, so I'm suggesting going with the simplest code.  
> >
> > The advantage I see is that we'll recover significantly faster in case
> > something went wrong.
> >  
> > > Note that there are (rare) situations where we could get a response
> > > after minutes.
> > >  - The interface on the guest was disabled for a while
> > >  - An error in guest firewall configuration blocked packets for a while
> > >  - A bug on the guest cause the kernel to wedge for a while
> > >  - The user manually suspended the guest for a while (VM/passt only)
> > >
> > > These generally indicate something has gone fairly badly wrong, but a
> > > long RTO gives the user a bit more time to realise their mistake and
> > > fix things.  
> >
> > True, it's just that to me five minutes sounds like "broken beyond
> > repair", while one minute sounds like "oh we tried again and it worked".
> >  
> > > These are niche cases, but given the cost of implementing
> > > it is "do nothing"...  
> >
> > ...anyway, it's not a strong preference from my side. It's mostly about
> > experience but I won't be able to really come up with obvious evidence
> > (at least not quickly), so if the code is significantly simpler...
> > whatever. It's not provable so I won't insist.
> >
> > Note: the comments I'm replying to are from yesterday / Thursday, on
> > v3, and today / Friday we're at v6. I don't expect a week grace period
> > as you would on the kernel:
> >
> >   https://docs.kernel.org/process/submitting-patches.html#don-t-get-discouraged-or-impatient
> >
> > because we can surely move faster than that, but three versions in a
> > day obviously before I get any chance to have a look means a
> > substantial overhead for me, and I might miss the meaning and context of
> > comments of other reviewers (David in this case). There are no
> > changelogs in cover letters either.
> >
> > I plan to skip to v6 but don't expect a review soon, because of that
> > overhead I just mentioned.  
> 
> Sorry for the overhead I brought. It's just so different from what we
> do with MRs or PRs(at least within our team), which we are supposed to
> update as soon as possible, so reviewers could review again at any
> time they are available. And it's always the latest code (with less
> "problematic" code) there for review, not the outdated ones.

Oh, I see now.

I also have some experience with contributing via git forges, and I
think it's a serious limitation (at least on GitHub) coming from the
fact that you don't have (proper) threading. You have it on discussions
and issues/tickets but not on code reviews.

You lose one dimension of discussion there, because it becomes entirely
"linear", and while you can see differences between revisions, it's not
really practical to review or discuss them. There's also no space to
record and describe changes, if you just force push a branch.

I think code quality suffers because if the author of the change and
just one reviewer are fast enough, the point of view of everybody else
will be ignored.

Other points of view can be re-evaluated later, but in this case you'll
waste more time writing yet another revision, which might now ignore a
previous comment (that you addressed, previously) because it's not
visible anymore.

 * * *

Let's pick this practical example here: we were in the middle of a
discussion about whether we need to properly size a buffer to read out
sysctl values (David's idea), or if we can go for a larger buffer in any
case to keep things simpler (my proposal).

Before I had the chance to follow up with the discussion, you posted
another revision. And then another one.

On GitHub, it would be impossible for me to re-open that discussion, so
I would start a new one, and now David might miss the fact it's the
same discussion. Maybe he was right, but it doesn't matter anymore.

With email, I can do that because we have threading and persistence, but
if the outcome of the discussion now changes, you wasted time with
another revision.

Or maybe I see that you're at v7 now and I forget that that discussion
was still open, so my previous point, even if valid, is now effectively
ignored and forgotten by everybody.

The workflow you have on GitHub works well if you have one author and
one reviewer, or more reviewers who are always right and always agree
between each other, but that's a quite unrealistic expectation.

I guess it also works well if code quantity is more important than
quality, because it's merged faster that way, and because it's harder
to discuss about it (no real threading). But here we're trying to have
less code and less bugs, not more.

> I thought
> it's the same with patches in emails, that outdated versions are no
> longer useful.

They are, but they're not so practical to have a discussion about, so
not so useful as the current one, which is why discussions should have
a chance to complete.

You'll just be busy writing new revisions otherwise, instead of having
time for something else in parallel.

And reviewers have other stuff to review too, so we don't really gain
time if you re-post fast.

It's different if we have a critical issue affecting many users and we
want to fix it fast for them. But usually it's a small patch/series in
that case and we don't care so much about discussing the best approach
as long as it's fixed and released quickly.

> Apparently I got it wrong. I will keep it in mind and
> not send too many versions in a short time, and add changelogs in
> cover letters when necessary.

It's not always necessary I think, and sometimes you can keep things
short if they're obvious to everybody. These are the biggest series
ever posted for passt, in terms of number of patches:

  [PATCH v2 00/32] Use dual stack sockets to listen for inbound
  TCP connections
  https://archives.passt.top/passt-dev/20221117055908.2782981-1-david@gibson.dropbear.id.au/

  [PATCH v11 00/30] Introduce discontiguous frames management
  https://archives.passt.top/passt-dev/20250902075253.990038-1-lvivier@redhat.com/

...you'll see that, for some revisions, changes are very briefly
summarised. That's enough, especially if there was a single reviewer
for a given revision.

But with this series it's doable and there are a few specific changes
between each revision, so I think you should, because it helps
reviewers to understand what you're doing.

-- 
Stefano


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-20 23:20             ` Stefano Brivio
@ 2025-10-22  2:23               ` David Gibson
  0 siblings, 0 replies; 31+ messages in thread
From: David Gibson @ 2025-10-22  2:23 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Yumei Huang, passt-dev

[-- Attachment #1: Type: text/plain, Size: 7548 bytes --]

On Tue, Oct 21, 2025 at 01:20:46AM +0200, Stefano Brivio wrote:
> On Mon, 20 Oct 2025 18:57:45 +0800
> Yumei Huang <yuhuang@redhat.com> wrote:
> > On Sat, Oct 18, 2025 at 2:28 AM Stefano Brivio <sbrivio@redhat.com> wrote:
[snip]
> > > because we can surely move faster than that, but three versions in a
> > > day obviously before I get any chance to have a look means a
> > > substantial overhead for me, and I might miss the meaning and context of
> > > comments of other reviewers (David in this case). There are no
> > > changelogs in cover letters either.
> > >
> > > I plan to skip to v6 but don't expect a review soon, because of that
> > > overhead I just mentioned.  
> > 
> > Sorry for the overhead I brought. It's just so different from what we
> > do with MRs or PRs(at least within our team), which we are supposed to
> > update as soon as possible, so reviewers could review again at any
> > time they are available. And it's always the latest code (with less
> > "problematic" code) there for review, not the outdated ones.

A lot of the differences from the workflow on a forge are a question
of degree, rather than qualitative.  If I see v4, v5 & v6 in my inbox
and I haven't looked at any yet, I'll delete v4 & v5 and go straight
to v6.


It gets trickier when there's already a discussion thread started on
one of the earlier versions (by my or someone else), or if I've
already started reviewing an earlier version.

The higher latencies of an email flow make those sorts of collision
more likely.  Although not as much as you might think, because most of
the latency is not from the technology, but from when people are awake
/ available.

> Oh, I see now.
> 
> I also have some experience with contributing via git forges, and I
> think it's a serious limitation (at least on GitHub) coming from the
> fact that you don't have (proper) threading. You have it on discussions
> and issues/tickets but not on code reviews.
> 
> You lose one dimension of discussion there, because it becomes entirely
> "linear", and while you can see differences between revisions, it's not
> really practical to review or discuss them. There's also no space to
> record and describe changes, if you just force push a branch.
> 
> I think code quality suffers because if the author of the change and
> just one reviewer are fast enough, the point of view of everybody else
> will be ignored.
> 
> Other points of view can be re-evaluated later, but in this case you'll
> waste more time writing yet another revision, which might now ignore a
> previous comment (that you addressed, previously) because it's not
> visible anymore.

I largely agree.  I find there are things to like about the forge
method.  Tracking comments according to the source line they are
relevant to can be nice in terms of automatically invalidating them
once the code is updated.  But it can also be a trap if the comment is
a broader point / discussion that was just attached to that source
line for want of a better place.

I find the forges sometimes encourage review at the level of a the
total diff of the MR, rather than patch-by-patch.  Patch-by-patch
often makes it easier to follow - or at least gives the contributor
more scope to make the change clearer by the way they split it up.



>  * * *
> 
> Let's pick this practical example here: we were in the middle of a
> discussion about whether we need to properly size a buffer to read out
> sysctl values (David's idea), or if we can go for a larger buffer in any
> case to keep things simpler (my proposal).
> 
> Before I had the chance to follow up with the discussion, you posted
> another revision. And then another one.
> 
> On GitHub, it would be impossible for me to re-open that discussion, so
> I would start a new one, and now David might miss the fact it's the
> same discussion. Maybe he was right, but it doesn't matter anymore.
> 
> With email, I can do that because we have threading and persistence, but
> if the outcome of the discussion now changes, you wasted time with
> another revision.
> 
> Or maybe I see that you're at v7 now and I forget that that discussion
> was still open, so my previous point, even if valid, is now effectively
> ignored and forgotten by everybody.
> 
> The workflow you have on GitHub works well if you have one author and
> one reviewer, or more reviewers who are always right and always agree
> between each other, but that's a quite unrealistic expectation.
> 
> I guess it also works well if code quantity is more important than
> quality, because it's merged faster that way, and because it's harder
> to discuss about it (no real threading). But here we're trying to have
> less code and less bugs, not more.
> 
> > I thought
> > it's the same with patches in emails, that outdated versions are no
> > longer useful.
> 
> They are, but they're not so practical to have a discussion about, so
> not so useful as the current one, which is why discussions should have
> a chance to complete.
> 
> You'll just be busy writing new revisions otherwise, instead of having
> time for something else in parallel.

Btw, it's perfectly ok to apply some fixes from review in your local
tree for things that are simple, but wait a while before re-posting
for longer discussions to complete or for additional reviewers to
contribute.  I do this all the time.

There's not a lot of hard and fast rules here, it's a question of
weighing the reasons to post early against the reasons to wait for the
next spin.  Taking a while to tune that is expected for any new
developer.

> And reviewers have other stuff to review too, so we don't really gain
> time if you re-post fast.
> 
> It's different if we have a critical issue affecting many users and we
> want to fix it fast for them. But usually it's a small patch/series in
> that case and we don't care so much about discussing the best approach
> as long as it's fixed and released quickly.
> 
> > Apparently I got it wrong. I will keep it in mind and
> > not send too many versions in a short time, and add changelogs in
> > cover letters when necessary.
> 
> It's not always necessary I think, and sometimes you can keep things
> short if they're obvious to everybody.

Right.  As a reviewer, a detailed changelog is very useful.  But as a
developer I know that maintaining that changelog can be a lot of work.
So again, it's a matter of finding a balance.

> These are the biggest series
> ever posted for passt, in terms of number of patches:
> 
>   [PATCH v2 00/32] Use dual stack sockets to listen for inbound
>   TCP connections
>   https://archives.passt.top/passt-dev/20221117055908.2782981-1-david@gibson.dropbear.id.au/
> 
>   [PATCH v11 00/30] Introduce discontiguous frames management
>   https://archives.passt.top/passt-dev/20250902075253.990038-1-lvivier@redhat.com/
> 
> ...you'll see that, for some revisions, changes are very briefly
> summarised. That's enough, especially if there was a single reviewer
> for a given revision.
> 
> But with this series it's doable and there are a few specific changes
> between each revision, so I think you should, because it helps
> reviewers to understand what you're doing.
> 
> -- 
> Stefano
> 

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 2/4] util: Introduce read_file() and read_file_long() function
  2025-10-15  4:46       ` David Gibson
  2025-10-15  5:46         ` Yumei Huang
@ 2025-10-28 23:12         ` Stefano Brivio
  2025-10-29  0:43           ` David Gibson
  1 sibling, 1 reply; 31+ messages in thread
From: Stefano Brivio @ 2025-10-28 23:12 UTC (permalink / raw)
  To: David Gibson; +Cc: Yumei Huang, passt-dev

On Wed, 15 Oct 2025 15:46:12 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Wed, Oct 15, 2025 at 11:50:53AM +0800, Yumei Huang wrote:
> > On Wed, Oct 15, 2025 at 7:28 AM David Gibson
> > <david@gibson.dropbear.id.au> wrote:  
> > > On Tue, Oct 14, 2025 at 03:38:34PM +0800, Yumei Huang wrote:  
> > > > Signed-off-by: Yumei Huang <yuhuang@redhat.com>  
> [snip]
> > > > + * @path: Path to the sysctl file
> > > > + * @fallback: Default value if file can't be read
> > > > + *
> > > > + * Return: Parameter value, fallback on failure
> > > > +*/
> > > > +long read_file_long(const char *path, long fallback)
> > > > +{
> > > > +        char buf[32];  
> > >
> > > Rather than just using a semi-arbitrary 32 here, I'd suggest defining
> > > a new constant similar to UINT16_STRLEN.  Except that's trickier for a
> > > type that doesn't have a known fixed width.  Pity the C library
> > > doesn't have constants for these AFAICT.  
> > 
> > I will just define a UINTMAX_STRLEN with (sizeof("2147483647")).  
> 
> That's not quite right.
>  - It should be INTMAX_STRLEN (signed), UINTMAX would be for the
>    unsigned version
>  - That assumes intmax_t is 32-bit which is probably not the case (it
>    will be 64-bit, maybe even 128-bit on modern systems)
>  - For signed cases, it's the minimum (negative) value that gives the
>    longest possible string (for 32-bit, "-2147483648")

By the way, while it doesn't cover intmax_t explicitly, I think this is
a pretty good resource as it covers most architectures supported by the
Linux kernel (hence, most architectures we support):

  https://wiki.debian.org/ArchitectureSpecificsMemo#Summary

and judging from intmax_t(3type) I'd say that the sizeof(long double)
column tells you how big intmax_t is.

Well, at least, that's the page I use to know which architectures I can
use to check things when I suspect a type portability bug.

That's because 'long double' should always be the biggest "native" data
type, that is, excluding __int128 or vectorised / SIMD types such as
__m256i.

-- 
Stefano


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-20  9:17               ` David Gibson
@ 2025-10-28 23:13                 ` Stefano Brivio
  2025-10-29  0:35                   ` David Gibson
  0 siblings, 1 reply; 31+ messages in thread
From: Stefano Brivio @ 2025-10-28 23:13 UTC (permalink / raw)
  To: David Gibson; +Cc: Yumei Huang, passt-dev

On Mon, 20 Oct 2025 20:17:10 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Mon, Oct 20, 2025 at 07:11:07AM +0200, Stefano Brivio wrote:
> > On Mon, 20 Oct 2025 11:20:19 +1100
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> > > On Fri, Oct 17, 2025 at 08:28:12PM +0200, Stefano Brivio wrote:  
> > > > On Thu, 16 Oct 2025 09:54:25 +1100
> > > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > > >     
> > > > > On Wed, Oct 15, 2025 at 02:31:27PM +0800, Yumei Huang wrote:    
> > > > > > On Wed, Oct 15, 2025 at 8:05 AM David Gibson
> > > > > > <david@gibson.dropbear.id.au> wrote:      
> > > > > > >
> > > > > > > On Tue, Oct 14, 2025 at 03:38:36PM +0800, Yumei Huang wrote:      
> > > > > > > > According to RFC 2988 and RFC 6298, we should use an exponential
> > > > > > > > backoff timeout for data retransmission starting from one second
> > > > > > > > (see Appendix A in RFC 6298), and limit it to about 60 seconds
> > > > > > > > as allowed by the same RFC:
> > > > > > > >
> > > > > > > >    (2.5) A maximum value MAY be placed on RTO provided it is at
> > > > > > > >          least 60 seconds.      
> > > > > > >
> > > > > > > The interpretation of this isn't entirely clear to me.  Does it mean
> > > > > > > if the total retransmit delay exceeds 60s we give up and RST (what
> > > > > > > this patch implements)?  Or does it mean that if the retransmit delay
> > > > > > > reaches 60s we keep retransmitting, but don't increase the delay any
> > > > > > > further?
> > > > > > >
> > > > > > > Looking at tcp_bound_rto() and related code in the kernel suggests the
> > > > > > > second interpretation.
> > > > > > >      
> > > > > > > > Combine the macros defining the initial timeout for both SYN and ACK.
> > > > > > > > And add a macro ACK_RETRIES to limit the total timeout to about 60s.
> > > > > > > >
> > > > > > > > Signed-off-by: Yumei Huang <yuhuang@redhat.com>
> > > > > > > > ---
> > > > > > > >  tcp.c | 32 ++++++++++++++++----------------
> > > > > > > >  1 file changed, 16 insertions(+), 16 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/tcp.c b/tcp.c
> > > > > > > > index 3ce3991..84da069 100644
> > > > > > > > --- a/tcp.c
> > > > > > > > +++ b/tcp.c
> > > > > > > > @@ -179,16 +179,12 @@
> > > > > > > >   *
> > > > > > > >   * Timeouts are implemented by means of timerfd timers, set based on flags:
> > > > > > > >   *
> > > > > > > > - * - SYN_TIMEOUT_INIT: if no ACK is received from tap/guest during handshake
> > > > > > > > - *   (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, resend
> > > > > > > > - *   SYN. It's the starting timeout for the first SYN retry. If this persists
> > > > > > > > - *   for more than TCP_MAX_RETRIES or (tcp_syn_retries +
> > > > > > > > - *   tcp_syn_linear_timeouts) times in a row, reset the connection
> > > > > > > > - *
> > > > > > > > - * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
> > > > > > > > - *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
> > > > > > > > - *   socket and reset sequence to what was acknowledged. If this persists for
> > > > > > > > - *   more than TCP_MAX_RETRIES times in a row, reset the connection
> > > > > > > > + * - ACK_TIMEOUT_INIT: if no ACK segment was received from tap/guest, eiher
> > > > > > > > + *   during handshake(flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after
> > > > > > > > + *   sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data
> > > > > > > > + *   from the socket and reset sequence to what was acknowledged. It's the
> > > > > > > > + *   starting timeout for the first retry. If this persists for more than
> > > > > > > > + *   allowed times in a row, reset the connection
> > > > > > > >   *
> > > > > > > >   * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
> > > > > > > >   *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
> > > > > > > > @@ -342,8 +338,7 @@ enum {
> > > > > > > >  #define WINDOW_DEFAULT                       14600           /* RFC 6928 */
> > > > > > > >
> > > > > > > >  #define ACK_INTERVAL                 10              /* ms */
> > > > > > > > -#define SYN_TIMEOUT_INIT             1               /* s */
> > > > > > > > -#define ACK_TIMEOUT                  2
> > > > > > > > +#define ACK_TIMEOUT_INIT             1               /* s, RFC 6298 */      
> > > > > > >
> > > > > > > I'd suggest calling this RTO_INIT to match the terminology used in the
> > > > > > > RFCs.      
> > > > > > 
> > > > > > Sure.      
> > > > > > >      
> > > > > > > >  #define FIN_TIMEOUT                  60
> > > > > > > >  #define ACT_TIMEOUT                  7200
> > > > > > > >
> > > > > > > > @@ -352,6 +347,11 @@ enum {
> > > > > > > >
> > > > > > > >  #define ACK_IF_NEEDED        0               /* See tcp_send_flag() */
> > > > > > > >
> > > > > > > > +/* Number of retries calculated from the exponential backoff formula, limited
> > > > > > > > + * by a total timeout of about 60 seconds.
> > > > > > > > + */
> > > > > > > > +#define ACK_RETRIES          5
> > > > > > > > +      
> > > > > > >
> > > > > > > As noted above, I think this is based on a misunderstanding of what
> > > > > > > the RFC is saying.  TCP_MAX_RETRIES should be fine as it is, I think.
> > > > > > > We could implement the clamping of the RTO, but it's a "MAY" in the
> > > > > > > RFC, so we don't have to, and I don't really see a strong reason to do
> > > > > > > so.      
> > > > > > 
> > > > > > If we use TCP_MAX_RETRIES and not clamping RTO, the total timeout
> > > > > > could be 255 seconds.
> > > > > > 
> > > > > > Stefano mentioned "Retransmitting data after 256 seconds doesn't make
> > > > > > a lot of sense to me" in the previous comment.      
> > > > > 
> > > > > That's true, but it's pretty much true for 60s as well.  For the local
> > > > > link we usually have between passt and guest, even 1s is an eternity.    
> > > > 
> > > > Rather than the local link I was thinking of whatever monitor or
> > > > liveness probe in KubeVirt which might have a 60-second period, or some
> > > > firewall agent, or how long it typically takes for guests to stop and
> > > > resume again in KubeVirt.    
> > > 
> > > Right, I hadn't considered those.  Although.. do those actually re-use
> > > a single connection?  I would have guessed they use a new connection
> > > each time, making the timeouts here irrelevant.  
> > 
> > It depends on the definition of "each time", because we don't time out
> > host-side connections immediately.  
> 
> Hm, ok.  Is your concern that getting a negative answer from the probe
> will take too long?

More like getting a positive answer taking too long, because we retry
so infrequently.

> > Pretending passt isn't there, the timeout would come from the default
> > values for TCP connections. It looks like there's no specific
> > SO_SNDTIMEO value set for those probes, and you can't configure the
> > timeout, at least according to:
> > 
> >   https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-tcp-liveness-probe  
> 
> My guess would be that the probe would probably time out at the
> application level long before the TCP layer times out, but I don't
> know for sure.

I don't think so. What I was pointing out is that I couldn't find any
place in the implementation of those probes where a particular
*handshake timeout* (not probe interval) is set on top of Linux's
defaults, so timeouts at TCP layer and application level should be the
same (no additional timeout in application logic).

> > and for tcp_syn_retries, tcp(7) says:
> > 
> >   The default value is 6, which corresponds to retrying for up to
> >   approximately 127 seconds.
> > 
> > In this series, to make things transparent, we read out those values,
> > so that part is fine. But does the Linux kernel clamp the RTO?
> > 
> > It turns out that yes, it does, TCP_RTO_MAX_SEC is 120 seconds (before
> > 1280c26228bd ("tcp: add tcp_rto_max_ms sysctl") that was TCP_RTO_MAX,
> > same value), and it's used by tcp_retransmit_timer() via tcp_rto_max().
> > That change makes it configurable.
> > 
> > I'm tempted to suggest that we should read out that value as well
> > (with a 120-second fallback for older kernels) to make our behaviour
> > as transparent as possible.
> > 
> > It's slightly more complicated and perhaps not strictly needed, but
> > we've been bitten a few times by cases where applications and users
> > expect us to behave like the Linux kernel, and we didn't... so maybe
> > we could do this as well while at it? Given the rest of this series,
> > it looks like a relatively small addition to it.  
> 
> I think that's a good idea.  It's a bit more work, but it doesn't
> greatly increase the conceptual complexity and will more closely match
> the kernel's behaviour.

-- 
Stefano


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-28 23:13                 ` Stefano Brivio
@ 2025-10-29  0:35                   ` David Gibson
  2025-10-29  4:52                     ` Stefano Brivio
  0 siblings, 1 reply; 31+ messages in thread
From: David Gibson @ 2025-10-29  0:35 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Yumei Huang, passt-dev

[-- Attachment #1: Type: text/plain, Size: 2425 bytes --]

On Wed, Oct 29, 2025 at 12:13:30AM +0100, Stefano Brivio wrote:
> On Mon, 20 Oct 2025 20:17:10 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Mon, Oct 20, 2025 at 07:11:07AM +0200, Stefano Brivio wrote:
> > > On Mon, 20 Oct 2025 11:20:19 +1100
> > > David Gibson <david@gibson.dropbear.id.au> wrote:
[snip]
> > > > > Rather than the local link I was thinking of whatever monitor or
> > > > > liveness probe in KubeVirt which might have a 60-second period, or some
> > > > > firewall agent, or how long it typically takes for guests to stop and
> > > > > resume again in KubeVirt.    
> > > > 
> > > > Right, I hadn't considered those.  Although.. do those actually re-use
> > > > a single connection?  I would have guessed they use a new connection
> > > > each time, making the timeouts here irrelevant.  
> > > 
> > > It depends on the definition of "each time", because we don't time out
> > > host-side connections immediately.  
> > 
> > Hm, ok.  Is your concern that getting a negative answer from the probe
> > will take too long?
> 
> More like getting a positive answer taking too long, because we retry
> so infrequently.

Right, but it will only be slow if we lose the first probe, which
should be very rare.

> > > Pretending passt isn't there, the timeout would come from the default
> > > values for TCP connections. It looks like there's no specific
> > > SO_SNDTIMEO value set for those probes, and you can't configure the
> > > timeout, at least according to:
> > > 
> > >   https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-tcp-liveness-probe  
> > 
> > My guess would be that the probe would probably time out at the
> > application level long before the TCP layer times out, but I don't
> > know for sure.
> 
> I don't think so. What I was pointing out is that I couldn't find any
> place in the implementation of those probes where a particular
> *handshake timeout* (not probe interval) is set on top of Linux's
> defaults, so timeouts at TCP layer and application level should be the
> same (no additional timeout in application logic).

Huh, that's mildly surprising to me.

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 2/4] util: Introduce read_file() and read_file_long() function
  2025-10-28 23:12         ` Stefano Brivio
@ 2025-10-29  0:43           ` David Gibson
  2025-10-29  4:43             ` Stefano Brivio
  0 siblings, 1 reply; 31+ messages in thread
From: David Gibson @ 2025-10-29  0:43 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Yumei Huang, passt-dev

[-- Attachment #1: Type: text/plain, Size: 2924 bytes --]

On Wed, Oct 29, 2025 at 12:12:48AM +0100, Stefano Brivio wrote:
> On Wed, 15 Oct 2025 15:46:12 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Wed, Oct 15, 2025 at 11:50:53AM +0800, Yumei Huang wrote:
> > > On Wed, Oct 15, 2025 at 7:28 AM David Gibson
> > > <david@gibson.dropbear.id.au> wrote:  
> > > > On Tue, Oct 14, 2025 at 03:38:34PM +0800, Yumei Huang wrote:  
> > > > > Signed-off-by: Yumei Huang <yuhuang@redhat.com>  
> > [snip]
> > > > > + * @path: Path to the sysctl file
> > > > > + * @fallback: Default value if file can't be read
> > > > > + *
> > > > > + * Return: Parameter value, fallback on failure
> > > > > +*/
> > > > > +long read_file_long(const char *path, long fallback)
> > > > > +{
> > > > > +        char buf[32];  
> > > >
> > > > Rather than just using a semi-arbitrary 32 here, I'd suggest defining
> > > > a new constant similar to UINT16_STRLEN.  Except that's trickier for a
> > > > type that doesn't have a known fixed width.  Pity the C library
> > > > doesn't have constants for these AFAICT.  
> > > 
> > > I will just define a UINTMAX_STRLEN with (sizeof("2147483647")).  
> > 
> > That's not quite right.
> >  - It should be INTMAX_STRLEN (signed), UINTMAX would be for the
> >    unsigned version
> >  - That assumes intmax_t is 32-bit which is probably not the case (it
> >    will be 64-bit, maybe even 128-bit on modern systems)
> >  - For signed cases, it's the minimum (negative) value that gives the
> >    longest possible string (for 32-bit, "-2147483648")
> 
> By the way, while it doesn't cover intmax_t explicitly, I think this is
> a pretty good resource as it covers most architectures supported by the
> Linux kernel (hence, most architectures we support):
> 
>   https://wiki.debian.org/ArchitectureSpecificsMemo#Summary

Oh, nice, that is a very handy resource.

> and judging from intmax_t(3type) I'd say that the sizeof(long double)
> column tells you how big intmax_t is.

> Well, at least, that's the page I use to know which architectures I can
> use to check things when I suspect a type portability bug.
> 
> That's because 'long double' should always be the biggest "native" data
> type, that is, excluding __int128 or vectorised / SIMD types such as
> __m256i.

This part isn't true, alas.  Theoretically speaking there's not
necessarily any relation between the largest native integer type and
the largest native float type.

But more importantly, it's not true in practice: according to the
table sizeof(long double) is 16 for amd64, but sizeof(intmax_t) is 8
empirically.

I think sizeof(long long) is more likely to match sizeof(intmax_t),
but I don't love relying on it.

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 2/4] util: Introduce read_file() and read_file_long() function
  2025-10-29  0:43           ` David Gibson
@ 2025-10-29  4:43             ` Stefano Brivio
  2025-10-29  9:35               ` David Gibson
  0 siblings, 1 reply; 31+ messages in thread
From: Stefano Brivio @ 2025-10-29  4:43 UTC (permalink / raw)
  To: David Gibson; +Cc: Yumei Huang, passt-dev

On Wed, 29 Oct 2025 11:43:00 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Wed, Oct 29, 2025 at 12:12:48AM +0100, Stefano Brivio wrote:
> > On Wed, 15 Oct 2025 15:46:12 +1100
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> > > On Wed, Oct 15, 2025 at 11:50:53AM +0800, Yumei Huang wrote:  
> > > > On Wed, Oct 15, 2025 at 7:28 AM David Gibson
> > > > <david@gibson.dropbear.id.au> wrote:    
> > > > > On Tue, Oct 14, 2025 at 03:38:34PM +0800, Yumei Huang wrote:    
> > > > > > Signed-off-by: Yumei Huang <yuhuang@redhat.com>    
> > > [snip]  
> > > > > > + * @path: Path to the sysctl file
> > > > > > + * @fallback: Default value if file can't be read
> > > > > > + *
> > > > > > + * Return: Parameter value, fallback on failure
> > > > > > +*/
> > > > > > +long read_file_long(const char *path, long fallback)
> > > > > > +{
> > > > > > +        char buf[32];    
> > > > >
> > > > > Rather than just using a semi-arbitrary 32 here, I'd suggest defining
> > > > > a new constant similar to UINT16_STRLEN.  Except that's trickier for a
> > > > > type that doesn't have a known fixed width.  Pity the C library
> > > > > doesn't have constants for these AFAICT.    
> > > > 
> > > > I will just define a UINTMAX_STRLEN with (sizeof("2147483647")).    
> > > 
> > > That's not quite right.
> > >  - It should be INTMAX_STRLEN (signed), UINTMAX would be for the
> > >    unsigned version
> > >  - That assumes intmax_t is 32-bit which is probably not the case (it
> > >    will be 64-bit, maybe even 128-bit on modern systems)
> > >  - For signed cases, it's the minimum (negative) value that gives the
> > >    longest possible string (for 32-bit, "-2147483648")  
> > 
> > By the way, while it doesn't cover intmax_t explicitly, I think this is
> > a pretty good resource as it covers most architectures supported by the
> > Linux kernel (hence, most architectures we support):
> > 
> >   https://wiki.debian.org/ArchitectureSpecificsMemo#Summary  
> 
> Oh, nice, that is a very handy resource.
> 
> > and judging from intmax_t(3type) I'd say that the sizeof(long double)
> > column tells you how big intmax_t is.  
> 
> > Well, at least, that's the page I use to know which architectures I can
> > use to check things when I suspect a type portability bug.
> > 
> > That's because 'long double' should always be the biggest "native" data
> > type, that is, excluding __int128 or vectorised / SIMD types such as
> > __m256i.  
> 
> This part isn't true, alas.  Theoretically speaking there's not
> necessarily any relation between the largest native integer type and
> the largest native float type.

Oops, yes, I misread intmax_t(3type), that's *integer* only (of course,
the name says it). So probably it has to match sizeof(long long)?

> But more importantly, it's not true in practice: according to the
> table sizeof(long double) is 16 for amd64, but sizeof(intmax_t) is 8
> empirically.
> 
> I think sizeof(long long) is more likely to match sizeof(intmax_t),
> but I don't love relying on it.

Right... well, about relying on it, without a change in the C11
standard, can it ever differ? I don't think so. We could have a look at
C17 / C23 and if long long is still the largest integer type, we know
we're fine for quite a few years / pretty much forever.

By the way, just as a reminder (also to self): we don't actually need
this here.

-- 
Stefano


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-29  0:35                   ` David Gibson
@ 2025-10-29  4:52                     ` Stefano Brivio
  2025-10-29  9:37                       ` David Gibson
  0 siblings, 1 reply; 31+ messages in thread
From: Stefano Brivio @ 2025-10-29  4:52 UTC (permalink / raw)
  To: David Gibson; +Cc: Yumei Huang, passt-dev

On Wed, 29 Oct 2025 11:35:29 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Wed, Oct 29, 2025 at 12:13:30AM +0100, Stefano Brivio wrote:
> > On Mon, 20 Oct 2025 20:17:10 +1100
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> > > On Mon, Oct 20, 2025 at 07:11:07AM +0200, Stefano Brivio wrote:  
> > > > On Mon, 20 Oct 2025 11:20:19 +1100
> > > > David Gibson <david@gibson.dropbear.id.au> wrote:  
> [snip]
> > > > > > Rather than the local link I was thinking of whatever monitor or
> > > > > > liveness probe in KubeVirt which might have a 60-second period, or some
> > > > > > firewall agent, or how long it typically takes for guests to stop and
> > > > > > resume again in KubeVirt.      
> > > > > 
> > > > > Right, I hadn't considered those.  Although.. do those actually re-use
> > > > > a single connection?  I would have guessed they use a new connection
> > > > > each time, making the timeouts here irrelevant.    
> > > > 
> > > > It depends on the definition of "each time", because we don't time out
> > > > host-side connections immediately.    
> > > 
> > > Hm, ok.  Is your concern that getting a negative answer from the probe
> > > will take too long?  
> > 
> > More like getting a positive answer taking too long, because we retry
> > so infrequently.  
> 
> Right, but it will only be slow if we lose the first probe, which
> should be very rare.

No, because again, that might be due to the guest doing something with
its firewall or stopping/resuming/getting online etc. It's not
necessarily rare.

If that situation persists for at least 1 + 2 + 4 + 8 + 16 + 32 = 55
seconds, without a clamp, we'll wait 119 seconds next, and 247 seconds
after that. In this case, to me, it looks more reasonable to retry
every minute instead.

> > > > Pretending passt isn't there, the timeout would come from the default
> > > > values for TCP connections. It looks like there's no specific
> > > > SO_SNDTIMEO value set for those probes, and you can't configure the
> > > > timeout, at least according to:
> > > > 
> > > >   https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-tcp-liveness-probe    
> > > 
> > > My guess would be that the probe would probably time out at the
> > > application level long before the TCP layer times out, but I don't
> > > know for sure.  
> > 
> > I don't think so. What I was pointing out is that I couldn't find any
> > place in the implementation of those probes where a particular
> > *handshake timeout* (not probe interval) is set on top of Linux's
> > defaults, so timeouts at TCP layer and application level should be the
> > same (no additional timeout in application logic).  
> 
> Huh, that's mildly surprising to me.

-- 
Stefano


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 2/4] util: Introduce read_file() and read_file_long() function
  2025-10-29  4:43             ` Stefano Brivio
@ 2025-10-29  9:35               ` David Gibson
  2025-10-29 16:23                 ` Stefano Brivio
  0 siblings, 1 reply; 31+ messages in thread
From: David Gibson @ 2025-10-29  9:35 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Yumei Huang, passt-dev

[-- Attachment #1: Type: text/plain, Size: 2525 bytes --]

On Wed, Oct 29, 2025 at 05:43:16AM +0100, Stefano Brivio wrote:
> On Wed, 29 Oct 2025 11:43:00 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Wed, Oct 29, 2025 at 12:12:48AM +0100, Stefano Brivio wrote:
[snip]
> > > By the way, while it doesn't cover intmax_t explicitly, I think this is
> > > a pretty good resource as it covers most architectures supported by the
> > > Linux kernel (hence, most architectures we support):
> > > 
> > >   https://wiki.debian.org/ArchitectureSpecificsMemo#Summary  
> > 
> > Oh, nice, that is a very handy resource.
> > 
> > > and judging from intmax_t(3type) I'd say that the sizeof(long double)
> > > column tells you how big intmax_t is.  
> > 
> > > Well, at least, that's the page I use to know which architectures I can
> > > use to check things when I suspect a type portability bug.
> > > 
> > > That's because 'long double' should always be the biggest "native" data
> > > type, that is, excluding __int128 or vectorised / SIMD types such as
> > > __m256i.  
> > 
> > This part isn't true, alas.  Theoretically speaking there's not
> > necessarily any relation between the largest native integer type and
> > the largest native float type.
> 
> Oops, yes, I misread intmax_t(3type), that's *integer* only (of course,
> the name says it). So probably it has to match sizeof(long long)?
> 
> > But more importantly, it's not true in practice: according to the
> > table sizeof(long double) is 16 for amd64, but sizeof(intmax_t) is 8
> > empirically.
> > 
> > I think sizeof(long long) is more likely to match sizeof(intmax_t),
> > but I don't love relying on it.
> 
> Right... well, about relying on it, without a change in the C11
> standard, can it ever differ? I don't think so. We could have a look at
> C17 / C23 and if long long is still the largest integer type, we know
> we're fine for quite a few years / pretty much forever.

Uh.. maybe?  I'm never clear on what's guaranteed by the C standard
and what's left to the platform / ABI.  AIUI the reason for intmax_t's
existence is because an awful lot is not pinned down by the standard.

__int128 does appear to be a thing that is longer than long long.
Maybe there's a rule that doesn't allow intmax_t to be __int128, but
I'm not sure where we'd find it.

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 4/4] tcp: Update data retransmission timeout
  2025-10-29  4:52                     ` Stefano Brivio
@ 2025-10-29  9:37                       ` David Gibson
  0 siblings, 0 replies; 31+ messages in thread
From: David Gibson @ 2025-10-29  9:37 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Yumei Huang, passt-dev

[-- Attachment #1: Type: text/plain, Size: 2369 bytes --]

On Wed, Oct 29, 2025 at 05:52:59AM +0100, Stefano Brivio wrote:
> On Wed, 29 Oct 2025 11:35:29 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Wed, Oct 29, 2025 at 12:13:30AM +0100, Stefano Brivio wrote:
> > > On Mon, 20 Oct 2025 20:17:10 +1100
> > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > >   
> > > > On Mon, Oct 20, 2025 at 07:11:07AM +0200, Stefano Brivio wrote:  
> > > > > On Mon, 20 Oct 2025 11:20:19 +1100
> > > > > David Gibson <david@gibson.dropbear.id.au> wrote:  
> > [snip]
> > > > > > > Rather than the local link I was thinking of whatever monitor or
> > > > > > > liveness probe in KubeVirt which might have a 60-second period, or some
> > > > > > > firewall agent, or how long it typically takes for guests to stop and
> > > > > > > resume again in KubeVirt.      
> > > > > > 
> > > > > > Right, I hadn't considered those.  Although.. do those actually re-use
> > > > > > a single connection?  I would have guessed they use a new connection
> > > > > > each time, making the timeouts here irrelevant.    
> > > > > 
> > > > > It depends on the definition of "each time", because we don't time out
> > > > > host-side connections immediately.    
> > > > 
> > > > Hm, ok.  Is your concern that getting a negative answer from the probe
> > > > will take too long?  
> > > 
> > > More like getting a positive answer taking too long, because we retry
> > > so infrequently.  
> > 
> > Right, but it will only be slow if we lose the first probe, which
> > should be very rare.
> 
> No, because again, that might be due to the guest doing something with
> its firewall or stopping/resuming/getting online etc. It's not
> necessarily rare.

Hmmm... I'd think if interruption due to coming up / firewall frobbing
/ whatever is *not* rare, then that constitutes flaky availability
that arguably the probe *should* fail on.

> If that situation persists for at least 1 + 2 + 4 + 8 + 16 + 32 = 55
> seconds, without a clamp, we'll wait 119 seconds next, and 247 seconds
> after that. In this case, to me, it looks more reasonable to retry
> every minute instead.

Yeah, I guess so.

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 2/4] util: Introduce read_file() and read_file_long() function
  2025-10-29  9:35               ` David Gibson
@ 2025-10-29 16:23                 ` Stefano Brivio
  0 siblings, 0 replies; 31+ messages in thread
From: Stefano Brivio @ 2025-10-29 16:23 UTC (permalink / raw)
  To: David Gibson; +Cc: Yumei Huang, passt-dev

On Wed, 29 Oct 2025 20:35:08 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Wed, Oct 29, 2025 at 05:43:16AM +0100, Stefano Brivio wrote:
> > On Wed, 29 Oct 2025 11:43:00 +1100
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> > > On Wed, Oct 29, 2025 at 12:12:48AM +0100, Stefano Brivio wrote:  
> [snip]
> > > > By the way, while it doesn't cover intmax_t explicitly, I think this is
> > > > a pretty good resource as it covers most architectures supported by the
> > > > Linux kernel (hence, most architectures we support):
> > > > 
> > > >   https://wiki.debian.org/ArchitectureSpecificsMemo#Summary    
> > > 
> > > Oh, nice, that is a very handy resource.
> > >   
> > > > and judging from intmax_t(3type) I'd say that the sizeof(long double)
> > > > column tells you how big intmax_t is.    
> > >   
> > > > Well, at least, that's the page I use to know which architectures I can
> > > > use to check things when I suspect a type portability bug.
> > > > 
> > > > That's because 'long double' should always be the biggest "native" data
> > > > type, that is, excluding __int128 or vectorised / SIMD types such as
> > > > __m256i.    
> > > 
> > > This part isn't true, alas.  Theoretically speaking there's not
> > > necessarily any relation between the largest native integer type and
> > > the largest native float type.  
> > 
> > Oops, yes, I misread intmax_t(3type), that's *integer* only (of course,
> > the name says it). So probably it has to match sizeof(long long)?
> >   
> > > But more importantly, it's not true in practice: according to the
> > > table sizeof(long double) is 16 for amd64, but sizeof(intmax_t) is 8
> > > empirically.
> > > 
> > > I think sizeof(long long) is more likely to match sizeof(intmax_t),
> > > but I don't love relying on it.  
> > 
> > Right... well, about relying on it, without a change in the C11
> > standard, can it ever differ? I don't think so. We could have a look at
> > C17 / C23 and if long long is still the largest integer type, we know
> > we're fine for quite a few years / pretty much forever.  
> 
> Uh.. maybe?  I'm never clear on what's guaranteed by the C standard
> and what's left to the platform / ABI.  AIUI the reason for intmax_t's
> existence is because an awful lot is not pinned down by the standard.

The standard specifies long long, without a width, and not long long
long, so we know that long long is the longest we can have at the
moment.

> __int128 does appear to be a thing that is longer than long long.

Yes but that's what I meant by "native" type, for lack of a better
name. It looks like they're more commonly called "main" types instead.

__int128 is a GNU extension and specifies a 128-bit width just like
__m256i specifies a 256-bit width. Those can be bigger than intmax_t.

> Maybe there's a rule that doesn't allow intmax_t to be __int128, but
> I'm not sure where we'd find it.

No real rule but BUGS in intmax_t(3type) reports this (even though it
doesn't look like a bug to me).

-- 
Stefano


^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2025-10-29 16:23 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-14  7:38 [PATCH v3 0/4] Retry SYNs for inbound connections Yumei Huang
2025-10-14  7:38 ` [PATCH v3 1/4] tcp: Rename "retrans" to "retries" Yumei Huang
2025-10-14 22:50   ` David Gibson
2025-10-15  2:17     ` Yumei Huang
2025-10-14  7:38 ` [PATCH v3 2/4] util: Introduce read_file() and read_file_long() function Yumei Huang
2025-10-14 23:27   ` David Gibson
2025-10-15  3:50     ` Yumei Huang
2025-10-15  4:46       ` David Gibson
2025-10-15  5:46         ` Yumei Huang
2025-10-28 23:12         ` Stefano Brivio
2025-10-29  0:43           ` David Gibson
2025-10-29  4:43             ` Stefano Brivio
2025-10-29  9:35               ` David Gibson
2025-10-29 16:23                 ` Stefano Brivio
2025-10-14  7:38 ` [PATCH v3 3/4] tcp: Resend SYN for inbound connections Yumei Huang
2025-10-14 23:40   ` David Gibson
2025-10-14  7:38 ` [PATCH v3 4/4] tcp: Update data retransmission timeout Yumei Huang
2025-10-15  0:05   ` David Gibson
2025-10-15  6:31     ` Yumei Huang
2025-10-15 22:54       ` David Gibson
2025-10-17 18:28         ` Stefano Brivio
2025-10-20  0:20           ` David Gibson
2025-10-20  5:11             ` Stefano Brivio
2025-10-20  9:17               ` David Gibson
2025-10-28 23:13                 ` Stefano Brivio
2025-10-29  0:35                   ` David Gibson
2025-10-29  4:52                     ` Stefano Brivio
2025-10-29  9:37                       ` David Gibson
2025-10-20 10:57           ` Yumei Huang
2025-10-20 23:20             ` Stefano Brivio
2025-10-22  2:23               ` David Gibson

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).