* [PATCH 1/6] tcp: FIN flags have to be retransmitted as well
2025-08-15 16:10 [PATCH 0/6] tcp: Fixes for issues uncovered by tests with 6.17-rc1 kernels Stefano Brivio
@ 2025-08-15 16:10 ` Stefano Brivio
2025-08-15 16:10 ` [PATCH 2/6] tcp: Factor sequence rewind for retransmissions into a new function Stefano Brivio
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Stefano Brivio @ 2025-08-15 16:10 UTC (permalink / raw)
To: passt-dev; +Cc: Jon Maloy, Paul Holzinger
If we're retransmitting any data, and we sent a FIN segment to our
peer, regardless of whether it was received, we obviously have to
retransmit it as well, given that it can only come with the last data
segment, or after it.
Unconditionally clear the internal TAP_FIN_SENT flag whenever we
re-transmit, so that we know we have to send it again, in case.
Reported-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
tcp.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/tcp.c b/tcp.c
index 957b498..7c1f237 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1759,6 +1759,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
"fast re-transmit, ACK: %u, previous sequence: %u",
max_ack_seq, conn->seq_to_tap);
conn->seq_to_tap = max_ack_seq;
+ conn->events &= ~TAP_FIN_SENT;
if (tcp_set_peek_offset(conn, 0)) {
tcp_rst(c, conn);
return -1;
@@ -2286,6 +2287,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
flow_dbg(conn, "ACK timeout, retry");
conn->retrans++;
conn->seq_to_tap = conn->seq_ack_from_tap;
+ conn->events &= ~TAP_FIN_SENT;
if (!conn->wnd_from_tap)
conn->wnd_from_tap = 1; /* Zero-window probe */
if (tcp_set_peek_offset(conn, 0)) {
--
@@ -1759,6 +1759,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
"fast re-transmit, ACK: %u, previous sequence: %u",
max_ack_seq, conn->seq_to_tap);
conn->seq_to_tap = max_ack_seq;
+ conn->events &= ~TAP_FIN_SENT;
if (tcp_set_peek_offset(conn, 0)) {
tcp_rst(c, conn);
return -1;
@@ -2286,6 +2287,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
flow_dbg(conn, "ACK timeout, retry");
conn->retrans++;
conn->seq_to_tap = conn->seq_ack_from_tap;
+ conn->events &= ~TAP_FIN_SENT;
if (!conn->wnd_from_tap)
conn->wnd_from_tap = 1; /* Zero-window probe */
if (tcp_set_peek_offset(conn, 0)) {
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 2/6] tcp: Factor sequence rewind for retransmissions into a new function
2025-08-15 16:10 [PATCH 0/6] tcp: Fixes for issues uncovered by tests with 6.17-rc1 kernels Stefano Brivio
2025-08-15 16:10 ` [PATCH 1/6] tcp: FIN flags have to be retransmitted as well Stefano Brivio
@ 2025-08-15 16:10 ` Stefano Brivio
2025-08-15 16:10 ` [PATCH 3/6] tcp: Rewind sequence when guest shrinks window to zero Stefano Brivio
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Stefano Brivio @ 2025-08-15 16:10 UTC (permalink / raw)
To: passt-dev; +Cc: Jon Maloy, Paul Holzinger
...as I'm going to need a third occurrence of this in the next change.
This introduces a small functional change in tcp_data_from_tap(): the
sequence was previously rewound to the highest ACK number we found in
the current packet batch, and not to the current value of
seq_ack_from_tap.
The two might differ in case tcp_sock_consume() failed, because in
that case we're ignoring that ACK altogether. But if we're ignoring
it, it looks more correct to me to start retransmitting from an
earlier sequence anyway.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
tcp.c | 47 ++++++++++++++++++++++++++++++++---------------
1 file changed, 32 insertions(+), 15 deletions(-)
diff --git a/tcp.c b/tcp.c
index 7c1f237..1402ca2 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1097,6 +1097,26 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
}
}
+/**
+ * tcp_rewind_seq() - Rewind sequence to tap and socket offset to current ACK
+ * @c: Execution context
+ * @conn: Connection pointer
+ *
+ * Return: 0 on success, -1 on failure, with connection reset
+ */
+static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+ conn->seq_to_tap = conn->seq_ack_from_tap;
+ conn->events &= ~TAP_FIN_SENT;
+
+ if (tcp_set_peek_offset(conn, 0)) {
+ tcp_rst(c, conn);
+ return -1;
+ }
+
+ return 0;
+}
+
/**
* tcp_prepare_flags() - Prepare header for flags-only segment (no payload)
* @c: Execution context
@@ -1757,13 +1777,11 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
if (retr) {
flow_trace(conn,
"fast re-transmit, ACK: %u, previous sequence: %u",
- max_ack_seq, conn->seq_to_tap);
- conn->seq_to_tap = max_ack_seq;
- conn->events &= ~TAP_FIN_SENT;
- if (tcp_set_peek_offset(conn, 0)) {
- tcp_rst(c, conn);
+ conn->seq_ack_from_tap, conn->seq_to_tap);
+
+ if (tcp_rewind_seq(c, conn))
return -1;
- }
+
tcp_data_from_sock(c, conn);
}
@@ -2285,17 +2303,16 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
tcp_rst(c, conn);
} else {
flow_dbg(conn, "ACK timeout, retry");
- conn->retrans++;
- conn->seq_to_tap = conn->seq_ack_from_tap;
- conn->events &= ~TAP_FIN_SENT;
+
if (!conn->wnd_from_tap)
conn->wnd_from_tap = 1; /* Zero-window probe */
- if (tcp_set_peek_offset(conn, 0)) {
- tcp_rst(c, conn);
- } else {
- tcp_data_from_sock(c, conn);
- tcp_timer_ctl(c, conn);
- }
+
+ conn->retrans++;
+ if (tcp_rewind_seq(c, conn))
+ return;
+
+ tcp_data_from_sock(c, conn);
+ tcp_timer_ctl(c, conn);
}
} else {
struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
--
@@ -1097,6 +1097,26 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
}
}
+/**
+ * tcp_rewind_seq() - Rewind sequence to tap and socket offset to current ACK
+ * @c: Execution context
+ * @conn: Connection pointer
+ *
+ * Return: 0 on success, -1 on failure, with connection reset
+ */
+static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+ conn->seq_to_tap = conn->seq_ack_from_tap;
+ conn->events &= ~TAP_FIN_SENT;
+
+ if (tcp_set_peek_offset(conn, 0)) {
+ tcp_rst(c, conn);
+ return -1;
+ }
+
+ return 0;
+}
+
/**
* tcp_prepare_flags() - Prepare header for flags-only segment (no payload)
* @c: Execution context
@@ -1757,13 +1777,11 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
if (retr) {
flow_trace(conn,
"fast re-transmit, ACK: %u, previous sequence: %u",
- max_ack_seq, conn->seq_to_tap);
- conn->seq_to_tap = max_ack_seq;
- conn->events &= ~TAP_FIN_SENT;
- if (tcp_set_peek_offset(conn, 0)) {
- tcp_rst(c, conn);
+ conn->seq_ack_from_tap, conn->seq_to_tap);
+
+ if (tcp_rewind_seq(c, conn))
return -1;
- }
+
tcp_data_from_sock(c, conn);
}
@@ -2285,17 +2303,16 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
tcp_rst(c, conn);
} else {
flow_dbg(conn, "ACK timeout, retry");
- conn->retrans++;
- conn->seq_to_tap = conn->seq_ack_from_tap;
- conn->events &= ~TAP_FIN_SENT;
+
if (!conn->wnd_from_tap)
conn->wnd_from_tap = 1; /* Zero-window probe */
- if (tcp_set_peek_offset(conn, 0)) {
- tcp_rst(c, conn);
- } else {
- tcp_data_from_sock(c, conn);
- tcp_timer_ctl(c, conn);
- }
+
+ conn->retrans++;
+ if (tcp_rewind_seq(c, conn))
+ return;
+
+ tcp_data_from_sock(c, conn);
+ tcp_timer_ctl(c, conn);
}
} else {
struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 3/6] tcp: Rewind sequence when guest shrinks window to zero
2025-08-15 16:10 [PATCH 0/6] tcp: Fixes for issues uncovered by tests with 6.17-rc1 kernels Stefano Brivio
2025-08-15 16:10 ` [PATCH 1/6] tcp: FIN flags have to be retransmitted as well Stefano Brivio
2025-08-15 16:10 ` [PATCH 2/6] tcp: Factor sequence rewind for retransmissions into a new function Stefano Brivio
@ 2025-08-15 16:10 ` Stefano Brivio
2025-08-15 16:10 ` [PATCH 4/6] tcp: Fix closing logic for half-closed connections Stefano Brivio
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Stefano Brivio @ 2025-08-15 16:10 UTC (permalink / raw)
To: passt-dev; +Cc: Jon Maloy, Paul Holzinger
A window shrunk to zero means by definition that anything else that
might be in flight is now out of window. Restart from the currently
acknowledged sequence.
Suggested-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
tcp.c | 25 ++++++++++++++++---------
1 file changed, 16 insertions(+), 9 deletions(-)
diff --git a/tcp.c b/tcp.c
index 1402ca2..dda0a2e 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1257,19 +1257,25 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
/**
* tcp_tap_window_update() - Process an updated window from tap side
+ * @c: Execution context
* @conn: Connection pointer
* @wnd: Window value, host order, unscaled
*/
-static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
+static void tcp_tap_window_update(const struct ctx *c,
+ struct tcp_tap_conn *conn, unsigned wnd)
{
wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
/* Work-around for bug introduced in peer kernel code, commit
- * e2142825c120 ("net: tcp: send zero-window ACK when no memory").
- * We don't update if window shrank to zero.
+ * e2142825c120 ("net: tcp: send zero-window ACK when no memory"): don't
+ * update the window if it shrank to zero, so that we'll eventually
+ * retry to send data, but rewind the sequence as that obviously implies
+ * that no data beyond the updated window will ever be acknowledged.
*/
- if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap))
+ if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) {
+ tcp_rewind_seq(c, conn);
return;
+ }
conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
@@ -1694,7 +1700,8 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
tcp_timer_ctl(c, conn);
if (p->count == 1) {
- tcp_tap_window_update(conn, ntohs(th->window));
+ tcp_tap_window_update(c, conn,
+ ntohs(th->window));
return 1;
}
@@ -1772,7 +1779,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
if (ack && !tcp_sock_consume(conn, max_ack_seq))
tcp_update_seqack_from_tap(c, conn, max_ack_seq);
- tcp_tap_window_update(conn, max_ack_seq_wnd);
+ tcp_tap_window_update(c, conn, max_ack_seq_wnd);
if (retr) {
flow_trace(conn,
@@ -1861,7 +1868,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
const struct tcphdr *th,
const char *opts, size_t optlen)
{
- tcp_tap_window_update(conn, ntohs(th->window));
+ tcp_tap_window_update(c, conn, ntohs(th->window));
tcp_get_tap_ws(conn, opts, optlen);
/* First value is not scaled */
@@ -2059,7 +2066,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
if (!th->ack)
goto reset;
- tcp_tap_window_update(conn, ntohs(th->window));
+ tcp_tap_window_update(c, conn, ntohs(th->window));
tcp_data_from_sock(c, conn);
@@ -2071,7 +2078,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
if (conn->events & TAP_FIN_RCVD) {
tcp_sock_consume(conn, ntohl(th->ack_seq));
tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
- tcp_tap_window_update(conn, ntohs(th->window));
+ tcp_tap_window_update(c, conn, ntohs(th->window));
tcp_data_from_sock(c, conn);
if (conn->events & SOCK_FIN_RCVD &&
--
@@ -1257,19 +1257,25 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
/**
* tcp_tap_window_update() - Process an updated window from tap side
+ * @c: Execution context
* @conn: Connection pointer
* @wnd: Window value, host order, unscaled
*/
-static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
+static void tcp_tap_window_update(const struct ctx *c,
+ struct tcp_tap_conn *conn, unsigned wnd)
{
wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
/* Work-around for bug introduced in peer kernel code, commit
- * e2142825c120 ("net: tcp: send zero-window ACK when no memory").
- * We don't update if window shrank to zero.
+ * e2142825c120 ("net: tcp: send zero-window ACK when no memory"): don't
+ * update the window if it shrank to zero, so that we'll eventually
+ * retry to send data, but rewind the sequence as that obviously implies
+ * that no data beyond the updated window will ever be acknowledged.
*/
- if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap))
+ if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) {
+ tcp_rewind_seq(c, conn);
return;
+ }
conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
@@ -1694,7 +1700,8 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
tcp_timer_ctl(c, conn);
if (p->count == 1) {
- tcp_tap_window_update(conn, ntohs(th->window));
+ tcp_tap_window_update(c, conn,
+ ntohs(th->window));
return 1;
}
@@ -1772,7 +1779,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
if (ack && !tcp_sock_consume(conn, max_ack_seq))
tcp_update_seqack_from_tap(c, conn, max_ack_seq);
- tcp_tap_window_update(conn, max_ack_seq_wnd);
+ tcp_tap_window_update(c, conn, max_ack_seq_wnd);
if (retr) {
flow_trace(conn,
@@ -1861,7 +1868,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
const struct tcphdr *th,
const char *opts, size_t optlen)
{
- tcp_tap_window_update(conn, ntohs(th->window));
+ tcp_tap_window_update(c, conn, ntohs(th->window));
tcp_get_tap_ws(conn, opts, optlen);
/* First value is not scaled */
@@ -2059,7 +2066,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
if (!th->ack)
goto reset;
- tcp_tap_window_update(conn, ntohs(th->window));
+ tcp_tap_window_update(c, conn, ntohs(th->window));
tcp_data_from_sock(c, conn);
@@ -2071,7 +2078,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
if (conn->events & TAP_FIN_RCVD) {
tcp_sock_consume(conn, ntohl(th->ack_seq));
tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
- tcp_tap_window_update(conn, ntohs(th->window));
+ tcp_tap_window_update(c, conn, ntohs(th->window));
tcp_data_from_sock(c, conn);
if (conn->events & SOCK_FIN_RCVD &&
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 4/6] tcp: Fix closing logic for half-closed connections
2025-08-15 16:10 [PATCH 0/6] tcp: Fixes for issues uncovered by tests with 6.17-rc1 kernels Stefano Brivio
` (2 preceding siblings ...)
2025-08-15 16:10 ` [PATCH 3/6] tcp: Rewind sequence when guest shrinks window to zero Stefano Brivio
@ 2025-08-15 16:10 ` Stefano Brivio
2025-08-15 16:10 ` [PATCH 5/6] tcp: Don't try to transmit right after the peer shrank the window to zero Stefano Brivio
2025-08-15 16:10 ` [PATCH 6/6] tcp: Fast re-transmit if half-closed, make TAP_FIN_RCVD path consistent Stefano Brivio
5 siblings, 0 replies; 7+ messages in thread
From: Stefano Brivio @ 2025-08-15 16:10 UTC (permalink / raw)
To: passt-dev; +Cc: Jon Maloy, Paul Holzinger
First off, don't close connections half-closed by the guest before
our own FIN is acknowledged by the guest itself.
That is, after we receive a FIN from the guest (TAP_FIN_RCVD), if we
don't have any data left to send from the socket (SOCK_FIN_RCVD, or
EPOLLHUP), we send a FIN segment to the guest (TAP_FIN_SENT), but we
need to actually have it acknowledged (and have no pending
retransmissions) before we can close the connection: check for
TAP_FIN_ACKED, first.
Then, if we set TAP_FIN_SENT, and we receive an ACK segment from the
guest, set TAP_FIN_ACKED. This was entirely missing for the
TAP_FIN_RCVD case, and as we fix the problem described above, this
becomes relevant as well.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
tcp.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/tcp.c b/tcp.c
index dda0a2e..aed25a9 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2081,9 +2081,14 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
tcp_tap_window_update(c, conn, ntohs(th->window));
tcp_data_from_sock(c, conn);
- if (conn->events & SOCK_FIN_RCVD &&
- conn->seq_ack_from_tap == conn->seq_to_tap)
- conn_event(c, conn, CLOSED);
+ if (conn->seq_ack_from_tap == conn->seq_to_tap) {
+ if (th->ack && conn->events & TAP_FIN_SENT)
+ conn_event(c, conn, TAP_FIN_ACKED);
+
+ if (conn->events & SOCK_FIN_RCVD &&
+ conn->events & TAP_FIN_ACKED)
+ conn_event(c, conn, CLOSED);
+ }
return 1;
}
@@ -2363,7 +2368,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
return;
}
- if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) {
+ if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) {
conn_event(c, conn, CLOSED);
return;
}
--
@@ -2081,9 +2081,14 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
tcp_tap_window_update(c, conn, ntohs(th->window));
tcp_data_from_sock(c, conn);
- if (conn->events & SOCK_FIN_RCVD &&
- conn->seq_ack_from_tap == conn->seq_to_tap)
- conn_event(c, conn, CLOSED);
+ if (conn->seq_ack_from_tap == conn->seq_to_tap) {
+ if (th->ack && conn->events & TAP_FIN_SENT)
+ conn_event(c, conn, TAP_FIN_ACKED);
+
+ if (conn->events & SOCK_FIN_RCVD &&
+ conn->events & TAP_FIN_ACKED)
+ conn_event(c, conn, CLOSED);
+ }
return 1;
}
@@ -2363,7 +2368,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
return;
}
- if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) {
+ if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) {
conn_event(c, conn, CLOSED);
return;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 5/6] tcp: Don't try to transmit right after the peer shrank the window to zero
2025-08-15 16:10 [PATCH 0/6] tcp: Fixes for issues uncovered by tests with 6.17-rc1 kernels Stefano Brivio
` (3 preceding siblings ...)
2025-08-15 16:10 ` [PATCH 4/6] tcp: Fix closing logic for half-closed connections Stefano Brivio
@ 2025-08-15 16:10 ` Stefano Brivio
2025-08-15 16:10 ` [PATCH 6/6] tcp: Fast re-transmit if half-closed, make TAP_FIN_RCVD path consistent Stefano Brivio
5 siblings, 0 replies; 7+ messages in thread
From: Stefano Brivio @ 2025-08-15 16:10 UTC (permalink / raw)
To: passt-dev; +Cc: Jon Maloy, Paul Holzinger
If the peer shrinks the window to zero, we'll skip storing the new
window, as a convenient way to cause window probes (which exceed any
zero-sized window, strictly speaking) if we don't get window updates
in a while.
As we do so, though, we need to ensure we don't try to queue more data
from the socket right after we process this window update, as the
entire point of a zero-window advertisement is to keep us from sending
more data.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
tcp.c | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/tcp.c b/tcp.c
index aed25a9..624e7f4 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1260,8 +1260,10 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
* @c: Execution context
* @conn: Connection pointer
* @wnd: Window value, host order, unscaled
+ *
+ * Return: false on zero window (not stored to wnd_from_tap), true otherwise
*/
-static void tcp_tap_window_update(const struct ctx *c,
+static bool tcp_tap_window_update(const struct ctx *c,
struct tcp_tap_conn *conn, unsigned wnd)
{
wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
@@ -1274,13 +1276,14 @@ static void tcp_tap_window_update(const struct ctx *c,
*/
if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) {
tcp_rewind_seq(c, conn);
- return;
+ return false;
}
conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
/* FIXME: reflect the tap-side receiver's window back to the sock-side
* sender by adjusting SO_RCVBUF? */
+ return true;
}
/**
@@ -2066,9 +2069,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
if (!th->ack)
goto reset;
- tcp_tap_window_update(c, conn, ntohs(th->window));
-
- tcp_data_from_sock(c, conn);
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)))
+ tcp_data_from_sock(c, conn);
if (p->count - idx == 1)
return 1;
@@ -2078,8 +2080,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
if (conn->events & TAP_FIN_RCVD) {
tcp_sock_consume(conn, ntohl(th->ack_seq));
tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
- tcp_tap_window_update(c, conn, ntohs(th->window));
- tcp_data_from_sock(c, conn);
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)))
+ tcp_data_from_sock(c, conn);
if (conn->seq_ack_from_tap == conn->seq_to_tap) {
if (th->ack && conn->events & TAP_FIN_SENT)
--
@@ -1260,8 +1260,10 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
* @c: Execution context
* @conn: Connection pointer
* @wnd: Window value, host order, unscaled
+ *
+ * Return: false on zero window (not stored to wnd_from_tap), true otherwise
*/
-static void tcp_tap_window_update(const struct ctx *c,
+static bool tcp_tap_window_update(const struct ctx *c,
struct tcp_tap_conn *conn, unsigned wnd)
{
wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
@@ -1274,13 +1276,14 @@ static void tcp_tap_window_update(const struct ctx *c,
*/
if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) {
tcp_rewind_seq(c, conn);
- return;
+ return false;
}
conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
/* FIXME: reflect the tap-side receiver's window back to the sock-side
* sender by adjusting SO_RCVBUF? */
+ return true;
}
/**
@@ -2066,9 +2069,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
if (!th->ack)
goto reset;
- tcp_tap_window_update(c, conn, ntohs(th->window));
-
- tcp_data_from_sock(c, conn);
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)))
+ tcp_data_from_sock(c, conn);
if (p->count - idx == 1)
return 1;
@@ -2078,8 +2080,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
if (conn->events & TAP_FIN_RCVD) {
tcp_sock_consume(conn, ntohl(th->ack_seq));
tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
- tcp_tap_window_update(c, conn, ntohs(th->window));
- tcp_data_from_sock(c, conn);
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)))
+ tcp_data_from_sock(c, conn);
if (conn->seq_ack_from_tap == conn->seq_to_tap) {
if (th->ack && conn->events & TAP_FIN_SENT)
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 6/6] tcp: Fast re-transmit if half-closed, make TAP_FIN_RCVD path consistent
2025-08-15 16:10 [PATCH 0/6] tcp: Fixes for issues uncovered by tests with 6.17-rc1 kernels Stefano Brivio
` (4 preceding siblings ...)
2025-08-15 16:10 ` [PATCH 5/6] tcp: Don't try to transmit right after the peer shrank the window to zero Stefano Brivio
@ 2025-08-15 16:10 ` Stefano Brivio
5 siblings, 0 replies; 7+ messages in thread
From: Stefano Brivio @ 2025-08-15 16:10 UTC (permalink / raw)
To: passt-dev; +Cc: Jon Maloy, Paul Holzinger
We currently have a number of discrepancies in the tcp_tap_handler()
path between the half-closed connection path and the regular one, and
they are mostly a result of code duplication, which comes in turn from
the fact that tcp_data_from_tap() deals with data transfers as well as
general connection bookkeeping, so we can't use it for half-closed
connections.
This suggests that we should probably rework it into two or more
functions, in the long term, but for the moment being I'm just fixing
one obvious issue, which is the lack of fast retransmissions in the
TAP_FIN_RCVD path, and a potential one, which is the fact we don't
handle socket flush failures.
Add fast re-transmit for half-closed connections, and extract the
logic to determine the TCP payload length from tcp_data_from_tap()
into the new tcp_packet_data_len() helper to decrease a bit the amount
of resulting code duplication.
Handle the case of socket flush (tcp_sock_consume()) flush failure in
the same way as tcp_data_from_tap() handles it.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
tcp.c | 79 ++++++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 56 insertions(+), 23 deletions(-)
diff --git a/tcp.c b/tcp.c
index 624e7f4..163f820 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1640,6 +1640,22 @@ static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
return tcp_buf_data_from_sock(c, conn);
}
+/**
+ * tcp_packet_data_len() - Get data (TCP payload) length for a TCP packet
+ * @th: Pointer to TCP header
+ * @l4len: TCP packet length, including TCP header
+ *
+ * Return: data length of TCP packet, -1 on invalid value of Data Offset field
+ */
+static ssize_t tcp_packet_data_len(const struct tcphdr *th, size_t l4len)
+{
+ size_t off = th->doff * 4UL;
+
+ if (off < sizeof(*th) || off > l4len)
+ return -1;
+
+ return l4len - off;
+}
/**
* tcp_data_from_tap() - tap/guest data for established connection
@@ -1671,27 +1687,21 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
for (i = idx, iov_i = 0; i < (int)p->count; i++) {
uint32_t seq, seq_offset, ack_seq;
const struct tcphdr *th;
+ ssize_t dlen;
char *data;
- size_t off;
th = packet_get(p, i, 0, sizeof(*th), &len);
if (!th)
return -1;
len += sizeof(*th);
- off = th->doff * 4UL;
- if (off < sizeof(*th) || off > len)
- return -1;
-
if (th->rst) {
conn_event(c, conn, CLOSED);
return 1;
}
- len -= off;
- data = packet_get(p, i, off, len, NULL);
- if (!data)
- continue;
+ if ((dlen = tcp_packet_data_len(th, len)) < 0)
+ return -1;
seq = ntohl(th->seq);
if (SEQ_LT(seq, conn->seq_from_tap) && len <= 1) {
@@ -1719,7 +1729,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
if (SEQ_GE(ack_seq, conn->seq_ack_from_tap) &&
SEQ_GE(ack_seq, max_ack_seq)) {
/* Fast re-transmit */
- retr = !len && !th->fin &&
+ retr = !dlen && !th->fin &&
ack_seq == max_ack_seq &&
ntohs(th->window) == max_ack_seq_wnd;
@@ -1731,33 +1741,37 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
if (th->fin)
fin = 1;
- if (!len)
+ if (!dlen)
+ continue;
+
+ data = packet_get(p, i, th->doff * 4UL, dlen, NULL);
+ if (!data)
continue;
seq_offset = seq_from_tap - seq;
/* Use data from this buffer only in these two cases:
*
- * , seq_from_tap , seq_from_tap
- * |--------| <-- len |--------| <-- len
+ * , seq_from_tap , seq_from_tap
+ * |--------| <-- dlen |--------| <-- dlen
* '----' <-- offset ' <-- offset
* ^ seq ^ seq
- * (offset >= 0, seq + len > seq_from_tap)
+ * (offset >= 0, seq + dlen > seq_from_tap)
*
* discard in these two cases:
- * , seq_from_tap , seq_from_tap
- * |--------| <-- len |--------| <-- len
+ * , seq_from_tap , seq_from_tap
+ * |--------| <-- dlen |--------| <-- dlen
* '--------' <-- offset '-----| <- offset
* ^ seq ^ seq
- * (offset >= 0, seq + len <= seq_from_tap)
+ * (offset >= 0, seq + dlen <= seq_from_tap)
*
* keep, look for another buffer, then go back, in this case:
* , seq_from_tap
- * |--------| <-- len
+ * |--------| <-- dlen
* '===' <-- offset
* ^ seq
* (offset < 0)
*/
- if (SEQ_GE(seq_offset, 0) && SEQ_LE(seq + len, seq_from_tap))
+ if (SEQ_GE(seq_offset, 0) && SEQ_LE(seq + dlen, seq_from_tap))
continue;
if (SEQ_LT(seq_offset, 0)) {
@@ -1767,7 +1781,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
}
tcp_iov[iov_i].iov_base = data + seq_offset;
- tcp_iov[iov_i].iov_len = len - seq_offset;
+ tcp_iov[iov_i].iov_len = dlen - seq_offset;
seq_from_tap += tcp_iov[iov_i].iov_len;
iov_i++;
@@ -2078,9 +2092,28 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
/* Established connections not accepting data from tap */
if (conn->events & TAP_FIN_RCVD) {
- tcp_sock_consume(conn, ntohl(th->ack_seq));
- tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
- if (tcp_tap_window_update(c, conn, ntohs(th->window)))
+ bool retr;
+
+ retr = th->ack && !tcp_packet_data_len(th, len) && !th->fin &&
+ ntohl(th->ack_seq) == conn->seq_ack_from_tap &&
+ ntohs(th->window) == conn->wnd_from_tap;
+
+ /* On socket flush failure, pretend there was no ACK, try again
+ * later
+ */
+ if (th->ack && !tcp_sock_consume(conn, ntohl(th->ack_seq)))
+ tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
+
+ if (retr) {
+ flow_trace(conn,
+ "fast re-transmit, ACK: %u, previous sequence: %u",
+ ntohl(th->ack_seq), conn->seq_to_tap);
+
+ if (tcp_rewind_seq(c, conn))
+ return -1;
+ }
+
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)) || retr)
tcp_data_from_sock(c, conn);
if (conn->seq_ack_from_tap == conn->seq_to_tap) {
--
@@ -1640,6 +1640,22 @@ static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
return tcp_buf_data_from_sock(c, conn);
}
+/**
+ * tcp_packet_data_len() - Get data (TCP payload) length for a TCP packet
+ * @th: Pointer to TCP header
+ * @l4len: TCP packet length, including TCP header
+ *
+ * Return: data length of TCP packet, -1 on invalid value of Data Offset field
+ */
+static ssize_t tcp_packet_data_len(const struct tcphdr *th, size_t l4len)
+{
+ size_t off = th->doff * 4UL;
+
+ if (off < sizeof(*th) || off > l4len)
+ return -1;
+
+ return l4len - off;
+}
/**
* tcp_data_from_tap() - tap/guest data for established connection
@@ -1671,27 +1687,21 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
for (i = idx, iov_i = 0; i < (int)p->count; i++) {
uint32_t seq, seq_offset, ack_seq;
const struct tcphdr *th;
+ ssize_t dlen;
char *data;
- size_t off;
th = packet_get(p, i, 0, sizeof(*th), &len);
if (!th)
return -1;
len += sizeof(*th);
- off = th->doff * 4UL;
- if (off < sizeof(*th) || off > len)
- return -1;
-
if (th->rst) {
conn_event(c, conn, CLOSED);
return 1;
}
- len -= off;
- data = packet_get(p, i, off, len, NULL);
- if (!data)
- continue;
+ if ((dlen = tcp_packet_data_len(th, len)) < 0)
+ return -1;
seq = ntohl(th->seq);
if (SEQ_LT(seq, conn->seq_from_tap) && len <= 1) {
@@ -1719,7 +1729,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
if (SEQ_GE(ack_seq, conn->seq_ack_from_tap) &&
SEQ_GE(ack_seq, max_ack_seq)) {
/* Fast re-transmit */
- retr = !len && !th->fin &&
+ retr = !dlen && !th->fin &&
ack_seq == max_ack_seq &&
ntohs(th->window) == max_ack_seq_wnd;
@@ -1731,33 +1741,37 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
if (th->fin)
fin = 1;
- if (!len)
+ if (!dlen)
+ continue;
+
+ data = packet_get(p, i, th->doff * 4UL, dlen, NULL);
+ if (!data)
continue;
seq_offset = seq_from_tap - seq;
/* Use data from this buffer only in these two cases:
*
- * , seq_from_tap , seq_from_tap
- * |--------| <-- len |--------| <-- len
+ * , seq_from_tap , seq_from_tap
+ * |--------| <-- dlen |--------| <-- dlen
* '----' <-- offset ' <-- offset
* ^ seq ^ seq
- * (offset >= 0, seq + len > seq_from_tap)
+ * (offset >= 0, seq + dlen > seq_from_tap)
*
* discard in these two cases:
- * , seq_from_tap , seq_from_tap
- * |--------| <-- len |--------| <-- len
+ * , seq_from_tap , seq_from_tap
+ * |--------| <-- dlen |--------| <-- dlen
* '--------' <-- offset '-----| <- offset
* ^ seq ^ seq
- * (offset >= 0, seq + len <= seq_from_tap)
+ * (offset >= 0, seq + dlen <= seq_from_tap)
*
* keep, look for another buffer, then go back, in this case:
* , seq_from_tap
- * |--------| <-- len
+ * |--------| <-- dlen
* '===' <-- offset
* ^ seq
* (offset < 0)
*/
- if (SEQ_GE(seq_offset, 0) && SEQ_LE(seq + len, seq_from_tap))
+ if (SEQ_GE(seq_offset, 0) && SEQ_LE(seq + dlen, seq_from_tap))
continue;
if (SEQ_LT(seq_offset, 0)) {
@@ -1767,7 +1781,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
}
tcp_iov[iov_i].iov_base = data + seq_offset;
- tcp_iov[iov_i].iov_len = len - seq_offset;
+ tcp_iov[iov_i].iov_len = dlen - seq_offset;
seq_from_tap += tcp_iov[iov_i].iov_len;
iov_i++;
@@ -2078,9 +2092,28 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
/* Established connections not accepting data from tap */
if (conn->events & TAP_FIN_RCVD) {
- tcp_sock_consume(conn, ntohl(th->ack_seq));
- tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
- if (tcp_tap_window_update(c, conn, ntohs(th->window)))
+ bool retr;
+
+ retr = th->ack && !tcp_packet_data_len(th, len) && !th->fin &&
+ ntohl(th->ack_seq) == conn->seq_ack_from_tap &&
+ ntohs(th->window) == conn->wnd_from_tap;
+
+ /* On socket flush failure, pretend there was no ACK, try again
+ * later
+ */
+ if (th->ack && !tcp_sock_consume(conn, ntohl(th->ack_seq)))
+ tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
+
+ if (retr) {
+ flow_trace(conn,
+ "fast re-transmit, ACK: %u, previous sequence: %u",
+ ntohl(th->ack_seq), conn->seq_to_tap);
+
+ if (tcp_rewind_seq(c, conn))
+ return -1;
+ }
+
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)) || retr)
tcp_data_from_sock(c, conn);
if (conn->seq_ack_from_tap == conn->seq_to_tap) {
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread