public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: Stefano Brivio <sbrivio@redhat.com>
To: passt-dev@passt.top
Cc: Jon Maloy <jmaloy@redhat.com>,
	Paul Holzinger <pholzing@redhat.com>,
	David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH v2 3/6] tcp: Rewind sequence when guest shrinks window to zero
Date: Wed, 20 Aug 2025 18:51:34 +0200	[thread overview]
Message-ID: <20250820165137.2004897-4-sbrivio@redhat.com> (raw)
In-Reply-To: <20250820165137.2004897-1-sbrivio@redhat.com>

A window shrunk to zero means by definition that anything else that
might be in flight is now out of window. Restart from the currently
acknowledged sequence.

We need to do that both in tcp_tap_window_update(), where we already
check for zero-window updates, as well as in tcp_data_from_tap(),
because we might get one of those updates in a batch of packets that
also contains a non-zero window update.

Suggested-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/tcp.c b/tcp.c
index 1402ca2..11c9c84 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1257,19 +1257,25 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
 
 /**
  * tcp_tap_window_update() - Process an updated window from tap side
+ * @c:		Execution context
  * @conn:	Connection pointer
  * @wnd:	Window value, host order, unscaled
  */
-static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
+static void tcp_tap_window_update(const struct ctx *c,
+				  struct tcp_tap_conn *conn, unsigned wnd)
 {
 	wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
 
 	/* Work-around for bug introduced in peer kernel code, commit
-	 * e2142825c120 ("net: tcp: send zero-window ACK when no memory").
-	 * We don't update if window shrank to zero.
+	 * e2142825c120 ("net: tcp: send zero-window ACK when no memory"): don't
+	 * update the window if it shrank to zero, so that we'll eventually
+	 * retry to send data, but rewind the sequence as that obviously implies
+	 * that no data beyond the updated window will ever be acknowledged.
 	 */
-	if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap))
+	if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) {
+		tcp_rewind_seq(c, conn);
 		return;
+	}
 
 	conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
 
@@ -1694,7 +1700,8 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			tcp_timer_ctl(c, conn);
 
 			if (p->count == 1) {
-				tcp_tap_window_update(conn, ntohs(th->window));
+				tcp_tap_window_update(c, conn,
+						      ntohs(th->window));
 				return 1;
 			}
 
@@ -1713,6 +1720,15 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 				       ack_seq == max_ack_seq &&
 				       ntohs(th->window) == max_ack_seq_wnd;
 
+				/* See tcp_tap_window_update() for details. On
+				 * top of that, we also need to check here if a
+				 * zero-window update is contained in a batch of
+				 * packets that includes a non-zero window as
+				 * well.
+				 */
+				if (!ntohs(th->window))
+					tcp_rewind_seq(c, conn);
+
 				max_ack_seq_wnd = ntohs(th->window);
 				max_ack_seq = ack_seq;
 			}
@@ -1772,7 +1788,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 	if (ack && !tcp_sock_consume(conn, max_ack_seq))
 		tcp_update_seqack_from_tap(c, conn, max_ack_seq);
 
-	tcp_tap_window_update(conn, max_ack_seq_wnd);
+	tcp_tap_window_update(c, conn, max_ack_seq_wnd);
 
 	if (retr) {
 		flow_trace(conn,
@@ -1861,7 +1877,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
 				      const struct tcphdr *th,
 				      const char *opts, size_t optlen)
 {
-	tcp_tap_window_update(conn, ntohs(th->window));
+	tcp_tap_window_update(c, conn, ntohs(th->window));
 	tcp_get_tap_ws(conn, opts, optlen);
 
 	/* First value is not scaled */
@@ -2059,7 +2075,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		if (!th->ack)
 			goto reset;
 
-		tcp_tap_window_update(conn, ntohs(th->window));
+		tcp_tap_window_update(c, conn, ntohs(th->window));
 
 		tcp_data_from_sock(c, conn);
 
@@ -2071,7 +2087,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 	if (conn->events & TAP_FIN_RCVD) {
 		tcp_sock_consume(conn, ntohl(th->ack_seq));
 		tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
-		tcp_tap_window_update(conn, ntohs(th->window));
+		tcp_tap_window_update(c, conn, ntohs(th->window));
 		tcp_data_from_sock(c, conn);
 
 		if (conn->events & SOCK_FIN_RCVD &&
-- 
@@ -1257,19 +1257,25 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
 
 /**
  * tcp_tap_window_update() - Process an updated window from tap side
+ * @c:		Execution context
  * @conn:	Connection pointer
  * @wnd:	Window value, host order, unscaled
  */
-static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
+static void tcp_tap_window_update(const struct ctx *c,
+				  struct tcp_tap_conn *conn, unsigned wnd)
 {
 	wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
 
 	/* Work-around for bug introduced in peer kernel code, commit
-	 * e2142825c120 ("net: tcp: send zero-window ACK when no memory").
-	 * We don't update if window shrank to zero.
+	 * e2142825c120 ("net: tcp: send zero-window ACK when no memory"): don't
+	 * update the window if it shrank to zero, so that we'll eventually
+	 * retry to send data, but rewind the sequence as that obviously implies
+	 * that no data beyond the updated window will ever be acknowledged.
 	 */
-	if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap))
+	if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) {
+		tcp_rewind_seq(c, conn);
 		return;
+	}
 
 	conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
 
@@ -1694,7 +1700,8 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			tcp_timer_ctl(c, conn);
 
 			if (p->count == 1) {
-				tcp_tap_window_update(conn, ntohs(th->window));
+				tcp_tap_window_update(c, conn,
+						      ntohs(th->window));
 				return 1;
 			}
 
@@ -1713,6 +1720,15 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 				       ack_seq == max_ack_seq &&
 				       ntohs(th->window) == max_ack_seq_wnd;
 
+				/* See tcp_tap_window_update() for details. On
+				 * top of that, we also need to check here if a
+				 * zero-window update is contained in a batch of
+				 * packets that includes a non-zero window as
+				 * well.
+				 */
+				if (!ntohs(th->window))
+					tcp_rewind_seq(c, conn);
+
 				max_ack_seq_wnd = ntohs(th->window);
 				max_ack_seq = ack_seq;
 			}
@@ -1772,7 +1788,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 	if (ack && !tcp_sock_consume(conn, max_ack_seq))
 		tcp_update_seqack_from_tap(c, conn, max_ack_seq);
 
-	tcp_tap_window_update(conn, max_ack_seq_wnd);
+	tcp_tap_window_update(c, conn, max_ack_seq_wnd);
 
 	if (retr) {
 		flow_trace(conn,
@@ -1861,7 +1877,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
 				      const struct tcphdr *th,
 				      const char *opts, size_t optlen)
 {
-	tcp_tap_window_update(conn, ntohs(th->window));
+	tcp_tap_window_update(c, conn, ntohs(th->window));
 	tcp_get_tap_ws(conn, opts, optlen);
 
 	/* First value is not scaled */
@@ -2059,7 +2075,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		if (!th->ack)
 			goto reset;
 
-		tcp_tap_window_update(conn, ntohs(th->window));
+		tcp_tap_window_update(c, conn, ntohs(th->window));
 
 		tcp_data_from_sock(c, conn);
 
@@ -2071,7 +2087,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 	if (conn->events & TAP_FIN_RCVD) {
 		tcp_sock_consume(conn, ntohl(th->ack_seq));
 		tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
-		tcp_tap_window_update(conn, ntohs(th->window));
+		tcp_tap_window_update(c, conn, ntohs(th->window));
 		tcp_data_from_sock(c, conn);
 
 		if (conn->events & SOCK_FIN_RCVD &&
-- 
2.43.0


  parent reply	other threads:[~2025-08-20 16:51 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-08-20 16:51 [PATCH v2 0/6] tcp: Fixes for issues uncovered by tests with 6.17-rc1 kernels Stefano Brivio
2025-08-20 16:51 ` [PATCH v2 1/6] tcp: FIN flags have to be retransmitted as well Stefano Brivio
2025-08-20 16:51 ` [PATCH v2 2/6] tcp: Factor sequence rewind for retransmissions into a new function Stefano Brivio
2025-08-20 16:51 ` Stefano Brivio [this message]
2025-08-20 16:51 ` [PATCH v2 4/6] tcp: Fix closing logic for half-closed connections Stefano Brivio
2025-08-20 16:51 ` [PATCH v2 5/6] tcp: Don't try to transmit right after the peer shrank the window to zero Stefano Brivio
2025-08-20 16:51 ` [PATCH v2 6/6] tcp: Fast re-transmit if half-closed, make TAP_FIN_RCVD path consistent Stefano Brivio
2025-08-20 19:09 ` [PATCH v2 0/6] tcp: Fixes for issues uncovered by tests with 6.17-rc1 kernels Stefano Brivio

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250820165137.2004897-4-sbrivio@redhat.com \
    --to=sbrivio@redhat.com \
    --cc=david@gibson.dropbear.id.au \
    --cc=jmaloy@redhat.com \
    --cc=passt-dev@passt.top \
    --cc=pholzing@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).