public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH] tcp: Reset ACK_FROM_TAP_DUE flag only as needed, update timer
@ 2023-02-13  1:12 Stefano Brivio
  0 siblings, 0 replies; only message in thread
From: Stefano Brivio @ 2023-02-13  1:12 UTC (permalink / raw)
  To: passt-dev; +Cc: David Gibson

David reports that TCP transfers might stall, especially with smaller
socket buffer sizes, because we reset the ACK_FROM_TAP_DUE flag, in
tcp_tap_handler(), whenever we receive an ACK segment, regardless of
its sequence number and the fact that we might still be waiting for
one. This way, we might fail to re-transmit frames on ACK timeouts.

We need, instead, to:

- indicate with the @retrans field only re-transmissions for the same
  data sequences. If we make progress, it should be reset, given that
  it's used to abort a connection when we exceed a given number of
  re-transmissions for the same data

- unset the ACK_FROM_TAP_DUE flag if and only if the acknowledged
  sequence is the same as the last one we sent, as suggested by David

- keep it set otherwise, if progress was done but not all the data we
  sent was acknowledged, and update the expiration of the ACK timeout

Add a new helper for these purposes, tcp_update_seqack_from_tap().

To extend the ACK timeout, the new helper sets the ACK_FROM_TAP_DUE
flag, even if it was already set, and conn_flag_do() triggers a timer
update. This part should be revisited at a later time, because,
strictly speaking, ACK_FROM_TAP_DUE isn't a flag anymore. One
possibility might be to introduce another connection attribute for
events affecting timer deadlines.

Reported-by: David Gibson <david@gibson.dropbear.id.au>
Link: https://bugs.passt.top/show_bug.cgi?id=41
Suggested-by: David Gibson <david@gibson.dropbear.id.au>
Fixes: be5bbb9b0681 ("tcp: Rework timers to use timerfd instead of periodic bitmap scan")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
This is already applied, I needed to unblock tests. Posting anyway
for reviews.

 tcp.c | 52 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/tcp.c b/tcp.c
index 1e0a338..dfa8b6d 100644
--- a/tcp.c
+++ b/tcp.c
@@ -757,8 +757,18 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
 			      tcp_flag_str[fls(~flag)]);
 		}
 	} else {
-		if (conn->flags & flag)
+		if (conn->flags & flag) {
+			/* Special case: setting ACK_FROM_TAP_DUE on a
+			 * connection where it's already set is used to
+			 * re-schedule the existing timer.
+			 * TODO: define clearer semantics for timer-related
+			 * flags and factor this into the logic below.
+			 */
+			if (flag == ACK_FROM_TAP_DUE)
+				tcp_timer_ctl(c, conn);
+
 			return;
+		}
 
 		conn->flags |= flag;
 		if (fls(flag) >= 0) {
@@ -1591,6 +1601,26 @@ out:
 	       conn->seq_ack_to_tap != prev_ack_to_tap;
 }
 
+/**
+ * tcp_update_seqack_from_tap() - ACK number from tap and related flags/counters
+ * @c:		Execution context
+ * @conn:	Connection pointer
+ * @seq		Current ACK sequence, host order
+ */
+static void tcp_update_seqack_from_tap(const struct ctx *c,
+				       struct tcp_tap_conn *conn, uint32_t seq)
+{
+	if (SEQ_GT(seq, conn->seq_ack_from_tap)) {
+		if (seq == conn->seq_to_tap)
+			conn_flag(c, conn, ~ACK_FROM_TAP_DUE);
+		else
+			conn_flag(c, conn, ACK_FROM_TAP_DUE);
+
+		conn->retrans = 0;
+		conn->seq_ack_from_tap = seq;
+	}
+}
+
 /**
  * tcp_send_flag() - Send segment with flags to tap (no payload)
  * @c:		Execution context
@@ -2041,7 +2071,6 @@ static int tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq)
 		 MSG_DONTWAIT | MSG_TRUNC) < 0)
 		return -errno;
 
-	conn->seq_ack_from_tap = ack_seq;
 	return 0;
 }
 
@@ -2333,14 +2362,9 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 
 	tcp_clamp_window(c, conn, max_ack_seq_wnd);
 
-	if (ack) {
-		if (max_ack_seq == conn->seq_to_tap) {
-			conn_flag(c, conn, ~ACK_FROM_TAP_DUE);
-			conn->retrans = 0;
-		}
-
-		tcp_sock_consume(conn, max_ack_seq);
-	}
+	/* On socket flush failure, pretend there was no ACK, try again later */
+	if (ack && !tcp_sock_consume(conn, max_ack_seq))
+		tcp_update_seqack_from_tap(c, conn, max_ack_seq);
 
 	if (retr) {
 		trace("TCP: fast re-transmit, ACK: %u, previous sequence: %u",
@@ -2492,10 +2516,8 @@ int tcp_tap_handler(struct ctx *c, int af, const void *addr,
 		return p->count;
 	}
 
-	if (th->ack) {
-		conn_flag(c, conn, ~ACK_FROM_TAP_DUE);
-		conn->retrans = 0;
-	}
+	if (th->ack && !(conn->events & ESTABLISHED))
+		tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
 
 	conn_flag(c, conn, ~STALLED);
 
@@ -2543,6 +2565,8 @@ int tcp_tap_handler(struct ctx *c, int af, const void *addr,
 
 	/* Established connections not accepting data from tap */
 	if (conn->events & TAP_FIN_RCVD) {
+		tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
+
 		if (conn->events & SOCK_FIN_RCVD &&
 		    conn->seq_ack_from_tap == conn->seq_to_tap)
 			conn_event(c, conn, CLOSED);
-- 
@@ -757,8 +757,18 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
 			      tcp_flag_str[fls(~flag)]);
 		}
 	} else {
-		if (conn->flags & flag)
+		if (conn->flags & flag) {
+			/* Special case: setting ACK_FROM_TAP_DUE on a
+			 * connection where it's already set is used to
+			 * re-schedule the existing timer.
+			 * TODO: define clearer semantics for timer-related
+			 * flags and factor this into the logic below.
+			 */
+			if (flag == ACK_FROM_TAP_DUE)
+				tcp_timer_ctl(c, conn);
+
 			return;
+		}
 
 		conn->flags |= flag;
 		if (fls(flag) >= 0) {
@@ -1591,6 +1601,26 @@ out:
 	       conn->seq_ack_to_tap != prev_ack_to_tap;
 }
 
+/**
+ * tcp_update_seqack_from_tap() - ACK number from tap and related flags/counters
+ * @c:		Execution context
+ * @conn:	Connection pointer
+ * @seq		Current ACK sequence, host order
+ */
+static void tcp_update_seqack_from_tap(const struct ctx *c,
+				       struct tcp_tap_conn *conn, uint32_t seq)
+{
+	if (SEQ_GT(seq, conn->seq_ack_from_tap)) {
+		if (seq == conn->seq_to_tap)
+			conn_flag(c, conn, ~ACK_FROM_TAP_DUE);
+		else
+			conn_flag(c, conn, ACK_FROM_TAP_DUE);
+
+		conn->retrans = 0;
+		conn->seq_ack_from_tap = seq;
+	}
+}
+
 /**
  * tcp_send_flag() - Send segment with flags to tap (no payload)
  * @c:		Execution context
@@ -2041,7 +2071,6 @@ static int tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq)
 		 MSG_DONTWAIT | MSG_TRUNC) < 0)
 		return -errno;
 
-	conn->seq_ack_from_tap = ack_seq;
 	return 0;
 }
 
@@ -2333,14 +2362,9 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 
 	tcp_clamp_window(c, conn, max_ack_seq_wnd);
 
-	if (ack) {
-		if (max_ack_seq == conn->seq_to_tap) {
-			conn_flag(c, conn, ~ACK_FROM_TAP_DUE);
-			conn->retrans = 0;
-		}
-
-		tcp_sock_consume(conn, max_ack_seq);
-	}
+	/* On socket flush failure, pretend there was no ACK, try again later */
+	if (ack && !tcp_sock_consume(conn, max_ack_seq))
+		tcp_update_seqack_from_tap(c, conn, max_ack_seq);
 
 	if (retr) {
 		trace("TCP: fast re-transmit, ACK: %u, previous sequence: %u",
@@ -2492,10 +2516,8 @@ int tcp_tap_handler(struct ctx *c, int af, const void *addr,
 		return p->count;
 	}
 
-	if (th->ack) {
-		conn_flag(c, conn, ~ACK_FROM_TAP_DUE);
-		conn->retrans = 0;
-	}
+	if (th->ack && !(conn->events & ESTABLISHED))
+		tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
 
 	conn_flag(c, conn, ~STALLED);
 
@@ -2543,6 +2565,8 @@ int tcp_tap_handler(struct ctx *c, int af, const void *addr,
 
 	/* Established connections not accepting data from tap */
 	if (conn->events & TAP_FIN_RCVD) {
+		tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
+
 		if (conn->events & SOCK_FIN_RCVD &&
 		    conn->seq_ack_from_tap == conn->seq_to_tap)
 			conn_event(c, conn, CLOSED);
-- 
2.35.1


^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2023-02-13  1:12 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-13  1:12 [PATCH] tcp: Reset ACK_FROM_TAP_DUE flag only as needed, update timer Stefano Brivio

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).