public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH v4] tcp: move seq_to_tap update to when frame is queued
@ 2024-05-14 19:44 Jon Maloy
  2024-05-14 19:57 ` Jon Maloy
  0 siblings, 1 reply; 4+ messages in thread
From: Jon Maloy @ 2024-05-14 19:44 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames")
delayed update of conn->seq_to_tap until the moment the corresponding
frame has been successfully pushed out. This has the advantage that we
immediately can make a new attempt to transmit a frame after a failed
trasnmit, rather than waiting for the peer to later discover a gap and
trigger the fast retransmit mechanism to solve the problem.

This approach has turned out to cause a problem with spurious sequence
number updates during peer-initiated retransmits, and we have realized
it may not be the best way to solve the above issue.

We now restore the previous method, by updating the said field at the
moment a frame is added to the outqueue. To retain the advantage of
having a quick re-attempt based on local failure detection, we now scan
through the part of the outqueue that had do be dropped, and restore the
sequence counter for each affected connection to the most appropriate
value.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>

---
v2: - Re-spun loop in tcp_revert_seq() and some other changes based on
      feedback from Stefano Brivio.
    - Added paranoid test to avoid that seq_to_tap becomes lower than
      seq_ack_from_tap.

v3: - Identical to v2. Called v3 because it was embedded in a series
      with that version.

v4: - In tcp_revert_seq(), we read the sequence number from the TCP
      header instead of keeping a copy in struct tcp_buf_seq_update.
    - Since the only remaining field in struct tcp_buf_seq_update is
      a pointer to struct tcp_tap_conn, we eliminate the struct
      altogether, and make the tcp6/tcp3_buf_seq_update arrays into
      arrays of said pointer.
    - Removed 'paranoid' test in tcp_revert_seq. If it happens, it
      is not fatal, and will be caught by other code anyway.
    - Separated from the series again.
---
 tcp.c | 59 +++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 22 deletions(-)

diff --git a/tcp.c b/tcp.c
index 21d0af0..976dba8 100644
--- a/tcp.c
+++ b/tcp.c
@@ -410,16 +410,6 @@ static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
  */
 static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
 
-/**
- * tcp_buf_seq_update - Sequences to update with length of frames once sent
- * @seq:	Pointer to sequence number sent to tap-side, to be updated
- * @len:	TCP payload length
- */
-struct tcp_buf_seq_update {
-	uint32_t *seq;
-	uint16_t len;
-};
-
 /* Static buffers */
 /**
  * struct tcp_payload_t - TCP header and data to send segments with payload
@@ -461,7 +451,8 @@ static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
 
 static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
 
-static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
+/* References tracking the owner connection of frames in the tap outqueue */
+static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
 static unsigned int tcp4_payload_used;
 
 static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
@@ -483,7 +474,8 @@ static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
 
 static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
 
-static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
+/* References tracking the owner connection of frames in the tap outqueue */
+static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
 static unsigned int tcp6_payload_used;
 
 static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
@@ -1261,25 +1253,49 @@ static void tcp_flags_flush(const struct ctx *c)
 	tcp4_flags_used = 0;
 }
 
+/**
+ * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
+ * @conns:       Array of connection pointers corresponding to queued frames
+ * @frames:      Two-dimensional array containing queued frames with sub-iovs
+ * @num_frames:  Number of entries in the two arrays to be compared
+ */
+static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec *frames,
+			   int num_frames)
+{
+	int c, f;
+
+	for (c = 0, f = 0; c < num_frames; c++, f += TCP_NUM_IOVS) {
+		struct tcp_tap_conn *conn = conns[c];
+		struct tcphdr *th = frames[f + TCP_IOV_PAYLOAD].iov_base;
+		uint32_t seq = ntohl(th->seq);
+
+		if (SEQ_LE(conn->seq_to_tap, seq))
+			continue;
+
+		conn->seq_to_tap = seq;
+	}
+}
+
 /**
  * tcp_payload_flush() - Send out buffers for segments with data
  * @c:		Execution context
  */
 static void tcp_payload_flush(const struct ctx *c)
 {
-	unsigned i;
 	size_t m;
 
 	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
 			    tcp6_payload_used);
-	for (i = 0; i < m; i++)
-		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
+	if (m != tcp6_payload_used)
+		tcp_revert_seq(tcp6_frame_conns, &tcp6_l2_iov[m][0],
+			       tcp6_payload_used - m);
 	tcp6_payload_used = 0;
 
 	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
 			    tcp4_payload_used);
-	for (i = 0; i < m; i++)
-		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
+	if (m != tcp4_payload_used)
+		tcp_revert_seq(tcp4_frame_conns, &tcp4_l2_iov[m][0],
+			       tcp4_payload_used - m);
 	tcp4_payload_used = 0;
 }
 
@@ -2129,10 +2145,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
 static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			    ssize_t dlen, int no_csum, uint32_t seq)
 {
-	uint32_t *seq_update = &conn->seq_to_tap;
 	struct iovec *iov;
 	size_t l4len;
 
+	conn->seq_to_tap = seq + dlen;
+
 	if (CONN_V4(conn)) {
 		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
 		const uint16_t *check = NULL;
@@ -2142,8 +2159,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			check = &iph->check;
 		}
 
-		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
-		tcp4_seq_update[tcp4_payload_used].len = dlen;
+		tcp4_frame_conns[tcp4_payload_used] = conn;
 
 		iov = tcp4_l2_iov[tcp4_payload_used++];
 		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
@@ -2151,8 +2167,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
 			tcp_payload_flush(c);
 	} else if (CONN_V6(conn)) {
-		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
-		tcp6_seq_update[tcp6_payload_used].len = dlen;
+		tcp6_frame_conns[tcp6_payload_used] = conn;
 
 		iov = tcp6_l2_iov[tcp6_payload_used++];
 		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);
-- 
@@ -410,16 +410,6 @@ static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
  */
 static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
 
-/**
- * tcp_buf_seq_update - Sequences to update with length of frames once sent
- * @seq:	Pointer to sequence number sent to tap-side, to be updated
- * @len:	TCP payload length
- */
-struct tcp_buf_seq_update {
-	uint32_t *seq;
-	uint16_t len;
-};
-
 /* Static buffers */
 /**
  * struct tcp_payload_t - TCP header and data to send segments with payload
@@ -461,7 +451,8 @@ static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
 
 static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
 
-static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
+/* References tracking the owner connection of frames in the tap outqueue */
+static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
 static unsigned int tcp4_payload_used;
 
 static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
@@ -483,7 +474,8 @@ static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
 
 static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
 
-static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
+/* References tracking the owner connection of frames in the tap outqueue */
+static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
 static unsigned int tcp6_payload_used;
 
 static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
@@ -1261,25 +1253,49 @@ static void tcp_flags_flush(const struct ctx *c)
 	tcp4_flags_used = 0;
 }
 
+/**
+ * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
+ * @conns:       Array of connection pointers corresponding to queued frames
+ * @frames:      Two-dimensional array containing queued frames with sub-iovs
+ * @num_frames:  Number of entries in the two arrays to be compared
+ */
+static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec *frames,
+			   int num_frames)
+{
+	int c, f;
+
+	for (c = 0, f = 0; c < num_frames; c++, f += TCP_NUM_IOVS) {
+		struct tcp_tap_conn *conn = conns[c];
+		struct tcphdr *th = frames[f + TCP_IOV_PAYLOAD].iov_base;
+		uint32_t seq = ntohl(th->seq);
+
+		if (SEQ_LE(conn->seq_to_tap, seq))
+			continue;
+
+		conn->seq_to_tap = seq;
+	}
+}
+
 /**
  * tcp_payload_flush() - Send out buffers for segments with data
  * @c:		Execution context
  */
 static void tcp_payload_flush(const struct ctx *c)
 {
-	unsigned i;
 	size_t m;
 
 	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
 			    tcp6_payload_used);
-	for (i = 0; i < m; i++)
-		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
+	if (m != tcp6_payload_used)
+		tcp_revert_seq(tcp6_frame_conns, &tcp6_l2_iov[m][0],
+			       tcp6_payload_used - m);
 	tcp6_payload_used = 0;
 
 	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
 			    tcp4_payload_used);
-	for (i = 0; i < m; i++)
-		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
+	if (m != tcp4_payload_used)
+		tcp_revert_seq(tcp4_frame_conns, &tcp4_l2_iov[m][0],
+			       tcp4_payload_used - m);
 	tcp4_payload_used = 0;
 }
 
@@ -2129,10 +2145,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
 static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			    ssize_t dlen, int no_csum, uint32_t seq)
 {
-	uint32_t *seq_update = &conn->seq_to_tap;
 	struct iovec *iov;
 	size_t l4len;
 
+	conn->seq_to_tap = seq + dlen;
+
 	if (CONN_V4(conn)) {
 		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
 		const uint16_t *check = NULL;
@@ -2142,8 +2159,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			check = &iph->check;
 		}
 
-		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
-		tcp4_seq_update[tcp4_payload_used].len = dlen;
+		tcp4_frame_conns[tcp4_payload_used] = conn;
 
 		iov = tcp4_l2_iov[tcp4_payload_used++];
 		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
@@ -2151,8 +2167,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
 			tcp_payload_flush(c);
 	} else if (CONN_V6(conn)) {
-		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
-		tcp6_seq_update[tcp6_payload_used].len = dlen;
+		tcp6_frame_conns[tcp6_payload_used] = conn;
 
 		iov = tcp6_l2_iov[tcp6_payload_used++];
 		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v4] tcp: move seq_to_tap update to when frame is queued
  2024-05-14 19:44 [PATCH v4] tcp: move seq_to_tap update to when frame is queued Jon Maloy
@ 2024-05-14 19:57 ` Jon Maloy
  2024-05-15  4:00   ` David Gibson
  0 siblings, 1 reply; 4+ messages in thread
From: Jon Maloy @ 2024-05-14 19:57 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson

Hi,
I did a little more than suggested by David/Stefano here, but I am 
always happy
when I see simplifications that can reduce the code amount.
However, being able to eliminate struct tcp_buf_update led me to another 
idea.

Could we just do:

struct tcp_frame {
      struct tcp_tap_conn *conn;
      struct iov headers[TCP_NUM_IOVS];
};
static struct tcp_frame   tcp4_l2_iov [TCP_FRAMES_MEM];
static struct tcp_frame   tcp6_l2_iov [TCP_FRAMES_MEM];

We could even add a v4/v6 field to the struct, and possibly
eliminate the need for separate v4/v6 queues altogether, with
all its entailing extra code.

What are your thoughts?

///jon


On 2024-05-14 15:44, Jon Maloy wrote:
> commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames")
> delayed update of conn->seq_to_tap until the moment the corresponding
> frame has been successfully pushed out. This has the advantage that we
> immediately can make a new attempt to transmit a frame after a failed
> trasnmit, rather than waiting for the peer to later discover a gap and
> trigger the fast retransmit mechanism to solve the problem.
>
> This approach has turned out to cause a problem with spurious sequence
> number updates during peer-initiated retransmits, and we have realized
> it may not be the best way to solve the above issue.
>
> We now restore the previous method, by updating the said field at the
> moment a frame is added to the outqueue. To retain the advantage of
> having a quick re-attempt based on local failure detection, we now scan
> through the part of the outqueue that had do be dropped, and restore the
> sequence counter for each affected connection to the most appropriate
> value.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
>
> ---
> v2: - Re-spun loop in tcp_revert_seq() and some other changes based on
>        feedback from Stefano Brivio.
>      - Added paranoid test to avoid that seq_to_tap becomes lower than
>        seq_ack_from_tap.
>
> v3: - Identical to v2. Called v3 because it was embedded in a series
>        with that version.
>
> v4: - In tcp_revert_seq(), we read the sequence number from the TCP
>        header instead of keeping a copy in struct tcp_buf_seq_update.
>      - Since the only remaining field in struct tcp_buf_seq_update is
>        a pointer to struct tcp_tap_conn, we eliminate the struct
>        altogether, and make the tcp6/tcp3_buf_seq_update arrays into
>        arrays of said pointer.
>      - Removed 'paranoid' test in tcp_revert_seq. If it happens, it
>        is not fatal, and will be caught by other code anyway.
>      - Separated from the series again.
> ---
>   tcp.c | 59 +++++++++++++++++++++++++++++++++++++----------------------
>   1 file changed, 37 insertions(+), 22 deletions(-)
>
> diff --git a/tcp.c b/tcp.c
> index 21d0af0..976dba8 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -410,16 +410,6 @@ static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
>    */
>   static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
>   
> -/**
> - * tcp_buf_seq_update - Sequences to update with length of frames once sent
> - * @seq:	Pointer to sequence number sent to tap-side, to be updated
> - * @len:	TCP payload length
> - */
> -struct tcp_buf_seq_update {
> -	uint32_t *seq;
> -	uint16_t len;
> -};
> -
>   /* Static buffers */
>   /**
>    * struct tcp_payload_t - TCP header and data to send segments with payload
> @@ -461,7 +451,8 @@ static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
>   
>   static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
>   
> -static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
> +/* References tracking the owner connection of frames in the tap outqueue */
> +static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
>   static unsigned int tcp4_payload_used;
>   
>   static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
> @@ -483,7 +474,8 @@ static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
>   
>   static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
>   
> -static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
> +/* References tracking the owner connection of frames in the tap outqueue */
> +static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
>   static unsigned int tcp6_payload_used;
>   
>   static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
> @@ -1261,25 +1253,49 @@ static void tcp_flags_flush(const struct ctx *c)
>   	tcp4_flags_used = 0;
>   }
>   
> +/**
> + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
> + * @conns:       Array of connection pointers corresponding to queued frames
> + * @frames:      Two-dimensional array containing queued frames with sub-iovs
> + * @num_frames:  Number of entries in the two arrays to be compared
> + */
> +static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec *frames,
> +			   int num_frames)
> +{
> +	int c, f;
> +
> +	for (c = 0, f = 0; c < num_frames; c++, f += TCP_NUM_IOVS) {
> +		struct tcp_tap_conn *conn = conns[c];
> +		struct tcphdr *th = frames[f + TCP_IOV_PAYLOAD].iov_base;
> +		uint32_t seq = ntohl(th->seq);
> +
> +		if (SEQ_LE(conn->seq_to_tap, seq))
> +			continue;
> +
> +		conn->seq_to_tap = seq;
> +	}
> +}
> +
>   /**
>    * tcp_payload_flush() - Send out buffers for segments with data
>    * @c:		Execution context
>    */
>   static void tcp_payload_flush(const struct ctx *c)
>   {
> -	unsigned i;
>   	size_t m;
>   
>   	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
>   			    tcp6_payload_used);
> -	for (i = 0; i < m; i++)
> -		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
> +	if (m != tcp6_payload_used)
> +		tcp_revert_seq(tcp6_frame_conns, &tcp6_l2_iov[m][0],
> +			       tcp6_payload_used - m);
>   	tcp6_payload_used = 0;
>   
>   	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
>   			    tcp4_payload_used);
> -	for (i = 0; i < m; i++)
> -		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
> +	if (m != tcp4_payload_used)
> +		tcp_revert_seq(tcp4_frame_conns, &tcp4_l2_iov[m][0],
> +			       tcp4_payload_used - m);
>   	tcp4_payload_used = 0;
>   }
>   
> @@ -2129,10 +2145,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
>   static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>   			    ssize_t dlen, int no_csum, uint32_t seq)
>   {
> -	uint32_t *seq_update = &conn->seq_to_tap;
>   	struct iovec *iov;
>   	size_t l4len;
>   
> +	conn->seq_to_tap = seq + dlen;
> +
>   	if (CONN_V4(conn)) {
>   		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
>   		const uint16_t *check = NULL;
> @@ -2142,8 +2159,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>   			check = &iph->check;
>   		}
>   
> -		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
> -		tcp4_seq_update[tcp4_payload_used].len = dlen;
> +		tcp4_frame_conns[tcp4_payload_used] = conn;
>   
>   		iov = tcp4_l2_iov[tcp4_payload_used++];
>   		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
> @@ -2151,8 +2167,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>   		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
>   			tcp_payload_flush(c);
>   	} else if (CONN_V6(conn)) {
> -		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
> -		tcp6_seq_update[tcp6_payload_used].len = dlen;
> +		tcp6_frame_conns[tcp6_payload_used] = conn;
>   
>   		iov = tcp6_l2_iov[tcp6_payload_used++];
>   		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v4] tcp: move seq_to_tap update to when frame is queued
  2024-05-14 19:57 ` Jon Maloy
@ 2024-05-15  4:00   ` David Gibson
  2024-05-15 15:19     ` Jon Maloy
  0 siblings, 1 reply; 4+ messages in thread
From: David Gibson @ 2024-05-15  4:00 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 9338 bytes --]

On Tue, May 14, 2024 at 03:57:43PM -0400, Jon Maloy wrote:
> Hi,
> I did a little more than suggested by David/Stefano here, but I am always
> happy
> when I see simplifications that can reduce the code amount.
> However, being able to eliminate struct tcp_buf_update led me to another
> idea.
> 
> Could we just do:
> 
> struct tcp_frame {
>      struct tcp_tap_conn *conn;
>      struct iov headers[TCP_NUM_IOVS];
> };
> static struct tcp_frame   tcp4_l2_iov [TCP_FRAMES_MEM];
> static struct tcp_frame   tcp6_l2_iov [TCP_FRAMES_MEM];

I don't think this works exactly as is.  At least in the qemu socket
case we actually rely on the iovec arrays for multiple frames being
contiguous so we can send them all in one operation.  We might be able
to do something a bit like the udp_meta[] structures I have in UDP, so
rather than having parallel arrays for each of the various header
components we have an array of structures which contains space for all
the header variants a frame might want, along with any additional
per-frame information we need, which could include the connection.
The iov arrays then point to the various buffers within those "meta"
structures.

I did have a look at implementing this a while back, but ran into some
complications.  Nothing particularly bad, I don't think, just more
than I wanted to deal with at the time.

> We could even add a v4/v6 field to the struct, and possibly
> eliminate the need for separate v4/v6 queues altogether, with
> all its entailing extra code.

It's absolutely my aim to unify most or all of the v4/v6 structures.
We'll need this for things we want like v4<->v6 forwarding.  I
certainly think it's doable, but as above, I did hit some fiddliness
when I've tried in the past.

> On 2024-05-14 15:44, Jon Maloy wrote:
> > commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames")
> > delayed update of conn->seq_to_tap until the moment the corresponding
> > frame has been successfully pushed out. This has the advantage that we
> > immediately can make a new attempt to transmit a frame after a failed
> > trasnmit, rather than waiting for the peer to later discover a gap and
> > trigger the fast retransmit mechanism to solve the problem.
> > 
> > This approach has turned out to cause a problem with spurious sequence
> > number updates during peer-initiated retransmits, and we have realized
> > it may not be the best way to solve the above issue.
> > 
> > We now restore the previous method, by updating the said field at the
> > moment a frame is added to the outqueue. To retain the advantage of
> > having a quick re-attempt based on local failure detection, we now scan
> > through the part of the outqueue that had do be dropped, and restore the
> > sequence counter for each affected connection to the most appropriate
> > value.
> > 
> > Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> > 
> > ---
> > v2: - Re-spun loop in tcp_revert_seq() and some other changes based on
> >        feedback from Stefano Brivio.
> >      - Added paranoid test to avoid that seq_to_tap becomes lower than
> >        seq_ack_from_tap.
> > 
> > v3: - Identical to v2. Called v3 because it was embedded in a series
> >        with that version.
> > 
> > v4: - In tcp_revert_seq(), we read the sequence number from the TCP
> >        header instead of keeping a copy in struct tcp_buf_seq_update.
> >      - Since the only remaining field in struct tcp_buf_seq_update is
> >        a pointer to struct tcp_tap_conn, we eliminate the struct
> >        altogether, and make the tcp6/tcp3_buf_seq_update arrays into
> >        arrays of said pointer.
> >      - Removed 'paranoid' test in tcp_revert_seq. If it happens, it
> >        is not fatal, and will be caught by other code anyway.
> >      - Separated from the series again.
> > ---
> >   tcp.c | 59 +++++++++++++++++++++++++++++++++++++----------------------
> >   1 file changed, 37 insertions(+), 22 deletions(-)
> > 
> > diff --git a/tcp.c b/tcp.c
> > index 21d0af0..976dba8 100644
> > --- a/tcp.c
> > +++ b/tcp.c
> > @@ -410,16 +410,6 @@ static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
> >    */
> >   static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
> > -/**
> > - * tcp_buf_seq_update - Sequences to update with length of frames once sent
> > - * @seq:	Pointer to sequence number sent to tap-side, to be updated
> > - * @len:	TCP payload length
> > - */
> > -struct tcp_buf_seq_update {
> > -	uint32_t *seq;
> > -	uint16_t len;
> > -};
> > -
> >   /* Static buffers */
> >   /**
> >    * struct tcp_payload_t - TCP header and data to send segments with payload
> > @@ -461,7 +451,8 @@ static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
> >   static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
> > -static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
> > +/* References tracking the owner connection of frames in the tap outqueue */
> > +static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
> >   static unsigned int tcp4_payload_used;
> >   static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
> > @@ -483,7 +474,8 @@ static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
> >   static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
> > -static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
> > +/* References tracking the owner connection of frames in the tap outqueue */
> > +static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
> >   static unsigned int tcp6_payload_used;
> >   static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
> > @@ -1261,25 +1253,49 @@ static void tcp_flags_flush(const struct ctx *c)
> >   	tcp4_flags_used = 0;
> >   }
> > +/**
> > + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
> > + * @conns:       Array of connection pointers corresponding to queued frames
> > + * @frames:      Two-dimensional array containing queued frames with sub-iovs
> > + * @num_frames:  Number of entries in the two arrays to be compared
> > + */
> > +static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec *frames,
> > +			   int num_frames)
> > +{
> > +	int c, f;
> > +
> > +	for (c = 0, f = 0; c < num_frames; c++, f += TCP_NUM_IOVS) {
> > +		struct tcp_tap_conn *conn = conns[c];
> > +		struct tcphdr *th = frames[f + TCP_IOV_PAYLOAD].iov_base;
> > +		uint32_t seq = ntohl(th->seq);
> > +
> > +		if (SEQ_LE(conn->seq_to_tap, seq))
> > +			continue;
> > +
> > +		conn->seq_to_tap = seq;
> > +	}
> > +}
> > +
> >   /**
> >    * tcp_payload_flush() - Send out buffers for segments with data
> >    * @c:		Execution context
> >    */
> >   static void tcp_payload_flush(const struct ctx *c)
> >   {
> > -	unsigned i;
> >   	size_t m;
> >   	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
> >   			    tcp6_payload_used);
> > -	for (i = 0; i < m; i++)
> > -		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
> > +	if (m != tcp6_payload_used)
> > +		tcp_revert_seq(tcp6_frame_conns, &tcp6_l2_iov[m][0],
> > +			       tcp6_payload_used - m);
> >   	tcp6_payload_used = 0;
> >   	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
> >   			    tcp4_payload_used);
> > -	for (i = 0; i < m; i++)
> > -		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
> > +	if (m != tcp4_payload_used)
> > +		tcp_revert_seq(tcp4_frame_conns, &tcp4_l2_iov[m][0],
> > +			       tcp4_payload_used - m);
> >   	tcp4_payload_used = 0;
> >   }
> > @@ -2129,10 +2145,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
> >   static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> >   			    ssize_t dlen, int no_csum, uint32_t seq)
> >   {
> > -	uint32_t *seq_update = &conn->seq_to_tap;
> >   	struct iovec *iov;
> >   	size_t l4len;
> > +	conn->seq_to_tap = seq + dlen;
> > +
> >   	if (CONN_V4(conn)) {
> >   		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
> >   		const uint16_t *check = NULL;
> > @@ -2142,8 +2159,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> >   			check = &iph->check;
> >   		}
> > -		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
> > -		tcp4_seq_update[tcp4_payload_used].len = dlen;
> > +		tcp4_frame_conns[tcp4_payload_used] = conn;
> >   		iov = tcp4_l2_iov[tcp4_payload_used++];
> >   		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
> > @@ -2151,8 +2167,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> >   		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
> >   			tcp_payload_flush(c);
> >   	} else if (CONN_V6(conn)) {
> > -		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
> > -		tcp6_seq_update[tcp6_payload_used].len = dlen;
> > +		tcp6_frame_conns[tcp6_payload_used] = conn;
> >   		iov = tcp6_l2_iov[tcp6_payload_used++];
> >   		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v4] tcp: move seq_to_tap update to when frame is queued
  2024-05-15  4:00   ` David Gibson
@ 2024-05-15 15:19     ` Jon Maloy
  0 siblings, 0 replies; 4+ messages in thread
From: Jon Maloy @ 2024-05-15 15:19 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev, sbrivio, lvivier, dgibson



On 2024-05-15 00:00, David Gibson wrote:
> On Tue, May 14, 2024 at 03:57:43PM -0400, Jon Maloy wrote:
>> Hi,
>> I did a little more than suggested by David/Stefano here, but I am always
>> happy
>> when I see simplifications that can reduce the code amount.
>> However, being able to eliminate struct tcp_buf_update led me to another
>> idea.
>>
>> Could we just do:
>>
>> struct tcp_frame {
>>       struct tcp_tap_conn *conn;
>>       struct iov headers[TCP_NUM_IOVS];
>> };
>> static struct tcp_frame   tcp4_l2_iov [TCP_FRAMES_MEM];
>> static struct tcp_frame   tcp6_l2_iov [TCP_FRAMES_MEM];
> I don't think this works exactly as is.  At least in the qemu socket
> case we actually rely on the iovec arrays for multiple frames being
> contiguous so we can send them all in one operation.
I suspected something like that, but didn't really check the code for it.
> We might be able
> to do something a bit like the udp_meta[] structures I have in UDP, so
> rather than having parallel arrays for each of the various header
> components we have an array of structures which contains space for all
> the header variants a frame might want, along with any additional
> per-frame information we need, which could include the connection.
> The iov arrays then point to the various buffers within those "meta"
> structures.
>
> I did have a look at implementing this a while back, but ran into some
> complications.  Nothing particularly bad, I don't think, just more
> than I wanted to deal with at the time.
This sounds like a cool task I could do after this series has been applied.
>
>> We could even add a v4/v6 field to the struct, and possibly
>> eliminate the need for separate v4/v6 queues altogether, with
>> all its entailing extra code.
> It's absolutely my aim to unify most or all of the v4/v6 structures.
> We'll need this for things we want like v4<->v6 forwarding.  I
> certainly think it's doable, but as above, I did hit some fiddliness
> when I've tried in the past.
Yeah, unifying those two structures looks very desirable.

///jon
>
>> On 2024-05-14 15:44, Jon Maloy wrote:
>>> commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames")
>>> delayed update of conn->seq_to_tap until the moment the corresponding
>>> frame has been successfully pushed out. This has the advantage that we
>>> immediately can make a new attempt to transmit a frame after a failed
>>> trasnmit, rather than waiting for the peer to later discover a gap and
>>> trigger the fast retransmit mechanism to solve the problem.
>>>
>>> This approach has turned out to cause a problem with spurious sequence
>>> number updates during peer-initiated retransmits, and we have realized
>>> it may not be the best way to solve the above issue.
>>>
>>> We now restore the previous method, by updating the said field at the
>>> moment a frame is added to the outqueue. To retain the advantage of
>>> having a quick re-attempt based on local failure detection, we now scan
>>> through the part of the outqueue that had do be dropped, and restore the
>>> sequence counter for each affected connection to the most appropriate
>>> value.
>>>
>>> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
>>>
>>> ---
>>> v2: - Re-spun loop in tcp_revert_seq() and some other changes based on
>>>         feedback from Stefano Brivio.
>>>       - Added paranoid test to avoid that seq_to_tap becomes lower than
>>>         seq_ack_from_tap.
>>>
>>> v3: - Identical to v2. Called v3 because it was embedded in a series
>>>         with that version.
>>>
>>> v4: - In tcp_revert_seq(), we read the sequence number from the TCP
>>>         header instead of keeping a copy in struct tcp_buf_seq_update.
>>>       - Since the only remaining field in struct tcp_buf_seq_update is
>>>         a pointer to struct tcp_tap_conn, we eliminate the struct
>>>         altogether, and make the tcp6/tcp3_buf_seq_update arrays into
>>>         arrays of said pointer.
>>>       - Removed 'paranoid' test in tcp_revert_seq. If it happens, it
>>>         is not fatal, and will be caught by other code anyway.
>>>       - Separated from the series again.
>>> ---
>>>    tcp.c | 59 +++++++++++++++++++++++++++++++++++++----------------------
>>>    1 file changed, 37 insertions(+), 22 deletions(-)
>>>
>>> diff --git a/tcp.c b/tcp.c
>>> index 21d0af0..976dba8 100644
>>> --- a/tcp.c
>>> +++ b/tcp.c
>>> @@ -410,16 +410,6 @@ static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
>>>     */
>>>    static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
>>> -/**
>>> - * tcp_buf_seq_update - Sequences to update with length of frames once sent
>>> - * @seq:	Pointer to sequence number sent to tap-side, to be updated
>>> - * @len:	TCP payload length
>>> - */
>>> -struct tcp_buf_seq_update {
>>> -	uint32_t *seq;
>>> -	uint16_t len;
>>> -};
>>> -
>>>    /* Static buffers */
>>>    /**
>>>     * struct tcp_payload_t - TCP header and data to send segments with payload
>>> @@ -461,7 +451,8 @@ static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
>>>    static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
>>> -static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
>>> +/* References tracking the owner connection of frames in the tap outqueue */
>>> +static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
>>>    static unsigned int tcp4_payload_used;
>>>    static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
>>> @@ -483,7 +474,8 @@ static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
>>>    static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
>>> -static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
>>> +/* References tracking the owner connection of frames in the tap outqueue */
>>> +static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
>>>    static unsigned int tcp6_payload_used;
>>>    static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
>>> @@ -1261,25 +1253,49 @@ static void tcp_flags_flush(const struct ctx *c)
>>>    	tcp4_flags_used = 0;
>>>    }
>>> +/**
>>> + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
>>> + * @conns:       Array of connection pointers corresponding to queued frames
>>> + * @frames:      Two-dimensional array containing queued frames with sub-iovs
>>> + * @num_frames:  Number of entries in the two arrays to be compared
>>> + */
>>> +static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec *frames,
>>> +			   int num_frames)
>>> +{
>>> +	int c, f;
>>> +
>>> +	for (c = 0, f = 0; c < num_frames; c++, f += TCP_NUM_IOVS) {
>>> +		struct tcp_tap_conn *conn = conns[c];
>>> +		struct tcphdr *th = frames[f + TCP_IOV_PAYLOAD].iov_base;
>>> +		uint32_t seq = ntohl(th->seq);
>>> +
>>> +		if (SEQ_LE(conn->seq_to_tap, seq))
>>> +			continue;
>>> +
>>> +		conn->seq_to_tap = seq;
>>> +	}
>>> +}
>>> +
>>>    /**
>>>     * tcp_payload_flush() - Send out buffers for segments with data
>>>     * @c:		Execution context
>>>     */
>>>    static void tcp_payload_flush(const struct ctx *c)
>>>    {
>>> -	unsigned i;
>>>    	size_t m;
>>>    	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
>>>    			    tcp6_payload_used);
>>> -	for (i = 0; i < m; i++)
>>> -		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
>>> +	if (m != tcp6_payload_used)
>>> +		tcp_revert_seq(tcp6_frame_conns, &tcp6_l2_iov[m][0],
>>> +			       tcp6_payload_used - m);
>>>    	tcp6_payload_used = 0;
>>>    	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
>>>    			    tcp4_payload_used);
>>> -	for (i = 0; i < m; i++)
>>> -		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
>>> +	if (m != tcp4_payload_used)
>>> +		tcp_revert_seq(tcp4_frame_conns, &tcp4_l2_iov[m][0],
>>> +			       tcp4_payload_used - m);
>>>    	tcp4_payload_used = 0;
>>>    }
>>> @@ -2129,10 +2145,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
>>>    static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>>>    			    ssize_t dlen, int no_csum, uint32_t seq)
>>>    {
>>> -	uint32_t *seq_update = &conn->seq_to_tap;
>>>    	struct iovec *iov;
>>>    	size_t l4len;
>>> +	conn->seq_to_tap = seq + dlen;
>>> +
>>>    	if (CONN_V4(conn)) {
>>>    		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
>>>    		const uint16_t *check = NULL;
>>> @@ -2142,8 +2159,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>>>    			check = &iph->check;
>>>    		}
>>> -		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
>>> -		tcp4_seq_update[tcp4_payload_used].len = dlen;
>>> +		tcp4_frame_conns[tcp4_payload_used] = conn;
>>>    		iov = tcp4_l2_iov[tcp4_payload_used++];
>>>    		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
>>> @@ -2151,8 +2167,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>>>    		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
>>>    			tcp_payload_flush(c);
>>>    	} else if (CONN_V6(conn)) {
>>> -		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
>>> -		tcp6_seq_update[tcp6_payload_used].len = dlen;
>>> +		tcp6_frame_conns[tcp6_payload_used] = conn;
>>>    		iov = tcp6_l2_iov[tcp6_payload_used++];
>>>    		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-05-15 15:19 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-14 19:44 [PATCH v4] tcp: move seq_to_tap update to when frame is queued Jon Maloy
2024-05-14 19:57 ` Jon Maloy
2024-05-15  4:00   ` David Gibson
2024-05-15 15:19     ` Jon Maloy

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).