public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH] tcp: move seq_to_tap update to when frame is queued
@ 2024-05-09  3:00 Jon Maloy
  2024-05-10 16:40 ` Stefano Brivio
  0 siblings, 1 reply; 5+ messages in thread
From: Jon Maloy @ 2024-05-09  3:00 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames")
delayed update of conn->seq_to_tap until the moment the corresponding
frame has been successfully pushed out. This has the advantage that we
immediately can retransmit a buffer that we fail to trasnmit, rather
than waiting for the peer side to discover the loss and initiate fast
retransmit.

This approach has turned out to cause a problem with spurious sequence
number updates during peer-initiated retransmits, and we have realized
it may not be the best way to solve te above issue.

We now restore the previous method, by updating the said field at the
moment a frame is added to the outqueue. To retain the advantage of fast
retansmit based on local failure detection, we now scan through the part
of the outqueue that had do be dropped, and restore the sequence counter
for each affected connection to the most appropriate value.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
 tcp.c | 52 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 10 deletions(-)

diff --git a/tcp.c b/tcp.c
index 21d0af0..58fdbc9 100644
--- a/tcp.c
+++ b/tcp.c
@@ -412,11 +412,13 @@ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
 
 /**
  * tcp_buf_seq_update - Sequences to update with length of frames once sent
- * @seq:	Pointer to sequence number sent to tap-side, to be updated
+ * @conn:       Pointer to connection corresponding to frame. May need update
+ * @seq:	Sequence number of the corresponding frame
  * @len:	TCP payload length
  */
 struct tcp_buf_seq_update {
-	uint32_t *seq;
+	struct tcp_tap_conn *conn;
+	uint32_t seq;
 	uint16_t len;
 };
 
@@ -1261,25 +1263,52 @@ static void tcp_flags_flush(const struct ctx *c)
 	tcp4_flags_used = 0;
 }
 
+/**
+ * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
+ * @seq_update: Array with connection and sequence number data
+ * @s:          Entry corresponding to first dropped frame
+ * @e:          Entry corresponding to last dropped frame
+ */
+static void tcp_revert_seq(struct tcp_buf_seq_update *seq_update, int s, int e)
+{
+	struct tcp_tap_conn *conn;
+	uint32_t lowest_seq;
+	int i, ii;
+
+	for (i = s; i < e; i++) {
+		conn = seq_update[i].conn;
+		lowest_seq = seq_update[i].seq;
+
+		for (ii = i + 1; ii < e; ii++) {
+			if (seq_update[ii].conn != conn)
+				continue;
+			if (SEQ_GT(lowest_seq, seq_update[ii].seq))
+				lowest_seq = seq_update[ii].seq;
+		}
+
+		if (SEQ_GT(conn->seq_to_tap, lowest_seq))
+			conn->seq_to_tap = lowest_seq;
+	}
+}
+
 /**
  * tcp_payload_flush() - Send out buffers for segments with data
  * @c:		Execution context
  */
 static void tcp_payload_flush(const struct ctx *c)
 {
-	unsigned i;
 	size_t m;
 
 	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
 			    tcp6_payload_used);
-	for (i = 0; i < m; i++)
-		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
+	if (m != tcp6_payload_used)
+		tcp_revert_seq(tcp6_seq_update, m, tcp6_payload_used);
 	tcp6_payload_used = 0;
 
 	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
 			    tcp4_payload_used);
-	for (i = 0; i < m; i++)
-		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
+	if (m != tcp4_payload_used)
+		tcp_revert_seq(tcp4_seq_update, m, tcp4_payload_used);
 	tcp4_payload_used = 0;
 }
 
@@ -2129,10 +2158,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
 static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			    ssize_t dlen, int no_csum, uint32_t seq)
 {
-	uint32_t *seq_update = &conn->seq_to_tap;
 	struct iovec *iov;
 	size_t l4len;
 
+	conn->seq_to_tap = seq;
+
 	if (CONN_V4(conn)) {
 		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
 		const uint16_t *check = NULL;
@@ -2142,7 +2172,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			check = &iph->check;
 		}
 
-		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
+		tcp4_seq_update[tcp4_payload_used].conn = conn;
+		tcp4_seq_update[tcp4_payload_used].seq = seq;
 		tcp4_seq_update[tcp4_payload_used].len = dlen;
 
 		iov = tcp4_l2_iov[tcp4_payload_used++];
@@ -2151,7 +2182,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
 			tcp_payload_flush(c);
 	} else if (CONN_V6(conn)) {
-		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
+		tcp6_seq_update[tcp6_payload_used].conn = conn;
+		tcp6_seq_update[tcp6_payload_used].seq = seq;
 		tcp6_seq_update[tcp6_payload_used].len = dlen;
 
 		iov = tcp6_l2_iov[tcp6_payload_used++];
-- 
@@ -412,11 +412,13 @@ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
 
 /**
  * tcp_buf_seq_update - Sequences to update with length of frames once sent
- * @seq:	Pointer to sequence number sent to tap-side, to be updated
+ * @conn:       Pointer to connection corresponding to frame. May need update
+ * @seq:	Sequence number of the corresponding frame
  * @len:	TCP payload length
  */
 struct tcp_buf_seq_update {
-	uint32_t *seq;
+	struct tcp_tap_conn *conn;
+	uint32_t seq;
 	uint16_t len;
 };
 
@@ -1261,25 +1263,52 @@ static void tcp_flags_flush(const struct ctx *c)
 	tcp4_flags_used = 0;
 }
 
+/**
+ * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
+ * @seq_update: Array with connection and sequence number data
+ * @s:          Entry corresponding to first dropped frame
+ * @e:          Entry corresponding to last dropped frame
+ */
+static void tcp_revert_seq(struct tcp_buf_seq_update *seq_update, int s, int e)
+{
+	struct tcp_tap_conn *conn;
+	uint32_t lowest_seq;
+	int i, ii;
+
+	for (i = s; i < e; i++) {
+		conn = seq_update[i].conn;
+		lowest_seq = seq_update[i].seq;
+
+		for (ii = i + 1; ii < e; ii++) {
+			if (seq_update[ii].conn != conn)
+				continue;
+			if (SEQ_GT(lowest_seq, seq_update[ii].seq))
+				lowest_seq = seq_update[ii].seq;
+		}
+
+		if (SEQ_GT(conn->seq_to_tap, lowest_seq))
+			conn->seq_to_tap = lowest_seq;
+	}
+}
+
 /**
  * tcp_payload_flush() - Send out buffers for segments with data
  * @c:		Execution context
  */
 static void tcp_payload_flush(const struct ctx *c)
 {
-	unsigned i;
 	size_t m;
 
 	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
 			    tcp6_payload_used);
-	for (i = 0; i < m; i++)
-		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
+	if (m != tcp6_payload_used)
+		tcp_revert_seq(tcp6_seq_update, m, tcp6_payload_used);
 	tcp6_payload_used = 0;
 
 	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
 			    tcp4_payload_used);
-	for (i = 0; i < m; i++)
-		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
+	if (m != tcp4_payload_used)
+		tcp_revert_seq(tcp4_seq_update, m, tcp4_payload_used);
 	tcp4_payload_used = 0;
 }
 
@@ -2129,10 +2158,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
 static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			    ssize_t dlen, int no_csum, uint32_t seq)
 {
-	uint32_t *seq_update = &conn->seq_to_tap;
 	struct iovec *iov;
 	size_t l4len;
 
+	conn->seq_to_tap = seq;
+
 	if (CONN_V4(conn)) {
 		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
 		const uint16_t *check = NULL;
@@ -2142,7 +2172,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			check = &iph->check;
 		}
 
-		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
+		tcp4_seq_update[tcp4_payload_used].conn = conn;
+		tcp4_seq_update[tcp4_payload_used].seq = seq;
 		tcp4_seq_update[tcp4_payload_used].len = dlen;
 
 		iov = tcp4_l2_iov[tcp4_payload_used++];
@@ -2151,7 +2182,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
 			tcp_payload_flush(c);
 	} else if (CONN_V6(conn)) {
-		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
+		tcp6_seq_update[tcp6_payload_used].conn = conn;
+		tcp6_seq_update[tcp6_payload_used].seq = seq;
 		tcp6_seq_update[tcp6_payload_used].len = dlen;
 
 		iov = tcp6_l2_iov[tcp6_payload_used++];
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] tcp: move seq_to_tap update to when frame is queued
  2024-05-09  3:00 [PATCH] tcp: move seq_to_tap update to when frame is queued Jon Maloy
@ 2024-05-10 16:40 ` Stefano Brivio
  2024-05-10 19:40   ` Jon Maloy
  2024-05-13  1:03   ` David Gibson
  0 siblings, 2 replies; 5+ messages in thread
From: Stefano Brivio @ 2024-05-10 16:40 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev, lvivier, dgibson

On Wed,  8 May 2024 23:00:23 -0400
Jon Maloy <jmaloy@redhat.com> wrote:

> commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames")
> delayed update of conn->seq_to_tap until the moment the corresponding
> frame has been successfully pushed out. This has the advantage that we
> immediately can retransmit a buffer that we fail to trasnmit, rather
> than waiting for the peer side to discover the loss and initiate fast
> retransmit.

It's not really fast retransmit, it's a simple retry of the operation
that didn't succeed. We didn't even transmit.

> 
> This approach has turned out to cause a problem with spurious sequence
> number updates during peer-initiated retransmits, and we have realized
> it may not be the best way to solve te above issue.
> 
> We now restore the previous method, by updating the said field at the
> moment a frame is added to the outqueue. To retain the advantage of fast
> retansmit

Same here.

> based on local failure detection, we now scan through the part
> of the outqueue that had do be dropped, and restore the sequence counter
> for each affected connection to the most appropriate value.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> ---
>  tcp.c | 52 ++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 42 insertions(+), 10 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 21d0af0..58fdbc9 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -412,11 +412,13 @@ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
>  
>  /**
>   * tcp_buf_seq_update - Sequences to update with length of frames once sent

This is not the case anymore, maybe:

 * tcp_conn_old_seq() - Old sequence numbers for connections with pending frames

> - * @seq:	Pointer to sequence number sent to tap-side, to be updated
> + * @conn:       Pointer to connection corresponding to frame. May need update

Mixed whitespace and tabs. It looks like the connection pointer might
need to be updated... what about:

 * @conn:	Pointer to connection for this frame

?

> + * @seq:	Sequence number of the corresponding frame
>   * @len:	TCP payload length

The length is not needed anymore.

>   */
>  struct tcp_buf_seq_update {
> -	uint32_t *seq;
> +	struct tcp_tap_conn *conn;
> +	uint32_t seq;
>  	uint16_t len;
>  };
>  
> @@ -1261,25 +1263,52 @@ static void tcp_flags_flush(const struct ctx *c)
>  	tcp4_flags_used = 0;
>  }
>  
> +/**
> + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
> + * @seq_update: Array with connection and sequence number data
> + * @s:          Entry corresponding to first dropped frame
> + * @e:          Entry corresponding to last dropped frame

These are not pointer to the entries, though. They are indices of the
queued frames.

> + */
> +static void tcp_revert_seq(struct tcp_buf_seq_update *seq_update, int s, int e)
> +{
> +	struct tcp_tap_conn *conn;
> +	uint32_t lowest_seq;
> +	int i, ii;
> +
> +	for (i = s; i < e; i++) {
> +		conn = seq_update[i].conn;
> +		lowest_seq = seq_update[i].seq;
> +
> +		for (ii = i + 1; ii < e; ii++) {
> +			if (seq_update[ii].conn != conn)
> +				continue;
> +			if (SEQ_GT(lowest_seq, seq_update[ii].seq))
> +				lowest_seq = seq_update[ii].seq;
> +		}

If I recall correctly, David suggested a simpler approach that avoids
this O(n^2) scan, based on the observation that 1. the first entry you
find in the table also has the lowest sequence number (we don't send
frames out-of-order), and that 2. you'll never revert to a higher
sequence number (the two lines below take care of that).

That is, you could just scan the table once, and if you find a sequence
number that's lower than the current sequence stored for the connection,
store it.

> +
> +		if (SEQ_GT(conn->seq_to_tap, lowest_seq))
> +			conn->seq_to_tap = lowest_seq;
> +	}
> +}
> +
>  /**
>   * tcp_payload_flush() - Send out buffers for segments with data
>   * @c:		Execution context
>   */
>  static void tcp_payload_flush(const struct ctx *c)
>  {
> -	unsigned i;
>  	size_t m;
>  
>  	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
>  			    tcp6_payload_used);
> -	for (i = 0; i < m; i++)
> -		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
> +	if (m != tcp6_payload_used)
> +		tcp_revert_seq(tcp6_seq_update, m, tcp6_payload_used);
>  	tcp6_payload_used = 0;
>  
>  	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
>  			    tcp4_payload_used);
> -	for (i = 0; i < m; i++)
> -		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
> +	if (m != tcp4_payload_used)
> +		tcp_revert_seq(tcp4_seq_update, m, tcp4_payload_used);
>  	tcp4_payload_used = 0;
>  }
>  
> @@ -2129,10 +2158,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
>  static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>  			    ssize_t dlen, int no_csum, uint32_t seq)
>  {
> -	uint32_t *seq_update = &conn->seq_to_tap;
>  	struct iovec *iov;
>  	size_t l4len;
>  
> +	conn->seq_to_tap = seq;

This is the sequence number for the frame we're sending (start of this
frame), but not the current byte sequence sent to the "tap" (end of
this frame), which would be seq + dlen, I think.

> +
>  	if (CONN_V4(conn)) {
>  		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
>  		const uint16_t *check = NULL;
> @@ -2142,7 +2172,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>  			check = &iph->check;
>  		}
>  
> -		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
> +		tcp4_seq_update[tcp4_payload_used].conn = conn;
> +		tcp4_seq_update[tcp4_payload_used].seq = seq;
>  		tcp4_seq_update[tcp4_payload_used].len = dlen;
>  
>  		iov = tcp4_l2_iov[tcp4_payload_used++];
> @@ -2151,7 +2182,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>  		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
>  			tcp_payload_flush(c);
>  	} else if (CONN_V6(conn)) {
> -		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
> +		tcp6_seq_update[tcp6_payload_used].conn = conn;
> +		tcp6_seq_update[tcp6_payload_used].seq = seq;
>  		tcp6_seq_update[tcp6_payload_used].len = dlen;
>  
>  		iov = tcp6_l2_iov[tcp6_payload_used++];

-- 
Stefano


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] tcp: move seq_to_tap update to when frame is queued
  2024-05-10 16:40 ` Stefano Brivio
@ 2024-05-10 19:40   ` Jon Maloy
  2024-05-13  1:32     ` David Gibson
  2024-05-13  1:03   ` David Gibson
  1 sibling, 1 reply; 5+ messages in thread
From: Jon Maloy @ 2024-05-10 19:40 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev, lvivier, dgibson



On 2024-05-10 12:40, Stefano Brivio wrote:
> On Wed,  8 May 2024 23:00:23 -0400
> Jon Maloy <jmaloy@redhat.com> wrote:
>
>> commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames")
>> delayed update of conn->seq_to_tap until the moment the corresponding
>> frame has been successfully pushed out. This has the advantage that we
>> immediately can retransmit a buffer that we fail to trasnmit, rather
>> than waiting for the peer side to discover the loss and initiate fast
>> retransmit.
> It's not really fast retransmit, it's a simple retry of the operation
> that didn't succeed. We didn't even transmit.
Ok
>> This approach has turned out to cause a problem with spurious sequence
>> number updates during peer-initiated retransmits, and we have realized
>> it may not be the best way to solve te above issue.
>>
>> We now restore the previous method, by updating the said field at the
>> moment a frame is added to the outqueue. To retain the advantage of fast
>> retansmit
> Same here.
>
>> based on local failure detection, we now scan through the part
>> of the outqueue that had do be dropped, and restore the sequence counter
>> for each affected connection to the most appropriate value.
>>
>> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
>> ---
>>   tcp.c | 52 ++++++++++++++++++++++++++++++++++++++++++----------
>>   1 file changed, 42 insertions(+), 10 deletions(-)
>>
>> diff --git a/tcp.c b/tcp.c
>> index 21d0af0..58fdbc9 100644
>> --- a/tcp.c
>> +++ b/tcp.c
>> @@ -412,11 +412,13 @@ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
>>   
>>   /**
>>    * tcp_buf_seq_update - Sequences to update with length of frames once sent
> This is not the case anymore, maybe:
>
>   * tcp_conn_old_seq() - Old sequence numbers for connections with pending frames
ok
>> - * @seq:	Pointer to sequence number sent to tap-side, to be updated
>> + * @conn:       Pointer to connection corresponding to frame. May need update
> Mixed whitespace and tabs. It looks like the connection pointer might
> need to be updated... what about:
>
>   * @conn:	Pointer to connection for this frame
>
> ?
>
>> + * @seq:	Sequence number of the corresponding frame
>>    * @len:	TCP payload length
> The length is not needed anymore.
Yes. Of course ;-(
>>    */
>>   struct tcp_buf_seq_update {
>> -	uint32_t *seq;
>> +	struct tcp_tap_conn *conn;
>> +	uint32_t seq;
>>   	uint16_t len;
>>   };
>>   
>> @@ -1261,25 +1263,52 @@ static void tcp_flags_flush(const struct ctx *c)
>>   	tcp4_flags_used = 0;
>>   }
>>   
>> +/**
>> + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
>> + * @seq_update: Array with connection and sequence number data
>> + * @s:          Entry corresponding to first dropped frame
>> + * @e:          Entry corresponding to last dropped frame
> These are not pointer to the entries, though. They are indices of the
> queued frames.
I had already fixed that.
>
>> + */
>> +static void tcp_revert_seq(struct tcp_buf_seq_update *seq_update, int s, int e)
>> +{
>> +	struct tcp_tap_conn *conn;
>> +	uint32_t lowest_seq;
>> +	int i, ii;
>> +
>> +	for (i = s; i < e; i++) {
>> +		conn = seq_update[i].conn;
>> +		lowest_seq = seq_update[i].seq;
>> +
>> +		for (ii = i + 1; ii < e; ii++) {
>> +			if (seq_update[ii].conn != conn)
>> +				continue;
>> +			if (SEQ_GT(lowest_seq, seq_update[ii].seq))
>> +				lowest_seq = seq_update[ii].seq;
>> +		}
> If I recall correctly, David suggested a simpler approach that avoids
> this O(n^2) scan, based on the observation that 1. the first entry you
> find in the table also has the lowest sequence number (we don't send
> frames out-of-order),
Not so sure about that. We can be in the middle of retransmit.
Of course, if I continue to flush the queue just before retransmit,
which I didn't intend to do, this will be true.


> and that 2. you'll never revert to a higher
> sequence number (the two lines below take care of that).
>
> That is, you could just scan the table once, and if you find a sequence
> number that's lower than the current sequence stored for the connection,
> store it.
Yes, I can do that, and it will work even without flushing the queue.
I missed that aspect of David's description.

>
>> +
>> +		if (SEQ_GT(conn->seq_to_tap, lowest_seq))
>> +			conn->seq_to_tap = lowest_seq;
>> +	}
>> +}
>> +
>>   /**
>>    * tcp_payload_flush() - Send out buffers for segments with data
>>    * @c:		Execution context
>>    */
>>   static void tcp_payload_flush(const struct ctx *c)
>>   {
>> -	unsigned i;
>>   	size_t m;
>>   
>>   	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
>>   			    tcp6_payload_used);
>> -	for (i = 0; i < m; i++)
>> -		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
>> +	if (m != tcp6_payload_used)
>> +		tcp_revert_seq(tcp6_seq_update, m, tcp6_payload_used);
>>   	tcp6_payload_used = 0;
>>   
>>   	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
>>   			    tcp4_payload_used);
>> -	for (i = 0; i < m; i++)
>> -		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
>> +	if (m != tcp4_payload_used)
>> +		tcp_revert_seq(tcp4_seq_update, m, tcp4_payload_used);
>>   	tcp4_payload_used = 0;
>>   }
>>   
>> @@ -2129,10 +2158,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
>>   static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>>   			    ssize_t dlen, int no_csum, uint32_t seq)
>>   {
>> -	uint32_t *seq_update = &conn->seq_to_tap;
>>   	struct iovec *iov;
>>   	size_t l4len;
>>   
>> +	conn->seq_to_tap = seq;
> This is the sequence number for the frame we're sending (start of this
> frame), but not the current byte sequence sent to the "tap" (end of
> this frame), which would be seq + dlen, I think.
Already noticed during my testing and fixed. Strangely enough, it still 
worked well for a while :-(
>
>> +
>>   	if (CONN_V4(conn)) {
>>   		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
>>   		const uint16_t *check = NULL;
>> @@ -2142,7 +2172,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>>   			check = &iph->check;
>>   		}
>>   
>> -		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
>> +		tcp4_seq_update[tcp4_payload_used].conn = conn;
>> +		tcp4_seq_update[tcp4_payload_used].seq = seq;
>>   		tcp4_seq_update[tcp4_payload_used].len = dlen;
>>   
>>   		iov = tcp4_l2_iov[tcp4_payload_used++];
>> @@ -2151,7 +2182,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>>   		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
>>   			tcp_payload_flush(c);
>>   	} else if (CONN_V6(conn)) {
>> -		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
>> +		tcp6_seq_update[tcp6_payload_used].conn = conn;
>> +		tcp6_seq_update[tcp6_payload_used].seq = seq;
>>   		tcp6_seq_update[tcp6_payload_used].len = dlen;
>>   
>>   		iov = tcp6_l2_iov[tcp6_payload_used++];

I will fix the loop and repost shortly.

///jon


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] tcp: move seq_to_tap update to when frame is queued
  2024-05-10 16:40 ` Stefano Brivio
  2024-05-10 19:40   ` Jon Maloy
@ 2024-05-13  1:03   ` David Gibson
  1 sibling, 0 replies; 5+ messages in thread
From: David Gibson @ 2024-05-13  1:03 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Jon Maloy, passt-dev, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 7558 bytes --]

On Fri, May 10, 2024 at 06:40:30PM +0200, Stefano Brivio wrote:
> On Wed,  8 May 2024 23:00:23 -0400
> Jon Maloy <jmaloy@redhat.com> wrote:
> 
> > commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames")
> > delayed update of conn->seq_to_tap until the moment the corresponding
> > frame has been successfully pushed out. This has the advantage that we
> > immediately can retransmit a buffer that we fail to trasnmit, rather
> > than waiting for the peer side to discover the loss and initiate fast
> > retransmit.
> 
> It's not really fast retransmit, it's a simple retry of the operation
> that didn't succeed. We didn't even transmit.
> 
> > 
> > This approach has turned out to cause a problem with spurious sequence
> > number updates during peer-initiated retransmits, and we have realized
> > it may not be the best way to solve te above issue.
> > 
> > We now restore the previous method, by updating the said field at the
> > moment a frame is added to the outqueue. To retain the advantage of fast
> > retansmit
> 
> Same here.
> 
> > based on local failure detection, we now scan through the part
> > of the outqueue that had do be dropped, and restore the sequence counter
> > for each affected connection to the most appropriate value.
> > 
> > Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> > ---
> >  tcp.c | 52 ++++++++++++++++++++++++++++++++++++++++++----------
> >  1 file changed, 42 insertions(+), 10 deletions(-)
> > 
> > diff --git a/tcp.c b/tcp.c
> > index 21d0af0..58fdbc9 100644
> > --- a/tcp.c
> > +++ b/tcp.c
> > @@ -412,11 +412,13 @@ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
> >  
> >  /**
> >   * tcp_buf_seq_update - Sequences to update with length of frames once sent
> 
> This is not the case anymore, maybe:
> 
>  * tcp_conn_old_seq() - Old sequence numbers for connections with pending frames
> 
> > - * @seq:	Pointer to sequence number sent to tap-side, to be updated
> > + * @conn:       Pointer to connection corresponding to frame. May need update
> 
> Mixed whitespace and tabs. It looks like the connection pointer might
> need to be updated... what about:
> 
>  * @conn:	Pointer to connection for this frame
> 
> ?
> 
> > + * @seq:	Sequence number of the corresponding frame
> >   * @len:	TCP payload length
> 
> The length is not needed anymore.

Strictly speaking, I don't think you need the sequence number here
either: it should be in the frame itself.  The fiddliness of
extracting it from the buffer might make it worthwhile to store here
anyway.

> >   */
> >  struct tcp_buf_seq_update {
> > -	uint32_t *seq;
> > +	struct tcp_tap_conn *conn;
> > +	uint32_t seq;
> >  	uint16_t len;
> >  };
> >  
> > @@ -1261,25 +1263,52 @@ static void tcp_flags_flush(const struct ctx *c)
> >  	tcp4_flags_used = 0;
> >  }
> >  
> > +/**
> > + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
> > + * @seq_update: Array with connection and sequence number data
> > + * @s:          Entry corresponding to first dropped frame
> > + * @e:          Entry corresponding to last dropped frame
> 
> These are not pointer to the entries, though. They are indices of the
> queued frames.
> 
> > + */
> > +static void tcp_revert_seq(struct tcp_buf_seq_update *seq_update, int s, int e)
> > +{
> > +	struct tcp_tap_conn *conn;
> > +	uint32_t lowest_seq;
> > +	int i, ii;
> > +
> > +	for (i = s; i < e; i++) {
> > +		conn = seq_update[i].conn;
> > +		lowest_seq = seq_update[i].seq;
> > +
> > +		for (ii = i + 1; ii < e; ii++) {
> > +			if (seq_update[ii].conn != conn)
> > +				continue;
> > +			if (SEQ_GT(lowest_seq, seq_update[ii].seq))
> > +				lowest_seq = seq_update[ii].seq;
> > +		}
> 
> If I recall correctly, David suggested a simpler approach that avoids
> this O(n^2) scan, based on the observation that 1. the first entry you
> find in the table also has the lowest sequence number (we don't send
> frames out-of-order), and that 2. you'll never revert to a higher
> sequence number (the two lines below take care of that).

Right..

> That is, you could just scan the table once, and if you find a sequence
> number that's lower than the current sequence stored for the connection,
> store it.
> 
> > +
> > +		if (SEQ_GT(conn->seq_to_tap, lowest_seq))
> > +			conn->seq_to_tap = lowest_seq;

..these lines here, specifically.  Basically we rewind seq_to_tap each
time we find an untransmitted frame that sits before it.
Theoretically that could involve multiple rewinds, but a) that's not
fatal, merely suboptimal and b) it won't happen in practice, since
frames in the queue will (nearly?) always have increasing sequence
numbers.

> > +	}
> > +}
> > +
> >  /**
> >   * tcp_payload_flush() - Send out buffers for segments with data
> >   * @c:		Execution context
> >   */
> >  static void tcp_payload_flush(const struct ctx *c)
> >  {
> > -	unsigned i;
> >  	size_t m;
> >  
> >  	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
> >  			    tcp6_payload_used);
> > -	for (i = 0; i < m; i++)
> > -		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
> > +	if (m != tcp6_payload_used)
> > +		tcp_revert_seq(tcp6_seq_update, m, tcp6_payload_used);
> >  	tcp6_payload_used = 0;
> >  
> >  	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
> >  			    tcp4_payload_used);
> > -	for (i = 0; i < m; i++)
> > -		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
> > +	if (m != tcp4_payload_used)
> > +		tcp_revert_seq(tcp4_seq_update, m, tcp4_payload_used);
> >  	tcp4_payload_used = 0;
> >  }
> >  
> > @@ -2129,10 +2158,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
> >  static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> >  			    ssize_t dlen, int no_csum, uint32_t seq)
> >  {
> > -	uint32_t *seq_update = &conn->seq_to_tap;
> >  	struct iovec *iov;
> >  	size_t l4len;
> >  
> > +	conn->seq_to_tap = seq;
> 
> This is the sequence number for the frame we're sending (start of this
> frame), but not the current byte sequence sent to the "tap" (end of
> this frame), which would be seq + dlen, I think.
> 
> > +
> >  	if (CONN_V4(conn)) {
> >  		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
> >  		const uint16_t *check = NULL;
> > @@ -2142,7 +2172,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> >  			check = &iph->check;
> >  		}
> >  
> > -		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
> > +		tcp4_seq_update[tcp4_payload_used].conn = conn;
> > +		tcp4_seq_update[tcp4_payload_used].seq = seq;
> >  		tcp4_seq_update[tcp4_payload_used].len = dlen;
> >  
> >  		iov = tcp4_l2_iov[tcp4_payload_used++];
> > @@ -2151,7 +2182,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> >  		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
> >  			tcp_payload_flush(c);
> >  	} else if (CONN_V6(conn)) {
> > -		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
> > +		tcp6_seq_update[tcp6_payload_used].conn = conn;
> > +		tcp6_seq_update[tcp6_payload_used].seq = seq;
> >  		tcp6_seq_update[tcp6_payload_used].len = dlen;
> >  
> >  		iov = tcp6_l2_iov[tcp6_payload_used++];
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] tcp: move seq_to_tap update to when frame is queued
  2024-05-10 19:40   ` Jon Maloy
@ 2024-05-13  1:32     ` David Gibson
  0 siblings, 0 replies; 5+ messages in thread
From: David Gibson @ 2024-05-13  1:32 UTC (permalink / raw)
  To: Jon Maloy; +Cc: Stefano Brivio, passt-dev, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 2934 bytes --]

On Fri, May 10, 2024 at 03:40:41PM -0400, Jon Maloy wrote:
> On 2024-05-10 12:40, Stefano Brivio wrote:
> > On Wed,  8 May 2024 23:00:23 -0400
> > Jon Maloy <jmaloy@redhat.com> wrote:
[snip]
> > > + */
> > > +static void tcp_revert_seq(struct tcp_buf_seq_update *seq_update, int s, int e)
> > > +{
> > > +	struct tcp_tap_conn *conn;
> > > +	uint32_t lowest_seq;
> > > +	int i, ii;
> > > +
> > > +	for (i = s; i < e; i++) {
> > > +		conn = seq_update[i].conn;
> > > +		lowest_seq = seq_update[i].seq;
> > > +
> > > +		for (ii = i + 1; ii < e; ii++) {
> > > +			if (seq_update[ii].conn != conn)
> > > +				continue;
> > > +			if (SEQ_GT(lowest_seq, seq_update[ii].seq))
> > > +				lowest_seq = seq_update[ii].seq;
> > > +		}
> > If I recall correctly, David suggested a simpler approach that avoids
> > this O(n^2) scan, based on the observation that 1. the first entry you
> > find in the table also has the lowest sequence number (we don't send
> > frames out-of-order),
> Not so sure about that. We can be in the middle of retransmit.

We could, but I think that's ok.  If we hit this on retransmit frames,
it means they actually haven't been retramsitted yet because of the
failure, so we need to try again, just like any other frame we failed
to retransmit.  For example in a single epoll cycle:

1. We queue frames 2, 3 & 4 [queue is (2, 3, 4)]
2. We get a dup ack for frame 1, and start retransmit
3. We queue frames 1 & 2 for retransmit [queue is (2, 3, 4, 1, 2)],
   seq_to_tap is 3
4. We flush the queued frames, but there's a failure after the first
   two.  (2, 3) where transmitted, (4, 1, 2) failed.
5. We step through the failed frames
   5.1.  We see frame 4 failed, but seq_to_tap == 3 <= 4, so we ignore it
   5.2.  We see frame 1 failed, and seq_to_tap == 3 > 1 so we rewind
         to 1.  This is correct, because our retransmit failed and we
	 need to do it again.
   5.3.  We see frame 2 failed, but seq_to_tap == 1 <= 2 so ignore

The steps are slightly different, but I'm pretty sure this also does
the right thing if
  - In the retransmit we get further than we got with the initial
    transmits

  - We start a retransmit several times (in a single queue batch (not
    sure if that's possible).  This could involve multiple rewinds
    during a single revert scan, but while that's arguably non-optimal
    it should be both rare and not really that expensive.    

- The retransmit starts from a point after the earliest initial
    frame in the queue (how would the peer request a retransmit for
    something we never transmitted?  but maybe possible if the first
    transmit in this queue batch is itself a retransmit from an
    earlier cycle.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-05-13  1:32 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-09  3:00 [PATCH] tcp: move seq_to_tap update to when frame is queued Jon Maloy
2024-05-10 16:40 ` Stefano Brivio
2024-05-10 19:40   ` Jon Maloy
2024-05-13  1:32     ` David Gibson
2024-05-13  1:03   ` David Gibson

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).