On Sat, May 11, 2024 at 11:20:06AM -0400, Jon Maloy wrote: > commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames") > delayed update of conn->seq_to_tap until the moment the corresponding > frame has been successfully pushed out. This has the advantage that we > immediately can make a new attempt to transmit a frame after a failed > trasnmit, rather than waiting for the peer to later discover a gap and > trigger the fast retransmit mechanism to solve the problem. > > This approach has turned out to cause a problem with spurious sequence > number updates during peer-initiated retransmits, and we have realized > it may not be the best way to solve the above issue. > > We now restore the previous method, by updating the said field at the > moment a frame is added to the outqueue. To retain the advantage of > having a quick re-attempt based on local failure detection, we now scan > through the part of the outqueue that had do be dropped, and restore the > sequence counter for each affected connection to the most appropriate > value. > > Signed-off-by: Jon Maloy > > --- > v2: - Re-spun loop in tcp_revert_seq() and some other changes based on > feedback from Stefano Brivio. > - Added paranoid test to avoid that seq_to_tap becomes lower than > seq_ack_from_tap. > --- > tcp.c | 63 ++++++++++++++++++++++++++++++++++++++++++----------------- > 1 file changed, 45 insertions(+), 18 deletions(-) > > diff --git a/tcp.c b/tcp.c > index 21d0af0..21cbfba 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -411,13 +411,14 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; > static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; > > /** > - * tcp_buf_seq_update - Sequences to update with length of frames once sent > - * @seq: Pointer to sequence number sent to tap-side, to be updated > - * @len: TCP payload length > + * tcp_frame_ref - References needed by queued frames in case we need > + * to revert corresponding connection sequence numbers > + * @conn: Pointer to connection for this frame > + * @seq: Sequence number of the corresponding frame > */ > -struct tcp_buf_seq_update { > - uint32_t *seq; > - uint16_t len; > +struct tcp_frame_ref { > + struct tcp_tap_conn *conn; > + uint32_t seq; As noted in another mail, I think we could get the sequence number from the actual frame buffer at revert time. That could be a follow up improvement, though. > }; > > /* Static buffers */ > @@ -461,7 +462,7 @@ static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; > > static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"); > > -static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM]; > +static struct tcp_frame_ref tcp4_frame_ref[TCP_FRAMES_MEM]; > static unsigned int tcp4_payload_used; > > static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM]; > @@ -483,7 +484,7 @@ static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; > > static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"); > > -static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM]; > +static struct tcp_frame_ref tcp6_frame_ref[TCP_FRAMES_MEM]; > static unsigned int tcp6_payload_used; > > static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM]; > @@ -1261,25 +1262,50 @@ static void tcp_flags_flush(const struct ctx *c) > tcp4_flags_used = 0; > } > > +/** > + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission > + * @frames_ref: Array with connection and sequence number data > + * @first: Index of entry corresponding to first dropped frame > + * @last: Index of entry corresponding to last dropped frame > + */ > +static void tcp_revert_seq(struct tcp_frame_ref *frame_ref, int first, int last) > +{ > + struct tcp_tap_conn *conn; > + int i; > + > + for (i = first; i <= last; i++) { > + conn = frame_ref[i].conn; 'conn' could be local to the loop body, and I think one of our static checkers is going to complain that it's not. > + > + if (SEQ_LE(conn->seq_to_tap, frame_ref[i].seq)) > + continue; > + > + conn->seq_to_tap = frame_ref[i].seq; > + > + if (SEQ_GE(conn->seq_to_tap, conn->seq_ack_from_tap)) > + continue; > + > + conn->seq_to_tap = conn->seq_ack_from_tap; > + } > +} > + > /** > * tcp_payload_flush() - Send out buffers for segments with data > * @c: Execution context > */ > static void tcp_payload_flush(const struct ctx *c) > { > - unsigned i; > size_t m; > > m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, > tcp6_payload_used); > - for (i = 0; i < m; i++) > - *tcp6_seq_update[i].seq += tcp6_seq_update[i].len; > + if (m != tcp6_payload_used) > + tcp_revert_seq(tcp6_frame_ref, m, tcp6_payload_used - 1); > tcp6_payload_used = 0; > > m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, > tcp4_payload_used); > - for (i = 0; i < m; i++) > - *tcp4_seq_update[i].seq += tcp4_seq_update[i].len; > + if (m != tcp4_payload_used) > + tcp_revert_seq(tcp4_frame_ref, m, tcp4_payload_used - 1); > tcp4_payload_used = 0; > } > > @@ -2129,10 +2155,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) > static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, > ssize_t dlen, int no_csum, uint32_t seq) > { > - uint32_t *seq_update = &conn->seq_to_tap; > struct iovec *iov; > size_t l4len; > > + conn->seq_to_tap = seq + dlen; > + > if (CONN_V4(conn)) { > struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; > const uint16_t *check = NULL; > @@ -2142,8 +2169,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, > check = &iph->check; > } > > - tcp4_seq_update[tcp4_payload_used].seq = seq_update; > - tcp4_seq_update[tcp4_payload_used].len = dlen; > + tcp4_frame_ref[tcp4_payload_used].conn = conn; > + tcp4_frame_ref[tcp4_payload_used].seq = seq; > > iov = tcp4_l2_iov[tcp4_payload_used++]; > l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq); > @@ -2151,8 +2178,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, > if (tcp4_payload_used > TCP_FRAMES_MEM - 1) > tcp_payload_flush(c); > } else if (CONN_V6(conn)) { > - tcp6_seq_update[tcp6_payload_used].seq = seq_update; > - tcp6_seq_update[tcp6_payload_used].len = dlen; > + tcp6_frame_ref[tcp6_payload_used].conn = conn; > + tcp6_frame_ref[tcp6_payload_used].seq = seq; > > iov = tcp6_l2_iov[tcp6_payload_used++]; > l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq); -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson