[PATCH v7 0/3] Support for SO_PEEK

public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed

* [PATCH v7 0/3] Support for SO_PEEK_OFF
@ 2024-05-24 17:26 Jon Maloy
  2024-05-24 17:26 ` [PATCH v7 1/3] tcp: move seq_to_tap update to when frame is queued Jon Maloy
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Jon Maloy @ 2024-05-24 17:26 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

v7: Only patch #3 updated.

Jon Maloy (3):
  tcp: move seq_to_tap update to when frame is queued
  tcp: leverage support of SO_PEEK_OFF socket option when available
  tcp: allow retransmit when peer receive window is zero

 tcp.c      | 164 +++++++++++++++++++++++++++++++++++++++--------------
 tcp_conn.h |   2 +
 2 files changed, 123 insertions(+), 43 deletions(-)

-- 
2.45.0


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v7 1/3] tcp: move seq_to_tap update to when frame is queued
  2024-05-24 17:26 [PATCH v7 0/3] Support for SO_PEEK_OFF Jon Maloy
@ 2024-05-24 17:26 ` Jon Maloy
  2024-05-31  1:42   ` David Gibson
  2024-05-24 17:26 ` [PATCH v7 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy
  2024-05-24 17:26 ` [PATCH v7 3/3] tcp: allow retransmit when peer receive window is zero Jon Maloy
  2 siblings, 1 reply; 6+ messages in thread
From: Jon Maloy @ 2024-05-24 17:26 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames")
delayed update of conn->seq_to_tap until the moment the corresponding
frame has been successfully pushed out. This has the advantage that we
immediately can make a new attempt to transmit a frame after a failed
trasnmit, rather than waiting for the peer to later discover a gap and
trigger the fast retransmit mechanism to solve the problem.

This approach has turned out to cause a problem with spurious sequence
number updates during peer-initiated retransmits, and we have realized
it may not be the best way to solve the above issue.

We now restore the previous method, by updating the said field at the
moment a frame is added to the outqueue. To retain the advantage of
having a quick re-attempt based on local failure detection, we now scan
through the part of the outqueue that had do be dropped, and restore the
sequence counter for each affected connection to the most appropriate
value.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
 tcp.c | 61 ++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/tcp.c b/tcp.c
index 06acb41..146ab8f 100644
--- a/tcp.c
+++ b/tcp.c
@@ -408,16 +408,6 @@ static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
  */
 static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
 
-/**
- * tcp_buf_seq_update - Sequences to update with length of frames once sent
- * @seq:	Pointer to sequence number sent to tap-side, to be updated
- * @len:	TCP payload length
- */
-struct tcp_buf_seq_update {
-	uint32_t *seq;
-	uint16_t len;
-};
-
 /* Static buffers */
 /**
  * struct tcp_payload_t - TCP header and data to send segments with payload
@@ -459,7 +449,8 @@ static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
 
 static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
 
-static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
+/* References tracking the owner connection of frames in the tap outqueue */
+static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
 static unsigned int tcp4_payload_used;
 
 static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
@@ -481,7 +472,8 @@ static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
 
 static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
 
-static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
+/* References tracking the owner connection of frames in the tap outqueue */
+static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
 static unsigned int tcp6_payload_used;
 
 static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
@@ -1257,25 +1249,51 @@ static void tcp_flags_flush(const struct ctx *c)
 	tcp4_flags_used = 0;
 }
 
+/**
+ * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
+ * @conns:       Array of connection pointers corresponding to queued frames
+ * @frames:      Two-dimensional array containing queued frames with sub-iovs
+ * @num_frames:  Number of entries in the two arrays to be compared
+ */
+static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[TCP_NUM_IOVS],
+			   int num_frames)
+{
+	int i;
+
+	for (i = 0; i < num_frames; i++) {
+		struct tcp_tap_conn *conn = conns[i];
+		struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base;
+		uint32_t seq = ntohl(th->seq);
+
+		if (SEQ_LE(conn->seq_to_tap, seq))
+			continue;
+
+		conn->seq_to_tap = seq;
+	}
+}
+
 /**
  * tcp_payload_flush() - Send out buffers for segments with data
  * @c:		Execution context
  */
 static void tcp_payload_flush(const struct ctx *c)
 {
-	unsigned i;
 	size_t m;
 
 	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
 			    tcp6_payload_used);
-	for (i = 0; i < m; i++)
-		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
+	if (m != tcp6_payload_used) {
+		tcp_revert_seq(tcp6_frame_conns, &tcp6_l2_iov[m],
+			       tcp6_payload_used - m);
+	}
 	tcp6_payload_used = 0;
 
 	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
 			    tcp4_payload_used);
-	for (i = 0; i < m; i++)
-		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
+	if (m != tcp4_payload_used) {
+		tcp_revert_seq(tcp4_frame_conns, &tcp4_l2_iov[m],
+			       tcp4_payload_used - m);
+	}
 	tcp4_payload_used = 0;
 }
 
@@ -2129,10 +2147,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
 static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			    ssize_t dlen, int no_csum, uint32_t seq)
 {
-	uint32_t *seq_update = &conn->seq_to_tap;
 	struct iovec *iov;
 	size_t l4len;
 
+	conn->seq_to_tap = seq + dlen;
+
 	if (CONN_V4(conn)) {
 		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
 		const uint16_t *check = NULL;
@@ -2142,8 +2161,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			check = &iph->check;
 		}
 
-		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
-		tcp4_seq_update[tcp4_payload_used].len = dlen;
+		tcp4_frame_conns[tcp4_payload_used] = conn;
 
 		iov = tcp4_l2_iov[tcp4_payload_used++];
 		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
@@ -2151,8 +2169,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
 			tcp_payload_flush(c);
 	} else if (CONN_V6(conn)) {
-		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
-		tcp6_seq_update[tcp6_payload_used].len = dlen;
+		tcp6_frame_conns[tcp6_payload_used] = conn;
 
 		iov = tcp6_l2_iov[tcp6_payload_used++];
 		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);
-- 
2.45.0


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v7 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-05-24 17:26 [PATCH v7 0/3] Support for SO_PEEK_OFF Jon Maloy
  2024-05-24 17:26 ` [PATCH v7 1/3] tcp: move seq_to_tap update to when frame is queued Jon Maloy
@ 2024-05-24 17:26 ` Jon Maloy
  2024-05-31  1:54   ` David Gibson
  2024-05-24 17:26 ` [PATCH v7 3/3] tcp: allow retransmit when peer receive window is zero Jon Maloy
  2 siblings, 1 reply; 6+ messages in thread
From: Jon Maloy @ 2024-05-24 17:26 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

From linux-6.9.0 the kernel will contain
commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option").

This new feature makes is possible to call recv_msg(MSG_PEEK) and make
it start reading data from a given offset set by the SO_PEEK_OFF socket
option. This way, we can avoid repeated reading of already read bytes of
a received message, hence saving read cycles when forwarding TCP
messages in the host->name space direction.

In this commit, we add functionality to leverage this feature when
available, while we fall back to the previous behavior when not.

Measurements with iperf3 shows that throughput increases with 15-20
percent in the host->namespace direction when this feature is used.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
 tcp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 51 insertions(+), 8 deletions(-)

diff --git a/tcp.c b/tcp.c
index 146ab8f..01898f1 100644
--- a/tcp.c
+++ b/tcp.c
@@ -509,6 +509,9 @@ static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 
+/* Does the kernel support TCP_PEEK_OFF? */
+static bool peek_offset_cap;
+
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
 
@@ -524,6 +527,20 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
 int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
 int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
 
+/**
+ * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
+ * @s:          Socket to update
+ * @offset:     Offset in bytes
+ */
+static void tcp_set_peek_offset(int s, int offset)
+{
+	if (!peek_offset_cap)
+		return;
+
+	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
+		err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
+}
+
 /**
  * tcp_conn_epoll_events() - epoll events mask for given connection state
  * @events:	Current connection events
@@ -1269,6 +1286,7 @@ static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[T
 			continue;
 
 		conn->seq_to_tap = seq;
+		tcp_set_peek_offset(conn->sock, seq - conn->seq_ack_from_tap);
 	}
 }
 
@@ -2199,14 +2217,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	uint32_t already_sent, seq;
 	struct iovec *iov;
 
+	/* How much have we read/sent since last received ack ? */
 	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
-
 	if (SEQ_LT(already_sent, 0)) {
 		/* RFC 761, section 2.1. */
 		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
 			   conn->seq_ack_from_tap, conn->seq_to_tap);
 		conn->seq_to_tap = conn->seq_ack_from_tap;
 		already_sent = 0;
+		tcp_set_peek_offset(s, 0);
 	}
 
 	if (!wnd_scaled || already_sent >= wnd_scaled) {
@@ -2224,11 +2243,16 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		iov_rem = (wnd_scaled - already_sent) % mss;
 	}
 
-	mh_sock.msg_iov = iov_sock;
-	mh_sock.msg_iovlen = fill_bufs + 1;
-
-	iov_sock[0].iov_base = tcp_buf_discard;
-	iov_sock[0].iov_len = already_sent;
+	/* Prepare iov according to kernel capability */
+	if (!peek_offset_cap) {
+		mh_sock.msg_iov = iov_sock;
+		iov_sock[0].iov_base = tcp_buf_discard;
+		iov_sock[0].iov_len = already_sent;
+		mh_sock.msg_iovlen = fill_bufs + 1;
+	} else {
+		mh_sock.msg_iov = &iov_sock[1];
+		mh_sock.msg_iovlen = fill_bufs;
+	}
 
 	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
 	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
@@ -2269,7 +2293,10 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!peek_offset_cap)
+		sendlen -= already_sent;
+
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -2440,6 +2467,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 			   "fast re-transmit, ACK: %u, previous sequence: %u",
 			   max_ack_seq, conn->seq_to_tap);
 		conn->seq_to_tap = max_ack_seq;
+		tcp_set_peek_offset(conn->sock, 0);
 		tcp_data_from_sock(c, conn);
 	}
 
@@ -2532,6 +2560,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
 	conn->seq_ack_to_tap = conn->seq_from_tap;
 
 	conn_event(c, conn, ESTABLISHED);
+	tcp_set_peek_offset(conn->sock, 0);
 
 	/* The client might have sent data already, which we didn't
 	 * dequeue waiting for SYN,ACK from tap -- check now.
@@ -2612,6 +2641,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
 			goto reset;
 
 		conn_event(c, conn, ESTABLISHED);
+		tcp_set_peek_offset(conn->sock, 0);
 
 		if (th->fin) {
 			conn->seq_from_tap++;
@@ -2860,6 +2890,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
 			flow_dbg(conn, "ACK timeout, retry");
 			conn->retrans++;
 			conn->seq_to_tap = conn->seq_ack_from_tap;
+			tcp_set_peek_offset(conn->sock, 0);
 			tcp_data_from_sock(c, conn);
 			tcp_timer_ctl(c, conn);
 		}
@@ -3151,7 +3182,8 @@ static void tcp_sock_refill_init(const struct ctx *c)
  */
 int tcp_init(struct ctx *c)
 {
-	unsigned b;
+	unsigned int b, optv = 0;
+	int s;
 
 	for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
 		tc_hash[b] = FLOW_SIDX_NONE;
@@ -3175,6 +3207,17 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Probe for SO_PEEK_OFF support */
+	s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (s < 0) {
+		warn("Temporary TCP socket creation failed");
+	} else {
+		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int)))
+			peek_offset_cap = true;
+		close(s);
+	}
+	info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
+
 	return 0;
 }
 
-- 
2.45.0


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v7 3/3] tcp: allow retransmit when peer receive window is zero
  2024-05-24 17:26 [PATCH v7 0/3] Support for SO_PEEK_OFF Jon Maloy
  2024-05-24 17:26 ` [PATCH v7 1/3] tcp: move seq_to_tap update to when frame is queued Jon Maloy
  2024-05-24 17:26 ` [PATCH v7 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy
@ 2024-05-24 17:26 ` Jon Maloy
  2 siblings, 0 replies; 6+ messages in thread
From: Jon Maloy @ 2024-05-24 17:26 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

A bug in kernel TCP may lead to a deadlock where a zero window is sent
from the peer, while it is unable to send out window updates even after
reads have freed up enough buffer space to permit a larger window.
In this situation, new window advertisemnts from the peer can only be
triggered by data packets arriving from this side.

However, such packets are never sent, because the zero-window condition
currently prevents this side from sending out any packets whatsoever
to the peer.

We notice that the above bug is triggered *only* after the peer has
dropped an arriving packet because of severe memory squeeze, and that we
hence always enter a retransmission situation when this occurs. This
also means that it goes against the RFC-9293 recommendation that a
previously advertised window never should shrink.

RFC-9293 seems to permit that we can send up to the right edge of the
last advertised non-zero window in such cases, so that is what we do
to resolve this situation. However, we use the above mechanism only for
timer-induced retransmits, while the fast-retransmit mechanism won't
be affected by this change.

It should be noted that although this solves the problem we have at
hand, it is a work-around, and not a genuine solution to the described
kernel bug.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
 tcp.c      | 44 +++++++++++++++++++++++++++++++-------------
 tcp_conn.h |  2 ++
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/tcp.c b/tcp.c
index 01898f1..76df04e 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1760,9 +1760,17 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
  */
 static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
 {
+	uint32_t wnd_edge;
+
 	wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
+
+	/* cppcheck-suppress [knownConditionTrueFalse, unmatchedSuppression] */
 	conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
 
+	wnd_edge = conn->seq_ack_from_tap + wnd;
+	if (wnd && SEQ_GT(wnd_edge, conn->seq_wnd_edge_from_tap))
+		conn->seq_wnd_edge_from_tap = wnd_edge;
+
 	/* FIXME: reflect the tap-side receiver's window back to the sock-side
 	 * sender by adjusting SO_RCVBUF? */
 }
@@ -1795,6 +1803,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
 	ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5;
 
 	conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns;
+	conn->seq_wnd_edge_from_tap = conn->seq_to_tap;
 }
 
 /**
@@ -2201,15 +2210,16 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
  * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window
  * @c:		Execution context
  * @conn:	Connection pointer
+ * @wnd_edge:	Right edge of window advertised from tap
  *
  * Return: negative on connection reset, 0 otherwise
  *
  * #syscalls recvmsg
  */
-static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
+			      uint32_t wnd_edge)
 {
-	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
-	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
+	int max_send, fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
 	int sendlen, len, dlen, v4 = CONN_V4(conn);
 	int s = conn->sock, i, ret = 0;
 	struct msghdr mh_sock = { 0 };
@@ -2228,19 +2238,24 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		tcp_set_peek_offset(s, 0);
 	}
 
-	if (!wnd_scaled || already_sent >= wnd_scaled) {
+	/* How much can we read/send within current window ? */
+	max_send = wnd_edge - conn->seq_to_tap;
+	if (max_send <= 0) {
+		flow_trace(conn, "Window full: right edge: %u, sent: %u",
+			   wnd_edge, conn->seq_to_tap);
+		conn->seq_wnd_edge_from_tap = conn->seq_to_tap;
 		conn_flag(c, conn, STALLED);
 		conn_flag(c, conn, ACK_FROM_TAP_DUE);
 		return 0;
 	}
 
 	/* Set up buffer descriptors we'll fill completely and partially. */
-	fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
+	fill_bufs = DIV_ROUND_UP(max_send,  mss);
 	if (fill_bufs > TCP_FRAMES) {
 		fill_bufs = TCP_FRAMES;
 		iov_rem = 0;
 	} else {
-		iov_rem = (wnd_scaled - already_sent) % mss;
+		iov_rem = max_send % mss;
 	}
 
 	/* Prepare iov according to kernel capability */
@@ -2468,7 +2483,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 			   max_ack_seq, conn->seq_to_tap);
 		conn->seq_to_tap = max_ack_seq;
 		tcp_set_peek_offset(conn->sock, 0);
-		tcp_data_from_sock(c, conn);
+		tcp_data_from_sock(c, conn, conn->seq_wnd_edge_from_tap);
 	}
 
 	if (!iov_i)
@@ -2565,7 +2580,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
 	/* The client might have sent data already, which we didn't
 	 * dequeue waiting for SYN,ACK from tap -- check now.
 	 */
-	tcp_data_from_sock(c, conn);
+	tcp_data_from_sock(c, conn, conn->seq_wnd_edge_from_tap);
 	tcp_send_flag(c, conn, ACK);
 }
 
@@ -2658,7 +2673,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
 
 		tcp_tap_window_update(conn, ntohs(th->window));
 
-		tcp_data_from_sock(c, conn);
+		tcp_data_from_sock(c, conn, conn->seq_wnd_edge_from_tap);
 
 		if (p->count - idx == 1)
 			return 1;
@@ -2891,7 +2906,8 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
 			conn->retrans++;
 			conn->seq_to_tap = conn->seq_ack_from_tap;
 			tcp_set_peek_offset(conn->sock, 0);
-			tcp_data_from_sock(c, conn);
+			tcp_data_from_sock(c, conn,
+					   conn->seq_wnd_edge_from_tap);
 			tcp_timer_ctl(c, conn);
 		}
 	} else {
@@ -2945,9 +2961,11 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
 		if (events & (EPOLLRDHUP | EPOLLHUP))
 			conn_event(c, conn, SOCK_FIN_RCVD);
 
-		if (events & EPOLLIN)
-			tcp_data_from_sock(c, conn);
-
+		if (events & EPOLLIN) {
+			tcp_data_from_sock(c, conn, conn->wnd_from_tap
+					   ? conn->seq_wnd_edge_from_tap
+					   : conn->seq_to_tap);
+		}
 		if (events & EPOLLOUT)
 			tcp_update_seqack_wnd(c, conn, 0, NULL);
 
diff --git a/tcp_conn.h b/tcp_conn.h
index 5f8c8fb..16228d8 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -30,6 +30,7 @@
  * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
  * @seq_to_tap:		Next sequence for packets to tap
  * @seq_ack_from_tap:	Last ACK number received from tap
+ * @seq_wnd_edge_from_tap: Right edge of last non-zero window from tap
  * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
  * @seq_ack_to_tap:	Last ACK number sent to tap
  * @seq_init_from_tap:	Initial sequence number from tap
@@ -101,6 +102,7 @@ struct tcp_tap_conn {
 
 	uint32_t	seq_to_tap;
 	uint32_t	seq_ack_from_tap;
+	uint32_t	seq_wnd_edge_from_tap;
 	uint32_t	seq_from_tap;
 	uint32_t	seq_ack_to_tap;
 	uint32_t	seq_init_from_tap;
-- 
2.45.0


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v7 1/3] tcp: move seq_to_tap update to when frame is queued
  2024-05-24 17:26 ` [PATCH v7 1/3] tcp: move seq_to_tap update to when frame is queued Jon Maloy
@ 2024-05-31  1:42   ` David Gibson
  0 siblings, 0 replies; 6+ messages in thread
From: David Gibson @ 2024-05-31  1:42 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 2906 bytes --]

On Fri, May 24, 2024 at 01:26:54PM -0400, Jon Maloy wrote:
> commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames")
> delayed update of conn->seq_to_tap until the moment the corresponding
> frame has been successfully pushed out. This has the advantage that we
> immediately can make a new attempt to transmit a frame after a failed
> trasnmit, rather than waiting for the peer to later discover a gap and
> trigger the fast retransmit mechanism to solve the problem.
> 
> This approach has turned out to cause a problem with spurious sequence
> number updates during peer-initiated retransmits, and we have realized
> it may not be the best way to solve the above issue.
> 
> We now restore the previous method, by updating the said field at the
> moment a frame is added to the outqueue. To retain the advantage of
> having a quick re-attempt based on local failure detection, we now scan
> through the part of the outqueue that had do be dropped, and restore the
> sequence counter for each affected connection to the most appropriate
> value.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>

This still has the issues I pointed out on the last revision...

[snip]
> +/**
> + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
> + * @conns:       Array of connection pointers corresponding to queued frames
> + * @frames:      Two-dimensional array containing queued frames with sub-iovs
> + * @num_frames:  Number of entries in the two arrays to be compared
> + */
> +static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[TCP_NUM_IOVS],
> +			   int num_frames)
> +{
> +	int i;
> +
> +	for (i = 0; i < num_frames; i++) {
> +		struct tcp_tap_conn *conn = conns[i];
> +		struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base;
> +		uint32_t seq = ntohl(th->seq);
> +
> +		if (SEQ_LE(conn->seq_to_tap, seq))
> +			continue;
> +
> +		conn->seq_to_tap = seq;

...one trivial - this would be clearer without the continue - ...

> +	}
> +}
> +
>  /**
>   * tcp_payload_flush() - Send out buffers for segments with data
>   * @c:		Execution context
>   */
>  static void tcp_payload_flush(const struct ctx *c)
>  {
> -	unsigned i;
>  	size_t m;
>  
>  	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
>  			    tcp6_payload_used);
> -	for (i = 0; i < m; i++)
> -		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
> +	if (m != tcp6_payload_used) {
> +		tcp_revert_seq(tcp6_frame_conns, &tcp6_l2_iov[m],
> +			       tcp6_payload_used - m);

.. and one fatal - you're calling this with non-matching entries from
frame_conns[] and l2_iov[].

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v7 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-05-24 17:26 ` [PATCH v7 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy
@ 2024-05-31  1:54   ` David Gibson
  0 siblings, 0 replies; 6+ messages in thread
From: David Gibson @ 2024-05-31  1:54 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 2638 bytes --]

On Fri, May 24, 2024 at 01:26:55PM -0400, Jon Maloy wrote:
> >From linux-6.9.0 the kernel will contain
> commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option").
> 
> This new feature makes is possible to call recv_msg(MSG_PEEK) and make
> it start reading data from a given offset set by the SO_PEEK_OFF socket
> option. This way, we can avoid repeated reading of already read bytes of
> a received message, hence saving read cycles when forwarding TCP
> messages in the host->name space direction.
> 
> In this commit, we add functionality to leverage this feature when
> available, while we fall back to the previous behavior when not.
> 
> Measurements with iperf3 shows that throughput increases with 15-20
> percent in the host->namespace direction when this feature is used.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> ---
>  tcp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 51 insertions(+), 8 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 146ab8f..01898f1 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -509,6 +509,9 @@ static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
>  static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
>  static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
>  
> +/* Does the kernel support TCP_PEEK_OFF? */
> +static bool peek_offset_cap;
> +
>  /* sendmsg() to socket */
>  static struct iovec	tcp_iov			[UIO_MAXIOV];
>  
> @@ -524,6 +527,20 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
>  int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
>  int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
>  
> +/**
> + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
> + * @s:          Socket to update
> + * @offset:     Offset in bytes
> + */
> +static void tcp_set_peek_offset(int s, int offset)
> +{
> +	if (!peek_offset_cap)
> +		return;
> +
> +	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
> +		err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);

I feel like we need to reset the connection if we ever reach here.
This means that SO_PEEK_OFF is now out of sync and we apparently can't
fix it.  If we keep the connection alive, we will inevitably send
incorrect data across it, which seems pretty bad.

Or, maybe we think this is unlikely enough we could just die().

Otherwise, LGTM.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-05-31  1:54 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-24 17:26 [PATCH v7 0/3] Support for SO_PEEK_OFF Jon Maloy
2024-05-24 17:26 ` [PATCH v7 1/3] tcp: move seq_to_tap update to when frame is queued Jon Maloy
2024-05-31  1:42   ` David Gibson
2024-05-24 17:26 ` [PATCH v7 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy
2024-05-31  1:54   ` David Gibson
2024-05-24 17:26 ` [PATCH v7 3/3] tcp: allow retransmit when peer receive window is zero Jon Maloy

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).