public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH v6 0/3] Support for SO_PEEK_OFF socket option
@ 2024-05-17 15:24 Jon Maloy
  2024-05-17 15:24 ` [PATCH v6 1/3] tcp: move seq_to_tap update to when frame is queued Jon Maloy
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Jon Maloy @ 2024-05-17 15:24 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

Only patch #3 updated from v5.

Jon Maloy (3):
  tcp: move seq_to_tap update to when frame is queued
  tcp: leverage support of SO_PEEK_OFF socket option when available
  tcp: allow retransmit when peer receive window is zero

 tcp.c      | 146 ++++++++++++++++++++++++++++++++++++++++-------------
 tcp_conn.h |   2 +
 2 files changed, 112 insertions(+), 36 deletions(-)

-- 
2.42.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v6 1/3] tcp: move seq_to_tap update to when frame is queued
  2024-05-17 15:24 [PATCH v6 0/3] Support for SO_PEEK_OFF socket option Jon Maloy
@ 2024-05-17 15:24 ` Jon Maloy
  2024-05-20  7:46   ` David Gibson
  2024-05-17 15:24 ` [PATCH v6 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy
  2024-05-17 15:24 ` [PATCH v6 3/3] tcp: allow retransmit when peer receive window is zero Jon Maloy
  2 siblings, 1 reply; 9+ messages in thread
From: Jon Maloy @ 2024-05-17 15:24 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames")
delayed update of conn->seq_to_tap until the moment the corresponding
frame has been successfully pushed out. This has the advantage that we
immediately can make a new attempt to transmit a frame after a failed
trasnmit, rather than waiting for the peer to later discover a gap and
trigger the fast retransmit mechanism to solve the problem.

This approach has turned out to cause a problem with spurious sequence
number updates during peer-initiated retransmits, and we have realized
it may not be the best way to solve the above issue.

We now restore the previous method, by updating the said field at the
moment a frame is added to the outqueue. To retain the advantage of
having a quick re-attempt based on local failure detection, we now scan
through the part of the outqueue that had do be dropped, and restore the
sequence counter for each affected connection to the most appropriate
value.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>

---
v2: - Re-spun loop in tcp_revert_seq() and some other changes based on
      feedback from Stefano Brivio.
    - Added paranoid test to avoid that seq_to_tap becomes lower than
      seq_ack_from_tap.

v3: - Identical to v2. Called v3 because it was embedded in a series
      with that version.

v4: - In tcp_revert_seq(), we read the sequence number from the TCP
      header instead of keeping a copy in struct tcp_buf_seq_update.
    - Since the only remaining field in struct tcp_buf_seq_update is
      a pointer to struct tcp_tap_conn, we eliminate the struct
      altogether, and make the tcp6/tcp3_buf_seq_update arrays into
      arrays of said pointer.
    - Removed 'paranoid' test in tcp_revert_seq. If it happens, it
      is not fatal, and will be caught by other code anyway.
    - Separated from the series again.

v5: - A couple of style issues.
---
 tcp.c | 61 ++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/tcp.c b/tcp.c
index 21d0af0..3a2350a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -410,16 +410,6 @@ static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
  */
 static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
 
-/**
- * tcp_buf_seq_update - Sequences to update with length of frames once sent
- * @seq:	Pointer to sequence number sent to tap-side, to be updated
- * @len:	TCP payload length
- */
-struct tcp_buf_seq_update {
-	uint32_t *seq;
-	uint16_t len;
-};
-
 /* Static buffers */
 /**
  * struct tcp_payload_t - TCP header and data to send segments with payload
@@ -461,7 +451,8 @@ static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
 
 static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
 
-static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
+/* References tracking the owner connection of frames in the tap outqueue */
+static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
 static unsigned int tcp4_payload_used;
 
 static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
@@ -483,7 +474,8 @@ static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
 
 static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
 
-static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
+/* References tracking the owner connection of frames in the tap outqueue */
+static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
 static unsigned int tcp6_payload_used;
 
 static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
@@ -1261,25 +1253,51 @@ static void tcp_flags_flush(const struct ctx *c)
 	tcp4_flags_used = 0;
 }
 
+/**
+ * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
+ * @conns:       Array of connection pointers corresponding to queued frames
+ * @frames:      Two-dimensional array containing queued frames with sub-iovs
+ * @num_frames:  Number of entries in the two arrays to be compared
+ */
+static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[TCP_NUM_IOVS],
+			   int num_frames)
+{
+	int i;
+
+	for (i = 0; i < num_frames; i++) {
+		struct tcp_tap_conn *conn = conns[i];
+		struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base;
+		uint32_t seq = ntohl(th->seq);
+
+		if (SEQ_LE(conn->seq_to_tap, seq))
+			continue;
+
+		conn->seq_to_tap = seq;
+	}
+}
+
 /**
  * tcp_payload_flush() - Send out buffers for segments with data
  * @c:		Execution context
  */
 static void tcp_payload_flush(const struct ctx *c)
 {
-	unsigned i;
 	size_t m;
 
 	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
 			    tcp6_payload_used);
-	for (i = 0; i < m; i++)
-		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
+	if (m != tcp6_payload_used) {
+		tcp_revert_seq(tcp6_frame_conns, &tcp6_l2_iov[m],
+			       tcp6_payload_used - m);
+	}
 	tcp6_payload_used = 0;
 
 	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
 			    tcp4_payload_used);
-	for (i = 0; i < m; i++)
-		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
+	if (m != tcp4_payload_used) {
+		tcp_revert_seq(tcp4_frame_conns, &tcp4_l2_iov[m],
+			       tcp4_payload_used - m);
+	}
 	tcp4_payload_used = 0;
 }
 
@@ -2129,10 +2147,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
 static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			    ssize_t dlen, int no_csum, uint32_t seq)
 {
-	uint32_t *seq_update = &conn->seq_to_tap;
 	struct iovec *iov;
 	size_t l4len;
 
+	conn->seq_to_tap = seq + dlen;
+
 	if (CONN_V4(conn)) {
 		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
 		const uint16_t *check = NULL;
@@ -2142,8 +2161,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			check = &iph->check;
 		}
 
-		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
-		tcp4_seq_update[tcp4_payload_used].len = dlen;
+		tcp4_frame_conns[tcp4_payload_used] = conn;
 
 		iov = tcp4_l2_iov[tcp4_payload_used++];
 		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
@@ -2151,8 +2169,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
 			tcp_payload_flush(c);
 	} else if (CONN_V6(conn)) {
-		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
-		tcp6_seq_update[tcp6_payload_used].len = dlen;
+		tcp6_frame_conns[tcp6_payload_used] = conn;
 
 		iov = tcp6_l2_iov[tcp6_payload_used++];
 		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);
-- 
@@ -410,16 +410,6 @@ static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
  */
 static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
 
-/**
- * tcp_buf_seq_update - Sequences to update with length of frames once sent
- * @seq:	Pointer to sequence number sent to tap-side, to be updated
- * @len:	TCP payload length
- */
-struct tcp_buf_seq_update {
-	uint32_t *seq;
-	uint16_t len;
-};
-
 /* Static buffers */
 /**
  * struct tcp_payload_t - TCP header and data to send segments with payload
@@ -461,7 +451,8 @@ static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
 
 static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
 
-static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
+/* References tracking the owner connection of frames in the tap outqueue */
+static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
 static unsigned int tcp4_payload_used;
 
 static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
@@ -483,7 +474,8 @@ static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
 
 static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
 
-static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
+/* References tracking the owner connection of frames in the tap outqueue */
+static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
 static unsigned int tcp6_payload_used;
 
 static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
@@ -1261,25 +1253,51 @@ static void tcp_flags_flush(const struct ctx *c)
 	tcp4_flags_used = 0;
 }
 
+/**
+ * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
+ * @conns:       Array of connection pointers corresponding to queued frames
+ * @frames:      Two-dimensional array containing queued frames with sub-iovs
+ * @num_frames:  Number of entries in the two arrays to be compared
+ */
+static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[TCP_NUM_IOVS],
+			   int num_frames)
+{
+	int i;
+
+	for (i = 0; i < num_frames; i++) {
+		struct tcp_tap_conn *conn = conns[i];
+		struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base;
+		uint32_t seq = ntohl(th->seq);
+
+		if (SEQ_LE(conn->seq_to_tap, seq))
+			continue;
+
+		conn->seq_to_tap = seq;
+	}
+}
+
 /**
  * tcp_payload_flush() - Send out buffers for segments with data
  * @c:		Execution context
  */
 static void tcp_payload_flush(const struct ctx *c)
 {
-	unsigned i;
 	size_t m;
 
 	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
 			    tcp6_payload_used);
-	for (i = 0; i < m; i++)
-		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
+	if (m != tcp6_payload_used) {
+		tcp_revert_seq(tcp6_frame_conns, &tcp6_l2_iov[m],
+			       tcp6_payload_used - m);
+	}
 	tcp6_payload_used = 0;
 
 	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
 			    tcp4_payload_used);
-	for (i = 0; i < m; i++)
-		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
+	if (m != tcp4_payload_used) {
+		tcp_revert_seq(tcp4_frame_conns, &tcp4_l2_iov[m],
+			       tcp4_payload_used - m);
+	}
 	tcp4_payload_used = 0;
 }
 
@@ -2129,10 +2147,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
 static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			    ssize_t dlen, int no_csum, uint32_t seq)
 {
-	uint32_t *seq_update = &conn->seq_to_tap;
 	struct iovec *iov;
 	size_t l4len;
 
+	conn->seq_to_tap = seq + dlen;
+
 	if (CONN_V4(conn)) {
 		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
 		const uint16_t *check = NULL;
@@ -2142,8 +2161,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			check = &iph->check;
 		}
 
-		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
-		tcp4_seq_update[tcp4_payload_used].len = dlen;
+		tcp4_frame_conns[tcp4_payload_used] = conn;
 
 		iov = tcp4_l2_iov[tcp4_payload_used++];
 		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
@@ -2151,8 +2169,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
 			tcp_payload_flush(c);
 	} else if (CONN_V6(conn)) {
-		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
-		tcp6_seq_update[tcp6_payload_used].len = dlen;
+		tcp6_frame_conns[tcp6_payload_used] = conn;
 
 		iov = tcp6_l2_iov[tcp6_payload_used++];
 		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v6 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-05-17 15:24 [PATCH v6 0/3] Support for SO_PEEK_OFF socket option Jon Maloy
  2024-05-17 15:24 ` [PATCH v6 1/3] tcp: move seq_to_tap update to when frame is queued Jon Maloy
@ 2024-05-17 15:24 ` Jon Maloy
  2024-05-20  8:07   ` David Gibson
  2024-05-17 15:24 ` [PATCH v6 3/3] tcp: allow retransmit when peer receive window is zero Jon Maloy
  2 siblings, 1 reply; 9+ messages in thread
From: Jon Maloy @ 2024-05-17 15:24 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

From linux-6.9.0 the kernel will contain
commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option").

This new feature makes is possible to call recv_msg(MSG_PEEK) and make
it start reading data from a given offset set by the SO_PEEK_OFF socket
option. This way, we can avoid repeated reading of already read bytes of
a received message, hence saving read cycles when forwarding TCP
messages in the host->name space direction.

In this commit, we add functionality to leverage this feature when
available, while we fall back to the previous behavior when not.

Measurements with iperf3 shows that throughput increases with 15-20
percent in the host->namespace direction when this feature is used.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>

---
v2: - Some smaller changes as suggested by David Gibson and Stefano Brivio.
    - Moved initial set_peek_offset(0) to only the locations where the socket is set
      to ESTABLISHED.
    - Removed the per-packet synchronization between sk_peek_off and
      already_sent. Instead only doing it in retransmit situations.
    - The problem I found when trouble shooting the occasionally occurring
      out of synch values between 'already_sent' and 'sk_peek_offset' may
      have deeper implications that we may need to be investigate.

v3: - Rebased to most recent version of tcp.c, plus the previous
      patch in this series.
    - Some changes based on feedback from PASST team

v4: - Some small changes based on feedback from Stefan/David.

v5: - Re-added accidentally dropped set_peek_offset() line.
      Thank you, David.
---
 tcp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 51 insertions(+), 8 deletions(-)

diff --git a/tcp.c b/tcp.c
index 3a2350a..fa13292 100644
--- a/tcp.c
+++ b/tcp.c
@@ -511,6 +511,9 @@ static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 
+/* Does the kernel support TCP_PEEK_OFF? */
+static bool peek_offset_cap;
+
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
 
@@ -526,6 +529,20 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
 int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
 int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
 
+/**
+ * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
+ * @s:          Socket to update
+ * @offset:     Offset in bytes
+ */
+static void tcp_set_peek_offset(int s, int offset)
+{
+	if (!peek_offset_cap)
+		return;
+
+	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
+		err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
+}
+
 /**
  * tcp_conn_epoll_events() - epoll events mask for given connection state
  * @events:	Current connection events
@@ -1273,6 +1290,7 @@ static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[T
 			continue;
 
 		conn->seq_to_tap = seq;
+		tcp_set_peek_offset(conn->sock, seq - conn->seq_ack_from_tap);
 	}
 }
 
@@ -2199,14 +2217,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	uint32_t already_sent, seq;
 	struct iovec *iov;
 
+	/* How much have we read/sent since last received ack ? */
 	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
-
 	if (SEQ_LT(already_sent, 0)) {
 		/* RFC 761, section 2.1. */
 		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
 			   conn->seq_ack_from_tap, conn->seq_to_tap);
 		conn->seq_to_tap = conn->seq_ack_from_tap;
 		already_sent = 0;
+		tcp_set_peek_offset(s, 0);
 	}
 
 	if (!wnd_scaled || already_sent >= wnd_scaled) {
@@ -2224,11 +2243,16 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		iov_rem = (wnd_scaled - already_sent) % mss;
 	}
 
-	mh_sock.msg_iov = iov_sock;
-	mh_sock.msg_iovlen = fill_bufs + 1;
-
-	iov_sock[0].iov_base = tcp_buf_discard;
-	iov_sock[0].iov_len = already_sent;
+	/* Prepare iov according to kernel capability */
+	if (!peek_offset_cap) {
+		mh_sock.msg_iov = iov_sock;
+		iov_sock[0].iov_base = tcp_buf_discard;
+		iov_sock[0].iov_len = already_sent;
+		mh_sock.msg_iovlen = fill_bufs + 1;
+	} else {
+		mh_sock.msg_iov = &iov_sock[1];
+		mh_sock.msg_iovlen = fill_bufs;
+	}
 
 	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
 	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
@@ -2269,7 +2293,10 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!peek_offset_cap)
+		sendlen -= already_sent;
+
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -2440,6 +2467,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 			   "fast re-transmit, ACK: %u, previous sequence: %u",
 			   max_ack_seq, conn->seq_to_tap);
 		conn->seq_to_tap = max_ack_seq;
+		tcp_set_peek_offset(conn->sock, 0);
 		tcp_data_from_sock(c, conn);
 	}
 
@@ -2532,6 +2560,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
 	conn->seq_ack_to_tap = conn->seq_from_tap;
 
 	conn_event(c, conn, ESTABLISHED);
+	tcp_set_peek_offset(conn->sock, 0);
 
 	/* The client might have sent data already, which we didn't
 	 * dequeue waiting for SYN,ACK from tap -- check now.
@@ -2612,6 +2641,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
 			goto reset;
 
 		conn_event(c, conn, ESTABLISHED);
+		tcp_set_peek_offset(conn->sock, 0);
 
 		if (th->fin) {
 			conn->seq_from_tap++;
@@ -2865,6 +2895,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
 			flow_dbg(conn, "ACK timeout, retry");
 			conn->retrans++;
 			conn->seq_to_tap = conn->seq_ack_from_tap;
+			tcp_set_peek_offset(conn->sock, 0);
 			tcp_data_from_sock(c, conn);
 			tcp_timer_ctl(c, conn);
 		}
@@ -3156,7 +3187,8 @@ static void tcp_sock_refill_init(const struct ctx *c)
  */
 int tcp_init(struct ctx *c)
 {
-	unsigned b;
+	unsigned int b, optv = 0;
+	int s;
 
 	for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
 		tc_hash[b] = FLOW_SIDX_NONE;
@@ -3180,6 +3212,17 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Probe for SO_PEEK_OFF support */
+	s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (s < 0) {
+		warn("Temporary TCP socket creation failed");
+	} else {
+		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int)))
+			peek_offset_cap = true;
+		close(s);
+	}
+	info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
+
 	return 0;
 }
 
-- 
@@ -511,6 +511,9 @@ static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 
+/* Does the kernel support TCP_PEEK_OFF? */
+static bool peek_offset_cap;
+
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
 
@@ -526,6 +529,20 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
 int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
 int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
 
+/**
+ * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
+ * @s:          Socket to update
+ * @offset:     Offset in bytes
+ */
+static void tcp_set_peek_offset(int s, int offset)
+{
+	if (!peek_offset_cap)
+		return;
+
+	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
+		err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
+}
+
 /**
  * tcp_conn_epoll_events() - epoll events mask for given connection state
  * @events:	Current connection events
@@ -1273,6 +1290,7 @@ static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[T
 			continue;
 
 		conn->seq_to_tap = seq;
+		tcp_set_peek_offset(conn->sock, seq - conn->seq_ack_from_tap);
 	}
 }
 
@@ -2199,14 +2217,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	uint32_t already_sent, seq;
 	struct iovec *iov;
 
+	/* How much have we read/sent since last received ack ? */
 	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
-
 	if (SEQ_LT(already_sent, 0)) {
 		/* RFC 761, section 2.1. */
 		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
 			   conn->seq_ack_from_tap, conn->seq_to_tap);
 		conn->seq_to_tap = conn->seq_ack_from_tap;
 		already_sent = 0;
+		tcp_set_peek_offset(s, 0);
 	}
 
 	if (!wnd_scaled || already_sent >= wnd_scaled) {
@@ -2224,11 +2243,16 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		iov_rem = (wnd_scaled - already_sent) % mss;
 	}
 
-	mh_sock.msg_iov = iov_sock;
-	mh_sock.msg_iovlen = fill_bufs + 1;
-
-	iov_sock[0].iov_base = tcp_buf_discard;
-	iov_sock[0].iov_len = already_sent;
+	/* Prepare iov according to kernel capability */
+	if (!peek_offset_cap) {
+		mh_sock.msg_iov = iov_sock;
+		iov_sock[0].iov_base = tcp_buf_discard;
+		iov_sock[0].iov_len = already_sent;
+		mh_sock.msg_iovlen = fill_bufs + 1;
+	} else {
+		mh_sock.msg_iov = &iov_sock[1];
+		mh_sock.msg_iovlen = fill_bufs;
+	}
 
 	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
 	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
@@ -2269,7 +2293,10 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!peek_offset_cap)
+		sendlen -= already_sent;
+
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -2440,6 +2467,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 			   "fast re-transmit, ACK: %u, previous sequence: %u",
 			   max_ack_seq, conn->seq_to_tap);
 		conn->seq_to_tap = max_ack_seq;
+		tcp_set_peek_offset(conn->sock, 0);
 		tcp_data_from_sock(c, conn);
 	}
 
@@ -2532,6 +2560,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
 	conn->seq_ack_to_tap = conn->seq_from_tap;
 
 	conn_event(c, conn, ESTABLISHED);
+	tcp_set_peek_offset(conn->sock, 0);
 
 	/* The client might have sent data already, which we didn't
 	 * dequeue waiting for SYN,ACK from tap -- check now.
@@ -2612,6 +2641,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
 			goto reset;
 
 		conn_event(c, conn, ESTABLISHED);
+		tcp_set_peek_offset(conn->sock, 0);
 
 		if (th->fin) {
 			conn->seq_from_tap++;
@@ -2865,6 +2895,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
 			flow_dbg(conn, "ACK timeout, retry");
 			conn->retrans++;
 			conn->seq_to_tap = conn->seq_ack_from_tap;
+			tcp_set_peek_offset(conn->sock, 0);
 			tcp_data_from_sock(c, conn);
 			tcp_timer_ctl(c, conn);
 		}
@@ -3156,7 +3187,8 @@ static void tcp_sock_refill_init(const struct ctx *c)
  */
 int tcp_init(struct ctx *c)
 {
-	unsigned b;
+	unsigned int b, optv = 0;
+	int s;
 
 	for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
 		tc_hash[b] = FLOW_SIDX_NONE;
@@ -3180,6 +3212,17 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Probe for SO_PEEK_OFF support */
+	s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (s < 0) {
+		warn("Temporary TCP socket creation failed");
+	} else {
+		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int)))
+			peek_offset_cap = true;
+		close(s);
+	}
+	info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
+
 	return 0;
 }
 
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v6 3/3] tcp: allow retransmit when peer receive window is zero
  2024-05-17 15:24 [PATCH v6 0/3] Support for SO_PEEK_OFF socket option Jon Maloy
  2024-05-17 15:24 ` [PATCH v6 1/3] tcp: move seq_to_tap update to when frame is queued Jon Maloy
  2024-05-17 15:24 ` [PATCH v6 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy
@ 2024-05-17 15:24 ` Jon Maloy
  2024-05-21  5:51   ` David Gibson
  2 siblings, 1 reply; 9+ messages in thread
From: Jon Maloy @ 2024-05-17 15:24 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

A bug in kernel TCP may lead to a deadlock where a zero window is sent
from the peer, while it is unable to send out window updates even after
reads have freed up enough buffer space to permit a larger window.
In this situation, new window advertisemnts from the peer can only be
triggered by packets arriving from this side.

However, such packets are never sent, because the zero-window condition
currently prevents this side from sending out any packets whatsoever
to the peer.

We notice that the above bug is triggered *only* after the peer has
dropped an arriving packet because of severe memory squeeze, and that we
hence always enter a retransmission situation when this occurs. This
also means that it goes against the RFC 9293 recommendation that a
previously advertised window never should shrink.

RFC 9293 gives the solution to this situation. In chapter 3.6.1 we find
the following statement:
"A TCP receiver SHOULD NOT shrink the window, i.e., move the right
window edge to the left (SHLD-14). However, a sending TCP peer MUST
be robust against window shrinking, which may cause the
"usable window" (see Section 3.8.6.2.1) to become negative (MUST-34).

If this happens, the sender SHOULD NOT send new data (SHLD-15), but
SHOULD retransmit normally the old unacknowledged data between SND.UNA
and SND.UNA+SND.WND (SHLD-16). The sender MAY also retransmit old data
beyond SND.UNA+SND.WND (MAY-7)"

We never see the window become negative, but we interpret this as a
recommendation to use the previously available window during
retransmission even when the currently advertised window is zero.

We use the above mechanism only for timer-induced retransmits, while
the fast-retransmit mechanism won't trigger on this condition.

It should be noted that although this solves the problem we have at
hand, it is not a genuine solution to the kernel bug. There may well
be TCP stacks around in other OS-es which don't do this, nor have
keep-alive probing as an alternatve way to solve the situation.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>

---
v2: - Using previously advertised window during retransmission, instead
      highest send sequencece number in the cycle.
v3: - Rebased to newest code
    - Changes based on feedback from PASST team
    - Sending out empty probe message at timer expiration when
      we are not in retransmit situation.
v4: - Some small changes based on feedback from PASST team.
    - Replaced fast retransmit with a one-time 'fast probe' when
      window is zero.
v5: - Gave up on 'fast probing' for now. When I got the sequence
      numbers right in the flag message (after having emptied the tap
      queue), it turns out an empty message does *not* force a new peer
      window update as was my previous understanding of the code.
    - Added cppcheck suppression line (which I was unable to verify)
      as suggested by S. Brivio.
    - Removed sending of empty probe when window from tap side is zero.
      It looks pointless at the moment, at least for solving the above
      described situation.
v6: - Ensure that arrival of new data doesn´t cause us to ignore a
      zero-window situation.
    - Removed the pointless probing referred to in v5 comment.
---
 tcp.c      | 26 ++++++++++++++++++++------
 tcp_conn.h |  2 ++
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/tcp.c b/tcp.c
index fa13292..38c3480 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1764,9 +1764,17 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
  */
 static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
 {
+	uint32_t wnd_edge;
+
 	wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
+	/* cppcheck-suppress [knownConditionTrueFalse, unmatchedSuppression] */
+
 	conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
 
+	wnd_edge = conn->seq_ack_from_tap + wnd;
+	if (wnd && SEQ_GT(wnd_edge, conn->seq_wnd_edge_from_tap))
+		conn->seq_wnd_edge_from_tap = wnd_edge;
+
 	/* FIXME: reflect the tap-side receiver's window back to the sock-side
 	 * sender by adjusting SO_RCVBUF? */
 }
@@ -1799,6 +1807,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
 	ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5;
 
 	conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns;
+	conn->seq_wnd_edge_from_tap = conn->seq_to_tap;
 }
 
 /**
@@ -2208,13 +2217,12 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
  */
 static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 {
-	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
 	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
 	int sendlen, len, dlen, v4 = CONN_V4(conn);
+	uint32_t already_sent, max_send, seq;
 	int s = conn->sock, i, ret = 0;
 	struct msghdr mh_sock = { 0 };
 	uint16_t mss = MSS_GET(conn);
-	uint32_t already_sent, seq;
 	struct iovec *iov;
 
 	/* How much have we read/sent since last received ack ? */
@@ -2228,19 +2236,24 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		tcp_set_peek_offset(s, 0);
 	}
 
-	if (!wnd_scaled || already_sent >= wnd_scaled) {
+	/* How much are we still allowed to send within current window ? */
+	max_send = conn->seq_wnd_edge_from_tap - conn->seq_to_tap;
+	if (SEQ_LE(max_send, 0)) {
+		flow_trace(conn, "Window full: right edge: %u, sent: %u",
+			   conn->seq_wnd_edge_from_tap, conn->seq_to_tap);
+		conn->seq_wnd_edge_from_tap = conn->seq_to_tap;
 		conn_flag(c, conn, STALLED);
 		conn_flag(c, conn, ACK_FROM_TAP_DUE);
 		return 0;
 	}
 
 	/* Set up buffer descriptors we'll fill completely and partially. */
-	fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
+	fill_bufs = DIV_ROUND_UP(max_send,  mss);
 	if (fill_bufs > TCP_FRAMES) {
 		fill_bufs = TCP_FRAMES;
 		iov_rem = 0;
 	} else {
-		iov_rem = (wnd_scaled - already_sent) % mss;
+		iov_rem = max_send % mss;
 	}
 
 	/* Prepare iov according to kernel capability */
@@ -2347,6 +2360,7 @@ err:
  *
  * Return: count of consumed packets
  */
+
 static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 			      const struct pool *p, int idx)
 {
@@ -2950,7 +2964,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
 		if (events & (EPOLLRDHUP | EPOLLHUP))
 			conn_event(c, conn, SOCK_FIN_RCVD);
 
-		if (events & EPOLLIN)
+		if (events & EPOLLIN && conn->wnd_from_tap)
 			tcp_data_from_sock(c, conn);
 
 		if (events & EPOLLOUT)
diff --git a/tcp_conn.h b/tcp_conn.h
index d280b22..5cbad2a 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -30,6 +30,7 @@
  * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
  * @seq_to_tap:		Next sequence for packets to tap
  * @seq_ack_from_tap:	Last ACK number received from tap
+ * @seq_wnd_edge_from_tap: Right edge of last non-zero window from tap
  * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
  * @seq_ack_to_tap:	Last ACK number sent to tap
  * @seq_init_from_tap:	Initial sequence number from tap
@@ -101,6 +102,7 @@ struct tcp_tap_conn {
 
 	uint32_t	seq_to_tap;
 	uint32_t	seq_ack_from_tap;
+	uint32_t	seq_wnd_edge_from_tap;
 	uint32_t	seq_from_tap;
 	uint32_t	seq_ack_to_tap;
 	uint32_t	seq_init_from_tap;
-- 
@@ -30,6 +30,7 @@
  * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
  * @seq_to_tap:		Next sequence for packets to tap
  * @seq_ack_from_tap:	Last ACK number received from tap
+ * @seq_wnd_edge_from_tap: Right edge of last non-zero window from tap
  * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
  * @seq_ack_to_tap:	Last ACK number sent to tap
  * @seq_init_from_tap:	Initial sequence number from tap
@@ -101,6 +102,7 @@ struct tcp_tap_conn {
 
 	uint32_t	seq_to_tap;
 	uint32_t	seq_ack_from_tap;
+	uint32_t	seq_wnd_edge_from_tap;
 	uint32_t	seq_from_tap;
 	uint32_t	seq_ack_to_tap;
 	uint32_t	seq_init_from_tap;
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v6 1/3] tcp: move seq_to_tap update to when frame is queued
  2024-05-17 15:24 ` [PATCH v6 1/3] tcp: move seq_to_tap update to when frame is queued Jon Maloy
@ 2024-05-20  7:46   ` David Gibson
  0 siblings, 0 replies; 9+ messages in thread
From: David Gibson @ 2024-05-20  7:46 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 8155 bytes --]

On Fri, May 17, 2024 at 11:24:12AM -0400, Jon Maloy wrote:
> commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames")
> delayed update of conn->seq_to_tap until the moment the corresponding
> frame has been successfully pushed out. This has the advantage that we
> immediately can make a new attempt to transmit a frame after a failed
> trasnmit, rather than waiting for the peer to later discover a gap and
> trigger the fast retransmit mechanism to solve the problem.
> 
> This approach has turned out to cause a problem with spurious sequence
> number updates during peer-initiated retransmits, and we have realized
> it may not be the best way to solve the above issue.
> 
> We now restore the previous method, by updating the said field at the
> moment a frame is added to the outqueue. To retain the advantage of
> having a quick re-attempt based on local failure detection, we now scan
> through the part of the outqueue that had do be dropped, and restore the
> sequence counter for each affected connection to the most appropriate
> value.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> 
> ---
> v2: - Re-spun loop in tcp_revert_seq() and some other changes based on
>       feedback from Stefano Brivio.
>     - Added paranoid test to avoid that seq_to_tap becomes lower than
>       seq_ack_from_tap.
> 
> v3: - Identical to v2. Called v3 because it was embedded in a series
>       with that version.
> 
> v4: - In tcp_revert_seq(), we read the sequence number from the TCP
>       header instead of keeping a copy in struct tcp_buf_seq_update.
>     - Since the only remaining field in struct tcp_buf_seq_update is
>       a pointer to struct tcp_tap_conn, we eliminate the struct
>       altogether, and make the tcp6/tcp3_buf_seq_update arrays into
>       arrays of said pointer.
>     - Removed 'paranoid' test in tcp_revert_seq. If it happens, it
>       is not fatal, and will be caught by other code anyway.
>     - Separated from the series again.
> 
> v5: - A couple of style issues.
> ---
>  tcp.c | 61 ++++++++++++++++++++++++++++++++++++++---------------------
>  1 file changed, 39 insertions(+), 22 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 21d0af0..3a2350a 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -410,16 +410,6 @@ static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
>   */
>  static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
>  
> -/**
> - * tcp_buf_seq_update - Sequences to update with length of frames once sent
> - * @seq:	Pointer to sequence number sent to tap-side, to be updated
> - * @len:	TCP payload length
> - */
> -struct tcp_buf_seq_update {
> -	uint32_t *seq;
> -	uint16_t len;
> -};
> -
>  /* Static buffers */
>  /**
>   * struct tcp_payload_t - TCP header and data to send segments with payload
> @@ -461,7 +451,8 @@ static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];
>  
>  static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
>  
> -static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
> +/* References tracking the owner connection of frames in the tap outqueue */
> +static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
>  static unsigned int tcp4_payload_used;
>  
>  static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
> @@ -483,7 +474,8 @@ static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];
>  
>  static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
>  
> -static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
> +/* References tracking the owner connection of frames in the tap outqueue */
> +static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
>  static unsigned int tcp6_payload_used;
>  
>  static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
> @@ -1261,25 +1253,51 @@ static void tcp_flags_flush(const struct ctx *c)
>  	tcp4_flags_used = 0;
>  }
>  
> +/**
> + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
> + * @conns:       Array of connection pointers corresponding to queued frames
> + * @frames:      Two-dimensional array containing queued frames with sub-iovs
> + * @num_frames:  Number of entries in the two arrays to be compared
> + */
> +static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[TCP_NUM_IOVS],
> +			   int num_frames)
> +{
> +	int i;
> +
> +	for (i = 0; i < num_frames; i++) {
> +		struct tcp_tap_conn *conn = conns[i];
> +		struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base;
> +		uint32_t seq = ntohl(th->seq);
> +
> +		if (SEQ_LE(conn->seq_to_tap, seq))
> +			continue;
> +
> +		conn->seq_to_tap = seq;

Not worth a respin, but given the other simplifications to this, it
would be clearer to have:
	if (!SEQ_LE(conn->seq_to_tap, seq))
		conn->seq_to_tao = seq;

Rather than using continue;


> +	}
> +}
> +
>  /**
>   * tcp_payload_flush() - Send out buffers for segments with data
>   * @c:		Execution context
>   */
>  static void tcp_payload_flush(const struct ctx *c)
>  {
> -	unsigned i;
>  	size_t m;
>  
>  	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
>  			    tcp6_payload_used);
> -	for (i = 0; i < m; i++)
> -		*tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
> +	if (m != tcp6_payload_used) {
> +		tcp_revert_seq(tcp6_frame_conns, &tcp6_l2_iov[m],
> +			       tcp6_payload_used - m);

Hrm.. AFAICT tcp_revert_seq() is using the same indices into conns[]
and frames[].  But here, aren't you passing the frames array from
entry m onwards, but the conns array from 0 onwards?  Meaning that
revert_seq() might use the wrong connections for each frame.  I think
you either need
	tcp_revert_seq(&tcp6_frame_conns[m], &tcp6_l2_iov[m], ...)
Or else pass the unindexed arrays here, and take the start index as a
new parameter to tcp_revert_seq().

> +	}
>  	tcp6_payload_used = 0;
>  
>  	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
>  			    tcp4_payload_used);
> -	for (i = 0; i < m; i++)
> -		*tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
> +	if (m != tcp4_payload_used) {
> +		tcp_revert_seq(tcp4_frame_conns, &tcp4_l2_iov[m],
> +			       tcp4_payload_used - m);

Same thing here, of course.

> +	}
>  	tcp4_payload_used = 0;
>  }
>  
> @@ -2129,10 +2147,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
>  static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>  			    ssize_t dlen, int no_csum, uint32_t seq)
>  {
> -	uint32_t *seq_update = &conn->seq_to_tap;
>  	struct iovec *iov;
>  	size_t l4len;
>  
> +	conn->seq_to_tap = seq + dlen;

Now that we update seq_to_tap here, we don't really need seq as a
parameter any more, which would also simplify logic in the caller slightly.

>  	if (CONN_V4(conn)) {
>  		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
>  		const uint16_t *check = NULL;
> @@ -2142,8 +2161,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>  			check = &iph->check;
>  		}
>  
> -		tcp4_seq_update[tcp4_payload_used].seq = seq_update;
> -		tcp4_seq_update[tcp4_payload_used].len = dlen;
> +		tcp4_frame_conns[tcp4_payload_used] = conn;
>  
>  		iov = tcp4_l2_iov[tcp4_payload_used++];
>  		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
> @@ -2151,8 +2169,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>  		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
>  			tcp_payload_flush(c);
>  	} else if (CONN_V6(conn)) {
> -		tcp6_seq_update[tcp6_payload_used].seq = seq_update;
> -		tcp6_seq_update[tcp6_payload_used].len = dlen;
> +		tcp6_frame_conns[tcp6_payload_used] = conn;
>  
>  		iov = tcp6_l2_iov[tcp6_payload_used++];
>  		l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v6 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-05-17 15:24 ` [PATCH v6 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy
@ 2024-05-20  8:07   ` David Gibson
  0 siblings, 0 replies; 9+ messages in thread
From: David Gibson @ 2024-05-20  8:07 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 7472 bytes --]

On Fri, May 17, 2024 at 11:24:13AM -0400, Jon Maloy wrote:
> >From linux-6.9.0 the kernel will contain
> commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option").
> 
> This new feature makes is possible to call recv_msg(MSG_PEEK) and make
> it start reading data from a given offset set by the SO_PEEK_OFF socket
> option. This way, we can avoid repeated reading of already read bytes of
> a received message, hence saving read cycles when forwarding TCP
> messages in the host->name space direction.
> 
> In this commit, we add functionality to leverage this feature when
> available, while we fall back to the previous behavior when not.
> 
> Measurements with iperf3 shows that throughput increases with 15-20
> percent in the host->namespace direction when this feature is used.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> 
> ---
> v2: - Some smaller changes as suggested by David Gibson and Stefano Brivio.
>     - Moved initial set_peek_offset(0) to only the locations where the socket is set
>       to ESTABLISHED.
>     - Removed the per-packet synchronization between sk_peek_off and
>       already_sent. Instead only doing it in retransmit situations.
>     - The problem I found when trouble shooting the occasionally occurring
>       out of synch values between 'already_sent' and 'sk_peek_offset' may
>       have deeper implications that we may need to be investigate.
> 
> v3: - Rebased to most recent version of tcp.c, plus the previous
>       patch in this series.
>     - Some changes based on feedback from PASST team
> 
> v4: - Some small changes based on feedback from Stefan/David.
> 
> v5: - Re-added accidentally dropped set_peek_offset() line.
>       Thank you, David.
> ---
>  tcp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 51 insertions(+), 8 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 3a2350a..fa13292 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -511,6 +511,9 @@ static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
>  static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
>  static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
>  
> +/* Does the kernel support TCP_PEEK_OFF? */
> +static bool peek_offset_cap;
> +
>  /* sendmsg() to socket */
>  static struct iovec	tcp_iov			[UIO_MAXIOV];
>  
> @@ -526,6 +529,20 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
>  int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
>  int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
>  
> +/**
> + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
> + * @s:          Socket to update
> + * @offset:     Offset in bytes
> + */
> +static void tcp_set_peek_offset(int s, int offset)
> +{
> +	if (!peek_offset_cap)
> +		return;
> +
> +	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
> +		err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
> +}
> +
>  /**
>   * tcp_conn_epoll_events() - epoll events mask for given connection state
>   * @events:	Current connection events
> @@ -1273,6 +1290,7 @@ static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[T
>  			continue;
>  
>  		conn->seq_to_tap = seq;
> +		tcp_set_peek_offset(conn->sock, seq - conn->seq_ack_from_tap);
>  	}
>  }
>  
> @@ -2199,14 +2217,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  	uint32_t already_sent, seq;
>  	struct iovec *iov;
>  
> +	/* How much have we read/sent since last received ack ? */
>  	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
> -
>  	if (SEQ_LT(already_sent, 0)) {
>  		/* RFC 761, section 2.1. */
>  		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
>  			   conn->seq_ack_from_tap, conn->seq_to_tap);
>  		conn->seq_to_tap = conn->seq_ack_from_tap;
>  		already_sent = 0;
> +		tcp_set_peek_offset(s, 0);
>  	}
>  
>  	if (!wnd_scaled || already_sent >= wnd_scaled) {
> @@ -2224,11 +2243,16 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  		iov_rem = (wnd_scaled - already_sent) % mss;
>  	}
>  
> -	mh_sock.msg_iov = iov_sock;
> -	mh_sock.msg_iovlen = fill_bufs + 1;
> -
> -	iov_sock[0].iov_base = tcp_buf_discard;
> -	iov_sock[0].iov_len = already_sent;
> +	/* Prepare iov according to kernel capability */
> +	if (!peek_offset_cap) {
> +		mh_sock.msg_iov = iov_sock;
> +		iov_sock[0].iov_base = tcp_buf_discard;
> +		iov_sock[0].iov_len = already_sent;
> +		mh_sock.msg_iovlen = fill_bufs + 1;
> +	} else {
> +		mh_sock.msg_iov = &iov_sock[1];
> +		mh_sock.msg_iovlen = fill_bufs;
> +	}
>  
>  	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
>  	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
> @@ -2269,7 +2293,10 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  		return 0;
>  	}
>  
> -	sendlen = len - already_sent;
> +	sendlen = len;
> +	if (!peek_offset_cap)
> +		sendlen -= already_sent;
> +
>  	if (sendlen <= 0) {
>  		conn_flag(c, conn, STALLED);
>  		return 0;
> @@ -2440,6 +2467,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
>  			   "fast re-transmit, ACK: %u, previous sequence: %u",
>  			   max_ack_seq, conn->seq_to_tap);
>  		conn->seq_to_tap = max_ack_seq;
> +		tcp_set_peek_offset(conn->sock, 0);
>  		tcp_data_from_sock(c, conn);
>  	}
>  
> @@ -2532,6 +2560,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
>  	conn->seq_ack_to_tap = conn->seq_from_tap;
>  
>  	conn_event(c, conn, ESTABLISHED);
> +	tcp_set_peek_offset(conn->sock, 0);
>  
>  	/* The client might have sent data already, which we didn't
>  	 * dequeue waiting for SYN,ACK from tap -- check now.
> @@ -2612,6 +2641,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
>  			goto reset;
>  
>  		conn_event(c, conn, ESTABLISHED);
> +		tcp_set_peek_offset(conn->sock, 0);
>  
>  		if (th->fin) {
>  			conn->seq_from_tap++;
> @@ -2865,6 +2895,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
>  			flow_dbg(conn, "ACK timeout, retry");
>  			conn->retrans++;
>  			conn->seq_to_tap = conn->seq_ack_from_tap;
> +			tcp_set_peek_offset(conn->sock, 0);
>  			tcp_data_from_sock(c, conn);
>  			tcp_timer_ctl(c, conn);
>  		}
> @@ -3156,7 +3187,8 @@ static void tcp_sock_refill_init(const struct ctx *c)
>   */
>  int tcp_init(struct ctx *c)
>  {
> -	unsigned b;
> +	unsigned int b, optv = 0;
> +	int s;
>  
>  	for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
>  		tc_hash[b] = FLOW_SIDX_NONE;
> @@ -3180,6 +3212,17 @@ int tcp_init(struct ctx *c)
>  		NS_CALL(tcp_ns_socks_init, c);
>  	}
>  
> +	/* Probe for SO_PEEK_OFF support */
> +	s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
> +	if (s < 0) {
> +		warn("Temporary TCP socket creation failed");
> +	} else {
> +		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int)))
> +			peek_offset_cap = true;
> +		close(s);
> +	}
> +	info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
> +
>  	return 0;
>  }
>  

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v6 3/3] tcp: allow retransmit when peer receive window is zero
  2024-05-17 15:24 ` [PATCH v6 3/3] tcp: allow retransmit when peer receive window is zero Jon Maloy
@ 2024-05-21  5:51   ` David Gibson
  2024-05-21 22:25     ` Jon Maloy
  0 siblings, 1 reply; 9+ messages in thread
From: David Gibson @ 2024-05-21  5:51 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 11201 bytes --]

On Fri, May 17, 2024 at 11:24:14AM -0400, Jon Maloy wrote:
> A bug in kernel TCP may lead to a deadlock where a zero window is sent
> from the peer, while it is unable to send out window updates even after
> reads have freed up enough buffer space to permit a larger window.
> In this situation, new window advertisemnts from the peer can only be
> triggered by packets arriving from this side.
> 
> However, such packets are never sent, because the zero-window condition
> currently prevents this side from sending out any packets whatsoever
> to the peer.
> 
> We notice that the above bug is triggered *only* after the peer has
> dropped an arriving packet because of severe memory squeeze, and that we
> hence always enter a retransmission situation when this occurs. This
> also means that it goes against the RFC 9293 recommendation that a
> previously advertised window never should shrink.
> 
> RFC 9293 gives the solution to this situation. In chapter 3.6.1 we find
> the following statement:
> "A TCP receiver SHOULD NOT shrink the window, i.e., move the right
> window edge to the left (SHLD-14). However, a sending TCP peer MUST
> be robust against window shrinking, which may cause the
> "usable window" (see Section 3.8.6.2.1) to become negative (MUST-34).
> 
> If this happens, the sender SHOULD NOT send new data (SHLD-15), but
> SHOULD retransmit normally the old unacknowledged data between SND.UNA
> and SND.UNA+SND.WND (SHLD-16). The sender MAY also retransmit old data
> beyond SND.UNA+SND.WND (MAY-7)"

So... I'm beginning to think this section of the rfc isn't really
helpful or useful here.  For starters, it doesn't seem to cover all of
what we're trying to do here - particularly the fact that we try to
send keepalive probes when in this situation...

> We never see the window become negative, but we interpret this as a
> recommendation to use the previously available window during
> retransmission even when the currently advertised window is zero.

... but also, looking at the RFC, I'm really not convinced of this
interpretation.  SND.WND generally refers to the last window we've
seen advertised by the guest, and I don't see any indication that in
this specific case we should instead consider the previous version it
had.

Indeed the "usable window" value it's discussing is elsewhere
described in terms of SND.WND, and if we used the previous SND.WND
value it would *not* become negative.

I believe that last MAY-7 bit means we're not violating the RFC by
using the previous window edge, but I don't think there's anything
there to suggest we must or should be doing so.

[In fact, I wonder if the reason behind MAY-7 is that it allows an
 implementation to satisfy this by just ignoring ignore window updates
 which would move the right edge backwards]

So.. moving on from the RFC to what we actually need to do to
workaround this bug.  Do we actually need anything more than
continuing to send keep-alive probes even when the window is zero?

> We use the above mechanism only for timer-induced retransmits, while
> the fast-retransmit mechanism won't trigger on this condition.
> 
> It should be noted that although this solves the problem we have at
> hand, it is not a genuine solution to the kernel bug. There may well
> be TCP stacks around in other OS-es which don't do this, nor have
> keep-alive probing as an alternatve way to solve the situation.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> 
> ---
> v2: - Using previously advertised window during retransmission, instead
>       highest send sequencece number in the cycle.
> v3: - Rebased to newest code
>     - Changes based on feedback from PASST team
>     - Sending out empty probe message at timer expiration when
>       we are not in retransmit situation.
> v4: - Some small changes based on feedback from PASST team.
>     - Replaced fast retransmit with a one-time 'fast probe' when
>       window is zero.
> v5: - Gave up on 'fast probing' for now. When I got the sequence
>       numbers right in the flag message (after having emptied the tap
>       queue), it turns out an empty message does *not* force a new peer
>       window update as was my previous understanding of the code.
>     - Added cppcheck suppression line (which I was unable to verify)
>       as suggested by S. Brivio.
>     - Removed sending of empty probe when window from tap side is zero.
>       It looks pointless at the moment, at least for solving the above
>       described situation.
> v6: - Ensure that arrival of new data doesn´t cause us to ignore a
>       zero-window situation.
>     - Removed the pointless probing referred to in v5 comment.
> ---
>  tcp.c      | 26 ++++++++++++++++++++------
>  tcp_conn.h |  2 ++
>  2 files changed, 22 insertions(+), 6 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index fa13292..38c3480 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -1764,9 +1764,17 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
>   */
>  static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
>  {
> +	uint32_t wnd_edge;
> +
>  	wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
> +	/* cppcheck-suppress [knownConditionTrueFalse, unmatchedSuppression] */

If I recall from earlier, we thought this suppression was needed
because of the cppcheck bug referenced in tcp_update_seqack_wnd().  If
that's the case we need something like that comment here as well:
knownConditionTrueFalse is not a check we should be suppressing
lightly.

But also... is it actually that bug?  In that case the check tripped
when we did an if based on the result of the MIN - it thought it was
always zero.  But here the suppression is on the MIN itself, which
suggests something different.  Is it instead that cppcheck is managing
to deduce that wnd >> conn->ws_from_tap cannot be greater than
USHRT_MAX.  Which should indeed be the case, although I can't quickly
see how you'd statically deduce it.

I'm also not sure why this is showing up now, because these lines
aren't changed.

> +

I also don't think inserting a blank line between the suppression and
the line where the error is occuring is a good idea.

>  	conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
>  
> +	wnd_edge = conn->seq_ack_from_tap + wnd;
> +	if (wnd && SEQ_GT(wnd_edge, conn->seq_wnd_edge_from_tap))
> +		conn->seq_wnd_edge_from_tap = wnd_edge;
> +
>  	/* FIXME: reflect the tap-side receiver's window back to the sock-side
>  	 * sender by adjusting SO_RCVBUF? */
>  }
> @@ -1799,6 +1807,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
>  	ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5;
>  
>  	conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns;
> +	conn->seq_wnd_edge_from_tap = conn->seq_to_tap;
>  }
>  
>  /**
> @@ -2208,13 +2217,12 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>   */
>  static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  {
> -	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
>  	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
>  	int sendlen, len, dlen, v4 = CONN_V4(conn);
> +	uint32_t already_sent, max_send, seq;
>  	int s = conn->sock, i, ret = 0;
>  	struct msghdr mh_sock = { 0 };
>  	uint16_t mss = MSS_GET(conn);
> -	uint32_t already_sent, seq;
>  	struct iovec *iov;
>  
>  	/* How much have we read/sent since last received ack ? */
> @@ -2228,19 +2236,24 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  		tcp_set_peek_offset(s, 0);
>  	}
>  
> -	if (!wnd_scaled || already_sent >= wnd_scaled) {
> +	/* How much are we still allowed to send within current window ? */
> +	max_send = conn->seq_wnd_edge_from_tap - conn->seq_to_tap;
> +	if (SEQ_LE(max_send, 0)) {

Although the maths probably works out correctly, I dislike using
SEQ_LE on sequence differences here, rather that using SEQ_LE directly
on seq_wnd_edge_from_tap and seq_to_tap.

> +		flow_trace(conn, "Window full: right edge: %u, sent: %u",
> +			   conn->seq_wnd_edge_from_tap, conn->seq_to_tap);
> +		conn->seq_wnd_edge_from_tap = conn->seq_to_tap;

So, here we pull seq_wnd_edge_from_tap back in line with seq_to_tap.
Which might be before even the "current" window of seq_ack_to_tap +
wnd_scaled.  Which means there's a pretty brief window in which
seq_wnd_edge_from_tap will actually be beyond the latest window.  It's
not clear to me why that brief window is important - or why getting
more data from the socket side would be relevant to finishing that
window.

>  		conn_flag(c, conn, STALLED);
>  		conn_flag(c, conn, ACK_FROM_TAP_DUE);
>  		return 0;
>  	}
>  
>  	/* Set up buffer descriptors we'll fill completely and partially. */
> -	fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
> +	fill_bufs = DIV_ROUND_UP(max_send,  mss);
>  	if (fill_bufs > TCP_FRAMES) {
>  		fill_bufs = TCP_FRAMES;
>  		iov_rem = 0;
>  	} else {
> -		iov_rem = (wnd_scaled - already_sent) % mss;
> +		iov_rem = max_send % mss;
>  	}
>  
>  	/* Prepare iov according to kernel capability */
> @@ -2347,6 +2360,7 @@ err:
>   *
>   * Return: count of consumed packets
>   */
> +

Spurious whitespace change.

>  static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
>  			      const struct pool *p, int idx)
>  {
> @@ -2950,7 +2964,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
>  		if (events & (EPOLLRDHUP | EPOLLHUP))
>  			conn_event(c, conn, SOCK_FIN_RCVD);
>  
> -		if (events & EPOLLIN)
> +		if (events & EPOLLIN && conn->wnd_from_tap)

Hrm.  If we don't even enter tcp_data_from_sock() when there's no
window, doesn't that mean we won't hit the handling for the max_send <
0 case, we won't set STALLED, won't switch the epoll flags for the
socket to edge triggered mode and will therefore just busy loop on
EPOLLIN socket events until the window re-opens.

>  			tcp_data_from_sock(c, conn);
>  
>  		if (events & EPOLLOUT)
> diff --git a/tcp_conn.h b/tcp_conn.h
> index d280b22..5cbad2a 100644
> --- a/tcp_conn.h
> +++ b/tcp_conn.h
> @@ -30,6 +30,7 @@
>   * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
>   * @seq_to_tap:		Next sequence for packets to tap
>   * @seq_ack_from_tap:	Last ACK number received from tap
> + * @seq_wnd_edge_from_tap: Right edge of last non-zero window from tap
>   * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
>   * @seq_ack_to_tap:	Last ACK number sent to tap
>   * @seq_init_from_tap:	Initial sequence number from tap
> @@ -101,6 +102,7 @@ struct tcp_tap_conn {
>  
>  	uint32_t	seq_to_tap;
>  	uint32_t	seq_ack_from_tap;
> +	uint32_t	seq_wnd_edge_from_tap;
>  	uint32_t	seq_from_tap;
>  	uint32_t	seq_ack_to_tap;
>  	uint32_t	seq_init_from_tap;

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v6 3/3] tcp: allow retransmit when peer receive window is zero
  2024-05-21  5:51   ` David Gibson
@ 2024-05-21 22:25     ` Jon Maloy
  0 siblings, 0 replies; 9+ messages in thread
From: Jon Maloy @ 2024-05-21 22:25 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev, sbrivio, lvivier, dgibson



On 2024-05-21 01:51, David Gibson wrote:
> On Fri, May 17, 2024 at 11:24:14AM -0400, Jon Maloy wrote:
>> A bug in kernel TCP may lead to a deadlock where a zero window is sent
>> from the peer, while it is unable to send out window updates even after
>> reads have freed up enough buffer space to permit a larger window.
>> In this situation, new window advertisemnts from the peer can only be
>> triggered by packets arriving from this side.
>>
>> However, such packets are never sent, because the zero-window condition
>> currently prevents this side from sending out any packets whatsoever
>> to the peer.
>>
>> We notice that the above bug is triggered *only* after the peer has
>> dropped an arriving packet because of severe memory squeeze, and that we
>> hence always enter a retransmission situation when this occurs. This
>> also means that it goes against the RFC 9293 recommendation that a
>> previously advertised window never should shrink.
>>
>> RFC 9293 gives the solution to this situation. In chapter 3.6.1 we find
>> the following statement:
>> "A TCP receiver SHOULD NOT shrink the window, i.e., move the right
>> window edge to the left (SHLD-14). However, a sending TCP peer MUST
>> be robust against window shrinking, which may cause the
>> "usable window" (see Section 3.8.6.2.1) to become negative (MUST-34).
>>
>> If this happens, the sender SHOULD NOT send new data (SHLD-15), but
>> SHOULD retransmit normally the old unacknowledged data between SND.UNA
>> and SND.UNA+SND.WND (SHLD-16). The sender MAY also retransmit old data
>> beyond SND.UNA+SND.WND (MAY-7)"
> So... I'm beginning to think this section of the rfc isn't really
> helpful or useful here.  For starters, it doesn't seem to cover all of
> what we're trying to do here - particularly the fact that we try to
> send keepalive probes when in this situation...
The probes don't resolve the situation, so I skipped them in the latest 
version.
Only payload data solves it.
>
>> We never see the window become negative, but we interpret this as a
>> recommendation to use the previously available window during
>> retransmission even when the currently advertised window is zero.
> ... but also, looking at the RFC, I'm really not convinced of this
> interpretation.  SND.WND generally refers to the last window we've
> seen advertised by the guest, and I don't see any indication that in
> this specific case we should instead consider the previous version it
> had.
>
> Indeed the "usable window" value it's discussing is elsewhere
> described in terms of SND.WND, and if we used the previous SND.WND
> value it would *not* become negative.
>
> I believe that last MAY-7 bit means we're not violating the RFC by
> using the previous window edge, but I don't think there's anything
> there to suggest we must or should be doing so.
Ok. But in my view, we don't have a choice until the kernel bug is fixed.
>
> [In fact, I wonder if the reason behind MAY-7 is that it allows an
>   implementation to satisfy this by just ignoring ignore window updates
>   which would move the right edge backwards]
That would be nice.
>
> So.. moving on from the RFC to what we actually need to do to
> workaround this bug.  Do we actually need anything more than
> continuing to send keep-alive probes even when the window is zero?
Yes. See above.
>
>> We use the above mechanism only for timer-induced retransmits, while
>> the fast-retransmit mechanism won't trigger on this condition.
>>
>> It should be noted that although this solves the problem we have at
>> hand, it is not a genuine solution to the kernel bug. There may well
>> be TCP stacks around in other OS-es which don't do this, nor have
>> keep-alive probing as an alternatve way to solve the situation.
>>
>> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
>>
>> ---
>> v2: - Using previously advertised window during retransmission, instead
>>        highest send sequencece number in the cycle.
>> v3: - Rebased to newest code
>>      - Changes based on feedback from PASST team
>>      - Sending out empty probe message at timer expiration when
>>        we are not in retransmit situation.
>> v4: - Some small changes based on feedback from PASST team.
>>      - Replaced fast retransmit with a one-time 'fast probe' when
>>        window is zero.
>> v5: - Gave up on 'fast probing' for now. When I got the sequence
>>        numbers right in the flag message (after having emptied the tap
>>        queue), it turns out an empty message does *not* force a new peer
>>        window update as was my previous understanding of the code.
>>      - Added cppcheck suppression line (which I was unable to verify)
>>        as suggested by S. Brivio.
>>      - Removed sending of empty probe when window from tap side is zero.
>>        It looks pointless at the moment, at least for solving the above
>>        described situation.
>> v6: - Ensure that arrival of new data doesn´t cause us to ignore a
>>        zero-window situation.
>>      - Removed the pointless probing referred to in v5 comment.
>> ---
>>   tcp.c      | 26 ++++++++++++++++++++------
>>   tcp_conn.h |  2 ++
>>   2 files changed, 22 insertions(+), 6 deletions(-)
>>
>> diff --git a/tcp.c b/tcp.c
>> index fa13292..38c3480 100644
>> --- a/tcp.c
>> +++ b/tcp.c
>> @@ -1764,9 +1764,17 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
>>    */
>>   static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
>>   {
>> +	uint32_t wnd_edge;
>> +
>>   	wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
>> +	/* cppcheck-suppress [knownConditionTrueFalse, unmatchedSuppression] */
> If I recall from earlier, we thought this suppression was needed
> because of the cppcheck bug referenced in tcp_update_seqack_wnd().  If
> that's the case we need something like that comment here as well:
> knownConditionTrueFalse is not a check we should be suppressing
> lightly.
>
> But also... is it actually that bug?  In that case the check tripped
> when we did an if based on the result of the MIN - it thought it was
> always zero.  But here the suppression is on the MIN itself, which
> suggests something different.  Is it instead that cppcheck is managing
> to deduce that wnd >> conn->ws_from_tap cannot be greater than
> USHRT_MAX.  Which should indeed be the case, although I can't quickly
> see how you'd statically deduce it.
>
> I'm also not sure why this is showing up now, because these lines
> aren't changed.
Good point. I wonder if Stefano has any theory on that?
>
>> +
> I also don't think inserting a blank line between the suppression and
> the line where the error is occuring is a good idea.
>
>>   	conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
>>   
>> +	wnd_edge = conn->seq_ack_from_tap + wnd;
>> +	if (wnd && SEQ_GT(wnd_edge, conn->seq_wnd_edge_from_tap))
>> +		conn->seq_wnd_edge_from_tap = wnd_edge;
>> +
>>   	/* FIXME: reflect the tap-side receiver's window back to the sock-side
>>   	 * sender by adjusting SO_RCVBUF? */
>>   }
>> @@ -1799,6 +1807,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
>>   	ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5;
>>   
>>   	conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns;
>> +	conn->seq_wnd_edge_from_tap = conn->seq_to_tap;
>>   }
>>   
>>   /**
>> @@ -2208,13 +2217,12 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>>    */
>>   static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>>   {
>> -	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
>>   	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
>>   	int sendlen, len, dlen, v4 = CONN_V4(conn);
>> +	uint32_t already_sent, max_send, seq;
>>   	int s = conn->sock, i, ret = 0;
>>   	struct msghdr mh_sock = { 0 };
>>   	uint16_t mss = MSS_GET(conn);
>> -	uint32_t already_sent, seq;
>>   	struct iovec *iov;
>>   
>>   	/* How much have we read/sent since last received ack ? */
>> @@ -2228,19 +2236,24 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>>   		tcp_set_peek_offset(s, 0);
>>   	}
>>   
>> -	if (!wnd_scaled || already_sent >= wnd_scaled) {
>> +	/* How much are we still allowed to send within current window ? */
>> +	max_send = conn->seq_wnd_edge_from_tap - conn->seq_to_tap;
>> +	if (SEQ_LE(max_send, 0)) {
> Although the maths probably works out correctly, I dislike using
> SEQ_LE on sequence differences here, rather that using SEQ_LE directly
> on seq_wnd_edge_from_tap and seq_to_tap.
I know we discussed this at our last meeting, but then I realized this 
explicitly means reading these two fields, which we just accessed via 
the pointer, once again. It is possible, even likely, that GCC/CLANG are 
smart enough to catch this and optimize, but it is at least ugly.
And again, we have exactly the same construct a few lines further up. If 
we fix it in one place we need to do both.

What was the objection to just making 'already_sent' and 'max_send' to 
signed integers again?

Otherwise, I can easily fix this with a couple of extra stack variables:
'seq' (which we already have),
'ack' (self explaining) and
'wnd_edge' (or just deliver 'max_send' as an argument, see further down)

>
>> +		flow_trace(conn, "Window full: right edge: %u, sent: %u",
>> +			   conn->seq_wnd_edge_from_tap, conn->seq_to_tap);
>> +		conn->seq_wnd_edge_from_tap = conn->seq_to_tap;
> So, here we pull seq_wnd_edge_from_tap back in line with seq_to_tap.
> Which might be before even the "current" window of seq_ack_to_tap +
> wnd_scaled.
TBH, I cannot see SEQ_LT(seq_wnd_edge_from_tap, seq_to_tap) *ever* 
happening.
They can be equal, because we may have consumed the whole permitted window,
but since we logically never can read/send beyond the right edge of window,
the condition SEQ_GE(seq_wnd_edge_from_tap, seq_to_tap) will always be true.

I.e., I could just as well use if (seq_wnd_edge_from_tap == seq_to_tap), 
the assignment
conn->seq_wnd_edge_from_tap = conn->seq_to_tap is in reality redundant.

To put it differently, seq_wnd_edge_from_tap will never ever move to the 
left.

The fact that seq_to_tap occasionally may revert to an older value doesn´t
change that.

So, using SEQ_LE() isn't logically necessary here, it is just healthy 
paranoia.

> Which means there's a pretty brief window in which
> seq_wnd_edge_from_tap will actually be beyond the latest window.
How? It is always set to be in sync with the window, except when the 
window is announced to be zero from the peer.
In the latter case it will be beyond it until a new non-zero window is 
announced, but that is the very point with this patch.

> It's
> not clear to me why that brief window is important - or why getting
> more data from the socket side would be relevant to finishing that
> window.
See above.
>
>>   		conn_flag(c, conn, STALLED);
>>   		conn_flag(c, conn, ACK_FROM_TAP_DUE);
>>   		return 0;
>>   	}
>>   
>>   	/* Set up buffer descriptors we'll fill completely and partially. */
>> -	fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
>> +	fill_bufs = DIV_ROUND_UP(max_send,  mss);
>>   	if (fill_bufs > TCP_FRAMES) {
>>   		fill_bufs = TCP_FRAMES;
>>   		iov_rem = 0;
>>   	} else {
>> -		iov_rem = (wnd_scaled - already_sent) % mss;
>> +		iov_rem = max_send % mss;
>>   	}
>>   
>>   	/* Prepare iov according to kernel capability */
>> @@ -2347,6 +2360,7 @@ err:
>>    *
>>    * Return: count of consumed packets
>>    */
>> +
> Spurious whitespace change.
ok
>
>>   static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
>>   			      const struct pool *p, int idx)
>>   {
>> @@ -2950,7 +2964,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
>>   		if (events & (EPOLLRDHUP | EPOLLHUP))
>>   			conn_event(c, conn, SOCK_FIN_RCVD);
>>   
>> -		if (events & EPOLLIN)
>> +		if (events & EPOLLIN && conn->wnd_from_tap)
> Hrm.  If we don't even enter tcp_data_from_sock() when there's no
> window, doesn't that mean we won't hit the handling for the max_send <
> 0 case, we won't set STALLED, won't switch the epoll flags for the
> socket to edge triggered mode and will therefore just busy loop on
> EPOLLIN socket events until the window re-opens.
That is correct.
When we receive a zero-window advertisement from the peer, it is either
1) The memory squeeze case we are dealing with. When that happens, 
ACK_FROM_TAP_DUE is always set anyway.
     We just sent a package which was dropped instead of being acked.

2) It is a "genuine" window exhaustion, where the receiver is not able 
to keep up, but everything is in its read queue.
     In that case, ACK_SEQ_FROM_TAP should *not* be set. The reader has 
received and acked, it has just not been able to consume it yet.

3) There is no third case, since the window edge never moves to the 
left, and we never send beyond that edge.

I must admit I never really paid attention to the STALLED flag, though.

It might be nicer if I can handle this case within tcp_data_from_sock(), 
of course,
but if so I need to find a way to easily distinguish between the case 
when the call comes from
tcp_sock_handler() and all the others.

If I add 'max_send' as an argument to the call instead of calculating it 
inside the call it would actually solve this.
What do you think?

/jon

>
>>   			tcp_data_from_sock(c, conn);
>>   
>>   		if (events & EPOLLOUT)
>> diff --git a/tcp_conn.h b/tcp_conn.h
>> index d280b22..5cbad2a 100644
>> --- a/tcp_conn.h
>> +++ b/tcp_conn.h
>> @@ -30,6 +30,7 @@
>>    * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
>>    * @seq_to_tap:		Next sequence for packets to tap
>>    * @seq_ack_from_tap:	Last ACK number received from tap
>> + * @seq_wnd_edge_from_tap: Right edge of last non-zero window from tap
>>    * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
>>    * @seq_ack_to_tap:	Last ACK number sent to tap
>>    * @seq_init_from_tap:	Initial sequence number from tap
>> @@ -101,6 +102,7 @@ struct tcp_tap_conn {
>>   
>>   	uint32_t	seq_to_tap;
>>   	uint32_t	seq_ack_from_tap;
>> +	uint32_t	seq_wnd_edge_from_tap;
>>   	uint32_t	seq_from_tap;
>>   	uint32_t	seq_ack_to_tap;
>>   	uint32_t	seq_init_from_tap;



^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v6 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-05-17 15:05 [PATCH v6 0/3] Support for SO_PEEK_OFF socket option Jon Maloy
@ 2024-05-17 15:06 ` Jon Maloy
  0 siblings, 0 replies; 9+ messages in thread
From: Jon Maloy @ 2024-05-17 15:06 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

From linux-6.9.0 the kernel will contain
commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option").

This new feature makes is possible to call recv_msg(MSG_PEEK) and make
it start reading data from a given offset set by the SO_PEEK_OFF socket
option. This way, we can avoid repeated reading of already read bytes of
a received message, hence saving read cycles when forwarding TCP
messages in the host->name space direction.

In this commit, we add functionality to leverage this feature when
available, while we fall back to the previous behavior when not.

Measurements with iperf3 shows that throughput increases with 15-20
percent in the host->namespace direction when this feature is used.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>

---
v2: - Some smaller changes as suggested by David Gibson and Stefano Brivio.
    - Moved initial set_peek_offset(0) to only the locations where the socket is set
      to ESTABLISHED.
    - Removed the per-packet synchronization between sk_peek_off and
      already_sent. Instead only doing it in retransmit situations.
    - The problem I found when trouble shooting the occasionally occurring
      out of synch values between 'already_sent' and 'sk_peek_offset' may
      have deeper implications that we may need to be investigate.

v3: - Rebased to most recent version of tcp.c, plus the previous
      patch in this series.
    - Some changes based on feedback from PASST team

v4: - Some small changes based on feedback from Stefan/David.

v5: - Re-added accidentally dropped set_peek_offset() line.
      Thank you, David.
---
 tcp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 51 insertions(+), 8 deletions(-)

diff --git a/tcp.c b/tcp.c
index 3a2350a..fa13292 100644
--- a/tcp.c
+++ b/tcp.c
@@ -511,6 +511,9 @@ static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 
+/* Does the kernel support TCP_PEEK_OFF? */
+static bool peek_offset_cap;
+
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
 
@@ -526,6 +529,20 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
 int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
 int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
 
+/**
+ * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
+ * @s:          Socket to update
+ * @offset:     Offset in bytes
+ */
+static void tcp_set_peek_offset(int s, int offset)
+{
+	if (!peek_offset_cap)
+		return;
+
+	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
+		err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
+}
+
 /**
  * tcp_conn_epoll_events() - epoll events mask for given connection state
  * @events:	Current connection events
@@ -1273,6 +1290,7 @@ static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[T
 			continue;
 
 		conn->seq_to_tap = seq;
+		tcp_set_peek_offset(conn->sock, seq - conn->seq_ack_from_tap);
 	}
 }
 
@@ -2199,14 +2217,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	uint32_t already_sent, seq;
 	struct iovec *iov;
 
+	/* How much have we read/sent since last received ack ? */
 	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
-
 	if (SEQ_LT(already_sent, 0)) {
 		/* RFC 761, section 2.1. */
 		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
 			   conn->seq_ack_from_tap, conn->seq_to_tap);
 		conn->seq_to_tap = conn->seq_ack_from_tap;
 		already_sent = 0;
+		tcp_set_peek_offset(s, 0);
 	}
 
 	if (!wnd_scaled || already_sent >= wnd_scaled) {
@@ -2224,11 +2243,16 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		iov_rem = (wnd_scaled - already_sent) % mss;
 	}
 
-	mh_sock.msg_iov = iov_sock;
-	mh_sock.msg_iovlen = fill_bufs + 1;
-
-	iov_sock[0].iov_base = tcp_buf_discard;
-	iov_sock[0].iov_len = already_sent;
+	/* Prepare iov according to kernel capability */
+	if (!peek_offset_cap) {
+		mh_sock.msg_iov = iov_sock;
+		iov_sock[0].iov_base = tcp_buf_discard;
+		iov_sock[0].iov_len = already_sent;
+		mh_sock.msg_iovlen = fill_bufs + 1;
+	} else {
+		mh_sock.msg_iov = &iov_sock[1];
+		mh_sock.msg_iovlen = fill_bufs;
+	}
 
 	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
 	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
@@ -2269,7 +2293,10 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!peek_offset_cap)
+		sendlen -= already_sent;
+
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -2440,6 +2467,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 			   "fast re-transmit, ACK: %u, previous sequence: %u",
 			   max_ack_seq, conn->seq_to_tap);
 		conn->seq_to_tap = max_ack_seq;
+		tcp_set_peek_offset(conn->sock, 0);
 		tcp_data_from_sock(c, conn);
 	}
 
@@ -2532,6 +2560,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
 	conn->seq_ack_to_tap = conn->seq_from_tap;
 
 	conn_event(c, conn, ESTABLISHED);
+	tcp_set_peek_offset(conn->sock, 0);
 
 	/* The client might have sent data already, which we didn't
 	 * dequeue waiting for SYN,ACK from tap -- check now.
@@ -2612,6 +2641,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
 			goto reset;
 
 		conn_event(c, conn, ESTABLISHED);
+		tcp_set_peek_offset(conn->sock, 0);
 
 		if (th->fin) {
 			conn->seq_from_tap++;
@@ -2865,6 +2895,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
 			flow_dbg(conn, "ACK timeout, retry");
 			conn->retrans++;
 			conn->seq_to_tap = conn->seq_ack_from_tap;
+			tcp_set_peek_offset(conn->sock, 0);
 			tcp_data_from_sock(c, conn);
 			tcp_timer_ctl(c, conn);
 		}
@@ -3156,7 +3187,8 @@ static void tcp_sock_refill_init(const struct ctx *c)
  */
 int tcp_init(struct ctx *c)
 {
-	unsigned b;
+	unsigned int b, optv = 0;
+	int s;
 
 	for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
 		tc_hash[b] = FLOW_SIDX_NONE;
@@ -3180,6 +3212,17 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Probe for SO_PEEK_OFF support */
+	s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (s < 0) {
+		warn("Temporary TCP socket creation failed");
+	} else {
+		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int)))
+			peek_offset_cap = true;
+		close(s);
+	}
+	info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
+
 	return 0;
 }
 
-- 
@@ -511,6 +511,9 @@ static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
 
+/* Does the kernel support TCP_PEEK_OFF? */
+static bool peek_offset_cap;
+
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
 
@@ -526,6 +529,20 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
 int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
 int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
 
+/**
+ * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
+ * @s:          Socket to update
+ * @offset:     Offset in bytes
+ */
+static void tcp_set_peek_offset(int s, int offset)
+{
+	if (!peek_offset_cap)
+		return;
+
+	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
+		err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
+}
+
 /**
  * tcp_conn_epoll_events() - epoll events mask for given connection state
  * @events:	Current connection events
@@ -1273,6 +1290,7 @@ static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[T
 			continue;
 
 		conn->seq_to_tap = seq;
+		tcp_set_peek_offset(conn->sock, seq - conn->seq_ack_from_tap);
 	}
 }
 
@@ -2199,14 +2217,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	uint32_t already_sent, seq;
 	struct iovec *iov;
 
+	/* How much have we read/sent since last received ack ? */
 	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
-
 	if (SEQ_LT(already_sent, 0)) {
 		/* RFC 761, section 2.1. */
 		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
 			   conn->seq_ack_from_tap, conn->seq_to_tap);
 		conn->seq_to_tap = conn->seq_ack_from_tap;
 		already_sent = 0;
+		tcp_set_peek_offset(s, 0);
 	}
 
 	if (!wnd_scaled || already_sent >= wnd_scaled) {
@@ -2224,11 +2243,16 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		iov_rem = (wnd_scaled - already_sent) % mss;
 	}
 
-	mh_sock.msg_iov = iov_sock;
-	mh_sock.msg_iovlen = fill_bufs + 1;
-
-	iov_sock[0].iov_base = tcp_buf_discard;
-	iov_sock[0].iov_len = already_sent;
+	/* Prepare iov according to kernel capability */
+	if (!peek_offset_cap) {
+		mh_sock.msg_iov = iov_sock;
+		iov_sock[0].iov_base = tcp_buf_discard;
+		iov_sock[0].iov_len = already_sent;
+		mh_sock.msg_iovlen = fill_bufs + 1;
+	} else {
+		mh_sock.msg_iov = &iov_sock[1];
+		mh_sock.msg_iovlen = fill_bufs;
+	}
 
 	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
 	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
@@ -2269,7 +2293,10 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!peek_offset_cap)
+		sendlen -= already_sent;
+
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -2440,6 +2467,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 			   "fast re-transmit, ACK: %u, previous sequence: %u",
 			   max_ack_seq, conn->seq_to_tap);
 		conn->seq_to_tap = max_ack_seq;
+		tcp_set_peek_offset(conn->sock, 0);
 		tcp_data_from_sock(c, conn);
 	}
 
@@ -2532,6 +2560,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
 	conn->seq_ack_to_tap = conn->seq_from_tap;
 
 	conn_event(c, conn, ESTABLISHED);
+	tcp_set_peek_offset(conn->sock, 0);
 
 	/* The client might have sent data already, which we didn't
 	 * dequeue waiting for SYN,ACK from tap -- check now.
@@ -2612,6 +2641,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
 			goto reset;
 
 		conn_event(c, conn, ESTABLISHED);
+		tcp_set_peek_offset(conn->sock, 0);
 
 		if (th->fin) {
 			conn->seq_from_tap++;
@@ -2865,6 +2895,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
 			flow_dbg(conn, "ACK timeout, retry");
 			conn->retrans++;
 			conn->seq_to_tap = conn->seq_ack_from_tap;
+			tcp_set_peek_offset(conn->sock, 0);
 			tcp_data_from_sock(c, conn);
 			tcp_timer_ctl(c, conn);
 		}
@@ -3156,7 +3187,8 @@ static void tcp_sock_refill_init(const struct ctx *c)
  */
 int tcp_init(struct ctx *c)
 {
-	unsigned b;
+	unsigned int b, optv = 0;
+	int s;
 
 	for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
 		tc_hash[b] = FLOW_SIDX_NONE;
@@ -3180,6 +3212,17 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Probe for SO_PEEK_OFF support */
+	s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (s < 0) {
+		warn("Temporary TCP socket creation failed");
+	} else {
+		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int)))
+			peek_offset_cap = true;
+		close(s);
+	}
+	info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
+
 	return 0;
 }
 
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-05-21 22:25 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-17 15:24 [PATCH v6 0/3] Support for SO_PEEK_OFF socket option Jon Maloy
2024-05-17 15:24 ` [PATCH v6 1/3] tcp: move seq_to_tap update to when frame is queued Jon Maloy
2024-05-20  7:46   ` David Gibson
2024-05-17 15:24 ` [PATCH v6 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy
2024-05-20  8:07   ` David Gibson
2024-05-17 15:24 ` [PATCH v6 3/3] tcp: allow retransmit when peer receive window is zero Jon Maloy
2024-05-21  5:51   ` David Gibson
2024-05-21 22:25     ` Jon Maloy
  -- strict thread matches above, loose matches on Subject: below --
2024-05-17 15:05 [PATCH v6 0/3] Support for SO_PEEK_OFF socket option Jon Maloy
2024-05-17 15:06 ` [PATCH v6 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).