public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH 0/2] Support for SO_PEEK_OFF when a available
@ 2024-04-20 19:19 Jon Maloy
  2024-04-20 19:19 ` [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy
  2024-04-20 19:19 ` [PATCH 2/2] tcp: allow retransmit when peer receive window is zero Jon Maloy
  0 siblings, 2 replies; 15+ messages in thread
From: Jon Maloy @ 2024-04-20 19:19 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

First commit introduces support for SO_PEEK_OFF socket option.
Second commit adds a workaround for a kernel TCP bug.

Jon Maloy (2):
  tcp: leverage support of SO_PEEK_OFF socket option when available
  tcp: allow retransmit when peer receive window is zero

 tcp.c      | 63 +++++++++++++++++++++++++++++++++++++++++++-----------
 tcp_conn.h |  2 ++
 2 files changed, 53 insertions(+), 12 deletions(-)

-- 
2.42.0


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-04-20 19:19 [PATCH 0/2] Support for SO_PEEK_OFF when a available Jon Maloy
@ 2024-04-20 19:19 ` Jon Maloy
  2024-04-23 17:50   ` Stefano Brivio
  2024-04-24  0:44   ` David Gibson
  2024-04-20 19:19 ` [PATCH 2/2] tcp: allow retransmit when peer receive window is zero Jon Maloy
  1 sibling, 2 replies; 15+ messages in thread
From: Jon Maloy @ 2024-04-20 19:19 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

The kernel may support recvmsg(MSG_PEEK), starting reading data from a
given offset set by the SO_PEEK_OFF socket option. This makes it
possible to avoid repeated reading of already read initial bytes of a
received message, hence saving read cycles when forwarding TCP messages
in the host->name space direction.

In this commit, we add functionality to leverage this feature when available,
while we fall back to the previous behavior when not.

Measurements with iperf3 shows that throughput increases with 15-20 percent
in the host->namespace direction when this feature is used.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
 tcp.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/tcp.c b/tcp.c
index 905d26f..95d400a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -505,6 +505,7 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp6_l2_buf_used;
 
 /* recvmsg()/sendmsg() data for tap */
+static bool peek_offset_cap = false;
 static char 		tcp_buf_discard		[MAX_WINDOW];
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
@@ -582,6 +583,14 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
 int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
 int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
 
+static void set_peek_offset(int s, int offset)
+{
+	if (!peek_offset_cap)
+		return;
+	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
+		perror("Failed to set SO_PEEK_OFF\n");
+}
+
 /**
  * tcp_conn_epoll_events() - epoll events mask for given connection state
  * @events:	Current connection events
@@ -1951,7 +1960,7 @@ static void tcp_conn_from_tap(struct ctx *c,
 		if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll)))
 			goto cancel;
 	}
-
+	set_peek_offset(s, 0);
 	conn = &flow->tcp;
 	conn->f.type = FLOW_TCP;
 	conn->sock = s;
@@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	if (iov_rem)
 		iov_sock[fill_bufs].iov_len = iov_rem;
 
+	if (peek_offset_cap) {
+		/* Don't use discard buffer */
+		mh_sock.msg_iov = &iov_sock[1];
+		mh_sock.msg_iovlen -= 1;
+
+		/* Keep kernel sk_peek_off in synch */
+		set_peek_offset(s, already_sent);
+	}
+
 	/* Receive into buffers, don't dequeue until acknowledged by guest. */
 	do
 		len = recvmsg(s, &mh_sock, MSG_PEEK);
@@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!peek_offset_cap)
+		sendlen -= already_sent;
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -2718,6 +2738,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
 	    tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice,
 				      s, (struct sockaddr *)&sa))
 		return;
+	set_peek_offset(s, 0);
 
 	tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s,
 			       (struct sockaddr *)&sa, now);
@@ -3042,6 +3063,7 @@ static void tcp_sock_refill_init(const struct ctx *c)
 int tcp_init(struct ctx *c)
 {
 	unsigned b;
+	int s;
 
 	for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
 		tc_hash[b] = FLOW_SIDX_NONE;
@@ -3065,6 +3087,17 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Probe for SO_PEEK_OFF support */
+	s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if (s < 0) {
+		perror("Temporary tcp socket creation failed\n");
+	} else {
+		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &(int){0}, sizeof(int))) {
+			peek_offset_cap = true;
+		}
+		close(s);
+	}
+	printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not ");
 	return 0;
 }
 
-- 
@@ -505,6 +505,7 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp6_l2_buf_used;
 
 /* recvmsg()/sendmsg() data for tap */
+static bool peek_offset_cap = false;
 static char 		tcp_buf_discard		[MAX_WINDOW];
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
@@ -582,6 +583,14 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
 int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
 int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
 
+static void set_peek_offset(int s, int offset)
+{
+	if (!peek_offset_cap)
+		return;
+	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
+		perror("Failed to set SO_PEEK_OFF\n");
+}
+
 /**
  * tcp_conn_epoll_events() - epoll events mask for given connection state
  * @events:	Current connection events
@@ -1951,7 +1960,7 @@ static void tcp_conn_from_tap(struct ctx *c,
 		if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll)))
 			goto cancel;
 	}
-
+	set_peek_offset(s, 0);
 	conn = &flow->tcp;
 	conn->f.type = FLOW_TCP;
 	conn->sock = s;
@@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	if (iov_rem)
 		iov_sock[fill_bufs].iov_len = iov_rem;
 
+	if (peek_offset_cap) {
+		/* Don't use discard buffer */
+		mh_sock.msg_iov = &iov_sock[1];
+		mh_sock.msg_iovlen -= 1;
+
+		/* Keep kernel sk_peek_off in synch */
+		set_peek_offset(s, already_sent);
+	}
+
 	/* Receive into buffers, don't dequeue until acknowledged by guest. */
 	do
 		len = recvmsg(s, &mh_sock, MSG_PEEK);
@@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!peek_offset_cap)
+		sendlen -= already_sent;
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -2718,6 +2738,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
 	    tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice,
 				      s, (struct sockaddr *)&sa))
 		return;
+	set_peek_offset(s, 0);
 
 	tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s,
 			       (struct sockaddr *)&sa, now);
@@ -3042,6 +3063,7 @@ static void tcp_sock_refill_init(const struct ctx *c)
 int tcp_init(struct ctx *c)
 {
 	unsigned b;
+	int s;
 
 	for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
 		tc_hash[b] = FLOW_SIDX_NONE;
@@ -3065,6 +3087,17 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Probe for SO_PEEK_OFF support */
+	s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if (s < 0) {
+		perror("Temporary tcp socket creation failed\n");
+	} else {
+		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &(int){0}, sizeof(int))) {
+			peek_offset_cap = true;
+		}
+		close(s);
+	}
+	printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not ");
 	return 0;
 }
 
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/2] tcp: allow retransmit when peer receive window is zero
  2024-04-20 19:19 [PATCH 0/2] Support for SO_PEEK_OFF when a available Jon Maloy
  2024-04-20 19:19 ` [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy
@ 2024-04-20 19:19 ` Jon Maloy
  2024-04-24  1:04   ` David Gibson
  1 sibling, 1 reply; 15+ messages in thread
From: Jon Maloy @ 2024-04-20 19:19 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

A bug in kernel TCP may lead to a deadlock where a zero window is sent
from the peer, while buffer reads doesn't lead to it being updated.
At the same time, the zero window stops this side from sending out
more data to trigger new advertisements to be sent from the peer.

RFC 793 states that it always is permitted for a sender to send one byte
of data even when the window is zero. This resolves the deadlock described
above, so we choose to introduce it here as a last resort. We allow it both
during fast and as keep-alives when the timer sees no activity on the connection.

However, we notice that this solution doesn´t work well. Traffic sometimes
goes to zero, and onley recovers after the timer has resolved the situation.

Because of this, we chose to improve it slightly: The deadlock happens when a
packet has been dropped at the peer end because of memory squeeze. We therefore
consider it legitimate to retransmit that packet while considering the window
size that was valid at the moment it was first transmitted. This works
much better.

It should be noted that although this solves the problem we have at hand,
it is not a genuine solution to the kernel bug. There may well be TCP stacks
around in other OS-es which don't do this probing.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
 tcp.c      | 26 ++++++++++++++++----------
 tcp_conn.h |  2 ++
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/tcp.c b/tcp.c
index 95d400a..9dea151 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1774,6 +1774,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
 	ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5;
 
 	conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns;
+	conn->max_seq_to_tap = conn->seq_to_tap;
 }
 
 /**
@@ -2123,9 +2124,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
  *
  * #syscalls recvmsg
  */
-static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, uint32_t wnd_scaled)
 {
-	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
 	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
 	int sendlen, len, plen, v4 = CONN_V4(conn);
 	int s = conn->sock, i, ret = 0;
@@ -2212,6 +2212,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 
 		return 0;
 	}
+	sendlen = len;
+	if (!peek_offset_cap)
+		sendlen -= already_sent;
 
 	sendlen = len;
 	if (!peek_offset_cap)
@@ -2241,7 +2244,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		tcp_data_to_tap(c, conn, plen, no_csum, seq);
 		seq += plen;
 	}
-
+	/* We need this to know this during retransmission: */
+	if (SEQ_GT(seq, conn->max_seq_to_tap))
+		conn->max_seq_to_tap = seq;
 	conn_flag(c, conn, ACK_FROM_TAP_DUE);
 
 	return 0;
@@ -2317,8 +2322,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 			    SEQ_GE(ack_seq, max_ack_seq)) {
 				/* Fast re-transmit */
 				retr = !len && !th->fin &&
-				       ack_seq == max_ack_seq &&
-				       ntohs(th->window) == max_ack_seq_wnd;
+				       ack_seq == max_ack_seq;
 
 				max_ack_seq_wnd = ntohs(th->window);
 				max_ack_seq = ack_seq;
@@ -2385,9 +2389,10 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 		flow_trace(conn,
 			   "fast re-transmit, ACK: %u, previous sequence: %u",
 			   max_ack_seq, conn->seq_to_tap);
+
 		conn->seq_ack_from_tap = max_ack_seq;
 		conn->seq_to_tap = max_ack_seq;
-		tcp_data_from_sock(c, conn);
+		tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap));
 	}
 
 	if (!iov_i)
@@ -2483,7 +2488,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
 	/* The client might have sent data already, which we didn't
 	 * dequeue waiting for SYN,ACK from tap -- check now.
 	 */
-	tcp_data_from_sock(c, conn);
+	tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap);
 	tcp_send_flag(c, conn, ACK);
 }
 
@@ -2575,7 +2580,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
 
 		tcp_tap_window_update(conn, ntohs(th->window));
 
-		tcp_data_from_sock(c, conn);
+		tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap);
 
 		if (p->count - idx == 1)
 			return 1;
@@ -2788,7 +2793,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
 			flow_dbg(conn, "ACK timeout, retry");
 			conn->retrans++;
 			conn->seq_to_tap = conn->seq_ack_from_tap;
-			tcp_data_from_sock(c, conn);
+			tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap));
 			tcp_timer_ctl(c, conn);
 		}
 	} else {
@@ -2807,6 +2812,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
 			tcp_rst(c, conn);
 		}
 	}
+
 }
 
 /**
@@ -2843,7 +2849,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
 			conn_event(c, conn, SOCK_FIN_RCVD);
 
 		if (events & EPOLLIN)
-			tcp_data_from_sock(c, conn);
+			tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap);
 
 		if (events & EPOLLOUT)
 			tcp_update_seqack_wnd(c, conn, 0, NULL);
diff --git a/tcp_conn.h b/tcp_conn.h
index a5f5cfe..afcdec9 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -29,6 +29,7 @@
  * @wnd_from_tap:	Last window size from tap, unscaled (as received)
  * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
  * @seq_to_tap:		Next sequence for packets to tap
+ * @max_seq_to_tap:	Next seq after highest ever sent. Needeed during retransmit
  * @seq_ack_from_tap:	Last ACK number received from tap
  * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
  * @seq_ack_to_tap:	Last ACK number sent to tap
@@ -100,6 +101,7 @@ struct tcp_tap_conn {
 	uint16_t	wnd_to_tap;
 
 	uint32_t	seq_to_tap;
+	uint32_t	max_seq_to_tap;
 	uint32_t	seq_ack_from_tap;
 	uint32_t	seq_from_tap;
 	uint32_t	seq_ack_to_tap;
-- 
@@ -29,6 +29,7 @@
  * @wnd_from_tap:	Last window size from tap, unscaled (as received)
  * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
  * @seq_to_tap:		Next sequence for packets to tap
+ * @max_seq_to_tap:	Next seq after highest ever sent. Needeed during retransmit
  * @seq_ack_from_tap:	Last ACK number received from tap
  * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
  * @seq_ack_to_tap:	Last ACK number sent to tap
@@ -100,6 +101,7 @@ struct tcp_tap_conn {
 	uint16_t	wnd_to_tap;
 
 	uint32_t	seq_to_tap;
+	uint32_t	max_seq_to_tap;
 	uint32_t	seq_ack_from_tap;
 	uint32_t	seq_from_tap;
 	uint32_t	seq_ack_to_tap;
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-04-20 19:19 ` [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy
@ 2024-04-23 17:50   ` Stefano Brivio
  2024-04-24  0:48     ` David Gibson
  2024-04-24  0:44   ` David Gibson
  1 sibling, 1 reply; 15+ messages in thread
From: Stefano Brivio @ 2024-04-23 17:50 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev, lvivier, dgibson

On Sat, 20 Apr 2024 15:19:19 -0400
Jon Maloy <jmaloy@redhat.com> wrote:

> The kernel may support recvmsg(MSG_PEEK), starting reading data from a
> given offset set by the SO_PEEK_OFF socket option.

It would be practical to mention in commit message and in a code
comment which kernel commit introduced the feature, that is, your
commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option")
-- even if it's on net-next, better than nothing (net-next might be
rebased but it's quite rare).

Also the (forecast, at this point) kernel version where this will be
introduced would be nice to have.

> This makes it
> possible to avoid repeated reading of already read initial bytes of a
> received message, hence saving read cycles when forwarding TCP messages
> in the host->name space direction.
> 
> In this commit, we add functionality to leverage this feature when available,
> while we fall back to the previous behavior when not.
> 
> Measurements with iperf3 shows that throughput increases with 15-20 percent
> in the host->namespace direction when this feature is used.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> ---
>  tcp.c | 37 +++++++++++++++++++++++++++++++++++--
>  1 file changed, 35 insertions(+), 2 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 905d26f..95d400a 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -505,6 +505,7 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
>  static unsigned int tcp6_l2_buf_used;
>  
>  /* recvmsg()/sendmsg() data for tap */
> +static bool peek_offset_cap = false;

No need to initialise, it's static.

The comment just above referred to tcp_buf_discard and iov_sock
("data"), now you're adding a flag just under it, which is a bit
confusing.

I would rather leave the original comment for tcp_buf_discard and
iov_sock, and add another one, say...:

>  static char 		tcp_buf_discard		[MAX_WINDOW];
>  static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];

/* Does the kernel support TCP_PEEK_OFF? */
static bool peek_offset_cap;

> @@ -582,6 +583,14 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
>  int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
>  int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
>  
> +static void set_peek_offset(int s, int offset)

A kerneldoc-style function comment would be nice, like all other
functions in this file:

/**
 * set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
 * @s		Socket
 * @offset	Offset in bytes
 */

> +{
> +	if (!peek_offset_cap)
> +		return;
> +	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))

gcc ('make') says:

In function ‘set_peek_offset’,
    inlined from ‘tcp_listen_handler’ at tcp.c:2819:2:
tcp.c:592:13: warning: ‘s’ may be used uninitialized [-Wmaybe-uninitialized]
  592 |         if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
tcp.c: In function ‘tcp_listen_handler’:
tcp.c:2815:13: note: ‘s’ was declared here
 2815 |         int s;
      |             ^

clang-tidy ('make clang-tidy'):

/home/sbrivio/passt/tcp.c:2819:2: error: 1st function call argument is an uninitialized value [clang-analyzer-core.CallAndMessage,-warnings-as-errors]
        set_peek_offset(s, 0);
        ^               ~
/home/sbrivio/passt/tcp.c:2815:2: note: 's' declared without an initial value
        int s;
        ^~~~~
/home/sbrivio/passt/tcp.c:2817:6: note: Assuming field 'no_tcp' is 0
        if (c->no_tcp || !(flow = flow_alloc()))
            ^~~~~~~~~
/home/sbrivio/passt/tcp.c:2817:6: note: Left side of '||' is false
/home/sbrivio/passt/tcp.c:2817:21: note: Assuming 'flow' is non-null
        if (c->no_tcp || !(flow = flow_alloc()))
                           ^~~~
/home/sbrivio/passt/tcp.c:2817:2: note: Taking false branch
        if (c->no_tcp || !(flow = flow_alloc()))
        ^
/home/sbrivio/passt/tcp.c:2819:2: note: 1st function call argument is an uninitialized value
        set_peek_offset(s, 0);
        ^               ~
/home/sbrivio/passt/tcp.c:2819:18: error: variable 's' is uninitialized when used here [clang-diagnostic-uninitialized,-warnings-as-errors]
        set_peek_offset(s, 0);
                        ^
/home/sbrivio/passt/tcp.c:2815:7: note: initialize the variable 's' to silence this warning
        int s;
             ^
              = 0

and cppcheck ('make cppcheck'):

tcp.c:2819:18: error: Uninitialized variable: s [uninitvar]
 set_peek_offset(s, 0);
                 ^
tcp.c:2817:16: note: Assuming condition is false
 if (c->no_tcp || !(flow = flow_alloc()))
               ^
tcp.c:2819:18: note: Uninitialized variable: s
 set_peek_offset(s, 0);
                 ^
make: *** [Makefile:296: cppcheck] Error 1

> +		perror("Failed to set SO_PEEK_OFF\n");
> +}
> +
>  /**
>   * tcp_conn_epoll_events() - epoll events mask for given connection state
>   * @events:	Current connection events
> @@ -1951,7 +1960,7 @@ static void tcp_conn_from_tap(struct ctx *c,
>  		if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll)))
>  			goto cancel;
>  	}
> -

Spurious change.

> +	set_peek_offset(s, 0);

Do we really need to initialise it to zero on a new connection? Extra
system calls on this path matter for latency of connection
establishment.

>  	conn = &flow->tcp;
>  	conn->f.type = FLOW_TCP;
>  	conn->sock = s;
> @@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  	if (iov_rem)
>  		iov_sock[fill_bufs].iov_len = iov_rem;
>  
> +	if (peek_offset_cap) {
> +		/* Don't use discard buffer */
> +		mh_sock.msg_iov = &iov_sock[1];
> +		mh_sock.msg_iovlen -= 1;
> +
> +		/* Keep kernel sk_peek_off in synch */

While Wiktionary lists "synch" as "Alternative spelling of sync", I
would argue that "sync" is much more common and less surprising.

> +		set_peek_offset(s, already_sent);

Note that there's an early return before this point where already_sent
is set to 0. Don't you need to set_peek_offset() also in that case?

I guess this would be easier to follow if both assignments of
already_sent, earlier in this function, were followed by a
set_peek_offset() call. Or do we risk calling it spuriously?

> +	}
> +
>  	/* Receive into buffers, don't dequeue until acknowledged by guest. */
>  	do
>  		len = recvmsg(s, &mh_sock, MSG_PEEK);
> @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  		return 0;
>  	}
>  
> -	sendlen = len - already_sent;
> +	sendlen = len;
> +	if (!peek_offset_cap)
> +		sendlen -= already_sent;
>  	if (sendlen <= 0) {
>  		conn_flag(c, conn, STALLED);
>  		return 0;
> @@ -2718,6 +2738,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
>  	    tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice,
>  				      s, (struct sockaddr *)&sa))
>  		return;
> +	set_peek_offset(s, 0);

Same here, do we really need to initialise it to zero? If yes, it would
be nice to leave a blank line after that 'return' as it was before.

>  
>  	tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s,
>  			       (struct sockaddr *)&sa, now);
> @@ -3042,6 +3063,7 @@ static void tcp_sock_refill_init(const struct ctx *c)
>  int tcp_init(struct ctx *c)
>  {
>  	unsigned b;
> +	int s;
>  
>  	for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
>  		tc_hash[b] = FLOW_SIDX_NONE;
> @@ -3065,6 +3087,17 @@ int tcp_init(struct ctx *c)
>  		NS_CALL(tcp_ns_socks_init, c);
>  	}
>  
> +	/* Probe for SO_PEEK_OFF support */
> +	s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);

By default, we should always pass SOCK_CLOEXEC to socket(), just in
case we miss closing one.

> +	if (s < 0) {
> +		perror("Temporary tcp socket creation failed\n");

This should be a warn() call, and s/tcp/TCP/.

> +	} else {
> +		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &(int){0}, sizeof(int))) {

No particular reason to exceed 80 columns here, and curly brackets
aren't needed (coding style is the same as kernel).

For consistency with the rest of the codebase: &((int) { 0 }).

> +			peek_offset_cap = true;
> +		}
> +		close(s);
> +	}
> +	printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not ");

This should be a debug() call, a bit too much for info().

>  	return 0;
>  }
>  

...I'm still reviewing 2/2, sorry for the delay.

-- 
Stefano


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-04-20 19:19 ` [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy
  2024-04-23 17:50   ` Stefano Brivio
@ 2024-04-24  0:44   ` David Gibson
  2024-04-25 23:23     ` Jon Maloy
  1 sibling, 1 reply; 15+ messages in thread
From: David Gibson @ 2024-04-24  0:44 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 4751 bytes --]

On Sat, Apr 20, 2024 at 03:19:19PM -0400, Jon Maloy wrote:
> The kernel may support recvmsg(MSG_PEEK), starting reading data from a

Not worth a respin on its own, but I think the comma above is
misplaced, and for me makes the sentence much harder to read.

> given offset set by the SO_PEEK_OFF socket option. This makes it
> possible to avoid repeated reading of already read initial bytes of a
> received message, hence saving read cycles when forwarding TCP messages
> in the host->name space direction.
> 
> In this commit, we add functionality to leverage this feature when available,
> while we fall back to the previous behavior when not.
> 
> Measurements with iperf3 shows that throughput increases with 15-20 percent
> in the host->namespace direction when this feature is used.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> ---
>  tcp.c | 37 +++++++++++++++++++++++++++++++++++--
>  1 file changed, 35 insertions(+), 2 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 905d26f..95d400a 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -505,6 +505,7 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
>  static unsigned int tcp6_l2_buf_used;
>  
>  /* recvmsg()/sendmsg() data for tap */
> +static bool peek_offset_cap = false;
>  static char 		tcp_buf_discard		[MAX_WINDOW];
>  static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
>  
> @@ -582,6 +583,14 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
>  int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
>  int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
>  
> +static void set_peek_offset(int s, int offset)
> +{
> +	if (!peek_offset_cap)
> +		return;
> +	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
> +		perror("Failed to set SO_PEEK_OFF\n");
> +}
> +
>  /**
>   * tcp_conn_epoll_events() - epoll events mask for given connection state
>   * @events:	Current connection events
> @@ -1951,7 +1960,7 @@ static void tcp_conn_from_tap(struct ctx *c,
>  		if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll)))
>  			goto cancel;
>  	}
> -
> +	set_peek_offset(s, 0);
>  	conn = &flow->tcp;
>  	conn->f.type = FLOW_TCP;
>  	conn->sock = s;
> @@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  	if (iov_rem)
>  		iov_sock[fill_bufs].iov_len = iov_rem;
>  
> +	if (peek_offset_cap) {
> +		/* Don't use discard buffer */
> +		mh_sock.msg_iov = &iov_sock[1];
> +		mh_sock.msg_iovlen -= 1;
> +
> +		/* Keep kernel sk_peek_off in synch */
> +		set_peek_offset(s, already_sent);

I thought we didn't need to set SO_PEEK_OFF here - that it would track
on its own, and we only needed to change it for retransmits.  I don't
think we even need to calculate 'already_sent' when we have
SO_PEEK_OFF.  In fact - if we set already_sent to 0 here, it might
make things a bit cleaner than having to have special cases for
adjusting the iov and sendlen.

> +	}
> +
>  	/* Receive into buffers, don't dequeue until acknowledged by guest. */
>  	do
>  		len = recvmsg(s, &mh_sock, MSG_PEEK);
> @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  		return 0;
>  	}
>  
> -	sendlen = len - already_sent;
> +	sendlen = len;
> +	if (!peek_offset_cap)
> +		sendlen -= already_sent;
>  	if (sendlen <= 0) {
>  		conn_flag(c, conn, STALLED);
>  		return 0;
> @@ -2718,6 +2738,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
>  	    tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice,
>  				      s, (struct sockaddr *)&sa))
>  		return;
> +	set_peek_offset(s, 0);
>  
>  	tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s,
>  			       (struct sockaddr *)&sa, now);
> @@ -3042,6 +3063,7 @@ static void tcp_sock_refill_init(const struct ctx *c)
>  int tcp_init(struct ctx *c)
>  {
>  	unsigned b;
> +	int s;
>  
>  	for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
>  		tc_hash[b] = FLOW_SIDX_NONE;
> @@ -3065,6 +3087,17 @@ int tcp_init(struct ctx *c)
>  		NS_CALL(tcp_ns_socks_init, c);
>  	}
>  
> +	/* Probe for SO_PEEK_OFF support */
> +	s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
> +	if (s < 0) {
> +		perror("Temporary tcp socket creation failed\n");
> +	} else {
> +		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &(int){0}, sizeof(int))) {
> +			peek_offset_cap = true;
> +		}
> +		close(s);
> +	}
> +	printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not ");

Should be an info().

>  	return 0;
>  }
>  

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-04-23 17:50   ` Stefano Brivio
@ 2024-04-24  0:48     ` David Gibson
  2024-04-24 18:30       ` Stefano Brivio
  2024-04-25 23:06       ` Jon Maloy
  0 siblings, 2 replies; 15+ messages in thread
From: David Gibson @ 2024-04-24  0:48 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Jon Maloy, passt-dev, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 1032 bytes --]

On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote:
> On Sat, 20 Apr 2024 15:19:19 -0400
> Jon Maloy <jmaloy@redhat.com> wrote:
[snip]
> > +	set_peek_offset(s, 0);
> 
> Do we really need to initialise it to zero on a new connection? Extra
> system calls on this path matter for latency of connection
> establishment.

Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting
it to 0, rather than the default -1.  We could lazily enable it, but
we'd need either to a) do it later in the handshake (maybe when we set
ESTABLISHED), but we'd need to be careful it is always set before the
first MSG_PEEK or b) keep track of whether it's set on a per-socket
basis (this would have the advantage of robustness if we ever
encountered a kernel that weirdly allows it for some but not all TCP
sockets).

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] tcp: allow retransmit when peer receive window is zero
  2024-04-20 19:19 ` [PATCH 2/2] tcp: allow retransmit when peer receive window is zero Jon Maloy
@ 2024-04-24  1:04   ` David Gibson
  2024-04-24 18:31     ` Stefano Brivio
  0 siblings, 1 reply; 15+ messages in thread
From: David Gibson @ 2024-04-24  1:04 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 7413 bytes --]

On Sat, Apr 20, 2024 at 03:19:20PM -0400, Jon Maloy wrote:
> A bug in kernel TCP may lead to a deadlock where a zero window is sent
> from the peer, while buffer reads doesn't lead to it being updated.
> At the same time, the zero window stops this side from sending out
> more data to trigger new advertisements to be sent from the peer.

This is a bit confusing to me.  I guess the buffer reads must be
happening on the peer, not "this side", but that's not very clear.  I
think "received from the peer" might be better than "sent from the
peer" to make it more obvious that the point of view is uniformly from
"this side".

> RFC 793 states that it always is permitted for a sender to send one byte
> of data even when the window is zero. This resolves the deadlock described
> above, so we choose to introduce it here as a last resort. We allow it both
> during fast and as keep-alives when the timer sees no activity on
> the connection.

Looks like there's a missing noun after "during fast".

> However, we notice that this solution doesn´t work well. Traffic sometimes
> goes to zero, and onley recovers after the timer has resolved the situation.

s/onley/only/

> Because of this, we chose to improve it slightly: The deadlock happens when a

Is it actually useful to describe the "bad" workaround if we're using
a different one?  I don't see how the better workaround is an obvious
extension of the bad one.

> packet has been dropped at the peer end because of memory squeeze. We therefore
> consider it legitimate to retransmit that packet while considering the window
> size that was valid at the moment it was first transmitted. This works
> much better.
> 
> It should be noted that although this solves the problem we have at hand,
> it is not a genuine solution to the kernel bug. There may well be TCP stacks
> around in other OS-es which don't do this probing.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> ---
>  tcp.c      | 26 ++++++++++++++++----------
>  tcp_conn.h |  2 ++
>  2 files changed, 18 insertions(+), 10 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 95d400a..9dea151 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -1774,6 +1774,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
>  	ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5;
>  
>  	conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns;
> +	conn->max_seq_to_tap = conn->seq_to_tap;
>  }
>  
>  /**
> @@ -2123,9 +2124,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>   *
>   * #syscalls recvmsg
>   */
> -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
> +static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, uint32_t wnd_scaled)
>  {
> -	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
>  	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
>  	int sendlen, len, plen, v4 = CONN_V4(conn);
>  	int s = conn->sock, i, ret = 0;
> @@ -2212,6 +2212,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  
>  		return 0;
>  	}
> +	sendlen = len;
> +	if (!peek_offset_cap)
> +		sendlen -= already_sent;
>  
>  	sendlen = len;
>  	if (!peek_offset_cap)

Looks like some duplicated lines from a bad rebase.

> @@ -2241,7 +2244,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  		tcp_data_to_tap(c, conn, plen, no_csum, seq);
>  		seq += plen;
>  	}
> -
> +	/* We need this to know this during retransmission: */
> +	if (SEQ_GT(seq, conn->max_seq_to_tap))
> +		conn->max_seq_to_tap = seq;
>  	conn_flag(c, conn, ACK_FROM_TAP_DUE);
>  
>  	return 0;
> @@ -2317,8 +2322,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
>  			    SEQ_GE(ack_seq, max_ack_seq)) {
>  				/* Fast re-transmit */
>  				retr = !len && !th->fin &&
> -				       ack_seq == max_ack_seq &&
> -				       ntohs(th->window) == max_ack_seq_wnd;
> +				       ack_seq == max_ack_seq;
>  
>  				max_ack_seq_wnd = ntohs(th->window);
>  				max_ack_seq = ack_seq;
> @@ -2385,9 +2389,10 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
>  		flow_trace(conn,
>  			   "fast re-transmit, ACK: %u, previous sequence: %u",
>  			   max_ack_seq, conn->seq_to_tap);
> +
>  		conn->seq_ack_from_tap = max_ack_seq;
>  		conn->seq_to_tap = max_ack_seq;
> -		tcp_data_from_sock(c, conn);
> +		tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap));

Does the MAX do anything: if max_seq_to_tap equals seq_ack_from_tap
then, by definition, there is nothing to retransmit - everything has
been acked.

>  	}
>  
>  	if (!iov_i)
> @@ -2483,7 +2488,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
>  	/* The client might have sent data already, which we didn't
>  	 * dequeue waiting for SYN,ACK from tap -- check now.
>  	 */
> -	tcp_data_from_sock(c, conn);
> +	tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap);
>  	tcp_send_flag(c, conn, ACK);
>  }
>  
> @@ -2575,7 +2580,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
>  
>  		tcp_tap_window_update(conn, ntohs(th->window));
>  
> -		tcp_data_from_sock(c, conn);
> +		tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap);
>  
>  		if (p->count - idx == 1)
>  			return 1;
> @@ -2788,7 +2793,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
>  			flow_dbg(conn, "ACK timeout, retry");
>  			conn->retrans++;
>  			conn->seq_to_tap = conn->seq_ack_from_tap;
> -			tcp_data_from_sock(c, conn);
> +			tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap));
>  			tcp_timer_ctl(c, conn);
>  		}
>  	} else {
> @@ -2807,6 +2812,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
>  			tcp_rst(c, conn);
>  		}
>  	}
> +

Spurious change

>  }
>  
>  /**
> @@ -2843,7 +2849,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
>  			conn_event(c, conn, SOCK_FIN_RCVD);
>  
>  		if (events & EPOLLIN)
> -			tcp_data_from_sock(c, conn);
> +			tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap);
>  
>  		if (events & EPOLLOUT)
>  			tcp_update_seqack_wnd(c, conn, 0, NULL);
> diff --git a/tcp_conn.h b/tcp_conn.h
> index a5f5cfe..afcdec9 100644
> --- a/tcp_conn.h
> +++ b/tcp_conn.h
> @@ -29,6 +29,7 @@
>   * @wnd_from_tap:	Last window size from tap, unscaled (as received)
>   * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
>   * @seq_to_tap:		Next sequence for packets to tap
> + * @max_seq_to_tap:	Next seq after highest ever sent. Needeed during retransmit
>   * @seq_ack_from_tap:	Last ACK number received from tap
>   * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
>   * @seq_ack_to_tap:	Last ACK number sent to tap
> @@ -100,6 +101,7 @@ struct tcp_tap_conn {
>  	uint16_t	wnd_to_tap;
>  
>  	uint32_t	seq_to_tap;
> +	uint32_t	max_seq_to_tap;
>  	uint32_t	seq_ack_from_tap;
>  	uint32_t	seq_from_tap;
>  	uint32_t	seq_ack_to_tap;

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-04-24  0:48     ` David Gibson
@ 2024-04-24 18:30       ` Stefano Brivio
  2024-04-26  3:27         ` David Gibson
  2024-04-25 23:06       ` Jon Maloy
  1 sibling, 1 reply; 15+ messages in thread
From: Stefano Brivio @ 2024-04-24 18:30 UTC (permalink / raw)
  To: David Gibson; +Cc: Jon Maloy, passt-dev, lvivier, dgibson

On Wed, 24 Apr 2024 10:48:05 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote:
> > On Sat, 20 Apr 2024 15:19:19 -0400
> > Jon Maloy <jmaloy@redhat.com> wrote:  
> [snip]
> > > +	set_peek_offset(s, 0);  
> > 
> > Do we really need to initialise it to zero on a new connection? Extra
> > system calls on this path matter for latency of connection
> > establishment.  
> 
> Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting
> it to 0, rather than the default -1.

By the way of which, this is not documented at this point -- a man page
patch (linux-man and linux-api lists) would be nice.

> We could lazily enable it, but
> we'd need either to a) do it later in the handshake (maybe when we set
> ESTABLISHED), but we'd need to be careful it is always set before the
> first MSG_PEEK

I was actually thinking that we could set it only as we receive data
(not every connection will receive data), and keep this out of the
handshake (which we want to keep "faster", I think).

And setting it as we mark a connection as ESTABLISHED should have the
same effect on latency as setting it on a new connection -- that's not
really lazy. So, actually:

> or b) keep track of whether it's set on a per-socket
> basis (this would have the advantage of robustness if we ever
> encountered a kernel that weirdly allows it for some but not all TCP
> sockets).

...this could be done as we receive data in tcp_data_from_sock(), with
a new flag in tcp_tap_conn::flags, to avoid adding latency to the
handshake. It also looks more robust to me, and done/checked in a
single place where we need it.

We have just three bits left there which isn't great, but if we need to
save one at a later point, we can drop this new flag easily.

-- 
Stefano


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] tcp: allow retransmit when peer receive window is zero
  2024-04-24  1:04   ` David Gibson
@ 2024-04-24 18:31     ` Stefano Brivio
  0 siblings, 0 replies; 15+ messages in thread
From: Stefano Brivio @ 2024-04-24 18:31 UTC (permalink / raw)
  To: Jon Maloy; +Cc: David Gibson, passt-dev, lvivier, dgibson

On top of David's comments:

On Wed, 24 Apr 2024 11:04:51 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Sat, Apr 20, 2024 at 03:19:20PM -0400, Jon Maloy wrote:
> > A bug in kernel TCP may lead to a deadlock where a zero window is sent
> > from the peer, while buffer reads doesn't lead to it being updated.
> > At the same time, the zero window stops this side from sending out
> > more data to trigger new advertisements to be sent from the peer.  
> 
> This is a bit confusing to me.  I guess the buffer reads must be
> happening on the peer, not "this side", but that's not very clear.  I
> think "received from the peer" might be better than "sent from the
> peer" to make it more obvious that the point of view is uniformly from
> "this side".
> 
> > RFC 793 states that it always is permitted for a sender to send one byte
> > of data even when the window is zero.

As incredible as it might sound, RFC 793 is now obsolete :) -- you
should refer to RFC 9293 instead.

Further, RFC 9293 3.8.6.1 (just like RFC 793) says that we *must*
(albeit not MUST) "regularly" send that octet -- as long as we have one
available. Not that it's permitted.

Hence, a general comment on this patch: my understanding is that we
want to implement zero-window probing, instead of triggering a fast
re-transmit whenever we get a keep-alive packet from the peer, because
we don't know if we'll get one.

RFC 9293 3.8.6.1 goes on saying:

  The transmitting host SHOULD send the first zero-window probe when a
  zero window has existed for the retransmission timeout period (SHLD-29)
  (Section 3.8.1), and SHOULD increase exponentially the interval between
  successive probes (SHLD-30).

but we currently don't compute the retransmission timeout according to
RFC 6298 (yeah, I know...).

Given that it's a SHOULD, and complying with that is clearly beyond the
scope of this change, we can just use one of the existing timeouts and
timers (ACK_TIMEOUT would do, for the moment).

> > This resolves the deadlock described
> > above, so we choose to introduce it here as a last resort. We allow it both
> > during fast and as keep-alives when the timer sees no activity on
> > the connection.  
> 
> Looks like there's a missing noun after "during fast".
> 
> > However, we notice that this solution doesn´t work well. Traffic sometimes
> > goes to zero, and onley recovers after the timer has resolved the situation.  
> 
> s/onley/only/
> 
> > Because of this, we chose to improve it slightly: The deadlock happens when a  
> 
> Is it actually useful to describe the "bad" workaround if we're using
> a different one?  I don't see how the better workaround is an obvious
> extension of the bad one.
> 
> > packet has been dropped at the peer end because of memory squeeze. We therefore
> > consider it legitimate to retransmit that packet while considering the window
> > size that was valid at the moment it was first transmitted. This works
> > much better.
> > 
> > It should be noted that although this solves the problem we have at hand,
> > it is not a genuine solution to the kernel bug. There may well be TCP stacks
> > around in other OS-es which don't do this probing.
> > 
> > Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> > ---
> >  tcp.c      | 26 ++++++++++++++++----------
> >  tcp_conn.h |  2 ++
> >  2 files changed, 18 insertions(+), 10 deletions(-)
> > 
> > diff --git a/tcp.c b/tcp.c
> > index 95d400a..9dea151 100644
> > --- a/tcp.c
> > +++ b/tcp.c
> > @@ -1774,6 +1774,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
> >  	ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5;
> >  
> >  	conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns;
> > +	conn->max_seq_to_tap = conn->seq_to_tap;
> >  }
> >  
> >  /**
> > @@ -2123,9 +2124,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> >   *
> >   * #syscalls recvmsg
> >   */
> > -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
> > +static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, uint32_t wnd_scaled)

This exceeds 80 columns for no particularly good reason, and the
function comment should be updated.

> >  {
> > -	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
> >  	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
> >  	int sendlen, len, plen, v4 = CONN_V4(conn);
> >  	int s = conn->sock, i, ret = 0;
> > @@ -2212,6 +2212,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
> >  
> >  		return 0;
> >  	}
> > +	sendlen = len;
> > +	if (!peek_offset_cap)
> > +		sendlen -= already_sent;
> >  
> >  	sendlen = len;
> >  	if (!peek_offset_cap)  
> 
> Looks like some duplicated lines from a bad rebase.
> 
> > @@ -2241,7 +2244,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
> >  		tcp_data_to_tap(c, conn, plen, no_csum, seq);
> >  		seq += plen;
> >  	}
> > -
> > +	/* We need this to know this during retransmission: */

s/this to know this/to know this/ (I guess)

> > +	if (SEQ_GT(seq, conn->max_seq_to_tap))
> > +		conn->max_seq_to_tap = seq;
> >  	conn_flag(c, conn, ACK_FROM_TAP_DUE);
> >  
> >  	return 0;
> > @@ -2317,8 +2322,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
> >  			    SEQ_GE(ack_seq, max_ack_seq)) {
> >  				/* Fast re-transmit */
> >  				retr = !len && !th->fin &&
> > -				       ack_seq == max_ack_seq &&
> > -				       ntohs(th->window) == max_ack_seq_wnd;
> > +				       ack_seq == max_ack_seq;

This matches any keep-alive (that is, a segment without data and
without a window update) we get: I think you should replace the
existing condition with a check that the window is zero, instead.

> >  
> >  				max_ack_seq_wnd = ntohs(th->window);
> >  				max_ack_seq = ack_seq;
> > @@ -2385,9 +2389,10 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
> >  		flow_trace(conn,
> >  			   "fast re-transmit, ACK: %u, previous sequence: %u",
> >  			   max_ack_seq, conn->seq_to_tap);
> > +

Spurious change.

> >  		conn->seq_ack_from_tap = max_ack_seq;
> >  		conn->seq_to_tap = max_ack_seq;
> > -		tcp_data_from_sock(c, conn);
> > +		tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap));  
> 
> Does the MAX do anything: if max_seq_to_tap equals seq_ack_from_tap
> then, by definition, there is nothing to retransmit - everything has
> been acked.
> 
> >  	}
> >  
> >  	if (!iov_i)
> > @@ -2483,7 +2488,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
> >  	/* The client might have sent data already, which we didn't
> >  	 * dequeue waiting for SYN,ACK from tap -- check now.
> >  	 */
> > -	tcp_data_from_sock(c, conn);
> > +	tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap);
> >  	tcp_send_flag(c, conn, ACK);
> >  }
> >  
> > @@ -2575,7 +2580,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
> >  
> >  		tcp_tap_window_update(conn, ntohs(th->window));
> >  
> > -		tcp_data_from_sock(c, conn);
> > +		tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap);
> >  
> >  		if (p->count - idx == 1)
> >  			return 1;
> > @@ -2788,7 +2793,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
> >  			flow_dbg(conn, "ACK timeout, retry");
> >  			conn->retrans++;
> >  			conn->seq_to_tap = conn->seq_ack_from_tap;
> > -			tcp_data_from_sock(c, conn);
> > +			tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap));
> >  			tcp_timer_ctl(c, conn);
> >  		}
> >  	} else {
> > @@ -2807,6 +2812,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
> >  			tcp_rst(c, conn);
> >  		}
> >  	}
> > +  
> 
> Spurious change
> 
> >  }
> >  
> >  /**
> > @@ -2843,7 +2849,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
> >  			conn_event(c, conn, SOCK_FIN_RCVD);
> >  
> >  		if (events & EPOLLIN)
> > -			tcp_data_from_sock(c, conn);
> > +			tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap);
> >  
> >  		if (events & EPOLLOUT)
> >  			tcp_update_seqack_wnd(c, conn, 0, NULL);
> > diff --git a/tcp_conn.h b/tcp_conn.h
> > index a5f5cfe..afcdec9 100644
> > --- a/tcp_conn.h
> > +++ b/tcp_conn.h
> > @@ -29,6 +29,7 @@
> >   * @wnd_from_tap:	Last window size from tap, unscaled (as received)
> >   * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
> >   * @seq_to_tap:		Next sequence for packets to tap
> > + * @max_seq_to_tap:	Next seq after highest ever sent. Needeed during retransmit

Sequence numbers regularly wrap around (they are 32 bits), so you can't
really define this. I'm not entirely sure how you use it, though -- I
have the same question about the usage of MAX() above.

> >   * @seq_ack_from_tap:	Last ACK number received from tap
> >   * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
> >   * @seq_ack_to_tap:	Last ACK number sent to tap
> > @@ -100,6 +101,7 @@ struct tcp_tap_conn {
> >  	uint16_t	wnd_to_tap;
> >  
> >  	uint32_t	seq_to_tap;
> > +	uint32_t	max_seq_to_tap;
> >  	uint32_t	seq_ack_from_tap;
> >  	uint32_t	seq_from_tap;
> >  	uint32_t	seq_ack_to_tap;  
> 

-- 
Stefano


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-04-24  0:48     ` David Gibson
  2024-04-24 18:30       ` Stefano Brivio
@ 2024-04-25 23:06       ` Jon Maloy
  1 sibling, 0 replies; 15+ messages in thread
From: Jon Maloy @ 2024-04-25 23:06 UTC (permalink / raw)
  To: David Gibson, Stefano Brivio; +Cc: passt-dev, lvivier, dgibson

On 2024-04-23 20:48, David Gibson wrote:
> On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote:
>> On Sat, 20 Apr 2024 15:19:19 -0400
>> Jon Maloy <jmaloy@redhat.com> wrote:
> [snip]
>>> +	set_peek_offset(s, 0);
>> Do we really need to initialise it to zero on a new connection? Extra
>> system calls on this path matter for latency of connection
>> establishment.
Yes. Although I moved it to  tcp_conn_from_tap() in the next version,
to make it more symmetric with the one in tcp_tap_conn_from_sock()
> Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting
> it to 0, rather than the default -1.  We could lazily enable it, but
> we'd need either to a) do it later in the handshake (maybe when we set
> ESTABLISHED), but we'd need to be careful it is always set before the
> first MSG_PEEK or b) keep track of whether it's set on a per-socket
> basis (this would have the advantage of robustness if we ever
> encountered a kernel that weirdly allows it for some but not all TCP
> sockets).
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-04-24  0:44   ` David Gibson
@ 2024-04-25 23:23     ` Jon Maloy
  2024-04-26  3:29       ` David Gibson
  0 siblings, 1 reply; 15+ messages in thread
From: Jon Maloy @ 2024-04-25 23:23 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev, sbrivio, lvivier, dgibson



On 2024-04-23 20:44, David Gibson wrote:
> On Sat, Apr 20, 2024 at 03:19:19PM -0400, Jon Maloy wrote:
>> The kernel may support recvmsg(MSG_PEEK), starting reading data from a
> Not worth a respin on its own, but I think the comma above is
> misplaced, and for me makes the sentence much harder to read.
>
>> given offset set by the SO_PEEK_OFF socket option. This makes it
>> possible to avoid repeated reading of already read initial bytes of a
>> received message, hence saving read cycles when forwarding TCP messages
>> in the host->name space direction.
>>
>> In this commit, we add functionality to leverage this feature when available,
[...]
>> @@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>>   	if (iov_rem)
>>   		iov_sock[fill_bufs].iov_len = iov_rem;
>>   
>> +	if (peek_offset_cap) {
>> +		/* Don't use discard buffer */
>> +		mh_sock.msg_iov = &iov_sock[1];
>> +		mh_sock.msg_iovlen -= 1;
>> +
>> +		/* Keep kernel sk_peek_off in synch */
>> +		set_peek_offset(s, already_sent);
> I thought we didn't need to set SO_PEEK_OFF here - that it would track
> on its own, and we only needed to change it for retransmits.  I don't
> think we even need to calculate 'already_sent' when we have
> SO_PEEK_OFF.  In fact - if we set already_sent to 0 here, it might
> make things a bit cleaner than having to have special cases for
> adjusting the iov and sendlen.
In theory yes.
I tried it for a while, using SEQ_GE(max_ack_seq, ack_seq) as criteria for
retransmission.
I observed some strange behavior, like retransmits that seemingly did not
come from fast retransmit or timer retransmit, and that the kernel 
'sk_peek_off'
didn´t always have the expected value when comparing with 'already_sent´.
Since my focus was on the zero-window issue I decided to skip this for now
and take the safe option.
I may revisit this later.
>
>> +	}
>> +
>>   	/* Receive into buffers, don't dequeue until acknowledged by guest. */
>>   	do
>>   		len = recvmsg(s, &mh_sock, MSG_PEEK);
>> @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>>   		return 0;
>>   	}
>>   
[...]
>> +			peek_offset_cap = true;
>> +		}
>> +		close(s);
>> +	}
>> +	printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not ")
> Should be an info().
Made it a debug() as suggested by Stefano.
>
>>   	return 0;
>>   }
>>   


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-04-24 18:30       ` Stefano Brivio
@ 2024-04-26  3:27         ` David Gibson
  2024-04-26  5:58           ` Stefano Brivio
  0 siblings, 1 reply; 15+ messages in thread
From: David Gibson @ 2024-04-26  3:27 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Jon Maloy, passt-dev, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 2598 bytes --]

On Wed, Apr 24, 2024 at 08:30:44PM +0200, Stefano Brivio wrote:
> On Wed, 24 Apr 2024 10:48:05 +1000
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote:
> > > On Sat, 20 Apr 2024 15:19:19 -0400
> > > Jon Maloy <jmaloy@redhat.com> wrote:  
> > [snip]
> > > > +	set_peek_offset(s, 0);  
> > > 
> > > Do we really need to initialise it to zero on a new connection? Extra
> > > system calls on this path matter for latency of connection
> > > establishment.  
> > 
> > Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting
> > it to 0, rather than the default -1.
> 
> By the way of which, this is not documented at this point -- a man page
> patch (linux-man and linux-api lists) would be nice.
> 
> > We could lazily enable it, but
> > we'd need either to a) do it later in the handshake (maybe when we set
> > ESTABLISHED), but we'd need to be careful it is always set before the
> > first MSG_PEEK
> 
> I was actually thinking that we could set it only as we receive data
> (not every connection will receive data), and keep this out of the
> handshake (which we want to keep "faster", I think).

That makes sense, but I think it would need a per-connection flag.

> And setting it as we mark a connection as ESTABLISHED should have the
> same effect on latency as setting it on a new connection -- that's not
> really lazy. So, actually:

Good point.

> > or b) keep track of whether it's set on a per-socket
> > basis (this would have the advantage of robustness if we ever
> > encountered a kernel that weirdly allows it for some but not all TCP
> > sockets).
> 
> ...this could be done as we receive data in tcp_data_from_sock(), with
> a new flag in tcp_tap_conn::flags, to avoid adding latency to the
> handshake. It also looks more robust to me, and done/checked in a
> single place where we need it.
> 
> We have just three bits left there which isn't great, but if we need to
> save one at a later point, we can drop this new flag easily.

I just realised that folding the feature detection into this is a bit
costlier than I thought.  If we globally probe the feature we just
need one bit per connection: is SO_PEEK_OFF set yet or not.  If we
tried to probe per-connection we'd need a tristate: haven't tried /
SO_PEEK_OFF enabled / tried and failed.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-04-25 23:23     ` Jon Maloy
@ 2024-04-26  3:29       ` David Gibson
  0 siblings, 0 replies; 15+ messages in thread
From: David Gibson @ 2024-04-26  3:29 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 2910 bytes --]

On Thu, Apr 25, 2024 at 07:23:28PM -0400, Jon Maloy wrote:
> 
> 
> On 2024-04-23 20:44, David Gibson wrote:
> > On Sat, Apr 20, 2024 at 03:19:19PM -0400, Jon Maloy wrote:
> > > The kernel may support recvmsg(MSG_PEEK), starting reading data from a
> > Not worth a respin on its own, but I think the comma above is
> > misplaced, and for me makes the sentence much harder to read.
> > 
> > > given offset set by the SO_PEEK_OFF socket option. This makes it
> > > possible to avoid repeated reading of already read initial bytes of a
> > > received message, hence saving read cycles when forwarding TCP messages
> > > in the host->name space direction.
> > > 
> > > In this commit, we add functionality to leverage this feature when available,
> [...]
> > > @@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
> > >   	if (iov_rem)
> > >   		iov_sock[fill_bufs].iov_len = iov_rem;
> > > +	if (peek_offset_cap) {
> > > +		/* Don't use discard buffer */
> > > +		mh_sock.msg_iov = &iov_sock[1];
> > > +		mh_sock.msg_iovlen -= 1;
> > > +
> > > +		/* Keep kernel sk_peek_off in synch */
> > > +		set_peek_offset(s, already_sent);
> > I thought we didn't need to set SO_PEEK_OFF here - that it would track
> > on its own, and we only needed to change it for retransmits.  I don't
> > think we even need to calculate 'already_sent' when we have
> > SO_PEEK_OFF.  In fact - if we set already_sent to 0 here, it might
> > make things a bit cleaner than having to have special cases for
> > adjusting the iov and sendlen.
> In theory yes.
> I tried it for a while, using SEQ_GE(max_ack_seq, ack_seq) as criteria for
> retransmission.
> I observed some strange behavior, like retransmits that seemingly did not
> come from fast retransmit or timer retransmit, and that the kernel
> 'sk_peek_off'
> didn´t always have the expected value when comparing with 'already_sent´.

Ouch, that sounds bad.  I'm pretty sure that means there's a bug on
one side or the other.

> Since my focus was on the zero-window issue I decided to skip this for now
> and take the safe option.
> I may revisit this later.
> > 
> > > +	}
> > > +
> > >   	/* Receive into buffers, don't dequeue until acknowledged by guest. */
> > >   	do
> > >   		len = recvmsg(s, &mh_sock, MSG_PEEK);
> > > @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
> > >   		return 0;
> > >   	}
> [...]
> > > +			peek_offset_cap = true;
> > > +		}
> > > +		close(s);
> > > +	}
> > > +	printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not ")
> > Should be an info().
> Made it a debug() as suggested by Stefano.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-04-26  3:27         ` David Gibson
@ 2024-04-26  5:58           ` Stefano Brivio
  2024-04-29  1:46             ` David Gibson
  0 siblings, 1 reply; 15+ messages in thread
From: Stefano Brivio @ 2024-04-26  5:58 UTC (permalink / raw)
  To: David Gibson; +Cc: Jon Maloy, passt-dev, lvivier, dgibson

On Fri, 26 Apr 2024 13:27:11 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Wed, Apr 24, 2024 at 08:30:44PM +0200, Stefano Brivio wrote:
> > On Wed, 24 Apr 2024 10:48:05 +1000
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> > > On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote:  
> > > > On Sat, 20 Apr 2024 15:19:19 -0400
> > > > Jon Maloy <jmaloy@redhat.com> wrote:    
> > > [snip]  
> > > > > +	set_peek_offset(s, 0);    
> > > > 
> > > > Do we really need to initialise it to zero on a new connection? Extra
> > > > system calls on this path matter for latency of connection
> > > > establishment.    
> > > 
> > > Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting
> > > it to 0, rather than the default -1.  
> > 
> > By the way of which, this is not documented at this point -- a man page
> > patch (linux-man and linux-api lists) would be nice.
> >   
> > > We could lazily enable it, but
> > > we'd need either to a) do it later in the handshake (maybe when we set
> > > ESTABLISHED), but we'd need to be careful it is always set before the
> > > first MSG_PEEK  
> > 
> > I was actually thinking that we could set it only as we receive data
> > (not every connection will receive data), and keep this out of the
> > handshake (which we want to keep "faster", I think).  
> 
> That makes sense, but I think it would need a per-connection flag.

Definitely.

> > And setting it as we mark a connection as ESTABLISHED should have the
> > same effect on latency as setting it on a new connection -- that's not
> > really lazy. So, actually:  
> 
> Good point.
> 
> > > or b) keep track of whether it's set on a per-socket
> > > basis (this would have the advantage of robustness if we ever
> > > encountered a kernel that weirdly allows it for some but not all TCP
> > > sockets).  
> > 
> > ...this could be done as we receive data in tcp_data_from_sock(), with
> > a new flag in tcp_tap_conn::flags, to avoid adding latency to the
> > handshake. It also looks more robust to me, and done/checked in a
> > single place where we need it.
> > 
> > We have just three bits left there which isn't great, but if we need to
> > save one at a later point, we can drop this new flag easily.  
> 
> I just realised that folding the feature detection into this is a bit
> costlier than I thought.  If we globally probe the feature we just
> need one bit per connection: is SO_PEEK_OFF set yet or not.  If we
> tried to probe per-connection we'd need a tristate: haven't tried /
> SO_PEEK_OFF enabled / tried and failed.

I forgot to mention this part: what I wanted to propose was actually
still a global probe, so that we don't waste one system call per
connection on kernels not supporting this (a substantial use case for a
couple of years from now?), which probably outweighs the advantage of
the weird, purely theoretical kernel not supporting the feature for
some sockets only.

And then something like PEEK_OFFSET_SET (SO_PEEK_OFF_SET sounds awkward
to me) on top. Another advantage is avoiding the tristate you described.

-- 
Stefano


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available
  2024-04-26  5:58           ` Stefano Brivio
@ 2024-04-29  1:46             ` David Gibson
  0 siblings, 0 replies; 15+ messages in thread
From: David Gibson @ 2024-04-29  1:46 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Jon Maloy, passt-dev, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 3630 bytes --]

On Fri, Apr 26, 2024 at 07:58:32AM +0200, Stefano Brivio wrote:
> On Fri, 26 Apr 2024 13:27:11 +1000
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Wed, Apr 24, 2024 at 08:30:44PM +0200, Stefano Brivio wrote:
> > > On Wed, 24 Apr 2024 10:48:05 +1000
> > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > >   
> > > > On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote:  
> > > > > On Sat, 20 Apr 2024 15:19:19 -0400
> > > > > Jon Maloy <jmaloy@redhat.com> wrote:    
> > > > [snip]  
> > > > > > +	set_peek_offset(s, 0);    
> > > > > 
> > > > > Do we really need to initialise it to zero on a new connection? Extra
> > > > > system calls on this path matter for latency of connection
> > > > > establishment.    
> > > > 
> > > > Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting
> > > > it to 0, rather than the default -1.  
> > > 
> > > By the way of which, this is not documented at this point -- a man page
> > > patch (linux-man and linux-api lists) would be nice.
> > >   
> > > > We could lazily enable it, but
> > > > we'd need either to a) do it later in the handshake (maybe when we set
> > > > ESTABLISHED), but we'd need to be careful it is always set before the
> > > > first MSG_PEEK  
> > > 
> > > I was actually thinking that we could set it only as we receive data
> > > (not every connection will receive data), and keep this out of the
> > > handshake (which we want to keep "faster", I think).  
> > 
> > That makes sense, but I think it would need a per-connection flag.
> 
> Definitely.
> 
> > > And setting it as we mark a connection as ESTABLISHED should have the
> > > same effect on latency as setting it on a new connection -- that's not
> > > really lazy. So, actually:  
> > 
> > Good point.
> > 
> > > > or b) keep track of whether it's set on a per-socket
> > > > basis (this would have the advantage of robustness if we ever
> > > > encountered a kernel that weirdly allows it for some but not all TCP
> > > > sockets).  
> > > 
> > > ...this could be done as we receive data in tcp_data_from_sock(), with
> > > a new flag in tcp_tap_conn::flags, to avoid adding latency to the
> > > handshake. It also looks more robust to me, and done/checked in a
> > > single place where we need it.
> > > 
> > > We have just three bits left there which isn't great, but if we need to
> > > save one at a later point, we can drop this new flag easily.  
> > 
> > I just realised that folding the feature detection into this is a bit
> > costlier than I thought.  If we globally probe the feature we just
> > need one bit per connection: is SO_PEEK_OFF set yet or not.  If we
> > tried to probe per-connection we'd need a tristate: haven't tried /
> > SO_PEEK_OFF enabled / tried and failed.
> 
> I forgot to mention this part: what I wanted to propose was actually
> still a global probe, so that we don't waste one system call per
> connection on kernels not supporting this (a substantial use case for a
> couple of years from now?), which probably outweighs the advantage of
> the weird, purely theoretical kernel not supporting the feature for
> some sockets only.

> And then something like PEEK_OFFSET_SET (SO_PEEK_OFF_SET sounds awkward
> to me) on top. Another advantage is avoiding the tristate you described.

Right, having thought it through I agree this is a better approach.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2024-04-29  1:46 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-20 19:19 [PATCH 0/2] Support for SO_PEEK_OFF when a available Jon Maloy
2024-04-20 19:19 ` [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy
2024-04-23 17:50   ` Stefano Brivio
2024-04-24  0:48     ` David Gibson
2024-04-24 18:30       ` Stefano Brivio
2024-04-26  3:27         ` David Gibson
2024-04-26  5:58           ` Stefano Brivio
2024-04-29  1:46             ` David Gibson
2024-04-25 23:06       ` Jon Maloy
2024-04-24  0:44   ` David Gibson
2024-04-25 23:23     ` Jon Maloy
2024-04-26  3:29       ` David Gibson
2024-04-20 19:19 ` [PATCH 2/2] tcp: allow retransmit when peer receive window is zero Jon Maloy
2024-04-24  1:04   ` David Gibson
2024-04-24 18:31     ` Stefano Brivio

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).