public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: Jon Maloy <jmaloy@redhat.com>
To: passt-dev@passt.top, sbrivio@redhat.com, lvivier@redhat.com,
	dgibson@redhat.com, jmaloy@redhat.com
Subject: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available
Date: Sat, 20 Apr 2024 15:19:19 -0400	[thread overview]
Message-ID: <20240420191920.104876-2-jmaloy@redhat.com> (raw)
In-Reply-To: <20240420191920.104876-1-jmaloy@redhat.com>

The kernel may support recvmsg(MSG_PEEK), starting reading data from a
given offset set by the SO_PEEK_OFF socket option. This makes it
possible to avoid repeated reading of already read initial bytes of a
received message, hence saving read cycles when forwarding TCP messages
in the host->name space direction.

In this commit, we add functionality to leverage this feature when available,
while we fall back to the previous behavior when not.

Measurements with iperf3 shows that throughput increases with 15-20 percent
in the host->namespace direction when this feature is used.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
 tcp.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/tcp.c b/tcp.c
index 905d26f..95d400a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -505,6 +505,7 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp6_l2_buf_used;
 
 /* recvmsg()/sendmsg() data for tap */
+static bool peek_offset_cap = false;
 static char 		tcp_buf_discard		[MAX_WINDOW];
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
@@ -582,6 +583,14 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
 int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
 int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
 
+static void set_peek_offset(int s, int offset)
+{
+	if (!peek_offset_cap)
+		return;
+	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
+		perror("Failed to set SO_PEEK_OFF\n");
+}
+
 /**
  * tcp_conn_epoll_events() - epoll events mask for given connection state
  * @events:	Current connection events
@@ -1951,7 +1960,7 @@ static void tcp_conn_from_tap(struct ctx *c,
 		if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll)))
 			goto cancel;
 	}
-
+	set_peek_offset(s, 0);
 	conn = &flow->tcp;
 	conn->f.type = FLOW_TCP;
 	conn->sock = s;
@@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	if (iov_rem)
 		iov_sock[fill_bufs].iov_len = iov_rem;
 
+	if (peek_offset_cap) {
+		/* Don't use discard buffer */
+		mh_sock.msg_iov = &iov_sock[1];
+		mh_sock.msg_iovlen -= 1;
+
+		/* Keep kernel sk_peek_off in synch */
+		set_peek_offset(s, already_sent);
+	}
+
 	/* Receive into buffers, don't dequeue until acknowledged by guest. */
 	do
 		len = recvmsg(s, &mh_sock, MSG_PEEK);
@@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!peek_offset_cap)
+		sendlen -= already_sent;
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -2718,6 +2738,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
 	    tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice,
 				      s, (struct sockaddr *)&sa))
 		return;
+	set_peek_offset(s, 0);
 
 	tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s,
 			       (struct sockaddr *)&sa, now);
@@ -3042,6 +3063,7 @@ static void tcp_sock_refill_init(const struct ctx *c)
 int tcp_init(struct ctx *c)
 {
 	unsigned b;
+	int s;
 
 	for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
 		tc_hash[b] = FLOW_SIDX_NONE;
@@ -3065,6 +3087,17 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Probe for SO_PEEK_OFF support */
+	s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if (s < 0) {
+		perror("Temporary tcp socket creation failed\n");
+	} else {
+		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &(int){0}, sizeof(int))) {
+			peek_offset_cap = true;
+		}
+		close(s);
+	}
+	printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not ");
 	return 0;
 }
 
-- 
@@ -505,6 +505,7 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp6_l2_buf_used;
 
 /* recvmsg()/sendmsg() data for tap */
+static bool peek_offset_cap = false;
 static char 		tcp_buf_discard		[MAX_WINDOW];
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
@@ -582,6 +583,14 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
 int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
 int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
 
+static void set_peek_offset(int s, int offset)
+{
+	if (!peek_offset_cap)
+		return;
+	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
+		perror("Failed to set SO_PEEK_OFF\n");
+}
+
 /**
  * tcp_conn_epoll_events() - epoll events mask for given connection state
  * @events:	Current connection events
@@ -1951,7 +1960,7 @@ static void tcp_conn_from_tap(struct ctx *c,
 		if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll)))
 			goto cancel;
 	}
-
+	set_peek_offset(s, 0);
 	conn = &flow->tcp;
 	conn->f.type = FLOW_TCP;
 	conn->sock = s;
@@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	if (iov_rem)
 		iov_sock[fill_bufs].iov_len = iov_rem;
 
+	if (peek_offset_cap) {
+		/* Don't use discard buffer */
+		mh_sock.msg_iov = &iov_sock[1];
+		mh_sock.msg_iovlen -= 1;
+
+		/* Keep kernel sk_peek_off in synch */
+		set_peek_offset(s, already_sent);
+	}
+
 	/* Receive into buffers, don't dequeue until acknowledged by guest. */
 	do
 		len = recvmsg(s, &mh_sock, MSG_PEEK);
@@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!peek_offset_cap)
+		sendlen -= already_sent;
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -2718,6 +2738,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
 	    tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice,
 				      s, (struct sockaddr *)&sa))
 		return;
+	set_peek_offset(s, 0);
 
 	tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s,
 			       (struct sockaddr *)&sa, now);
@@ -3042,6 +3063,7 @@ static void tcp_sock_refill_init(const struct ctx *c)
 int tcp_init(struct ctx *c)
 {
 	unsigned b;
+	int s;
 
 	for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
 		tc_hash[b] = FLOW_SIDX_NONE;
@@ -3065,6 +3087,17 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Probe for SO_PEEK_OFF support */
+	s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if (s < 0) {
+		perror("Temporary tcp socket creation failed\n");
+	} else {
+		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &(int){0}, sizeof(int))) {
+			peek_offset_cap = true;
+		}
+		close(s);
+	}
+	printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not ");
 	return 0;
 }
 
-- 
2.42.0


  reply	other threads:[~2024-04-20 19:19 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-20 19:19 [PATCH 0/2] Support for SO_PEEK_OFF when a available Jon Maloy
2024-04-20 19:19 ` Jon Maloy [this message]
2024-04-23 17:50   ` [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available Stefano Brivio
2024-04-24  0:48     ` David Gibson
2024-04-24 18:30       ` Stefano Brivio
2024-04-26  3:27         ` David Gibson
2024-04-26  5:58           ` Stefano Brivio
2024-04-29  1:46             ` David Gibson
2024-04-25 23:06       ` Jon Maloy
2024-04-24  0:44   ` David Gibson
2024-04-25 23:23     ` Jon Maloy
2024-04-26  3:29       ` David Gibson
2024-04-20 19:19 ` [PATCH 2/2] tcp: allow retransmit when peer receive window is zero Jon Maloy
2024-04-24  1:04   ` David Gibson
2024-04-24 18:31     ` Stefano Brivio

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240420191920.104876-2-jmaloy@redhat.com \
    --to=jmaloy@redhat.com \
    --cc=dgibson@redhat.com \
    --cc=lvivier@redhat.com \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).