public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: Jon Maloy <jmaloy@redhat.com>
To: sbrivio@redhat.com, lvivier@redhat.com, dgibson@redhat.com,
	jmaloy@redhat.com, passt-dev@passt.top
Subject: tcp.c: leverage MSG_PEEK with offset kernel capability when available
Date: Tue,  5 Dec 2023 18:36:04 -0500	[thread overview]
Message-ID: <20231205233604.1491317-1-jmaloy@redhat.com> (raw)

The kernel may support recvmsg(MSG_PEEK), starting from a given offset. This
makes it possible to avoid repeated reading of already read initial bytes of
a received message, hence saving us read cycles when forwarding TCP messages
in the host->name space direction.

When this feature is supported, iov_sock[0].iov_base can be set to NULL. The
kernel code will interpret this as an instruction to skip reading of the first
iov_sock[0].iov_len bytes of the message.

Since iov_sock[0].iov_base is set to point to tcp_buf_discard, we do this
by simply not allocating that buffer, letting the pointer remain NULL, when
we find that we have this kernel support.

There is no macro or function indicating kernel support for this feature. We
therefore have to probe for it by reading from an established TCP connection.
The traffic connection cannot be used for this purpose, because it will be
broken if the probe reading fails. We therefore have to create a temporary
local connection for this task. Because of this, we also add a new function,
tcp_probe_msg_peek_offset_cap(), which creates this temporary connection
and performs the probe read on it.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
 tcp.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 108 insertions(+), 2 deletions(-)

diff --git a/tcp.c b/tcp.c
index f506cfd..ab5168e 100644
--- a/tcp.c
+++ b/tcp.c
@@ -402,6 +402,8 @@ struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
 	 (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
 #define CONN_HAS(conn, set)	((conn->events & (set)) == (set))
 
+static bool tcp_probe_msg_peek_offset_cap();
+
 static const char *tcp_event_str[] __attribute((__unused__)) = {
 	"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
 
@@ -506,7 +508,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp6_l2_buf_used;
 
 /* recvmsg()/sendmsg() data for tap */
-static char 		tcp_buf_discard		[MAX_WINDOW];
+static char 		*tcp_buf_discard        = NULL;
+
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
 static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
@@ -573,6 +576,15 @@ static unsigned int tcp6_l2_flags_buf_used;
 
 #define CONN(idx)		(&(FLOW(idx)->tcp))
 
+
+/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset?
+ */
+static inline  bool msg_peek_offset_cap()
+{
+	return !tcp_buf_discard;
+}
+
+
 /** conn_at_idx() - Find a connection by index, if present
  * @idx:	Index of connection to lookup
  *
@@ -2224,7 +2236,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!msg_peek_offset_cap())
+		sendlen -= already_sent;
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -3107,6 +3121,15 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Only allocate discard buffer if needed */
+	if (!tcp_probe_msg_peek_offset_cap()) {
+		tcp_buf_discard = malloc(MAX_WINDOW);
+		if (!tcp_buf_discard) {
+			perror("failed to allocate discard buffer\n");
+			exit(EXIT_FAILURE);
+		}
+		debug("allocated discard buffer of size %i\n", MAX_WINDOW);
+	}
 	return 0;
 }
 
@@ -3213,3 +3236,86 @@ void tcp_timer(struct ctx *c, const struct timespec *ts)
 	if (c->mode == MODE_PASTA)
 		tcp_splice_refill(c);
 }
+
+/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support
+ */
+static bool tcp_probe_msg_peek_offset_cap()
+{
+	int listenerfd, fd;
+	ssize_t len;
+	char buf[8] = { 0 };
+	struct msghdr msg;
+	struct iovec iov[2];
+	struct sockaddr_in addr = { 0, };
+	socklen_t addrlen = sizeof(addr);
+	int option = 1;
+	bool ret = true;
+
+	memset(buf, 0, sizeof(buf));
+	addr.sin_family    = AF_INET;
+	addr.sin_addr.s_addr = INADDR_ANY;
+	addr.sin_port = 0;
+
+	if ( (listenerfd = socket(AF_INET, SOCK_STREAM, 0)) < 0 ) {
+		perror("temporary listener socket creation failed");
+		exit(EXIT_FAILURE);
+	}
+	setsockopt(listenerfd, SOL_SOCKET, SO_REUSEADDR, &option, sizeof(option));
+
+	if (bind(listenerfd, (const struct sockaddr *)&addr, sizeof(addr)) < 0 ) {
+		perror("temporary bind() failed");
+		exit(EXIT_FAILURE);
+	}
+	if (0 > listen(listenerfd, 100)) {
+		perror("temporary listen failed");
+		exit(EXIT_FAILURE);
+	}
+	if (0 != getsockname(listenerfd, (struct sockaddr *)&addr, &addrlen)) {
+		perror("temporary getsockname() failed");
+		exit(EXIT_FAILURE);
+	}
+
+	/* Spawn off a child process and let it send a message here: */
+	if (fork() == 0) {
+		if ( (fd = socket(AF_INET, SOCK_STREAM, 0)) < 0 ) {
+			perror("temporary socket creation failed");
+			exit(EXIT_FAILURE);
+		}
+		if (0 > connect(fd, (struct sockaddr *)&addr, sizeof(addr))) {
+			perror("temporary connect() failed");
+			exit(EXIT_FAILURE);
+		}
+		if (sizeof(buf) != send(fd, buf, sizeof(buf), 0))
+			perror("send on temporary server socket failed");
+		shutdown(fd, SHUT_RD | SHUT_WR);
+		close(fd);
+		exit(0);
+	}
+	/* Receive the message in mother process: */
+	fd = accept(listenerfd, NULL, NULL);
+	if (fd <= 0) {
+		perror("accept on temporary listener socket failed");
+		exit(EXIT_FAILURE);
+	}
+	memset(&msg, 0, sizeof(msg));
+	msg.msg_iov = iov;
+	msg.msg_iovlen = 2;
+	iov[0].iov_base = NULL;
+	iov[0].iov_len = 4;
+	iov[1].iov_base = &buf[3];
+	iov[1].iov_len = 4;
+	len = recvmsg(fd, &msg, MSG_PEEK);
+	if (len <= 0) {
+		if (errno != EFAULT) {
+			perror("temporary recvmsg() failed");
+			exit(EXIT_FAILURE);
+		}
+		ret = false;
+	} else if ((size_t)len != iov[1].iov_len) {
+		exit(EXIT_FAILURE);
+	}
+	shutdown(fd, SHUT_RD | SHUT_WR);
+	close(fd);
+	info("MSG_PEEK with offset %ssupported by kernel.\n", ret ? "" : "not ");
+	return ret;
+}
-- 
@@ -402,6 +402,8 @@ struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
 	 (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
 #define CONN_HAS(conn, set)	((conn->events & (set)) == (set))
 
+static bool tcp_probe_msg_peek_offset_cap();
+
 static const char *tcp_event_str[] __attribute((__unused__)) = {
 	"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
 
@@ -506,7 +508,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp6_l2_buf_used;
 
 /* recvmsg()/sendmsg() data for tap */
-static char 		tcp_buf_discard		[MAX_WINDOW];
+static char 		*tcp_buf_discard        = NULL;
+
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
 static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
@@ -573,6 +576,15 @@ static unsigned int tcp6_l2_flags_buf_used;
 
 #define CONN(idx)		(&(FLOW(idx)->tcp))
 
+
+/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset?
+ */
+static inline  bool msg_peek_offset_cap()
+{
+	return !tcp_buf_discard;
+}
+
+
 /** conn_at_idx() - Find a connection by index, if present
  * @idx:	Index of connection to lookup
  *
@@ -2224,7 +2236,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!msg_peek_offset_cap())
+		sendlen -= already_sent;
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -3107,6 +3121,15 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Only allocate discard buffer if needed */
+	if (!tcp_probe_msg_peek_offset_cap()) {
+		tcp_buf_discard = malloc(MAX_WINDOW);
+		if (!tcp_buf_discard) {
+			perror("failed to allocate discard buffer\n");
+			exit(EXIT_FAILURE);
+		}
+		debug("allocated discard buffer of size %i\n", MAX_WINDOW);
+	}
 	return 0;
 }
 
@@ -3213,3 +3236,86 @@ void tcp_timer(struct ctx *c, const struct timespec *ts)
 	if (c->mode == MODE_PASTA)
 		tcp_splice_refill(c);
 }
+
+/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support
+ */
+static bool tcp_probe_msg_peek_offset_cap()
+{
+	int listenerfd, fd;
+	ssize_t len;
+	char buf[8] = { 0 };
+	struct msghdr msg;
+	struct iovec iov[2];
+	struct sockaddr_in addr = { 0, };
+	socklen_t addrlen = sizeof(addr);
+	int option = 1;
+	bool ret = true;
+
+	memset(buf, 0, sizeof(buf));
+	addr.sin_family    = AF_INET;
+	addr.sin_addr.s_addr = INADDR_ANY;
+	addr.sin_port = 0;
+
+	if ( (listenerfd = socket(AF_INET, SOCK_STREAM, 0)) < 0 ) {
+		perror("temporary listener socket creation failed");
+		exit(EXIT_FAILURE);
+	}
+	setsockopt(listenerfd, SOL_SOCKET, SO_REUSEADDR, &option, sizeof(option));
+
+	if (bind(listenerfd, (const struct sockaddr *)&addr, sizeof(addr)) < 0 ) {
+		perror("temporary bind() failed");
+		exit(EXIT_FAILURE);
+	}
+	if (0 > listen(listenerfd, 100)) {
+		perror("temporary listen failed");
+		exit(EXIT_FAILURE);
+	}
+	if (0 != getsockname(listenerfd, (struct sockaddr *)&addr, &addrlen)) {
+		perror("temporary getsockname() failed");
+		exit(EXIT_FAILURE);
+	}
+
+	/* Spawn off a child process and let it send a message here: */
+	if (fork() == 0) {
+		if ( (fd = socket(AF_INET, SOCK_STREAM, 0)) < 0 ) {
+			perror("temporary socket creation failed");
+			exit(EXIT_FAILURE);
+		}
+		if (0 > connect(fd, (struct sockaddr *)&addr, sizeof(addr))) {
+			perror("temporary connect() failed");
+			exit(EXIT_FAILURE);
+		}
+		if (sizeof(buf) != send(fd, buf, sizeof(buf), 0))
+			perror("send on temporary server socket failed");
+		shutdown(fd, SHUT_RD | SHUT_WR);
+		close(fd);
+		exit(0);
+	}
+	/* Receive the message in mother process: */
+	fd = accept(listenerfd, NULL, NULL);
+	if (fd <= 0) {
+		perror("accept on temporary listener socket failed");
+		exit(EXIT_FAILURE);
+	}
+	memset(&msg, 0, sizeof(msg));
+	msg.msg_iov = iov;
+	msg.msg_iovlen = 2;
+	iov[0].iov_base = NULL;
+	iov[0].iov_len = 4;
+	iov[1].iov_base = &buf[3];
+	iov[1].iov_len = 4;
+	len = recvmsg(fd, &msg, MSG_PEEK);
+	if (len <= 0) {
+		if (errno != EFAULT) {
+			perror("temporary recvmsg() failed");
+			exit(EXIT_FAILURE);
+		}
+		ret = false;
+	} else if ((size_t)len != iov[1].iov_len) {
+		exit(EXIT_FAILURE);
+	}
+	shutdown(fd, SHUT_RD | SHUT_WR);
+	close(fd);
+	info("MSG_PEEK with offset %ssupported by kernel.\n", ret ? "" : "not ");
+	return ret;
+}
-- 
2.39.2


             reply	other threads:[~2023-12-05 23:36 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-12-05 23:36 Jon Maloy [this message]
2023-12-06 14:59 ` tcp.c: leverage MSG_PEEK with offset kernel capability when available Stefano Brivio
2023-12-06 15:08   ` Stefano Brivio
2023-12-06 16:10   ` Jon Maloy
2023-12-06 17:06     ` Stefano Brivio

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231205233604.1491317-1-jmaloy@redhat.com \
    --to=jmaloy@redhat.com \
    --cc=dgibson@redhat.com \
    --cc=lvivier@redhat.com \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).