public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH v2] tcp.c: leverage MSG_PEEK with offset kernel capability when available
@ 2024-01-14 18:07 Jon Maloy
  2024-01-18  3:05 ` David Gibson
  0 siblings, 1 reply; 6+ messages in thread
From: Jon Maloy @ 2024-01-14 18:07 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

The kernel may support recvmsg(MSG_PEEK), starting from a given offset. This
makes it possible to avoid repeated reading of already read initial bytes of
a received message, hence saving us read cycles when forwarding TCP messages
in the host->name space direction.

When this feature is supported, iov_sock[0].iov_base can be set to NULL. The
kernel code will interpret this as an instruction to skip reading of the first
iov_sock[0].iov_len bytes of the message.

Since iov_sock[0].iov_base is set to point to tcp_buf_discard, we do this
by making this into a pointer, setting it to NULL if we find that the feature
is supported by the kernel, an letting it point to a static buffer if not.

There is no macro or function indicating kernel support for this feature. We
therefore have to probe for it by creating a temporary tcp connection and
read from it as if the feature is present. If that fails, we fall back to
the original design. The traffic connection cannot be used for this purpose,
because it will be broken if the probe reading fails.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>

---
v2: Changes as suggested by Stefano Brivio:
    - Moved probe process/function into a separate, temporary name space, to avoid
      occupying port numbers in the current name space.
    - Put discard buffer back to static memory.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
 netlink.c |   2 +-
 netlink.h |   1 +
 tcp.c     | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/netlink.c b/netlink.c
index 379d46e..d5f10de 100644
--- a/netlink.c
+++ b/netlink.c
@@ -55,7 +55,7 @@ static int nl_seq = 1;
  *
  * Return: 0
  */
-static int nl_sock_init_do(void *arg)
+int nl_sock_init_do(void *arg)
 {
 	struct sockaddr_nl addr = { .nl_family = AF_NETLINK, };
 	int *s = arg ? &nl_sock_ns : &nl_sock;
diff --git a/netlink.h b/netlink.h
index 3a1f0de..cadd3b7 100644
--- a/netlink.h
+++ b/netlink.h
@@ -24,5 +24,6 @@ int nl_addr_dup(int s_src, unsigned int ifi_src,
 int nl_link_get_mac(int s, unsigned int ifi, void *mac);
 int nl_link_set_mac(int s, unsigned int ifi, const void *mac);
 int nl_link_up(int s, unsigned int ifi, int mtu);
+int nl_sock_init_do(void *arg);
 
 #endif /* NETLINK_H */
diff --git a/tcp.c b/tcp.c
index f506cfd..4410460 100644
--- a/tcp.c
+++ b/tcp.c
@@ -283,6 +283,7 @@
 #include <sys/timerfd.h>
 #include <sys/types.h>
 #include <sys/uio.h>
+#include <sys/wait.h>
 #include <time.h>
 
 #include <linux/tcp.h> /* For struct tcp_info */
@@ -297,6 +298,7 @@
 #include "log.h"
 #include "inany.h"
 #include "flow.h"
+#include "netlink.h"
 
 #include "tcp_conn.h"
 #include "flow_table.h"
@@ -402,6 +404,8 @@ struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
 	 (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
 #define CONN_HAS(conn, set)	((conn->events & (set)) == (set))
 
+static bool tcp_probe_msg_peek_offset_cap();
+
 static const char *tcp_event_str[] __attribute((__unused__)) = {
 	"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
 
@@ -506,7 +510,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp6_l2_buf_used;
 
 /* recvmsg()/sendmsg() data for tap */
-static char 		tcp_buf_discard		[MAX_WINDOW];
+static char 		tcp_discard_buf[MAX_WINDOW];
+static char* tcp_buf_discard = tcp_discard_buf;
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
 static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
@@ -573,6 +578,15 @@ static unsigned int tcp6_l2_flags_buf_used;
 
 #define CONN(idx)		(&(FLOW(idx)->tcp))
 
+
+/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset?
+ */
+static inline  bool msg_peek_offset_cap()
+{
+	return !tcp_buf_discard;
+}
+
+
 /** conn_at_idx() - Find a connection by index, if present
  * @idx:	Index of connection to lookup
  *
@@ -2224,7 +2238,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!msg_peek_offset_cap())
+		sendlen -= already_sent;
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -3107,6 +3123,9 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Ignore discard buffer if not needed */
+	if (tcp_probe_msg_peek_offset_cap())
+		tcp_buf_discard = NULL;
 	return 0;
 }
 
@@ -3213,3 +3232,83 @@ void tcp_timer(struct ctx *c, const struct timespec *ts)
 	if (c->mode == MODE_PASTA)
 		tcp_splice_refill(c);
 }
+
+static int tcp_probe_sockets(void *arg)
+{
+        char b;
+        struct iovec iov[2] = { { NULL, 1 }, { &b, 1 }, };
+        struct msghdr msg = { NULL, 0, iov, 2, NULL, 0, 0};
+        struct sockaddr a = { AF_INET, {0, }};
+	int err = EXIT_FAILURE;
+        int s[2] = {0, };
+	int s_recv = 0;
+	int *rc = arg;
+        ssize_t len;
+
+	/* Make sure loopback interface is enabled */
+	nl_sock_init_do(NULL);
+	nl_link_up(nl_sock, 1 /* lo */, 0);
+
+	s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+	if (s[0] < 0 || s[1] < 0) {
+		perror("Temporary probe socket creation failed\n");
+		goto out;
+	}
+	if (0 > bind(s[0], &a, sizeof(a))) {
+		perror("Temporary probe socket bind() failed\n");
+		goto out;
+	}
+	if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) {
+		perror("Temporary probe socket getsockname() failed\n");
+		goto out;
+	}
+	if (0 > listen(s[0], 0)) {
+		perror("Temporary probe socket listen() failed\n");
+		goto out;
+	}
+	if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) {
+		perror("Temporary probe socket connect() failed\n");
+		goto out;
+	}
+	s_recv = accept(s[0], NULL, NULL);
+	if (s_recv <= 0) {
+		perror("Temporary probe socket accept() failed\n");
+		goto out;
+	}
+	if (0 >= send(s[1], (char *)("ab"), 2, 0) || errno != EINPROGRESS) {
+		perror("Temporary probe socket send() failed\n");
+		goto out;
+	}
+	len = recvmsg(s_recv, &msg, MSG_PEEK);
+	if (len <= 0 && errno != EFAULT) {
+		perror("Temporary probe socket recvmsg() failed\n");
+		goto out;
+	}
+	printf("MSG_PEEK with offset %ssupported\n", len == 1 ? "" : "not ");
+	*rc = len == 1;
+	err = EXIT_SUCCESS;
+out:
+	close(s_recv);
+	close(s[1]);
+	close(s[0]);
+	return err;
+}
+
+/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support
+ */
+static bool tcp_probe_msg_peek_offset_cap()
+{
+	char ns_fn_stack[NS_FN_STACK_SIZE];
+	int child_pid, child_ret, rc = 0;
+
+	child_pid = do_clone(tcp_probe_sockets, ns_fn_stack, sizeof(ns_fn_stack),
+			     CLONE_NEWNET | CLONE_NEWUSER | CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD,
+			     (void *)&rc);
+	if (child_pid <= 0) {
+		perror("Failed to clone tcp probe process\n");
+		exit(EXIT_FAILURE);
+	}
+	waitpid(child_pid, &child_ret, 0);
+	return rc;
+}
-- 
@@ -283,6 +283,7 @@
 #include <sys/timerfd.h>
 #include <sys/types.h>
 #include <sys/uio.h>
+#include <sys/wait.h>
 #include <time.h>
 
 #include <linux/tcp.h> /* For struct tcp_info */
@@ -297,6 +298,7 @@
 #include "log.h"
 #include "inany.h"
 #include "flow.h"
+#include "netlink.h"
 
 #include "tcp_conn.h"
 #include "flow_table.h"
@@ -402,6 +404,8 @@ struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
 	 (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
 #define CONN_HAS(conn, set)	((conn->events & (set)) == (set))
 
+static bool tcp_probe_msg_peek_offset_cap();
+
 static const char *tcp_event_str[] __attribute((__unused__)) = {
 	"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
 
@@ -506,7 +510,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp6_l2_buf_used;
 
 /* recvmsg()/sendmsg() data for tap */
-static char 		tcp_buf_discard		[MAX_WINDOW];
+static char 		tcp_discard_buf[MAX_WINDOW];
+static char* tcp_buf_discard = tcp_discard_buf;
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
 static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
@@ -573,6 +578,15 @@ static unsigned int tcp6_l2_flags_buf_used;
 
 #define CONN(idx)		(&(FLOW(idx)->tcp))
 
+
+/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset?
+ */
+static inline  bool msg_peek_offset_cap()
+{
+	return !tcp_buf_discard;
+}
+
+
 /** conn_at_idx() - Find a connection by index, if present
  * @idx:	Index of connection to lookup
  *
@@ -2224,7 +2238,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!msg_peek_offset_cap())
+		sendlen -= already_sent;
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -3107,6 +3123,9 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Ignore discard buffer if not needed */
+	if (tcp_probe_msg_peek_offset_cap())
+		tcp_buf_discard = NULL;
 	return 0;
 }
 
@@ -3213,3 +3232,83 @@ void tcp_timer(struct ctx *c, const struct timespec *ts)
 	if (c->mode == MODE_PASTA)
 		tcp_splice_refill(c);
 }
+
+static int tcp_probe_sockets(void *arg)
+{
+        char b;
+        struct iovec iov[2] = { { NULL, 1 }, { &b, 1 }, };
+        struct msghdr msg = { NULL, 0, iov, 2, NULL, 0, 0};
+        struct sockaddr a = { AF_INET, {0, }};
+	int err = EXIT_FAILURE;
+        int s[2] = {0, };
+	int s_recv = 0;
+	int *rc = arg;
+        ssize_t len;
+
+	/* Make sure loopback interface is enabled */
+	nl_sock_init_do(NULL);
+	nl_link_up(nl_sock, 1 /* lo */, 0);
+
+	s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+	if (s[0] < 0 || s[1] < 0) {
+		perror("Temporary probe socket creation failed\n");
+		goto out;
+	}
+	if (0 > bind(s[0], &a, sizeof(a))) {
+		perror("Temporary probe socket bind() failed\n");
+		goto out;
+	}
+	if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) {
+		perror("Temporary probe socket getsockname() failed\n");
+		goto out;
+	}
+	if (0 > listen(s[0], 0)) {
+		perror("Temporary probe socket listen() failed\n");
+		goto out;
+	}
+	if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) {
+		perror("Temporary probe socket connect() failed\n");
+		goto out;
+	}
+	s_recv = accept(s[0], NULL, NULL);
+	if (s_recv <= 0) {
+		perror("Temporary probe socket accept() failed\n");
+		goto out;
+	}
+	if (0 >= send(s[1], (char *)("ab"), 2, 0) || errno != EINPROGRESS) {
+		perror("Temporary probe socket send() failed\n");
+		goto out;
+	}
+	len = recvmsg(s_recv, &msg, MSG_PEEK);
+	if (len <= 0 && errno != EFAULT) {
+		perror("Temporary probe socket recvmsg() failed\n");
+		goto out;
+	}
+	printf("MSG_PEEK with offset %ssupported\n", len == 1 ? "" : "not ");
+	*rc = len == 1;
+	err = EXIT_SUCCESS;
+out:
+	close(s_recv);
+	close(s[1]);
+	close(s[0]);
+	return err;
+}
+
+/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support
+ */
+static bool tcp_probe_msg_peek_offset_cap()
+{
+	char ns_fn_stack[NS_FN_STACK_SIZE];
+	int child_pid, child_ret, rc = 0;
+
+	child_pid = do_clone(tcp_probe_sockets, ns_fn_stack, sizeof(ns_fn_stack),
+			     CLONE_NEWNET | CLONE_NEWUSER | CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD,
+			     (void *)&rc);
+	if (child_pid <= 0) {
+		perror("Failed to clone tcp probe process\n");
+		exit(EXIT_FAILURE);
+	}
+	waitpid(child_pid, &child_ret, 0);
+	return rc;
+}
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-01-20  4:57 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-14 18:07 [PATCH v2] tcp.c: leverage MSG_PEEK with offset kernel capability when available Jon Maloy
2024-01-18  3:05 ` David Gibson
2024-01-18 16:23   ` Stefano Brivio
2024-01-19  0:05     ` David Gibson
2024-01-19 10:45       ` Stefano Brivio
2024-01-20  4:47         ` David Gibson

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).