public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH v2] tcp.c: leverage MSG_PEEK with offset kernel capability when available
@ 2024-01-14 18:07 Jon Maloy
  2024-01-18  3:05 ` David Gibson
  0 siblings, 1 reply; 6+ messages in thread
From: Jon Maloy @ 2024-01-14 18:07 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

The kernel may support recvmsg(MSG_PEEK), starting from a given offset. This
makes it possible to avoid repeated reading of already read initial bytes of
a received message, hence saving us read cycles when forwarding TCP messages
in the host->name space direction.

When this feature is supported, iov_sock[0].iov_base can be set to NULL. The
kernel code will interpret this as an instruction to skip reading of the first
iov_sock[0].iov_len bytes of the message.

Since iov_sock[0].iov_base is set to point to tcp_buf_discard, we do this
by making this into a pointer, setting it to NULL if we find that the feature
is supported by the kernel, an letting it point to a static buffer if not.

There is no macro or function indicating kernel support for this feature. We
therefore have to probe for it by creating a temporary tcp connection and
read from it as if the feature is present. If that fails, we fall back to
the original design. The traffic connection cannot be used for this purpose,
because it will be broken if the probe reading fails.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>

---
v2: Changes as suggested by Stefano Brivio:
    - Moved probe process/function into a separate, temporary name space, to avoid
      occupying port numbers in the current name space.
    - Put discard buffer back to static memory.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
 netlink.c |   2 +-
 netlink.h |   1 +
 tcp.c     | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/netlink.c b/netlink.c
index 379d46e..d5f10de 100644
--- a/netlink.c
+++ b/netlink.c
@@ -55,7 +55,7 @@ static int nl_seq = 1;
  *
  * Return: 0
  */
-static int nl_sock_init_do(void *arg)
+int nl_sock_init_do(void *arg)
 {
 	struct sockaddr_nl addr = { .nl_family = AF_NETLINK, };
 	int *s = arg ? &nl_sock_ns : &nl_sock;
diff --git a/netlink.h b/netlink.h
index 3a1f0de..cadd3b7 100644
--- a/netlink.h
+++ b/netlink.h
@@ -24,5 +24,6 @@ int nl_addr_dup(int s_src, unsigned int ifi_src,
 int nl_link_get_mac(int s, unsigned int ifi, void *mac);
 int nl_link_set_mac(int s, unsigned int ifi, const void *mac);
 int nl_link_up(int s, unsigned int ifi, int mtu);
+int nl_sock_init_do(void *arg);
 
 #endif /* NETLINK_H */
diff --git a/tcp.c b/tcp.c
index f506cfd..4410460 100644
--- a/tcp.c
+++ b/tcp.c
@@ -283,6 +283,7 @@
 #include <sys/timerfd.h>
 #include <sys/types.h>
 #include <sys/uio.h>
+#include <sys/wait.h>
 #include <time.h>
 
 #include <linux/tcp.h> /* For struct tcp_info */
@@ -297,6 +298,7 @@
 #include "log.h"
 #include "inany.h"
 #include "flow.h"
+#include "netlink.h"
 
 #include "tcp_conn.h"
 #include "flow_table.h"
@@ -402,6 +404,8 @@ struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
 	 (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
 #define CONN_HAS(conn, set)	((conn->events & (set)) == (set))
 
+static bool tcp_probe_msg_peek_offset_cap();
+
 static const char *tcp_event_str[] __attribute((__unused__)) = {
 	"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
 
@@ -506,7 +510,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp6_l2_buf_used;
 
 /* recvmsg()/sendmsg() data for tap */
-static char 		tcp_buf_discard		[MAX_WINDOW];
+static char 		tcp_discard_buf[MAX_WINDOW];
+static char* tcp_buf_discard = tcp_discard_buf;
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
 static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
@@ -573,6 +578,15 @@ static unsigned int tcp6_l2_flags_buf_used;
 
 #define CONN(idx)		(&(FLOW(idx)->tcp))
 
+
+/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset?
+ */
+static inline  bool msg_peek_offset_cap()
+{
+	return !tcp_buf_discard;
+}
+
+
 /** conn_at_idx() - Find a connection by index, if present
  * @idx:	Index of connection to lookup
  *
@@ -2224,7 +2238,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!msg_peek_offset_cap())
+		sendlen -= already_sent;
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -3107,6 +3123,9 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Ignore discard buffer if not needed */
+	if (tcp_probe_msg_peek_offset_cap())
+		tcp_buf_discard = NULL;
 	return 0;
 }
 
@@ -3213,3 +3232,83 @@ void tcp_timer(struct ctx *c, const struct timespec *ts)
 	if (c->mode == MODE_PASTA)
 		tcp_splice_refill(c);
 }
+
+static int tcp_probe_sockets(void *arg)
+{
+        char b;
+        struct iovec iov[2] = { { NULL, 1 }, { &b, 1 }, };
+        struct msghdr msg = { NULL, 0, iov, 2, NULL, 0, 0};
+        struct sockaddr a = { AF_INET, {0, }};
+	int err = EXIT_FAILURE;
+        int s[2] = {0, };
+	int s_recv = 0;
+	int *rc = arg;
+        ssize_t len;
+
+	/* Make sure loopback interface is enabled */
+	nl_sock_init_do(NULL);
+	nl_link_up(nl_sock, 1 /* lo */, 0);
+
+	s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+	if (s[0] < 0 || s[1] < 0) {
+		perror("Temporary probe socket creation failed\n");
+		goto out;
+	}
+	if (0 > bind(s[0], &a, sizeof(a))) {
+		perror("Temporary probe socket bind() failed\n");
+		goto out;
+	}
+	if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) {
+		perror("Temporary probe socket getsockname() failed\n");
+		goto out;
+	}
+	if (0 > listen(s[0], 0)) {
+		perror("Temporary probe socket listen() failed\n");
+		goto out;
+	}
+	if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) {
+		perror("Temporary probe socket connect() failed\n");
+		goto out;
+	}
+	s_recv = accept(s[0], NULL, NULL);
+	if (s_recv <= 0) {
+		perror("Temporary probe socket accept() failed\n");
+		goto out;
+	}
+	if (0 >= send(s[1], (char *)("ab"), 2, 0) || errno != EINPROGRESS) {
+		perror("Temporary probe socket send() failed\n");
+		goto out;
+	}
+	len = recvmsg(s_recv, &msg, MSG_PEEK);
+	if (len <= 0 && errno != EFAULT) {
+		perror("Temporary probe socket recvmsg() failed\n");
+		goto out;
+	}
+	printf("MSG_PEEK with offset %ssupported\n", len == 1 ? "" : "not ");
+	*rc = len == 1;
+	err = EXIT_SUCCESS;
+out:
+	close(s_recv);
+	close(s[1]);
+	close(s[0]);
+	return err;
+}
+
+/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support
+ */
+static bool tcp_probe_msg_peek_offset_cap()
+{
+	char ns_fn_stack[NS_FN_STACK_SIZE];
+	int child_pid, child_ret, rc = 0;
+
+	child_pid = do_clone(tcp_probe_sockets, ns_fn_stack, sizeof(ns_fn_stack),
+			     CLONE_NEWNET | CLONE_NEWUSER | CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD,
+			     (void *)&rc);
+	if (child_pid <= 0) {
+		perror("Failed to clone tcp probe process\n");
+		exit(EXIT_FAILURE);
+	}
+	waitpid(child_pid, &child_ret, 0);
+	return rc;
+}
-- 
@@ -283,6 +283,7 @@
 #include <sys/timerfd.h>
 #include <sys/types.h>
 #include <sys/uio.h>
+#include <sys/wait.h>
 #include <time.h>
 
 #include <linux/tcp.h> /* For struct tcp_info */
@@ -297,6 +298,7 @@
 #include "log.h"
 #include "inany.h"
 #include "flow.h"
+#include "netlink.h"
 
 #include "tcp_conn.h"
 #include "flow_table.h"
@@ -402,6 +404,8 @@ struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
 	 (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
 #define CONN_HAS(conn, set)	((conn->events & (set)) == (set))
 
+static bool tcp_probe_msg_peek_offset_cap();
+
 static const char *tcp_event_str[] __attribute((__unused__)) = {
 	"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
 
@@ -506,7 +510,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
 static unsigned int tcp6_l2_buf_used;
 
 /* recvmsg()/sendmsg() data for tap */
-static char 		tcp_buf_discard		[MAX_WINDOW];
+static char 		tcp_discard_buf[MAX_WINDOW];
+static char* tcp_buf_discard = tcp_discard_buf;
 static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
 
 static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
@@ -573,6 +578,15 @@ static unsigned int tcp6_l2_flags_buf_used;
 
 #define CONN(idx)		(&(FLOW(idx)->tcp))
 
+
+/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset?
+ */
+static inline  bool msg_peek_offset_cap()
+{
+	return !tcp_buf_discard;
+}
+
+
 /** conn_at_idx() - Find a connection by index, if present
  * @idx:	Index of connection to lookup
  *
@@ -2224,7 +2238,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 		return 0;
 	}
 
-	sendlen = len - already_sent;
+	sendlen = len;
+	if (!msg_peek_offset_cap())
+		sendlen -= already_sent;
 	if (sendlen <= 0) {
 		conn_flag(c, conn, STALLED);
 		return 0;
@@ -3107,6 +3123,9 @@ int tcp_init(struct ctx *c)
 		NS_CALL(tcp_ns_socks_init, c);
 	}
 
+	/* Ignore discard buffer if not needed */
+	if (tcp_probe_msg_peek_offset_cap())
+		tcp_buf_discard = NULL;
 	return 0;
 }
 
@@ -3213,3 +3232,83 @@ void tcp_timer(struct ctx *c, const struct timespec *ts)
 	if (c->mode == MODE_PASTA)
 		tcp_splice_refill(c);
 }
+
+static int tcp_probe_sockets(void *arg)
+{
+        char b;
+        struct iovec iov[2] = { { NULL, 1 }, { &b, 1 }, };
+        struct msghdr msg = { NULL, 0, iov, 2, NULL, 0, 0};
+        struct sockaddr a = { AF_INET, {0, }};
+	int err = EXIT_FAILURE;
+        int s[2] = {0, };
+	int s_recv = 0;
+	int *rc = arg;
+        ssize_t len;
+
+	/* Make sure loopback interface is enabled */
+	nl_sock_init_do(NULL);
+	nl_link_up(nl_sock, 1 /* lo */, 0);
+
+	s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+	if (s[0] < 0 || s[1] < 0) {
+		perror("Temporary probe socket creation failed\n");
+		goto out;
+	}
+	if (0 > bind(s[0], &a, sizeof(a))) {
+		perror("Temporary probe socket bind() failed\n");
+		goto out;
+	}
+	if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) {
+		perror("Temporary probe socket getsockname() failed\n");
+		goto out;
+	}
+	if (0 > listen(s[0], 0)) {
+		perror("Temporary probe socket listen() failed\n");
+		goto out;
+	}
+	if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) {
+		perror("Temporary probe socket connect() failed\n");
+		goto out;
+	}
+	s_recv = accept(s[0], NULL, NULL);
+	if (s_recv <= 0) {
+		perror("Temporary probe socket accept() failed\n");
+		goto out;
+	}
+	if (0 >= send(s[1], (char *)("ab"), 2, 0) || errno != EINPROGRESS) {
+		perror("Temporary probe socket send() failed\n");
+		goto out;
+	}
+	len = recvmsg(s_recv, &msg, MSG_PEEK);
+	if (len <= 0 && errno != EFAULT) {
+		perror("Temporary probe socket recvmsg() failed\n");
+		goto out;
+	}
+	printf("MSG_PEEK with offset %ssupported\n", len == 1 ? "" : "not ");
+	*rc = len == 1;
+	err = EXIT_SUCCESS;
+out:
+	close(s_recv);
+	close(s[1]);
+	close(s[0]);
+	return err;
+}
+
+/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support
+ */
+static bool tcp_probe_msg_peek_offset_cap()
+{
+	char ns_fn_stack[NS_FN_STACK_SIZE];
+	int child_pid, child_ret, rc = 0;
+
+	child_pid = do_clone(tcp_probe_sockets, ns_fn_stack, sizeof(ns_fn_stack),
+			     CLONE_NEWNET | CLONE_NEWUSER | CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD,
+			     (void *)&rc);
+	if (child_pid <= 0) {
+		perror("Failed to clone tcp probe process\n");
+		exit(EXIT_FAILURE);
+	}
+	waitpid(child_pid, &child_ret, 0);
+	return rc;
+}
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] tcp.c: leverage MSG_PEEK with offset kernel capability when available
  2024-01-14 18:07 [PATCH v2] tcp.c: leverage MSG_PEEK with offset kernel capability when available Jon Maloy
@ 2024-01-18  3:05 ` David Gibson
  2024-01-18 16:23   ` Stefano Brivio
  0 siblings, 1 reply; 6+ messages in thread
From: David Gibson @ 2024-01-18  3:05 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 11378 bytes --]

On Sun, Jan 14, 2024 at 01:07:55PM -0500, Jon Maloy wrote:
> The kernel may support recvmsg(MSG_PEEK), starting from a given offset. This
> makes it possible to avoid repeated reading of already read initial bytes of
> a received message, hence saving us read cycles when forwarding TCP messages
> in the host->name space direction.
> 
> When this feature is supported, iov_sock[0].iov_base can be set to NULL. The
> kernel code will interpret this as an instruction to skip reading of the first
> iov_sock[0].iov_len bytes of the message.
> 
> Since iov_sock[0].iov_base is set to point to tcp_buf_discard, we do this
> by making this into a pointer, setting it to NULL if we find that the feature
> is supported by the kernel, an letting it point to a static buffer if not.
> 
> There is no macro or function indicating kernel support for this feature. We
> therefore have to probe for it by creating a temporary tcp connection and
> read from it as if the feature is present. If that fails, we fall back to
> the original design. The traffic connection cannot be used for this purpose,
> because it will be broken if the probe reading fails.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> 
> ---
> v2: Changes as suggested by Stefano Brivio:
>     - Moved probe process/function into a separate, temporary name space, to avoid
>       occupying port numbers in the current name space.
>     - Put discard buffer back to static memory.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> ---
>  netlink.c |   2 +-
>  netlink.h |   1 +
>  tcp.c     | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  3 files changed, 103 insertions(+), 3 deletions(-)
> 
> diff --git a/netlink.c b/netlink.c
> index 379d46e..d5f10de 100644
> --- a/netlink.c
> +++ b/netlink.c
> @@ -55,7 +55,7 @@ static int nl_seq = 1;
>   *
>   * Return: 0
>   */
> -static int nl_sock_init_do(void *arg)
> +int nl_sock_init_do(void *arg)
>  {
>  	struct sockaddr_nl addr = { .nl_family = AF_NETLINK, };
>  	int *s = arg ? &nl_sock_ns : &nl_sock;
> diff --git a/netlink.h b/netlink.h
> index 3a1f0de..cadd3b7 100644
> --- a/netlink.h
> +++ b/netlink.h
> @@ -24,5 +24,6 @@ int nl_addr_dup(int s_src, unsigned int ifi_src,
>  int nl_link_get_mac(int s, unsigned int ifi, void *mac);
>  int nl_link_set_mac(int s, unsigned int ifi, const void *mac);
>  int nl_link_up(int s, unsigned int ifi, int mtu);
> +int nl_sock_init_do(void *arg);
>  
>  #endif /* NETLINK_H */
> diff --git a/tcp.c b/tcp.c
> index f506cfd..4410460 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -283,6 +283,7 @@
>  #include <sys/timerfd.h>
>  #include <sys/types.h>
>  #include <sys/uio.h>
> +#include <sys/wait.h>
>  #include <time.h>
>  
>  #include <linux/tcp.h> /* For struct tcp_info */
> @@ -297,6 +298,7 @@
>  #include "log.h"
>  #include "inany.h"
>  #include "flow.h"
> +#include "netlink.h"
>  
>  #include "tcp_conn.h"
>  #include "flow_table.h"
> @@ -402,6 +404,8 @@ struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
>  	 (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
>  #define CONN_HAS(conn, set)	((conn->events & (set)) == (set))
>  
> +static bool tcp_probe_msg_peek_offset_cap();

We usually prefer to re-order functions in the file, rather than use
forward declarations.  That said, I'm not sure I'd put the probing
logic in tcp.c, but I'm not entirely sure where I would put it.

> +
>  static const char *tcp_event_str[] __attribute((__unused__)) = {
>  	"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
>  
> @@ -506,7 +510,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
>  static unsigned int tcp6_l2_buf_used;
>  
>  /* recvmsg()/sendmsg() data for tap */
> -static char 		tcp_buf_discard		[MAX_WINDOW];
> +static char 		tcp_discard_buf[MAX_WINDOW];
> +static char* tcp_buf_discard = tcp_discard_buf;

tcp_discard_buf vs. tcp_buf_discard seems very easy to confuse.  Maybe
tcp_discard_buf and tcp_discard_ptr?  Or, since you need explicit
tests on the capability boolean anyway, just open code putting either
tcp_buf_discard or NULL into the iovec.

>  static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];
>  
>  static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
> @@ -573,6 +578,15 @@ static unsigned int tcp6_l2_flags_buf_used;
>  
>  #define CONN(idx)		(&(FLOW(idx)->tcp))
>  
> +
> +/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset?

Needs a "Return:" comment.

> + */
> +static inline  bool msg_peek_offset_cap()

msg_peek_offset_cap() vs. tcp_probe_msg_peek_offset_cap() also seems
very easy to confuse.

> +{
> +	return !tcp_buf_discard;
> +}
> +
> +
>  /** conn_at_idx() - Find a connection by index, if present
>   * @idx:	Index of connection to lookup
>   *
> @@ -2224,7 +2238,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  		return 0;
>  	}
>  
> -	sendlen = len - already_sent;
> +	sendlen = len;
> +	if (!msg_peek_offset_cap())
> +		sendlen -= already_sent;

Hmmm.. I think this is implying that when using this feature the
recvmsg() call won't count the discarded first segment in the total
received length returned.  That doesn't seem quite correct to me.  It
would make sense if we did have an explicit offset sent, but at least
the way I think of this NULL buffer feature, we're logically doing the
full recvmsg(), but choosing to send some of the data to a black hole.
That suggests to me it should return the same value whether or not one
of the buffers is NULL.

Perhaps more to the point, I think keeping the same return value is
strictly more useful: we don't need it here, but I can imagine users
of this feature where they want to discard the next n bytes of data,
but they don't yet know if that data has already been received at the
kernel level.  So, even if there isn't enough data to make it into the
second buffer - so it's all discarded - they want to know how many
bytes it was that were discarded.

>  	if (sendlen <= 0) {
>  		conn_flag(c, conn, STALLED);
>  		return 0;
> @@ -3107,6 +3123,9 @@ int tcp_init(struct ctx *c)
>  		NS_CALL(tcp_ns_socks_init, c);
>  	}
>  
> +	/* Ignore discard buffer if not needed */
> +	if (tcp_probe_msg_peek_offset_cap())
> +		tcp_buf_discard = NULL;
>  	return 0;
>  }
>  
> @@ -3213,3 +3232,83 @@ void tcp_timer(struct ctx *c, const struct timespec *ts)
>  	if (c->mode == MODE_PASTA)
>  		tcp_splice_refill(c);
>  }
> +


Function header comment would be good.

> +static int tcp_probe_sockets(void *arg)
> +{
> +        char b;

passt uses tab, not space indents.

> +        struct iovec iov[2] = { { NULL, 1 }, { &b, 1 }, };
> +        struct msghdr msg = { NULL, 0, iov, 2, NULL, 0, 0};
> +        struct sockaddr a = { AF_INET, {0, }};

I think you want sockaddr_in here.  I don't believe plain sockaddr is
guaranteed long enough to hold an IP address, though it probably is in
practice.

> +	int err = EXIT_FAILURE;
> +        int s[2] = {0, };

No need to initialise this, you overwrite both elements anyway.  Also,
you do entirely different things with the two elements, so it might
as well be different variables (as you have with s_recv).

> +	int s_recv = 0;

Again, no need for an initializer, you unconditionally overwrite this.

> +	int *rc = arg;
> +        ssize_t len;
> +
> +	/* Make sure loopback interface is enabled */
> +	nl_sock_init_do(NULL);

Ugh.. this will overwrite the nl_sock global with the netlink socket
in your temporary namespace.  We might get away with that if you do
this early enough, but it's pretty counter-intuitive.

I think we want to refactor no_sock_init_do() as one function that
just returns the new netlink socket, with another helper that does the
ns entering we need for the pasta namespace.  Here you could just use
that base function and put the temporary socket for the temporary ns
into a local.

> +	nl_link_up(nl_sock, 1 /* lo */, 0);
> +
> +	s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
> +	s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
> +	if (s[0] < 0 || s[1] < 0) {
> +		perror("Temporary probe socket creation failed\n");
> +		goto out;
> +	}
> +	if (0 > bind(s[0], &a, sizeof(a))) {

Since the socket address is unspecified, why do you need to bind at
all?  It might be clearer to explicitly set a to localhost + a
specific port - because you're in a temporary namespace, you can rely
on every port being available.

> +		perror("Temporary probe socket bind() failed\n");
> +		goto out;
> +	}
> +	if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) {
> +		perror("Temporary probe socket getsockname() failed\n");
> +		goto out;
> +	}
> +	if (0 > listen(s[0], 0)) {
> +		perror("Temporary probe socket listen() failed\n");
> +		goto out;
> +	}
> +	if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) {
> +		perror("Temporary probe socket connect() failed\n");
> +		goto out;
> +	}

This is assuming that a will now contain the correct address to
connect to.  Although it will have the right port, I think the address
may still be unspecified for the listening socket.

> +	s_recv = accept(s[0], NULL, NULL);
> +	if (s_recv <= 0) {

Should be strictly < 0 (although it's very unlikely to occur in practice).

> +		perror("Temporary probe socket accept() failed\n");
> +		goto out;
> +	}
> +	if (0 >= send(s[1], (char *)("ab"), 2, 0) || errno != EINPROGRESS) {

IIUC this is only checking errno in the case where send() did *not*
return <= 0, and therefore doesn't contain any relevant value.

> +		perror("Temporary probe socket send() failed\n");
> +		goto out;
> +	}
> +	len = recvmsg(s_recv, &msg, MSG_PEEK);
> +	if (len <= 0 && errno != EFAULT) {
> +		perror("Temporary probe socket recvmsg() failed\n");
> +		goto out;
> +	}
> +	printf("MSG_PEEK with offset %ssupported\n", len == 1 ? "" : "not ");

Better to use the info() or debug() helper here.

> +	*rc = len == 1;

Better to explicitly check if you got an EFAULT or not here, rather
than relying on the subtly different return semantics which as noted
above I don't think are a good idea.

> +	err = EXIT_SUCCESS;

> +out:
> +	close(s_recv);
> +	close(s[1]);
> +	close(s[0]);
> +	return err;
> +}
> +
> +/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support
> + */
> +static bool tcp_probe_msg_peek_offset_cap()

I believe we prefer the explicit foo(void) for declarations of
functions with no parameters, rather than just foo().

> +{
> +	char ns_fn_stack[NS_FN_STACK_SIZE];
> +	int child_pid, child_ret, rc = 0;
> +
> +	child_pid = do_clone(tcp_probe_sockets, ns_fn_stack, sizeof(ns_fn_stack),
> +			     CLONE_NEWNET | CLONE_NEWUSER | CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD,
> +			     (void *)&rc);
> +	if (child_pid <= 0) {
> +		perror("Failed to clone tcp probe process\n");
> +		exit(EXIT_FAILURE);
> +	}
> +	waitpid(child_pid, &child_ret, 0);
> +	return rc;
> +}

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] tcp.c: leverage MSG_PEEK with offset kernel capability when available
  2024-01-18  3:05 ` David Gibson
@ 2024-01-18 16:23   ` Stefano Brivio
  2024-01-19  0:05     ` David Gibson
  0 siblings, 1 reply; 6+ messages in thread
From: Stefano Brivio @ 2024-01-18 16:23 UTC (permalink / raw)
  To: David Gibson; +Cc: Jon Maloy, passt-dev, lvivier, dgibson

Not a full review, but a couple of comments, mostly about stuff I also
had in pkt_selfie.c (review of v1):

On Thu, 18 Jan 2024 14:05:38 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Sun, Jan 14, 2024 at 01:07:55PM -0500, Jon Maloy wrote:
> >
> > [...]
> >
> > +
> > +	s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
> > +	s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
> > +	if (s[0] < 0 || s[1] < 0) {
> > +		perror("Temporary probe socket creation failed\n");
> > +		goto out;
> > +	}
> > +	if (0 > bind(s[0], &a, sizeof(a))) {  
> 
> Since the socket address is unspecified, why do you need to bind at
> all?  It might be clearer to explicitly set a to localhost + a
> specific port - because you're in a temporary namespace, you can rely
> on every port being available.

There are two advantages of bind() without port, and then getsockname():
first, ip_unprivileged_port_start might have whatever value in our new
namespace (we don't touch it), and I wouldn't take for granted we'll
have CAP_SYS_ADMIN in it for all the possible start-up combinations.

Second, there's no need for a magic value.

> > +		perror("Temporary probe socket bind() failed\n");
> > +		goto out;
> > +	}
> > +	if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) {
> > +		perror("Temporary probe socket getsockname() failed\n");
> > +		goto out;
> > +	}
> > +	if (0 > listen(s[0], 0)) {
> > +		perror("Temporary probe socket listen() failed\n");
> > +		goto out;
> > +	}
> > +	if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) {
> > +		perror("Temporary probe socket connect() failed\n");
> > +		goto out;
> > +	}  
> 
> This is assuming that a will now contain the correct address to
> connect to.  Although it will have the right port, I think the address
> may still be unspecified for the listening socket.

Hmm, why? From getsockname(2):

       getsockname()  returns  the  current address to which the socket
       sockfd is bound [...]

> > [...]
> >
> > +/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support
> > + */
> > +static bool tcp_probe_msg_peek_offset_cap()  
> 
> I believe we prefer the explicit foo(void) for declarations of
> functions with no parameters, rather than just foo().

Right, because foo() isn't a prototype, while foo(void) is. Perhaps at
some point we should enable -Wstrict-prototypes in CFLAGS (it's not in
-Wextra, I just realised).

Look:

$ cat prototypes.c
int a(void) { ; }
int b() { ; }
int main(char **argv) { a(); a(1); b(); b(1); }

$ gcc prototypes.c
prototypes.c: In function ‘main’:
prototypes.c:3:30: error: too many arguments to function ‘a’
    3 | int main(char **argv) { a(); a(1); b(); b(1); }
      |                              ^
prototypes.c:1:5: note: declared here
    1 | int a(void) { ; }
      |     ^

note that calling b() with any number and type of arguments is fine. And:

$ gcc -Wstrict-prototypes -Werror -Wfatal-errors prototypes.c
prototypes.c:2:5: error: function declaration isn’t a prototype [-Werror=strict-prototypes]
    2 | int b() { ; }
      |     ^
compilation terminated due to -Wfatal-errors.
cc1: all warnings being treated as errors

> > [...]

-- 
Stefano


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] tcp.c: leverage MSG_PEEK with offset kernel capability when available
  2024-01-18 16:23   ` Stefano Brivio
@ 2024-01-19  0:05     ` David Gibson
  2024-01-19 10:45       ` Stefano Brivio
  0 siblings, 1 reply; 6+ messages in thread
From: David Gibson @ 2024-01-19  0:05 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Jon Maloy, passt-dev, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 4125 bytes --]

On Thu, Jan 18, 2024 at 05:23:26PM +0100, Stefano Brivio wrote:
> Not a full review, but a couple of comments, mostly about stuff I also
> had in pkt_selfie.c (review of v1):
> 
> On Thu, 18 Jan 2024 14:05:38 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Sun, Jan 14, 2024 at 01:07:55PM -0500, Jon Maloy wrote:
> > >
> > > [...]
> > >
> > > +
> > > +	s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
> > > +	s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
> > > +	if (s[0] < 0 || s[1] < 0) {
> > > +		perror("Temporary probe socket creation failed\n");
> > > +		goto out;
> > > +	}
> > > +	if (0 > bind(s[0], &a, sizeof(a))) {  
> > 
> > Since the socket address is unspecified, why do you need to bind at
> > all?  It might be clearer to explicitly set a to localhost + a
> > specific port - because you're in a temporary namespace, you can rely
> > on every port being available.
> 
> There are two advantages of bind() without port, and then getsockname():
> first, ip_unprivileged_port_start might have whatever value in our new
> namespace (we don't touch it), and I wouldn't take for granted we'll
> have CAP_SYS_ADMIN in it for all the possible start-up combinations.
> 
> Second, there's no need for a magic value.

Good point.  Note that at present we're not bind()ing to an address
either.

> > > +		perror("Temporary probe socket bind() failed\n");
> > > +		goto out;
> > > +	}
> > > +	if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) {
> > > +		perror("Temporary probe socket getsockname() failed\n");
> > > +		goto out;
> > > +	}
> > > +	if (0 > listen(s[0], 0)) {
> > > +		perror("Temporary probe socket listen() failed\n");
> > > +		goto out;
> > > +	}
> > > +	if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) {
> > > +		perror("Temporary probe socket connect() failed\n");
> > > +		goto out;
> > > +	}  
> > 
> > This is assuming that a will now contain the correct address to
> > connect to.  Although it will have the right port, I think the address
> > may still be unspecified for the listening socket.
> 
> Hmm, why? From getsockname(2):
> 
>        getsockname()  returns  the  current address to which the socket
>        sockfd is bound [...]

But we've only bound ourselves to 0.0.0.0, which while perfectly
cromulent for a listening socket, is no good for connect().  If we
accept(), then the accepted socket will have a specific bound address,
but the listening socket won't (I'm pretty sure I've actually tested
this).

> > > [...]
> > >
> > > +/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support
> > > + */
> > > +static bool tcp_probe_msg_peek_offset_cap()  
> > 
> > I believe we prefer the explicit foo(void) for declarations of
> > functions with no parameters, rather than just foo().
> 
> Right, because foo() isn't a prototype, while foo(void) is. Perhaps at
> some point we should enable -Wstrict-prototypes in CFLAGS (it's not in
> -Wextra, I just realised).
> 
> Look:
> 
> $ cat prototypes.c
> int a(void) { ; }
> int b() { ; }
> int main(char **argv) { a(); a(1); b(); b(1); }
> 
> $ gcc prototypes.c
> prototypes.c: In function ‘main’:
> prototypes.c:3:30: error: too many arguments to function ‘a’
>     3 | int main(char **argv) { a(); a(1); b(); b(1); }
>       |                              ^
> prototypes.c:1:5: note: declared here
>     1 | int a(void) { ; }
>       |     ^
> 
> note that calling b() with any number and type of arguments is fine. And:
> 
> $ gcc -Wstrict-prototypes -Werror -Wfatal-errors prototypes.c
> prototypes.c:2:5: error: function declaration isn’t a prototype [-Werror=strict-prototypes]
>     2 | int b() { ; }
>       |     ^
> compilation terminated due to -Wfatal-errors.
> cc1: all warnings being treated as errors
> 
> > > [...]
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] tcp.c: leverage MSG_PEEK with offset kernel capability when available
  2024-01-19  0:05     ` David Gibson
@ 2024-01-19 10:45       ` Stefano Brivio
  2024-01-20  4:47         ` David Gibson
  0 siblings, 1 reply; 6+ messages in thread
From: Stefano Brivio @ 2024-01-19 10:45 UTC (permalink / raw)
  To: David Gibson; +Cc: Jon Maloy, passt-dev, lvivier, dgibson

On Fri, 19 Jan 2024 11:05:02 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Thu, Jan 18, 2024 at 05:23:26PM +0100, Stefano Brivio wrote:
> > Not a full review, but a couple of comments, mostly about stuff I also
> > had in pkt_selfie.c (review of v1):
> > 
> > On Thu, 18 Jan 2024 14:05:38 +1100
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> > > On Sun, Jan 14, 2024 at 01:07:55PM -0500, Jon Maloy wrote:  
> > > >
> > > > [...]
> > > >
> > > > +
> > > > +	s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
> > > > +	s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
> > > > +	if (s[0] < 0 || s[1] < 0) {
> > > > +		perror("Temporary probe socket creation failed\n");
> > > > +		goto out;
> > > > +	}
> > > > +	if (0 > bind(s[0], &a, sizeof(a))) {    
> > > 
> > > Since the socket address is unspecified, why do you need to bind at
> > > all?  It might be clearer to explicitly set a to localhost + a
> > > specific port - because you're in a temporary namespace, you can rely
> > > on every port being available.  
> > 
> > There are two advantages of bind() without port, and then getsockname():
> > first, ip_unprivileged_port_start might have whatever value in our new
> > namespace (we don't touch it), and I wouldn't take for granted we'll
> > have CAP_SYS_ADMIN in it for all the possible start-up combinations.
> > 
> > Second, there's no need for a magic value.  
> 
> Good point.  Note that at present we're not bind()ing to an address
> either.
> 
> > > > +		perror("Temporary probe socket bind() failed\n");
> > > > +		goto out;
> > > > +	}
> > > > +	if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) {
> > > > +		perror("Temporary probe socket getsockname() failed\n");
> > > > +		goto out;
> > > > +	}
> > > > +	if (0 > listen(s[0], 0)) {
> > > > +		perror("Temporary probe socket listen() failed\n");
> > > > +		goto out;
> > > > +	}
> > > > +	if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) {
> > > > +		perror("Temporary probe socket connect() failed\n");
> > > > +		goto out;
> > > > +	}    
> > > 
> > > This is assuming that a will now contain the correct address to
> > > connect to.  Although it will have the right port, I think the address
> > > may still be unspecified for the listening socket.  
> > 
> > Hmm, why? From getsockname(2):
> > 
> >        getsockname()  returns  the  current address to which the socket
> >        sockfd is bound [...]  
> 
> But we've only bound ourselves to 0.0.0.0, which while perfectly
> cromulent for a listening socket, is no good for connect().

Hah, "cromulent" just embiggened my dictionary! Why not, though? From
RFC 6890, 2.2.2:

              +----------------------+----------------------------+
              | Attribute            | Value                      |
              +----------------------+----------------------------+
              | Address Block        | 0.0.0.0/8                  |
              | Name                 | "This host on this network"|

and:

  $ strace -e connect ./pkt_selfie 
  --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=891325, si_uid=1000, si_status=0, si_utime=0, si_stime=0} ---
  connect(5, {sa_family=AF_INET, sin_port=htons(51155), sin_addr=inet_addr("0.0.0.0")}, 16) = -1 EINPROGRESS (Operation now in progress)
  MSG_PEEK with offset not supported
  +++ exited with 0 +++

with pkt_selfie.c from review of v1:

  https://archives.passt.top/passt-dev/20231206160808.3d312733@elisabeth/

-- 
Stefano


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] tcp.c: leverage MSG_PEEK with offset kernel capability when available
  2024-01-19 10:45       ` Stefano Brivio
@ 2024-01-20  4:47         ` David Gibson
  0 siblings, 0 replies; 6+ messages in thread
From: David Gibson @ 2024-01-20  4:47 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Jon Maloy, passt-dev, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 4172 bytes --]

On Fri, Jan 19, 2024 at 11:45:05AM +0100, Stefano Brivio wrote:
> On Fri, 19 Jan 2024 11:05:02 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Thu, Jan 18, 2024 at 05:23:26PM +0100, Stefano Brivio wrote:
> > > Not a full review, but a couple of comments, mostly about stuff I also
> > > had in pkt_selfie.c (review of v1):
> > > 
> > > On Thu, 18 Jan 2024 14:05:38 +1100
> > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > >   
> > > > On Sun, Jan 14, 2024 at 01:07:55PM -0500, Jon Maloy wrote:  
> > > > >
> > > > > [...]
> > > > >
> > > > > +
> > > > > +	s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
> > > > > +	s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
> > > > > +	if (s[0] < 0 || s[1] < 0) {
> > > > > +		perror("Temporary probe socket creation failed\n");
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if (0 > bind(s[0], &a, sizeof(a))) {    
> > > > 
> > > > Since the socket address is unspecified, why do you need to bind at
> > > > all?  It might be clearer to explicitly set a to localhost + a
> > > > specific port - because you're in a temporary namespace, you can rely
> > > > on every port being available.  
> > > 
> > > There are two advantages of bind() without port, and then getsockname():
> > > first, ip_unprivileged_port_start might have whatever value in our new
> > > namespace (we don't touch it), and I wouldn't take for granted we'll
> > > have CAP_SYS_ADMIN in it for all the possible start-up combinations.
> > > 
> > > Second, there's no need for a magic value.  
> > 
> > Good point.  Note that at present we're not bind()ing to an address
> > either.
> > 
> > > > > +		perror("Temporary probe socket bind() failed\n");
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) {
> > > > > +		perror("Temporary probe socket getsockname() failed\n");
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if (0 > listen(s[0], 0)) {
> > > > > +		perror("Temporary probe socket listen() failed\n");
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) {
> > > > > +		perror("Temporary probe socket connect() failed\n");
> > > > > +		goto out;
> > > > > +	}    
> > > > 
> > > > This is assuming that a will now contain the correct address to
> > > > connect to.  Although it will have the right port, I think the address
> > > > may still be unspecified for the listening socket.  
> > > 
> > > Hmm, why? From getsockname(2):
> > > 
> > >        getsockname()  returns  the  current address to which the socket
> > >        sockfd is bound [...]  
> > 
> > But we've only bound ourselves to 0.0.0.0, which while perfectly
> > cromulent for a listening socket, is no good for connect().
> 
> Hah, "cromulent" just embiggened my dictionary! Why not, though? From
> RFC 6890, 2.2.2:
> 
>               +----------------------+----------------------------+
>               | Attribute            | Value                      |
>               +----------------------+----------------------------+
>               | Address Block        | 0.0.0.0/8                  |
>               | Name                 | "This host on this network"|

Huh.  I never realised you could use 0.0.0.0 like that.  I guess that
makes it work, though I still feel like explicitly using localhost
would be clearer.

> and:
> 
>   $ strace -e connect ./pkt_selfie 
>   --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=891325, si_uid=1000, si_status=0, si_utime=0, si_stime=0} ---
>   connect(5, {sa_family=AF_INET, sin_port=htons(51155), sin_addr=inet_addr("0.0.0.0")}, 16) = -1 EINPROGRESS (Operation now in progress)
>   MSG_PEEK with offset not supported
>   +++ exited with 0 +++
> 
> with pkt_selfie.c from review of v1:
> 
>   https://archives.passt.top/passt-dev/20231206160808.3d312733@elisabeth/
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-01-20  4:57 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-14 18:07 [PATCH v2] tcp.c: leverage MSG_PEEK with offset kernel capability when available Jon Maloy
2024-01-18  3:05 ` David Gibson
2024-01-18 16:23   ` Stefano Brivio
2024-01-19  0:05     ` David Gibson
2024-01-19 10:45       ` Stefano Brivio
2024-01-20  4:47         ` David Gibson

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).