* tcp.c: leverage MSG_PEEK with offset kernel capability when available @ 2023-12-05 23:36 Jon Maloy 2023-12-06 14:59 ` Stefano Brivio 0 siblings, 1 reply; 5+ messages in thread From: Jon Maloy @ 2023-12-05 23:36 UTC (permalink / raw) To: sbrivio, lvivier, dgibson, jmaloy, passt-dev The kernel may support recvmsg(MSG_PEEK), starting from a given offset. This makes it possible to avoid repeated reading of already read initial bytes of a received message, hence saving us read cycles when forwarding TCP messages in the host->name space direction. When this feature is supported, iov_sock[0].iov_base can be set to NULL. The kernel code will interpret this as an instruction to skip reading of the first iov_sock[0].iov_len bytes of the message. Since iov_sock[0].iov_base is set to point to tcp_buf_discard, we do this by simply not allocating that buffer, letting the pointer remain NULL, when we find that we have this kernel support. There is no macro or function indicating kernel support for this feature. We therefore have to probe for it by reading from an established TCP connection. The traffic connection cannot be used for this purpose, because it will be broken if the probe reading fails. We therefore have to create a temporary local connection for this task. Because of this, we also add a new function, tcp_probe_msg_peek_offset_cap(), which creates this temporary connection and performs the probe read on it. Signed-off-by: Jon Maloy <jmaloy@redhat.com> --- tcp.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 108 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index f506cfd..ab5168e 100644 --- a/tcp.c +++ b/tcp.c @@ -402,6 +402,8 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) +static bool tcp_probe_msg_peek_offset_cap(); + static const char *tcp_event_str[] __attribute((__unused__)) = { "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", @@ -506,7 +508,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; /* recvmsg()/sendmsg() data for tap */ -static char tcp_buf_discard [MAX_WINDOW]; +static char *tcp_buf_discard = NULL; + static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; @@ -573,6 +576,15 @@ static unsigned int tcp6_l2_flags_buf_used; #define CONN(idx) (&(FLOW(idx)->tcp)) + +/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset? + */ +static inline bool msg_peek_offset_cap() +{ + return !tcp_buf_discard; +} + + /** conn_at_idx() - Find a connection by index, if present * @idx: Index of connection to lookup * @@ -2224,7 +2236,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len - already_sent; + sendlen = len; + if (!msg_peek_offset_cap()) + sendlen -= already_sent; if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0; @@ -3107,6 +3121,15 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Only allocate discard buffer if needed */ + if (!tcp_probe_msg_peek_offset_cap()) { + tcp_buf_discard = malloc(MAX_WINDOW); + if (!tcp_buf_discard) { + perror("failed to allocate discard buffer\n"); + exit(EXIT_FAILURE); + } + debug("allocated discard buffer of size %i\n", MAX_WINDOW); + } return 0; } @@ -3213,3 +3236,86 @@ void tcp_timer(struct ctx *c, const struct timespec *ts) if (c->mode == MODE_PASTA) tcp_splice_refill(c); } + +/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support + */ +static bool tcp_probe_msg_peek_offset_cap() +{ + int listenerfd, fd; + ssize_t len; + char buf[8] = { 0 }; + struct msghdr msg; + struct iovec iov[2]; + struct sockaddr_in addr = { 0, }; + socklen_t addrlen = sizeof(addr); + int option = 1; + bool ret = true; + + memset(buf, 0, sizeof(buf)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = INADDR_ANY; + addr.sin_port = 0; + + if ( (listenerfd = socket(AF_INET, SOCK_STREAM, 0)) < 0 ) { + perror("temporary listener socket creation failed"); + exit(EXIT_FAILURE); + } + setsockopt(listenerfd, SOL_SOCKET, SO_REUSEADDR, &option, sizeof(option)); + + if (bind(listenerfd, (const struct sockaddr *)&addr, sizeof(addr)) < 0 ) { + perror("temporary bind() failed"); + exit(EXIT_FAILURE); + } + if (0 > listen(listenerfd, 100)) { + perror("temporary listen failed"); + exit(EXIT_FAILURE); + } + if (0 != getsockname(listenerfd, (struct sockaddr *)&addr, &addrlen)) { + perror("temporary getsockname() failed"); + exit(EXIT_FAILURE); + } + + /* Spawn off a child process and let it send a message here: */ + if (fork() == 0) { + if ( (fd = socket(AF_INET, SOCK_STREAM, 0)) < 0 ) { + perror("temporary socket creation failed"); + exit(EXIT_FAILURE); + } + if (0 > connect(fd, (struct sockaddr *)&addr, sizeof(addr))) { + perror("temporary connect() failed"); + exit(EXIT_FAILURE); + } + if (sizeof(buf) != send(fd, buf, sizeof(buf), 0)) + perror("send on temporary server socket failed"); + shutdown(fd, SHUT_RD | SHUT_WR); + close(fd); + exit(0); + } + /* Receive the message in mother process: */ + fd = accept(listenerfd, NULL, NULL); + if (fd <= 0) { + perror("accept on temporary listener socket failed"); + exit(EXIT_FAILURE); + } + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = iov; + msg.msg_iovlen = 2; + iov[0].iov_base = NULL; + iov[0].iov_len = 4; + iov[1].iov_base = &buf[3]; + iov[1].iov_len = 4; + len = recvmsg(fd, &msg, MSG_PEEK); + if (len <= 0) { + if (errno != EFAULT) { + perror("temporary recvmsg() failed"); + exit(EXIT_FAILURE); + } + ret = false; + } else if ((size_t)len != iov[1].iov_len) { + exit(EXIT_FAILURE); + } + shutdown(fd, SHUT_RD | SHUT_WR); + close(fd); + info("MSG_PEEK with offset %ssupported by kernel.\n", ret ? "" : "not "); + return ret; +} -- @@ -402,6 +402,8 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) +static bool tcp_probe_msg_peek_offset_cap(); + static const char *tcp_event_str[] __attribute((__unused__)) = { "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", @@ -506,7 +508,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; /* recvmsg()/sendmsg() data for tap */ -static char tcp_buf_discard [MAX_WINDOW]; +static char *tcp_buf_discard = NULL; + static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; @@ -573,6 +576,15 @@ static unsigned int tcp6_l2_flags_buf_used; #define CONN(idx) (&(FLOW(idx)->tcp)) + +/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset? + */ +static inline bool msg_peek_offset_cap() +{ + return !tcp_buf_discard; +} + + /** conn_at_idx() - Find a connection by index, if present * @idx: Index of connection to lookup * @@ -2224,7 +2236,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len - already_sent; + sendlen = len; + if (!msg_peek_offset_cap()) + sendlen -= already_sent; if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0; @@ -3107,6 +3121,15 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Only allocate discard buffer if needed */ + if (!tcp_probe_msg_peek_offset_cap()) { + tcp_buf_discard = malloc(MAX_WINDOW); + if (!tcp_buf_discard) { + perror("failed to allocate discard buffer\n"); + exit(EXIT_FAILURE); + } + debug("allocated discard buffer of size %i\n", MAX_WINDOW); + } return 0; } @@ -3213,3 +3236,86 @@ void tcp_timer(struct ctx *c, const struct timespec *ts) if (c->mode == MODE_PASTA) tcp_splice_refill(c); } + +/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support + */ +static bool tcp_probe_msg_peek_offset_cap() +{ + int listenerfd, fd; + ssize_t len; + char buf[8] = { 0 }; + struct msghdr msg; + struct iovec iov[2]; + struct sockaddr_in addr = { 0, }; + socklen_t addrlen = sizeof(addr); + int option = 1; + bool ret = true; + + memset(buf, 0, sizeof(buf)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = INADDR_ANY; + addr.sin_port = 0; + + if ( (listenerfd = socket(AF_INET, SOCK_STREAM, 0)) < 0 ) { + perror("temporary listener socket creation failed"); + exit(EXIT_FAILURE); + } + setsockopt(listenerfd, SOL_SOCKET, SO_REUSEADDR, &option, sizeof(option)); + + if (bind(listenerfd, (const struct sockaddr *)&addr, sizeof(addr)) < 0 ) { + perror("temporary bind() failed"); + exit(EXIT_FAILURE); + } + if (0 > listen(listenerfd, 100)) { + perror("temporary listen failed"); + exit(EXIT_FAILURE); + } + if (0 != getsockname(listenerfd, (struct sockaddr *)&addr, &addrlen)) { + perror("temporary getsockname() failed"); + exit(EXIT_FAILURE); + } + + /* Spawn off a child process and let it send a message here: */ + if (fork() == 0) { + if ( (fd = socket(AF_INET, SOCK_STREAM, 0)) < 0 ) { + perror("temporary socket creation failed"); + exit(EXIT_FAILURE); + } + if (0 > connect(fd, (struct sockaddr *)&addr, sizeof(addr))) { + perror("temporary connect() failed"); + exit(EXIT_FAILURE); + } + if (sizeof(buf) != send(fd, buf, sizeof(buf), 0)) + perror("send on temporary server socket failed"); + shutdown(fd, SHUT_RD | SHUT_WR); + close(fd); + exit(0); + } + /* Receive the message in mother process: */ + fd = accept(listenerfd, NULL, NULL); + if (fd <= 0) { + perror("accept on temporary listener socket failed"); + exit(EXIT_FAILURE); + } + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = iov; + msg.msg_iovlen = 2; + iov[0].iov_base = NULL; + iov[0].iov_len = 4; + iov[1].iov_base = &buf[3]; + iov[1].iov_len = 4; + len = recvmsg(fd, &msg, MSG_PEEK); + if (len <= 0) { + if (errno != EFAULT) { + perror("temporary recvmsg() failed"); + exit(EXIT_FAILURE); + } + ret = false; + } else if ((size_t)len != iov[1].iov_len) { + exit(EXIT_FAILURE); + } + shutdown(fd, SHUT_RD | SHUT_WR); + close(fd); + info("MSG_PEEK with offset %ssupported by kernel.\n", ret ? "" : "not "); + return ret; +} -- 2.39.2 ^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: tcp.c: leverage MSG_PEEK with offset kernel capability when available 2023-12-05 23:36 tcp.c: leverage MSG_PEEK with offset kernel capability when available Jon Maloy @ 2023-12-06 14:59 ` Stefano Brivio 2023-12-06 15:08 ` Stefano Brivio 2023-12-06 16:10 ` Jon Maloy 0 siblings, 2 replies; 5+ messages in thread From: Stefano Brivio @ 2023-12-06 14:59 UTC (permalink / raw) To: Jon Maloy; +Cc: lvivier, dgibson, passt-dev [-- Attachment #1: Type: text/plain, Size: 6097 bytes --] On Tue, 5 Dec 2023 18:36:04 -0500 Jon Maloy <jmaloy@redhat.com> wrote: > The kernel may support recvmsg(MSG_PEEK), starting from a given offset. This > makes it possible to avoid repeated reading of already read initial bytes of > a received message, hence saving us read cycles when forwarding TCP messages > in the host->name space direction. > > When this feature is supported, iov_sock[0].iov_base can be set to NULL. The > kernel code will interpret this as an instruction to skip reading of the first > iov_sock[0].iov_len bytes of the message. > > Since iov_sock[0].iov_base is set to point to tcp_buf_discard, we do this > by simply not allocating that buffer, letting the pointer remain NULL, when > we find that we have this kernel support. > > There is no macro or function indicating kernel support for this feature. We > therefore have to probe for it by reading from an established TCP connection. > The traffic connection cannot be used for this purpose, because it will be > broken if the probe reading fails. We therefore have to create a temporary > local connection for this task. Because of this, we also add a new function, > tcp_probe_msg_peek_offset_cap(), which creates this temporary connection > and performs the probe read on it. > > Signed-off-by: Jon Maloy <jmaloy@redhat.com> > --- > tcp.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 108 insertions(+), 2 deletions(-) > > diff --git a/tcp.c b/tcp.c > index f506cfd..ab5168e 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -402,6 +402,8 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ > (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) > #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) > > +static bool tcp_probe_msg_peek_offset_cap(); No need for a forward declaration, I guess. > + > static const char *tcp_event_str[] __attribute((__unused__)) = { > "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", > > @@ -506,7 +508,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; > static unsigned int tcp6_l2_buf_used; > > /* recvmsg()/sendmsg() data for tap */ > -static char tcp_buf_discard [MAX_WINDOW]; > +static char *tcp_buf_discard = NULL; > + > static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; > > static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; > @@ -573,6 +576,15 @@ static unsigned int tcp6_l2_flags_buf_used; > > #define CONN(idx) (&(FLOW(idx)->tcp)) > > + > +/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset? > + */ > +static inline bool msg_peek_offset_cap() > +{ > + return !tcp_buf_discard; > +} > + > + > /** conn_at_idx() - Find a connection by index, if present > * @idx: Index of connection to lookup > * > @@ -2224,7 +2236,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > return 0; > } > > - sendlen = len - already_sent; > + sendlen = len; > + if (!msg_peek_offset_cap()) > + sendlen -= already_sent; > if (sendlen <= 0) { > conn_flag(c, conn, STALLED); > return 0; > @@ -3107,6 +3121,15 @@ int tcp_init(struct ctx *c) > NS_CALL(tcp_ns_socks_init, c); > } > > + /* Only allocate discard buffer if needed */ > + if (!tcp_probe_msg_peek_offset_cap()) { > + tcp_buf_discard = malloc(MAX_WINDOW); I would rather not use the heap at all, even though after commit 0515adceaa8f ("passt, pasta: Namespace-based sandboxing, defer seccomp policy application") we don't ask seccomp to terminate the process if we call a sbrk() (or similar) in this phase. The only specific issue I have in mind is rather minor, that is, at the moment we can reliably calculate our memory footprint using nm(1). But in general, having a single set of memory addresses keep things simpler and probably a bit safer. This would be the only non-static memory we use, and I don't see a strong case for it. I would rather drop this buffer after a few months (in turn, if/after the kernel change is accepted), turning the detection into a build-time step, with passt failing if we find that we don't have this buffer, and we were built for a kernel with support for MSG_PEEK with offset. > + if (!tcp_buf_discard) { > + perror("failed to allocate discard buffer\n"); > + exit(EXIT_FAILURE); > + } > + debug("allocated discard buffer of size %i\n", MAX_WINDOW); > + } > return 0; > } > > @@ -3213,3 +3236,86 @@ void tcp_timer(struct ctx *c, const struct timespec *ts) > if (c->mode == MODE_PASTA) > tcp_splice_refill(c); > } > + > +/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support > + */ > +static bool tcp_probe_msg_peek_offset_cap() For consistency: (void). I have two main criticisms to this approach: 1. it uses fork() (and that's another usage of heap memory) but we don't actually need connect() and send() to be synchronous for this test, and 2. we bind an actual TCP port where we run. I attached a sketch (pkt_selfie.c) of a slightly different approach that solves 1. by using a non-blocking socket on the client side, and solves 2. by creating the pair of sockets in a detached network namespace, which is essentially invisible and goes away once we're done with the probing. For some reason, the negative case works: $ gcc -o pkt_selfie pkt_selfie.c; strace -f ./pkt_selfie [...] sendto(5, "ab", 2, 0, NULL, 0) = 2 recvmsg(6, {msg_name=0x7ffd2ba5d130, msg_namelen=2 => 0, msg_iov=NULL, msg_iovlen=0, msg_controllen=0, msg_flags=0}, MSG_PEEK) = 0 but on a kernel with your patch, I get ENOTCONN on recvmsg(). If I replace that by a simple recv(): sendto(5, "ab", 2, 0, NULL, 0) = 2 recvfrom(6, "ab", 10, 0, NULL, NULL) = 2 ...so I don't think it's a fundamental issue with this approach, rather something with your patch, but I'm not yet sure what. :) Most of pkt_selfie.c is copied and pasted (with minimal adaptations) from existing passt's codebase, the actual implementation starts at line 107. Of course it's missing all the error checks etc. -- Stefano [-- Attachment #2: pkt_selfie.c --] [-- Type: text/x-c++src, Size: 3400 bytes --] #define _GNU_SOURCE #include <sched.h> #include <errno.h> #include <signal.h> #include <unistd.h> #include <stdlib.h> #include <stddef.h> #include <stdio.h> #include <sys/socket.h> #include <arpa/inet.h> #include <net/if.h> #include <netinet/tcp.h> #include <linux/netlink.h> #include <linux/rtnetlink.h> /* ===> from passt's Makefile and code... */ #define RLIMIT_STACK_VAL 8192 #define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8) int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, void *arg) { #ifdef __ia64__ return __clone2(fn, stack_area + stack_size / 2, stack_size / 2, flags, arg); #else return clone(fn, stack_area + stack_size / 2, flags, arg); #endif } static int nl_sock; static int nl_sock_init_do(void *arg) { struct sockaddr_nl addr = { .nl_family = AF_NETLINK, }; int *s = &nl_sock; #ifdef NETLINK_GET_STRICT_CHK int y = 1; #endif *s = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); if (*s < 0 || bind(*s, (struct sockaddr *)&addr, sizeof(addr))) { *s = -1; return 0; } return 0; } /** * nl_send() - Prepare and send netlink request * @s: Netlink socket * @req: Request (will fill netlink header) * @type: Request type * @flags: Extra request flags (NLM_F_REQUEST and NLM_F_ACK assumed) * @len: Request length * * Return: sequence number of request on success, terminates on error */ static uint32_t nl_send(int s, void *req, uint16_t type, uint16_t flags, ssize_t len) { struct nlmsghdr *nh; ssize_t n; nh = (struct nlmsghdr *)req; nh->nlmsg_type = type; nh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags; nh->nlmsg_len = len; nh->nlmsg_seq = 1; nh->nlmsg_pid = 0; n = send(s, req, len, 0); return nh->nlmsg_seq; } int nl_link_up(int s, unsigned int ifi, int mtu) { struct req_t { struct nlmsghdr nlh; struct ifinfomsg ifm; struct rtattr rta; unsigned int mtu; } req = { .ifm.ifi_family = AF_UNSPEC, .ifm.ifi_index = ifi, .ifm.ifi_flags = IFF_UP, .ifm.ifi_change = IFF_UP, .rta.rta_type = IFLA_MTU, .rta.rta_len = RTA_LENGTH(sizeof(unsigned int)), .mtu = mtu, }; ssize_t len = sizeof(req); if (!mtu) /* Shorten request to drop MTU attribute */ len = offsetof(struct req_t, rta); return nl_send(s, &req, RTM_NEWLINK, 0, len); /* was nl_do() */ } /* <=== ...until here */ static int tcp_probe_sockets(void *arg) { int *s = (int *)arg; nl_sock_init_do(NULL); nl_link_up(nl_sock, 1 /* lo */, 0); s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); return 0; } int main(int argc, char **argv) { char ns_fn_stack[NS_FN_STACK_SIZE], b; struct iovec iov[2] = { { NULL, 1 }, { &b, 1 }, }; struct sockaddr a = { AF_INET, htons(6666), }; struct msghdr msg = { iov, 2 }; int s[2], s_nl, s_recv; ssize_t len; char buf[10]; do_clone(tcp_probe_sockets, ns_fn_stack, sizeof(ns_fn_stack), CLONE_NEWNET | CLONE_NEWUSER | CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD, (void *)s); bind(s[0], &a, sizeof(a)); getsockname(s[0], &a, &((int){ sizeof(a) })); listen(s[0], 0); connect(s[1], &a, sizeof(a)); s_recv = accept(s[0], NULL, NULL); send(s[1], (char *)("ab"), 2, 0); len = recvmsg(s_recv, &msg, MSG_PEEK); printf("MSG_PEEK with offset %ssupported\n", len == 1 ? "" : "not "); close(s_recv); close(s[1]); close(s[0]); return 0; } ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: tcp.c: leverage MSG_PEEK with offset kernel capability when available 2023-12-06 14:59 ` Stefano Brivio @ 2023-12-06 15:08 ` Stefano Brivio 2023-12-06 16:10 ` Jon Maloy 1 sibling, 0 replies; 5+ messages in thread From: Stefano Brivio @ 2023-12-06 15:08 UTC (permalink / raw) To: Jon Maloy; +Cc: lvivier, dgibson, passt-dev [-- Attachment #1: Type: text/plain, Size: 543 bytes --] On Wed, 6 Dec 2023 15:59:40 +0100 Stefano Brivio <sbrivio@redhat.com> wrote: > [...] > > but on a kernel with your patch, I get ENOTCONN on recvmsg(). If I > replace that by a simple recv(): > > sendto(5, "ab", 2, 0, NULL, 0) = 2 > recvfrom(6, "ab", 10, 0, NULL, NULL) = 2 > > ...so I don't think it's a fundamental issue with this approach, rather > something with your patch, but I'm not yet sure what. :) Oops, my bad, I got the order of fields in struct msghdr wrong. New version attached, this one works. -- Stefano [-- Attachment #2: pkt_selfie.c --] [-- Type: text/x-c++src, Size: 3381 bytes --] #define _GNU_SOURCE #include <sched.h> #include <errno.h> #include <signal.h> #include <unistd.h> #include <stdlib.h> #include <stddef.h> #include <stdio.h> #include <sys/socket.h> #include <arpa/inet.h> #include <net/if.h> #include <netinet/tcp.h> #include <linux/netlink.h> #include <linux/rtnetlink.h> /* ===> from passt's Makefile and code... */ #define RLIMIT_STACK_VAL 8192 #define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8) int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, void *arg) { #ifdef __ia64__ return __clone2(fn, stack_area + stack_size / 2, stack_size / 2, flags, arg); #else return clone(fn, stack_area + stack_size / 2, flags, arg); #endif } static int nl_sock; static int nl_sock_init_do(void *arg) { struct sockaddr_nl addr = { .nl_family = AF_NETLINK, }; int *s = &nl_sock; #ifdef NETLINK_GET_STRICT_CHK int y = 1; #endif *s = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); if (*s < 0 || bind(*s, (struct sockaddr *)&addr, sizeof(addr))) { *s = -1; return 0; } return 0; } /** * nl_send() - Prepare and send netlink request * @s: Netlink socket * @req: Request (will fill netlink header) * @type: Request type * @flags: Extra request flags (NLM_F_REQUEST and NLM_F_ACK assumed) * @len: Request length * * Return: sequence number of request on success, terminates on error */ static uint32_t nl_send(int s, void *req, uint16_t type, uint16_t flags, ssize_t len) { struct nlmsghdr *nh; ssize_t n; nh = (struct nlmsghdr *)req; nh->nlmsg_type = type; nh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags; nh->nlmsg_len = len; nh->nlmsg_seq = 1; nh->nlmsg_pid = 0; n = send(s, req, len, 0); return nh->nlmsg_seq; } int nl_link_up(int s, unsigned int ifi, int mtu) { struct req_t { struct nlmsghdr nlh; struct ifinfomsg ifm; struct rtattr rta; unsigned int mtu; } req = { .ifm.ifi_family = AF_UNSPEC, .ifm.ifi_index = ifi, .ifm.ifi_flags = IFF_UP, .ifm.ifi_change = IFF_UP, .rta.rta_type = IFLA_MTU, .rta.rta_len = RTA_LENGTH(sizeof(unsigned int)), .mtu = mtu, }; ssize_t len = sizeof(req); if (!mtu) /* Shorten request to drop MTU attribute */ len = offsetof(struct req_t, rta); return nl_send(s, &req, RTM_NEWLINK, 0, len); /* was nl_do() */ } /* <=== ...until here */ static int tcp_probe_sockets(void *arg) { int *s = (int *)arg; nl_sock_init_do(NULL); nl_link_up(nl_sock, 1 /* lo */, 0); s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); return 0; } int main(int argc, char **argv) { char ns_fn_stack[NS_FN_STACK_SIZE], b; struct iovec iov[2] = { { NULL, 1 }, { &b, 1 }, }; struct sockaddr a = { AF_INET, }; struct msghdr msg = { NULL, 0, iov, 2, }; int s[2], s_nl, s_recv; ssize_t len; do_clone(tcp_probe_sockets, ns_fn_stack, sizeof(ns_fn_stack), CLONE_NEWNET | CLONE_NEWUSER | CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD, (void *)s); bind(s[0], &a, sizeof(a)); getsockname(s[0], &a, &((int){ sizeof(a) })); listen(s[0], 0); connect(s[1], &a, sizeof(a)); s_recv = accept(s[0], NULL, NULL); send(s[1], (char *)("ab"), 2, 0); len = recvmsg(s_recv, &msg, MSG_PEEK); printf("MSG_PEEK with offset %ssupported\n", len == 1 ? "" : "not "); close(s_recv); close(s[1]); close(s[0]); return 0; } ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: tcp.c: leverage MSG_PEEK with offset kernel capability when available 2023-12-06 14:59 ` Stefano Brivio 2023-12-06 15:08 ` Stefano Brivio @ 2023-12-06 16:10 ` Jon Maloy 2023-12-06 17:06 ` Stefano Brivio 1 sibling, 1 reply; 5+ messages in thread From: Jon Maloy @ 2023-12-06 16:10 UTC (permalink / raw) To: Stefano Brivio; +Cc: lvivier, dgibson, passt-dev On 2023-12-06 09:59, Stefano Brivio wrote: > On Tue, 5 Dec 2023 18:36:04 -0500 > Jon Maloy <jmaloy@redhat.com> wrote: > >> The kernel may support recvmsg(MSG_PEEK), starting from a given offset. This >> makes it possible to avoid repeated reading of already read initial bytes of >> a received message, hence saving us read cycles when forwarding TCP messages >> in the host->name space direction. >> >> When this feature is supported, iov_sock[0].iov_base can be set to NULL. The >> kernel code will interpret this as an instruction to skip reading of the first >> iov_sock[0].iov_len bytes of the message. >> >> Since iov_sock[0].iov_base is set to point to tcp_buf_discard, we do this >> by simply not allocating that buffer, letting the pointer remain NULL, when >> we find that we have this kernel support. >> >> There is no macro or function indicating kernel support for this feature. We >> therefore have to probe for it by reading from an established TCP connection. >> The traffic connection cannot be used for this purpose, because it will be >> broken if the probe reading fails. We therefore have to create a temporary >> local connection for this task. Because of this, we also add a new function, >> tcp_probe_msg_peek_offset_cap(), which creates this temporary connection >> and performs the probe read on it. >> >> Signed-off-by: Jon Maloy <jmaloy@redhat.com> >> --- >> tcp.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- >> 1 file changed, 108 insertions(+), 2 deletions(-) >> >> diff --git a/tcp.c b/tcp.c >> index f506cfd..ab5168e 100644 >> --- a/tcp.c >> +++ b/tcp.c >> @@ -402,6 +402,8 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ >> (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) >> #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) >> >> +static bool tcp_probe_msg_peek_offset_cap(); > No need for a forward declaration, I guess. > >> + >> static const char *tcp_event_str[] __attribute((__unused__)) = { >> "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", >> >> @@ -506,7 +508,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; >> static unsigned int tcp6_l2_buf_used; >> >> /* recvmsg()/sendmsg() data for tap */ >> -static char tcp_buf_discard [MAX_WINDOW]; >> +static char *tcp_buf_discard = NULL; >> + >> static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; >> >> static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; >> @@ -573,6 +576,15 @@ static unsigned int tcp6_l2_flags_buf_used; >> >> #define CONN(idx) (&(FLOW(idx)->tcp)) >> >> + >> +/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset? >> + */ >> +static inline bool msg_peek_offset_cap() >> +{ >> + return !tcp_buf_discard; >> +} >> + >> + >> /** conn_at_idx() - Find a connection by index, if present >> * @idx: Index of connection to lookup >> * >> @@ -2224,7 +2236,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) >> return 0; >> } >> >> - sendlen = len - already_sent; >> + sendlen = len; >> + if (!msg_peek_offset_cap()) >> + sendlen -= already_sent; >> if (sendlen <= 0) { >> conn_flag(c, conn, STALLED); >> return 0; >> @@ -3107,6 +3121,15 @@ int tcp_init(struct ctx *c) >> NS_CALL(tcp_ns_socks_init, c); >> } >> >> + /* Only allocate discard buffer if needed */ >> + if (!tcp_probe_msg_peek_offset_cap()) { >> + tcp_buf_discard = malloc(MAX_WINDOW); > I would rather not use the heap at all, even though after commit > 0515adceaa8f ("passt, pasta: Namespace-based sandboxing, defer seccomp > policy application") we don't ask seccomp to terminate the process if we > call a sbrk() (or similar) in this phase. > > The only specific issue I have in mind is rather minor, that is, at the > moment we can reliably calculate our memory footprint using nm(1). > > But in general, having a single set of memory addresses keep things > simpler and probably a bit safer. This would be the only non-static > memory we use, and I don't see a strong case for it. It is a rather large chunk of memory, but sure, if we could live with it so far, we still can. > > I would rather drop this buffer after a few months (in turn, if/after > the kernel change is accepted), turning the detection into a build-time > step, with passt failing if we find that we don't have this buffer, and > we were built for a kernel with support for MSG_PEEK with offset. Sounds a litte risky to me. How do we know how are users are building it? Maybe a year or two... > >> + if (!tcp_buf_discard) { >> + perror("failed to allocate discard buffer\n"); >> + exit(EXIT_FAILURE); >> + } >> + debug("allocated discard buffer of size %i\n", MAX_WINDOW); >> + } >> return 0; >> } >> >> @@ -3213,3 +3236,86 @@ void tcp_timer(struct ctx *c, const struct timespec *ts) >> if (c->mode == MODE_PASTA) >> tcp_splice_refill(c); >> } >> + >> +/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support >> + */ >> +static bool tcp_probe_msg_peek_offset_cap() > For consistency: (void). > > I have two main criticisms to this approach: 1. it uses fork() > (and that's another usage of heap memory) but we don't actually need > connect() and send() to be synchronous for this test, and 2. we bind > an actual TCP port where we run. > > I attached a sketch (pkt_selfie.c) of a slightly different approach > that solves 1. by using a non-blocking socket on the client side, and > solves 2. by creating the pair of sockets in a detached network > namespace, which is essentially invisible and goes away once we're done > with the probing. > > For some reason, the negative case works: > > $ gcc -o pkt_selfie pkt_selfie.c; strace -f ./pkt_selfie > [...] > sendto(5, "ab", 2, 0, NULL, 0) = 2 > recvmsg(6, {msg_name=0x7ffd2ba5d130, msg_namelen=2 => 0, msg_iov=NULL, msg_iovlen=0, msg_controllen=0, msg_flags=0}, MSG_PEEK) = 0 > > but on a kernel with your patch, I get ENOTCONN on recvmsg(). If I > replace that by a simple recv(): > > sendto(5, "ab", 2, 0, NULL, 0) = 2 > recvfrom(6, "ab", 10, 0, NULL, NULL) = 2 > > ...so I don't think it's a fundamental issue with this approach, rather > something with your patch, but I'm not yet sure what. :) > > Most of pkt_selfie.c is copied and pasted (with minimal adaptations) > from existing passt's codebase, the actual implementation starts at > line 107. Of course it's missing all the error checks etc. > I tend to be rather minimalist when it comes to adding codelines, so that is why I chose the synchronous approach. I'll try your code (the latest version), and see what it looks like. ///jon ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: tcp.c: leverage MSG_PEEK with offset kernel capability when available 2023-12-06 16:10 ` Jon Maloy @ 2023-12-06 17:06 ` Stefano Brivio 0 siblings, 0 replies; 5+ messages in thread From: Stefano Brivio @ 2023-12-06 17:06 UTC (permalink / raw) To: Jon Maloy; +Cc: lvivier, dgibson, passt-dev On Wed, 6 Dec 2023 11:10:50 -0500 Jon Maloy <jmaloy@redhat.com> wrote: > On 2023-12-06 09:59, Stefano Brivio wrote: > > On Tue, 5 Dec 2023 18:36:04 -0500 > > Jon Maloy <jmaloy@redhat.com> wrote: > > > >> The kernel may support recvmsg(MSG_PEEK), starting from a given offset. This > >> makes it possible to avoid repeated reading of already read initial bytes of > >> a received message, hence saving us read cycles when forwarding TCP messages > >> in the host->name space direction. > >> > >> When this feature is supported, iov_sock[0].iov_base can be set to NULL. The > >> kernel code will interpret this as an instruction to skip reading of the first > >> iov_sock[0].iov_len bytes of the message. > >> > >> Since iov_sock[0].iov_base is set to point to tcp_buf_discard, we do this > >> by simply not allocating that buffer, letting the pointer remain NULL, when > >> we find that we have this kernel support. > >> > >> There is no macro or function indicating kernel support for this feature. We > >> therefore have to probe for it by reading from an established TCP connection. > >> The traffic connection cannot be used for this purpose, because it will be > >> broken if the probe reading fails. We therefore have to create a temporary > >> local connection for this task. Because of this, we also add a new function, > >> tcp_probe_msg_peek_offset_cap(), which creates this temporary connection > >> and performs the probe read on it. > >> > >> Signed-off-by: Jon Maloy <jmaloy@redhat.com> > >> --- > >> tcp.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- > >> 1 file changed, 108 insertions(+), 2 deletions(-) > >> > >> diff --git a/tcp.c b/tcp.c > >> index f506cfd..ab5168e 100644 > >> --- a/tcp.c > >> +++ b/tcp.c > >> @@ -402,6 +402,8 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ > >> (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) > >> #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) > >> > >> +static bool tcp_probe_msg_peek_offset_cap(); > > No need for a forward declaration, I guess. > > > >> + > >> static const char *tcp_event_str[] __attribute((__unused__)) = { > >> "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", > >> > >> @@ -506,7 +508,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; > >> static unsigned int tcp6_l2_buf_used; > >> > >> /* recvmsg()/sendmsg() data for tap */ > >> -static char tcp_buf_discard [MAX_WINDOW]; > >> +static char *tcp_buf_discard = NULL; > >> + > >> static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; > >> > >> static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; > >> @@ -573,6 +576,15 @@ static unsigned int tcp6_l2_flags_buf_used; > >> > >> #define CONN(idx) (&(FLOW(idx)->tcp)) > >> > >> + > >> +/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset? > >> + */ > >> +static inline bool msg_peek_offset_cap() > >> +{ > >> + return !tcp_buf_discard; > >> +} > >> + > >> + > >> /** conn_at_idx() - Find a connection by index, if present > >> * @idx: Index of connection to lookup > >> * > >> @@ -2224,7 +2236,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > >> return 0; > >> } > >> > >> - sendlen = len - already_sent; > >> + sendlen = len; > >> + if (!msg_peek_offset_cap()) > >> + sendlen -= already_sent; > >> if (sendlen <= 0) { > >> conn_flag(c, conn, STALLED); > >> return 0; > >> @@ -3107,6 +3121,15 @@ int tcp_init(struct ctx *c) > >> NS_CALL(tcp_ns_socks_init, c); > >> } > >> > >> + /* Only allocate discard buffer if needed */ > >> + if (!tcp_probe_msg_peek_offset_cap()) { > >> + tcp_buf_discard = malloc(MAX_WINDOW); > > I would rather not use the heap at all, even though after commit > > 0515adceaa8f ("passt, pasta: Namespace-based sandboxing, defer seccomp > > policy application") we don't ask seccomp to terminate the process if we > > call a sbrk() (or similar) in this phase. > > > > The only specific issue I have in mind is rather minor, that is, at the > > moment we can reliably calculate our memory footprint using nm(1). > > > > But in general, having a single set of memory addresses keep things > > simpler and probably a bit safer. This would be the only non-static > > memory we use, and I don't see a strong case for it. > It is a rather large chunk of memory, but sure, if we could live with it > so far, we still can. ...but we wouldn't actually use that anymore -- as long as we never write to it: $ cat zero_static.c; gcc -o zero_static zero_static.c; echo; ./zero_static #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <string.h> static char zero[4 << 20]; int main() { char cmd[100]; snprintf(cmd, 100, "grep VmRSS /proc/%i/status", getpid()); system(cmd); memset(zero, 0xff, sizeof(zero)); system(cmd); return 0; } VmRSS: 1388 kB VmRSS: 5484 kB > > I would rather drop this buffer after a few months (in turn, if/after > > the kernel change is accepted), turning the detection into a build-time > > step, with passt failing if we find that we don't have this buffer, and > > we were built for a kernel with support for MSG_PEEK with offset. > Sounds a litte risky to me. How do we know how are users are building it? The idea is that packages are _typically_ built on a system that vaguely resembles the one where they're going to be installed. Based on that, we already have some conditionals in the Makefile based on kernel headers -- those are strictly needed because we use those headers to access features, so it's not ideal (we have no theoretical guarantees, and doesn't cover users who don't use packages), but we can't do much better. Here, we could bluntly take the kernel version on build, and if it's recent enough (again, leaving a reasonable timeframe in between), skip the discard buffer. I don't see it as very risky in the sense that if we don't have it, and we realise we need it, we can tell the user to either enable a build conditional, or upgrade the kernel. On the other hand: > Maybe a year or two... ...yes, perhaps more reasonable, especially given that we're not using that memory anymore. -- Stefano ^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2023-12-06 17:06 UTC | newest] Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2023-12-05 23:36 tcp.c: leverage MSG_PEEK with offset kernel capability when available Jon Maloy 2023-12-06 14:59 ` Stefano Brivio 2023-12-06 15:08 ` Stefano Brivio 2023-12-06 16:10 ` Jon Maloy 2023-12-06 17:06 ` Stefano Brivio
Code repositories for project(s) associated with this public inbox https://passt.top/passt This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for IMAP folder(s).