public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: David Gibson <david@gibson.dropbear.id.au>
To: passt-dev@passt.top, Stefano Brivio <sbrivio@redhat.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH v2 16/32] tcp: Use the same sockets to listen for spliced and non-spliced connections
Date: Thu, 17 Nov 2022 16:58:52 +1100	[thread overview]
Message-ID: <20221117055908.2782981-17-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20221117055908.2782981-1-david@gibson.dropbear.id.au>

In pasta mode, tcp_sock_init[46]() create separate sockets to listen for
spliced connections (these are bound to localhost) and non-spliced
connections (these are bound to the host address).  This introduces a
subtle behavioural difference between pasta and passt: by default, pasta
will listen only on a single host address, whereas passt will listen on
all addresses (0.0.0.0 or ::).  This also prevents us using some additional
optimizations that only work with the unspecified (0.0.0.0 or ::) address.

However, it turns out we don't need to do this.  We can splice a connection
if and only if it originates from the loopback address.  Currently we
ensure this by having the "spliced" listening sockets listening only on
loopback.  Instead, defer the decision about whether to splice a connection
until after accept(), by checking if the connection was made from the
loopback address.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c        | 127 +++++++++++++--------------------------------------
 tcp_splice.c |  26 +++++++++--
 tcp_splice.h |   5 +-
 3 files changed, 56 insertions(+), 102 deletions(-)

diff --git a/tcp.c b/tcp.c
index a53c70d..7df3c09 100644
--- a/tcp.c
+++ b/tcp.c
@@ -434,7 +434,6 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = {
 };
 
 /* Listening sockets, used for automatic port forwarding in pasta mode only */
-static int tcp_sock_init_lo	[NUM_PORTS][IP_VERSIONS];
 static int tcp_sock_init_ext	[NUM_PORTS][IP_VERSIONS];
 static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
 
@@ -2851,21 +2850,31 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 	socklen_t sl;
 	int s;
 
+	assert(ref.r.p.tcp.tcp.listen);
+	assert(!ref.r.p.tcp.tcp.splice);
+
 	if (c->tcp.conn_count >= TCP_MAX_CONNS)
 		return;
 
 	sl = sizeof(sa);
+	/* FIXME: Workaround clang-tidy not realizing that accept4()
+	 * writes the socket address.  See
+	 * https://github.com/llvm/llvm-project/issues/58992
+	 */
+	memset(&sa, 0, sizeof(struct sockaddr_in6));
 	s = accept4(ref.r.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK);
 	if (s < 0)
 		return;
 
 	conn = tc + c->tcp.conn_count++;
 
-	if (ref.r.p.tcp.tcp.splice)
-		tcp_splice_conn_from_sock(c, ref, &conn->splice, s);
-	else
-		tcp_tap_conn_from_sock(c, ref, &conn->tap, s,
-				       (struct sockaddr *)&sa, now);
+	if (c->mode == MODE_PASTA &&
+	    tcp_splice_conn_from_sock(c, ref, &conn->splice,
+				      s, (struct sockaddr *)&sa))
+		return;
+
+	tcp_tap_conn_from_sock(c, ref, &conn->tap, s,
+			       (struct sockaddr *)&sa, now);
 }
 
 /**
@@ -3018,47 +3027,16 @@ static void tcp_sock_init4(const struct ctx *c, const struct in_addr *addr,
 {
 	in_port_t idx = port + c->tcp.fwd_in.delta[port];
 	union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.index = idx };
-	bool spliced = false, tap = true;
 	int s;
 
-	if (c->mode == MODE_PASTA) {
-		spliced = !addr || IN4_IS_ADDR_UNSPECIFIED(addr) ||
-			IN4_IS_ADDR_LOOPBACK(addr);
-
-		if (!addr)
-			addr = &c->ip4.addr;
-
-		tap = !IN4_IS_ADDR_LOOPBACK(addr);
-	}
-
-	if (tap) {
-		s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port,
-			    tref.u32);
-		if (s >= 0)
-			tcp_sock_set_bufsize(c, s);
-		else
-			s = -1;
-
-		if (c->tcp.fwd_in.mode == FWD_AUTO)
-			tcp_sock_init_ext[port][V4] = s;
-	}
-
-	if (spliced) {
-		struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
-		tref.tcp.splice = 1;
-
-		addr = &loopback;
-
-		s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port,
-			    tref.u32);
-		if (s >= 0)
-			tcp_sock_set_bufsize(c, s);
-		else
-			s = -1;
+	s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port, tref.u32);
+	if (s >= 0)
+		tcp_sock_set_bufsize(c, s);
+	else
+		s = -1;
 
-		if (c->tcp.fwd_out.mode == FWD_AUTO)
-			tcp_sock_init_lo[port][V4] = s;
-	}
+	if (c->tcp.fwd_in.mode == FWD_AUTO)
+		tcp_sock_init_ext[port][V4] = s;
 }
 
 /**
@@ -3075,47 +3053,16 @@ static void tcp_sock_init6(const struct ctx *c,
 	in_port_t idx = port + c->tcp.fwd_in.delta[port];
 	union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.v6 = 1,
 				     .tcp.index = idx };
-	bool spliced = false, tap = true;
 	int s;
 
-	if (c->mode == MODE_PASTA) {
-		spliced = !addr ||
-			  IN6_IS_ADDR_UNSPECIFIED(addr) ||
-			  IN6_IS_ADDR_LOOPBACK(addr);
-
-		if (!addr)
-			addr = &c->ip6.addr;
-
-		tap = !IN6_IS_ADDR_LOOPBACK(addr);
-	}
-
-	if (tap) {
-		s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port,
-			    tref.u32);
-		if (s >= 0)
-			tcp_sock_set_bufsize(c, s);
-		else
-			s = -1;
-
-		if (c->tcp.fwd_in.mode == FWD_AUTO)
-			tcp_sock_init_ext[port][V6] = s;
-	}
-
-	if (spliced) {
-		tref.tcp.splice = 1;
-
-		addr = &in6addr_loopback;
-
-		s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port,
-			    tref.u32);
-		if (s >= 0)
-			tcp_sock_set_bufsize(c, s);
-		else
-			s = -1;
+	s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port, tref.u32);
+	if (s >= 0)
+		tcp_sock_set_bufsize(c, s);
+	else
+		s = -1;
 
-		if (c->tcp.fwd_out.mode == FWD_AUTO)
-			tcp_sock_init_lo[port][V6] = s;
-	}
+	if (c->tcp.fwd_in.mode == FWD_AUTO)
+		tcp_sock_init_ext[port][V6] = s;
 }
 
 /**
@@ -3144,7 +3091,7 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
 {
 	in_port_t idx = port + c->tcp.fwd_out.delta[port];
 	union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = 1,
-				     .tcp.splice = 1, .tcp.index = idx };
+				     .tcp.index = idx };
 	struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
 	int s;
 
@@ -3169,8 +3116,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
 {
 	in_port_t idx = port + c->tcp.fwd_out.delta[port];
 	union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = 1,
-				     .tcp.splice = 1, .tcp.v6 = 1,
-				     .tcp.index = idx };
+				     .tcp.v6 = 1, .tcp.index = idx };
 	int s;
 
 	assert(c->mode == MODE_PASTA);
@@ -3337,7 +3283,6 @@ int tcp_init(struct ctx *c)
 	memset(init_sock_pool6,		0xff,	sizeof(init_sock_pool6));
 	memset(ns_sock_pool4,		0xff,	sizeof(ns_sock_pool4));
 	memset(ns_sock_pool6,		0xff,	sizeof(ns_sock_pool6));
-	memset(tcp_sock_init_lo,	0xff,	sizeof(tcp_sock_init_lo));
 	memset(tcp_sock_init_ext,	0xff,	sizeof(tcp_sock_init_ext));
 	memset(tcp_sock_ns,		0xff,	sizeof(tcp_sock_ns));
 
@@ -3445,16 +3390,6 @@ static int tcp_port_rebind(void *arg)
 					close(tcp_sock_init_ext[port][V6]);
 					tcp_sock_init_ext[port][V6] = -1;
 				}
-
-				if (tcp_sock_init_lo[port][V4] >= 0) {
-					close(tcp_sock_init_lo[port][V4]);
-					tcp_sock_init_lo[port][V4] = -1;
-				}
-
-				if (tcp_sock_init_lo[port][V6] >= 0) {
-					close(tcp_sock_init_lo[port][V6]);
-					tcp_sock_init_lo[port][V6] = -1;
-				}
 				continue;
 			}
 
diff --git a/tcp_splice.c b/tcp_splice.c
index 2fd88ec..356d194 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -502,30 +502,48 @@ static void tcp_splice_dir(struct tcp_splice_conn *conn, int ref_sock,
 }
 
 /**
- * tcp_splice_conn_from_sock() - Initialize state for spliced connection
+ * tcp_splice_conn_from_sock() - Attempt to init state for a spliced connection
  * @c:		Execution context
  * @ref:	epoll reference of listening socket
  * @conn:	connection structure to initialize
  * @s:		Accepted socket
+ * @sa:		Peer address of connection
  *
+ * Return: true if able to create a spliced connection, false otherwise
  * #syscalls:pasta setsockopt
  */
-void tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref,
-			       struct tcp_splice_conn *conn, int s)
+bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref,
+			       struct tcp_splice_conn *conn, int s,
+			       const struct sockaddr *sa)
 {
 	assert(c->mode == MODE_PASTA);
 
+	if (ref.r.p.tcp.tcp.v6) {
+		const struct sockaddr_in6 *sa6;
+
+		sa6 = (const struct sockaddr_in6 *)sa;
+		if (!IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr))
+			return false;
+		conn->flags = SPLICE_V6;
+	} else {
+		const struct sockaddr_in *sa4 = (const struct sockaddr_in *)sa;
+		if (!IN4_IS_ADDR_LOOPBACK(&sa4->sin_addr))
+			return false;
+		conn->flags = 0;
+	}
+
 	if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int)))
 		trace("TCP (spliced): failed to set TCP_QUICKACK on %i", s);
 
 	conn->c.spliced = true;
 	c->tcp.splice_conn_count++;
 	conn->a = s;
-	conn->flags = ref.r.p.tcp.tcp.v6 ? SPLICE_V6 : 0;
 
 	if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index,
 			   ref.r.p.tcp.tcp.outbound))
 		conn_flag(c, conn, CLOSING);
+
+	return true;
 }
 
 /**
diff --git a/tcp_splice.h b/tcp_splice.h
index f9462ae..1a915dd 100644
--- a/tcp_splice.h
+++ b/tcp_splice.h
@@ -10,8 +10,9 @@ struct tcp_splice_conn;
 
 void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
 			     uint32_t events);
-void tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref,
-			       struct tcp_splice_conn *conn, int s);
+bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref,
+			       struct tcp_splice_conn *conn, int s,
+			       const struct sockaddr *sa);
 void tcp_splice_init(struct ctx *c);
 
 #endif /* TCP_SPLICE_H */
-- 
@@ -10,8 +10,9 @@ struct tcp_splice_conn;
 
 void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
 			     uint32_t events);
-void tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref,
-			       struct tcp_splice_conn *conn, int s);
+bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref,
+			       struct tcp_splice_conn *conn, int s,
+			       const struct sockaddr *sa);
 void tcp_splice_init(struct ctx *c);
 
 #endif /* TCP_SPLICE_H */
-- 
2.38.1


  parent reply	other threads:[~2022-11-17  5:59 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-17  5:58 [PATCH v2 00/32] Use dual stack sockets to listen for inbound TCP connections David Gibson
2022-11-17  5:58 ` [PATCH v2 01/32] clang-tidy: Suppress warning about assignments in if statements David Gibson
2022-11-17  5:58 ` [PATCH v2 02/32] style: Minor corrections to function comments David Gibson
2022-11-17  5:58 ` [PATCH v2 03/32] tcp_splice: #include tcp_splice.h in tcp_splice.c David Gibson
2022-11-17  5:58 ` [PATCH v2 04/32] tcp: Remove unused TCP_MAX_SOCKS constant David Gibson
2022-11-17  5:58 ` [PATCH v2 05/32] tcp: Better helpers for converting between connection pointer and index David Gibson
2022-11-17  5:58 ` [PATCH v2 06/32] tcp_splice: Helpers for converting from index to/from tcp_splice_conn David Gibson
2022-11-17  5:58 ` [PATCH v2 07/32] tcp: Move connection state structures into a shared header David Gibson
2022-11-17  5:58 ` [PATCH v2 08/32] tcp: Add connection union type David Gibson
2022-11-18  0:25   ` Stefano Brivio
2022-11-18  1:10     ` David Gibson
2022-11-19  8:39       ` Stefano Brivio
2022-11-17  5:58 ` [PATCH v2 09/32] tcp: Improved helpers to update connections after moving David Gibson
2022-11-17  5:58 ` [PATCH v2 10/32] tcp: Unify spliced and non-spliced connection tables David Gibson
2022-11-17  5:58 ` [PATCH v2 11/32] tcp: Unify tcp_defer_handler and tcp_splice_defer_handler() David Gibson
2022-11-17  5:58 ` [PATCH v2 12/32] tcp: Partially unify tcp_timer() and tcp_splice_timer() David Gibson
2022-11-17  5:58 ` [PATCH v2 13/32] tcp: Unify the IN_EPOLL flag David Gibson
2022-11-17  5:58 ` [PATCH v2 14/32] tcp: Separate helpers to create ns listening sockets David Gibson
2022-11-17  5:58 ` [PATCH v2 15/32] tcp: Unify part of spliced and non-spliced conn_from_sock path David Gibson
2022-11-17  5:58 ` David Gibson [this message]
2022-11-17  5:58 ` [PATCH v2 17/32] tcp: Remove splice from tcp_epoll_ref David Gibson
2022-11-17  5:58 ` [PATCH v2 18/32] tcp: Don't store hash bucket in connection structures David Gibson
2022-11-17  5:58 ` [PATCH v2 19/32] inany: Helper functions for handling addresses which could be IPv4 or IPv6 David Gibson
2022-11-17  5:58 ` [PATCH v2 20/32] tcp: Hash IPv4 and IPv4-mapped-IPv6 addresses the same David Gibson
2022-11-17  5:58 ` [PATCH v2 21/32] tcp: Take tcp_hash_insert() address from struct tcp_conn David Gibson
2022-11-17  5:58 ` [PATCH v2 22/32] tcp: Simplify tcp_hash_match() to take an inany_addr David Gibson
2022-11-17  5:58 ` [PATCH v2 23/32] tcp: Unify initial sequence number calculation for IPv4 and IPv6 David Gibson
2022-11-17  5:59 ` [PATCH v2 24/32] tcp: Have tcp_seq_init() take its parameters from struct tcp_conn David Gibson
2022-11-17  5:59 ` [PATCH v2 25/32] tcp: Fix small errors in tcp_seq_init() time handling David Gibson
2022-11-17  5:59 ` [PATCH v2 26/32] tcp: Remove v6 flag from tcp_epoll_ref David Gibson
2022-11-17  5:59 ` [PATCH v2 27/32] tcp: NAT IPv4-mapped IPv6 addresses like IPv4 addresses David Gibson
2022-11-17  5:59 ` [PATCH v2 28/32] tcp_splice: Allow splicing of connections from IPv4-mapped loopback David Gibson
2022-11-17  5:59 ` [PATCH v2 29/32] tcp: Consolidate tcp_sock_init[46] David Gibson
2022-11-17  5:59 ` [PATCH v2 30/32] util: Allow sock_l4() to open dual stack sockets David Gibson
2022-11-17  5:59 ` [PATCH v2 31/32] util: Always return -1 on error in sock_l4() David Gibson
2022-11-17  5:59 ` [PATCH v2 32/32] tcp: Use dual stack sockets for port forwarding when possible David Gibson
2022-11-25  9:22 ` [PATCH v2 00/32] Use dual stack sockets to listen for inbound TCP connections Stefano Brivio

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221117055908.2782981-17-david@gibson.dropbear.id.au \
    --to=david@gibson.dropbear.id.au \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).