public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH 0/5] Cleanups to tcp socket pool handling
@ 2023-01-09  0:50 David Gibson
  2023-01-09  0:50 ` [PATCH 1/5] tcp: Make a helper to refill each socket pool David Gibson
                   ` (4 more replies)
  0 siblings, 5 replies; 13+ messages in thread
From: David Gibson @ 2023-01-09  0:50 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

This is a bit of a diversion from what I'm notionally working on at
the moment.  While thinking about what we'd need to use the
IP_TRANSPARENT socket option to broaden the cases where we can
"splice", I noticed some inelegancies in how we handle the pool of
pre-opened sockets in the TCP code, and well, here we are.

This makes a number of cleanups to the handling of these pools.  Most
notably, tcp_splice_connect() now has simpler semantics: it now always
runs in the init namespace, and is always given a pre-opened socket
(which could be in the guest ns).

David Gibson (5):
  tcp: Make a helper to refill each socket pool
  tcp: Split init and ns cases for tcp_sock_refill()
  tcp: Move socket pool declarations around
  tcp: Split pool lookup from creating new sockets in
    tcp_conn_new_sock()
  tcp: Improve handling of fallback if socket pool is empty on new
    splice

 tcp.c        | 138 ++++++++++++++++++-------------------------------
 tcp.h        |   2 -
 tcp_conn.h   |  12 ++++-
 tcp_splice.c | 141 ++++++++++++++++++++++++++-------------------------
 4 files changed, 132 insertions(+), 161 deletions(-)

-- 
2.39.0


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/5] tcp: Make a helper to refill each socket pool
  2023-01-09  0:50 [PATCH 0/5] Cleanups to tcp socket pool handling David Gibson
@ 2023-01-09  0:50 ` David Gibson
  2023-01-12 18:09   ` Stefano Brivio
  2023-01-09  0:50 ` [PATCH 2/5] tcp: Split init and ns cases for tcp_sock_refill() David Gibson
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 13+ messages in thread
From: David Gibson @ 2023-01-09  0:50 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

tcp_sock_refill() contains two near-identical loops to refill the IPv4
and IPv6 socket pools.  In addition, if we get an error on the IPv4 pool
we exit early and won't attempt to refill the IPv6 pool.  That doesn't seem
sensible, since these are essentially completely independent, and there
may be value to filling up either one without the other.

Address both of these with a helper function 'tcp_sock_refill_pool()' to
refill a single given pool.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 63 +++++++++++++++++++++++++++++++----------------------------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/tcp.c b/tcp.c
index 26037b3..becad80 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2982,6 +2982,34 @@ static int tcp_ns_socks_init(void *arg)
 	return 0;
 }
 
+/**
+ * tcp_sock_refill_pool() - Refill one pool of pre-opened sockets
+ * @c:		Execution context
+ * @pool:	Pool of sockets to refill
+ * @af:		Address family to use
+ */
+static void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af)
+{
+	int i;
+
+	for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) {
+		int *s = &pool[i];
+
+		if (*s >= 0)
+			break;
+
+		*s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+		if (*s > SOCKET_MAX) {
+			close(*s);
+			*s = -1;
+			return;
+		}
+
+		if (*s >= 0)
+			tcp_sock_set_bufsize(c, *s);
+	}
+}
+
 /**
  * struct tcp_sock_refill_arg - Arguments for tcp_sock_refill()
  * @c:		Execution context
@@ -3001,7 +3029,7 @@ struct tcp_sock_refill_arg {
 static int tcp_sock_refill(void *arg)
 {
 	struct tcp_sock_refill_arg *a = (struct tcp_sock_refill_arg *)arg;
-	int i, *p4, *p6;
+	int *p4, *p6;
 
 	if (a->ns) {
 		ns_enter(a->c);
@@ -3012,36 +3040,11 @@ static int tcp_sock_refill(void *arg)
 		p6 = init_sock_pool6;
 	}
 
-	for (i = 0; a->c->ifi4 && i < TCP_SOCK_POOL_SIZE; i++, p4++) {
-		if (*p4 >= 0)
-			break;
-
-		*p4 = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
-		if (*p4 > SOCKET_MAX) {
-			close(*p4);
-			*p4 = -1;
-			return -EIO;
-		}
-
-		if (*p4 >= 0)
-			tcp_sock_set_bufsize(a->c, *p4);
-	}
-
-	for (i = 0; a->c->ifi6 && i < TCP_SOCK_POOL_SIZE; i++, p6++) {
-		if (*p6 >= 0)
-			break;
-
-		*p6 = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK,
-			     IPPROTO_TCP);
-		if (*p6 > SOCKET_MAX) {
-			close(*p6);
-			*p6 = -1;
-			return -EIO;
-		}
+	if (a->c->ifi4)
+		tcp_sock_refill_pool(a->c, p4, AF_INET);
 
-		if (*p6 >= 0)
-			tcp_sock_set_bufsize(a->c, *p6);
-	}
+	if (a->c->ifi6)
+		tcp_sock_refill_pool(a->c, p6, AF_INET6);
 
 	return 0;
 }
-- 
@@ -2982,6 +2982,34 @@ static int tcp_ns_socks_init(void *arg)
 	return 0;
 }
 
+/**
+ * tcp_sock_refill_pool() - Refill one pool of pre-opened sockets
+ * @c:		Execution context
+ * @pool:	Pool of sockets to refill
+ * @af:		Address family to use
+ */
+static void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af)
+{
+	int i;
+
+	for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) {
+		int *s = &pool[i];
+
+		if (*s >= 0)
+			break;
+
+		*s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+		if (*s > SOCKET_MAX) {
+			close(*s);
+			*s = -1;
+			return;
+		}
+
+		if (*s >= 0)
+			tcp_sock_set_bufsize(c, *s);
+	}
+}
+
 /**
  * struct tcp_sock_refill_arg - Arguments for tcp_sock_refill()
  * @c:		Execution context
@@ -3001,7 +3029,7 @@ struct tcp_sock_refill_arg {
 static int tcp_sock_refill(void *arg)
 {
 	struct tcp_sock_refill_arg *a = (struct tcp_sock_refill_arg *)arg;
-	int i, *p4, *p6;
+	int *p4, *p6;
 
 	if (a->ns) {
 		ns_enter(a->c);
@@ -3012,36 +3040,11 @@ static int tcp_sock_refill(void *arg)
 		p6 = init_sock_pool6;
 	}
 
-	for (i = 0; a->c->ifi4 && i < TCP_SOCK_POOL_SIZE; i++, p4++) {
-		if (*p4 >= 0)
-			break;
-
-		*p4 = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
-		if (*p4 > SOCKET_MAX) {
-			close(*p4);
-			*p4 = -1;
-			return -EIO;
-		}
-
-		if (*p4 >= 0)
-			tcp_sock_set_bufsize(a->c, *p4);
-	}
-
-	for (i = 0; a->c->ifi6 && i < TCP_SOCK_POOL_SIZE; i++, p6++) {
-		if (*p6 >= 0)
-			break;
-
-		*p6 = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK,
-			     IPPROTO_TCP);
-		if (*p6 > SOCKET_MAX) {
-			close(*p6);
-			*p6 = -1;
-			return -EIO;
-		}
+	if (a->c->ifi4)
+		tcp_sock_refill_pool(a->c, p4, AF_INET);
 
-		if (*p6 >= 0)
-			tcp_sock_set_bufsize(a->c, *p6);
-	}
+	if (a->c->ifi6)
+		tcp_sock_refill_pool(a->c, p6, AF_INET6);
 
 	return 0;
 }
-- 
2.39.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/5] tcp: Split init and ns cases for tcp_sock_refill()
  2023-01-09  0:50 [PATCH 0/5] Cleanups to tcp socket pool handling David Gibson
  2023-01-09  0:50 ` [PATCH 1/5] tcp: Make a helper to refill each socket pool David Gibson
@ 2023-01-09  0:50 ` David Gibson
  2023-01-09  0:50 ` [PATCH 3/5] tcp: Move socket pool declarations around David Gibson
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 13+ messages in thread
From: David Gibson @ 2023-01-09  0:50 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

With the creation of the tcp_sock_refill_pool() helper, the ns==true and
ns==false paths for tcp_sock_refill() now have almost nothing in common.
Split the two versions into tcp_sock_refill_init() and tcp_sock_refill_ns()
functions.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 53 +++++++++++++++++++++--------------------------------
 1 file changed, 21 insertions(+), 32 deletions(-)

diff --git a/tcp.c b/tcp.c
index becad80..4da823c 100644
--- a/tcp.c
+++ b/tcp.c
@@ -3011,40 +3011,33 @@ static void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af)
 }
 
 /**
- * struct tcp_sock_refill_arg - Arguments for tcp_sock_refill()
+ * tcp_sock_refill_init() - Refill pools of pre-opened sockets in init ns
  * @c:		Execution context
- * @ns:		Set to refill pool of sockets created in namespace
  */
-struct tcp_sock_refill_arg {
-	struct ctx *c;
-	int ns;
-};
+static void tcp_sock_refill_init(const struct ctx *c)
+{
+	if (c->ifi4)
+		tcp_sock_refill_pool(c, init_sock_pool4, AF_INET);
+	if (c->ifi6)
+		tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6);
+}
 
 /**
- * tcp_sock_refill() - Refill pool of pre-opened sockets
- * @arg:	See @tcp_sock_refill_arg
+ * tcp_sock_refill_ns() - Refill pools of pre-opened sockets in namespace
+ * @arg:	Execution context cast to void *
  *
  * Return: 0
  */
-static int tcp_sock_refill(void *arg)
+static int tcp_sock_refill_ns(void *arg)
 {
-	struct tcp_sock_refill_arg *a = (struct tcp_sock_refill_arg *)arg;
-	int *p4, *p6;
+	const struct ctx *c = (const struct ctx *)arg;
 
-	if (a->ns) {
-		ns_enter(a->c);
-		p4 = ns_sock_pool4;
-		p6 = ns_sock_pool6;
-	} else {
-		p4 = init_sock_pool4;
-		p6 = init_sock_pool6;
-	}
-
-	if (a->c->ifi4)
-		tcp_sock_refill_pool(a->c, p4, AF_INET);
+	ns_enter(c);
 
-	if (a->c->ifi6)
-		tcp_sock_refill_pool(a->c, p6, AF_INET6);
+	if (c->ifi4)
+		tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
+	if (c->ifi6)
+		tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
 
 	return 0;
 }
@@ -3057,7 +3050,6 @@ static int tcp_sock_refill(void *arg)
  */
 int tcp_init(struct ctx *c)
 {
-	struct tcp_sock_refill_arg refill_arg = { c, 0 };
 	int i;
 #ifndef HAS_GETRANDOM
 	int dev_random = open("/dev/random", O_RDONLY);
@@ -3103,15 +3095,14 @@ int tcp_init(struct ctx *c)
 	memset(tcp_sock_init_ext,	0xff,	sizeof(tcp_sock_init_ext));
 	memset(tcp_sock_ns,		0xff,	sizeof(tcp_sock_ns));
 
-	tcp_sock_refill(&refill_arg);
+	tcp_sock_refill_init(c);
 
 	if (c->mode == MODE_PASTA) {
 		tcp_splice_init(c);
 
 		NS_CALL(tcp_ns_socks_init, c);
 
-		refill_arg.ns = 1;
-		NS_CALL(tcp_sock_refill, &refill_arg);
+		NS_CALL(tcp_sock_refill_ns, c);
 	}
 
 	return 0;
@@ -3231,7 +3222,6 @@ static int tcp_port_rebind(void *arg)
  */
 void tcp_timer(struct ctx *c, const struct timespec *ts)
 {
-	struct tcp_sock_refill_arg refill_arg = { c, 0 };
 	union tcp_conn *conn;
 
 	(void)ts;
@@ -3264,12 +3254,11 @@ void tcp_timer(struct ctx *c, const struct timespec *ts)
 		}
 	}
 
-	tcp_sock_refill(&refill_arg);
+	tcp_sock_refill_init(c);
 	if (c->mode == MODE_PASTA) {
-		refill_arg.ns = 1;
 		if ((c->ifi4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
 		    (c->ifi6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
-			NS_CALL(tcp_sock_refill, &refill_arg);
+			NS_CALL(tcp_sock_refill_ns, c);
 
 		tcp_splice_pipe_refill(c);
 	}
-- 
@@ -3011,40 +3011,33 @@ static void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af)
 }
 
 /**
- * struct tcp_sock_refill_arg - Arguments for tcp_sock_refill()
+ * tcp_sock_refill_init() - Refill pools of pre-opened sockets in init ns
  * @c:		Execution context
- * @ns:		Set to refill pool of sockets created in namespace
  */
-struct tcp_sock_refill_arg {
-	struct ctx *c;
-	int ns;
-};
+static void tcp_sock_refill_init(const struct ctx *c)
+{
+	if (c->ifi4)
+		tcp_sock_refill_pool(c, init_sock_pool4, AF_INET);
+	if (c->ifi6)
+		tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6);
+}
 
 /**
- * tcp_sock_refill() - Refill pool of pre-opened sockets
- * @arg:	See @tcp_sock_refill_arg
+ * tcp_sock_refill_ns() - Refill pools of pre-opened sockets in namespace
+ * @arg:	Execution context cast to void *
  *
  * Return: 0
  */
-static int tcp_sock_refill(void *arg)
+static int tcp_sock_refill_ns(void *arg)
 {
-	struct tcp_sock_refill_arg *a = (struct tcp_sock_refill_arg *)arg;
-	int *p4, *p6;
+	const struct ctx *c = (const struct ctx *)arg;
 
-	if (a->ns) {
-		ns_enter(a->c);
-		p4 = ns_sock_pool4;
-		p6 = ns_sock_pool6;
-	} else {
-		p4 = init_sock_pool4;
-		p6 = init_sock_pool6;
-	}
-
-	if (a->c->ifi4)
-		tcp_sock_refill_pool(a->c, p4, AF_INET);
+	ns_enter(c);
 
-	if (a->c->ifi6)
-		tcp_sock_refill_pool(a->c, p6, AF_INET6);
+	if (c->ifi4)
+		tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
+	if (c->ifi6)
+		tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
 
 	return 0;
 }
@@ -3057,7 +3050,6 @@ static int tcp_sock_refill(void *arg)
  */
 int tcp_init(struct ctx *c)
 {
-	struct tcp_sock_refill_arg refill_arg = { c, 0 };
 	int i;
 #ifndef HAS_GETRANDOM
 	int dev_random = open("/dev/random", O_RDONLY);
@@ -3103,15 +3095,14 @@ int tcp_init(struct ctx *c)
 	memset(tcp_sock_init_ext,	0xff,	sizeof(tcp_sock_init_ext));
 	memset(tcp_sock_ns,		0xff,	sizeof(tcp_sock_ns));
 
-	tcp_sock_refill(&refill_arg);
+	tcp_sock_refill_init(c);
 
 	if (c->mode == MODE_PASTA) {
 		tcp_splice_init(c);
 
 		NS_CALL(tcp_ns_socks_init, c);
 
-		refill_arg.ns = 1;
-		NS_CALL(tcp_sock_refill, &refill_arg);
+		NS_CALL(tcp_sock_refill_ns, c);
 	}
 
 	return 0;
@@ -3231,7 +3222,6 @@ static int tcp_port_rebind(void *arg)
  */
 void tcp_timer(struct ctx *c, const struct timespec *ts)
 {
-	struct tcp_sock_refill_arg refill_arg = { c, 0 };
 	union tcp_conn *conn;
 
 	(void)ts;
@@ -3264,12 +3254,11 @@ void tcp_timer(struct ctx *c, const struct timespec *ts)
 		}
 	}
 
-	tcp_sock_refill(&refill_arg);
+	tcp_sock_refill_init(c);
 	if (c->mode == MODE_PASTA) {
-		refill_arg.ns = 1;
 		if ((c->ifi4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
 		    (c->ifi6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
-			NS_CALL(tcp_sock_refill, &refill_arg);
+			NS_CALL(tcp_sock_refill_ns, c);
 
 		tcp_splice_pipe_refill(c);
 	}
-- 
2.39.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 3/5] tcp: Move socket pool declarations around
  2023-01-09  0:50 [PATCH 0/5] Cleanups to tcp socket pool handling David Gibson
  2023-01-09  0:50 ` [PATCH 1/5] tcp: Make a helper to refill each socket pool David Gibson
  2023-01-09  0:50 ` [PATCH 2/5] tcp: Split init and ns cases for tcp_sock_refill() David Gibson
@ 2023-01-09  0:50 ` David Gibson
  2023-01-12 18:09   ` Stefano Brivio
  2023-01-09  0:50 ` [PATCH 4/5] tcp: Split pool lookup from creating new sockets in tcp_conn_new_sock() David Gibson
  2023-01-09  0:50 ` [PATCH 5/5] tcp: Improve handling of fallback if socket pool is empty on new splice David Gibson
  4 siblings, 1 reply; 13+ messages in thread
From: David Gibson @ 2023-01-09  0:50 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

tcp_splice.c has some explicit extern declarations to access the socket
pools.  This is pretty dangerous - if we changed the type of these
variables in tcp.c, we'd have tcp.c and tcp_splice.c using the memory
differently with no compiler error.  So, move the extern declarations to
tcp_conn.h so they're visible to both tcp.c and tcp_splice.c, but not the
rest of pasta.

In fact the pools for the guest namespace are necessarily only used by
tcp_splice.c - we have no sockets on the guest side if we're not splicing.
So move those declarations - and the functions that deal exclusively with
them - to tcp_splice.c

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c        | 41 ++++-------------------------------------
 tcp.h        |  2 --
 tcp_conn.h   | 10 ++++++++--
 tcp_splice.c | 50 +++++++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 55 insertions(+), 48 deletions(-)

diff --git a/tcp.c b/tcp.c
index 4da823c..19fd17b 100644
--- a/tcp.c
+++ b/tcp.c
@@ -370,8 +370,6 @@ struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
 #define FIN_TIMEOUT			60
 #define ACT_TIMEOUT			7200
 
-#define TCP_SOCK_POOL_TSH		16 /* Refill in ns if > x used */
-
 #define LOW_RTT_TABLE_SIZE		8
 #define LOW_RTT_THRESHOLD		10 /* us */
 
@@ -595,11 +593,9 @@ static inline struct tcp_tap_conn *conn_at_idx(int index)
 /* Table for lookup from remote address, local port, remote port */
 static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE];
 
-/* Pools for pre-opened sockets */
+/* Pools for pre-opened sockets (init namespace) */
 int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
 int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
-int ns_sock_pool4		[TCP_SOCK_POOL_SIZE];
-int ns_sock_pool6		[TCP_SOCK_POOL_SIZE];
 
 /**
  * tcp_conn_epoll_events() - epoll events mask for given connection state
@@ -2988,7 +2984,7 @@ static int tcp_ns_socks_init(void *arg)
  * @pool:	Pool of sockets to refill
  * @af:		Address family to use
  */
-static void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af)
+void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af)
 {
 	int i;
 
@@ -3022,26 +3018,6 @@ static void tcp_sock_refill_init(const struct ctx *c)
 		tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6);
 }
 
-/**
- * tcp_sock_refill_ns() - Refill pools of pre-opened sockets in namespace
- * @arg:	Execution context cast to void *
- *
- * Return: 0
- */
-static int tcp_sock_refill_ns(void *arg)
-{
-	const struct ctx *c = (const struct ctx *)arg;
-
-	ns_enter(c);
-
-	if (c->ifi4)
-		tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
-	if (c->ifi6)
-		tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
-
-	return 0;
-}
-
 /**
  * tcp_init() - Get initial sequence, hash secret, initialise per-socket data
  * @c:		Execution context
@@ -3090,8 +3066,6 @@ int tcp_init(struct ctx *c)
 
 	memset(init_sock_pool4,		0xff,	sizeof(init_sock_pool4));
 	memset(init_sock_pool6,		0xff,	sizeof(init_sock_pool6));
-	memset(ns_sock_pool4,		0xff,	sizeof(ns_sock_pool4));
-	memset(ns_sock_pool6,		0xff,	sizeof(ns_sock_pool6));
 	memset(tcp_sock_init_ext,	0xff,	sizeof(tcp_sock_init_ext));
 	memset(tcp_sock_ns,		0xff,	sizeof(tcp_sock_ns));
 
@@ -3101,8 +3075,6 @@ int tcp_init(struct ctx *c)
 		tcp_splice_init(c);
 
 		NS_CALL(tcp_ns_socks_init, c);
-
-		NS_CALL(tcp_sock_refill_ns, c);
 	}
 
 	return 0;
@@ -3255,11 +3227,6 @@ void tcp_timer(struct ctx *c, const struct timespec *ts)
 	}
 
 	tcp_sock_refill_init(c);
-	if (c->mode == MODE_PASTA) {
-		if ((c->ifi4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
-		    (c->ifi6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
-			NS_CALL(tcp_sock_refill_ns, c);
-
-		tcp_splice_pipe_refill(c);
-	}
+	if (c->mode == MODE_PASTA)
+		tcp_splice_refill(c);
 }
diff --git a/tcp.h b/tcp.h
index 739b451..236038f 100644
--- a/tcp.h
+++ b/tcp.h
@@ -11,8 +11,6 @@
 #define TCP_CONN_INDEX_BITS		17	/* 128k */
 #define TCP_MAX_CONNS			(1 << TCP_CONN_INDEX_BITS)
 
-#define TCP_SOCK_POOL_SIZE		32
-
 struct ctx;
 
 void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
diff --git a/tcp_conn.h b/tcp_conn.h
index 70f4a7c..9951b0a 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -182,11 +182,17 @@ union tcp_conn {
 /* TCP connections */
 extern union tcp_conn tc[];
 
+/* Socket pools */
+#define TCP_SOCK_POOL_SIZE		32
+
+extern int init_sock_pool4	[TCP_SOCK_POOL_SIZE];
+extern int init_sock_pool6	[TCP_SOCK_POOL_SIZE];
+
 void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new);
 void tcp_table_compact(struct ctx *c, union tcp_conn *hole);
 void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union);
 void tcp_splice_timer(struct ctx *c, union tcp_conn *conn_union);
-void tcp_splice_pipe_refill(const struct ctx *c);
-
+void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af);
+void tcp_splice_refill(const struct ctx *c);
 
 #endif /* TCP_CONN_H */
diff --git a/tcp_splice.c b/tcp_splice.c
index 72b1672..688d776 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -61,11 +61,11 @@
 #define TCP_SPLICE_CONN_PRESSURE	30	/* % of conn_count */
 #define TCP_SPLICE_FILE_PRESSURE	30	/* % of c->nofile */
 
-/* From tcp.c */
-extern int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
-extern int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
-extern int ns_sock_pool4		[TCP_SOCK_POOL_SIZE];
-extern int ns_sock_pool6		[TCP_SOCK_POOL_SIZE];
+/* Pools for pre-opened sockets (guest namespace) */
+#define TCP_SOCK_POOL_TSH		16 /* Refill in ns if > x used */
+
+static int ns_sock_pool4	[TCP_SOCK_POOL_SIZE];
+static int ns_sock_pool6	[TCP_SOCK_POOL_SIZE];
 
 /* Pool of pre-opened pipes */
 static int splice_pipe_pool		[TCP_SPLICE_PIPE_POOL_SIZE][2][2];
@@ -783,7 +783,7 @@ smaller:
  * tcp_splice_pipe_refill() - Refill pool of pre-opened pipes
  * @c:		Execution context
  */
-void tcp_splice_pipe_refill(const struct ctx *c)
+static void tcp_splice_pipe_refill(const struct ctx *c)
 {
 	int i;
 
@@ -812,6 +812,39 @@ void tcp_splice_pipe_refill(const struct ctx *c)
 	}
 }
 
+/**
+ * tcp_sock_refill_ns() - Refill pools of pre-opened sockets in namespace
+ * @arg:	Execution context cast to void *
+ *
+ * Return: 0
+ */
+static int tcp_sock_refill_ns(void *arg)
+{
+	const struct ctx *c = (const struct ctx *)arg;
+
+	ns_enter(c);
+
+	if (c->ifi4)
+		tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
+	if (c->ifi6)
+		tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
+
+	return 0;
+}
+
+/**
+ * tcp_splice_refill() - Refill pools of resources needed for splicing
+ * @c:		Execution context
+ */
+void tcp_splice_refill(const struct ctx *c)
+{
+	if ((c->ifi4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
+	    (c->ifi6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
+		NS_CALL(tcp_sock_refill_ns, c);
+
+	tcp_splice_pipe_refill(c);
+}
+
 /**
  * tcp_splice_init() - Initialise pipe pool and size
  * @c:		Execution context
@@ -820,7 +853,10 @@ void tcp_splice_init(struct ctx *c)
 {
 	memset(splice_pipe_pool, 0xff, sizeof(splice_pipe_pool));
 	tcp_set_pipe_size(c);
-	tcp_splice_pipe_refill(c);
+
+	memset(&ns_sock_pool4,		0xff,	sizeof(ns_sock_pool4));
+	memset(&ns_sock_pool6,		0xff,	sizeof(ns_sock_pool6));
+	NS_CALL(tcp_sock_refill_ns, c);
 }
 
 /**
-- 
@@ -61,11 +61,11 @@
 #define TCP_SPLICE_CONN_PRESSURE	30	/* % of conn_count */
 #define TCP_SPLICE_FILE_PRESSURE	30	/* % of c->nofile */
 
-/* From tcp.c */
-extern int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
-extern int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
-extern int ns_sock_pool4		[TCP_SOCK_POOL_SIZE];
-extern int ns_sock_pool6		[TCP_SOCK_POOL_SIZE];
+/* Pools for pre-opened sockets (guest namespace) */
+#define TCP_SOCK_POOL_TSH		16 /* Refill in ns if > x used */
+
+static int ns_sock_pool4	[TCP_SOCK_POOL_SIZE];
+static int ns_sock_pool6	[TCP_SOCK_POOL_SIZE];
 
 /* Pool of pre-opened pipes */
 static int splice_pipe_pool		[TCP_SPLICE_PIPE_POOL_SIZE][2][2];
@@ -783,7 +783,7 @@ smaller:
  * tcp_splice_pipe_refill() - Refill pool of pre-opened pipes
  * @c:		Execution context
  */
-void tcp_splice_pipe_refill(const struct ctx *c)
+static void tcp_splice_pipe_refill(const struct ctx *c)
 {
 	int i;
 
@@ -812,6 +812,39 @@ void tcp_splice_pipe_refill(const struct ctx *c)
 	}
 }
 
+/**
+ * tcp_sock_refill_ns() - Refill pools of pre-opened sockets in namespace
+ * @arg:	Execution context cast to void *
+ *
+ * Return: 0
+ */
+static int tcp_sock_refill_ns(void *arg)
+{
+	const struct ctx *c = (const struct ctx *)arg;
+
+	ns_enter(c);
+
+	if (c->ifi4)
+		tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
+	if (c->ifi6)
+		tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
+
+	return 0;
+}
+
+/**
+ * tcp_splice_refill() - Refill pools of resources needed for splicing
+ * @c:		Execution context
+ */
+void tcp_splice_refill(const struct ctx *c)
+{
+	if ((c->ifi4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
+	    (c->ifi6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
+		NS_CALL(tcp_sock_refill_ns, c);
+
+	tcp_splice_pipe_refill(c);
+}
+
 /**
  * tcp_splice_init() - Initialise pipe pool and size
  * @c:		Execution context
@@ -820,7 +853,10 @@ void tcp_splice_init(struct ctx *c)
 {
 	memset(splice_pipe_pool, 0xff, sizeof(splice_pipe_pool));
 	tcp_set_pipe_size(c);
-	tcp_splice_pipe_refill(c);
+
+	memset(&ns_sock_pool4,		0xff,	sizeof(ns_sock_pool4));
+	memset(&ns_sock_pool6,		0xff,	sizeof(ns_sock_pool6));
+	NS_CALL(tcp_sock_refill_ns, c);
 }
 
 /**
-- 
2.39.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 4/5] tcp: Split pool lookup from creating new sockets in tcp_conn_new_sock()
  2023-01-09  0:50 [PATCH 0/5] Cleanups to tcp socket pool handling David Gibson
                   ` (2 preceding siblings ...)
  2023-01-09  0:50 ` [PATCH 3/5] tcp: Move socket pool declarations around David Gibson
@ 2023-01-09  0:50 ` David Gibson
  2023-01-09  0:50 ` [PATCH 5/5] tcp: Improve handling of fallback if socket pool is empty on new splice David Gibson
  4 siblings, 0 replies; 13+ messages in thread
From: David Gibson @ 2023-01-09  0:50 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

tcp_conn_new_sock() first looks for a socket in a pre-opened pool, then if
that's empty creates a new socket in the init namespace.  Both parts of
this are duplicated in other places: the pool lookup logic is duplicated in
tcp_splice_new(), and the socket opening logic is duplicated in
tcp_sock_refill_pool().

Split the function into separate parts so we can remove both these
duplications.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c        | 53 +++++++++++++++++++++++++++-------------------------
 tcp_conn.h   |  1 +
 tcp_splice.c |  8 ++------
 3 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/tcp.c b/tcp.c
index 19fd17b..a540235 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1825,24 +1825,35 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
 }
 
 /**
- * tcp_conn_new_sock() - Get socket for new connection from pool or make new one
- * @c:		Execution context
- * @af:		Address family
+ * tcp_conn_pool_sock() - Get socket for new connection from pre-opened pool
+ * @pool:	Pool of pre-opened sockets
  *
- * Return: socket number if available, negative code if socket creation failed
+ * Return: socket number if available, negative code if pool is empty
  */
-static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
+int tcp_conn_pool_sock(int pool[])
 {
-	int *p = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4, i, s = -1;
+	int s = -1, i;
 
-	for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, p++) {
-		SWAP(s, *p);
+	for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) {
+		SWAP(s, pool[i]);
 		if (s >= 0)
-			break;
+			return s;
 	}
+	return -1;
+}
 
-	if (s < 0)
-		s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+/**
+ * tcp_conn_new_sock() - Open and prepare new socket for connection
+ * @c:		Execution context
+ * @af:		Address family
+ *
+ * Return: socket number on success, negative code if socket creation failed
+ */
+static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
+{
+	int s;
+
+	s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
 
 	if (s > SOCKET_MAX) {
 		close(s);
@@ -1903,6 +1914,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
 			      const struct tcphdr *th, const char *opts,
 			      size_t optlen, const struct timespec *now)
 {
+	int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4;
 	struct sockaddr_in addr4 = {
 		.sin_family = AF_INET,
 		.sin_port = th->dest,
@@ -1921,8 +1933,9 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
 	if (c->tcp.conn_count >= TCP_MAX_CONNS)
 		return;
 
-	if ((s = tcp_conn_new_sock(c, af)) < 0)
-		return;
+	if ((s = tcp_conn_pool_sock(pool)) < 0)
+		if ((s = tcp_conn_new_sock(c, af)) < 0)
+			return;
 
 	if (!c->no_map_gw) {
 		if (af == AF_INET && IN4_ARE_ADDR_EQUAL(addr, &c->ip4.gw))
@@ -2989,20 +3002,10 @@ void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af)
 	int i;
 
 	for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) {
-		int *s = &pool[i];
-
-		if (*s >= 0)
+		if (pool[i] >= 0)
 			break;
 
-		*s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
-		if (*s > SOCKET_MAX) {
-			close(*s);
-			*s = -1;
-			return;
-		}
-
-		if (*s >= 0)
-			tcp_sock_set_bufsize(c, *s);
+		pool[i] = tcp_conn_new_sock(c, af);
 	}
 }
 
diff --git a/tcp_conn.h b/tcp_conn.h
index 9951b0a..c807e8b 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -192,6 +192,7 @@ void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new);
 void tcp_table_compact(struct ctx *c, union tcp_conn *hole);
 void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union);
 void tcp_splice_timer(struct ctx *c, union tcp_conn *conn_union);
+int tcp_conn_pool_sock(int pool[]);
 void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af);
 void tcp_splice_refill(const struct ctx *c);
 
diff --git a/tcp_splice.c b/tcp_splice.c
index 688d776..5a88975 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -452,18 +452,14 @@ static int tcp_splice_connect_ns(void *arg)
 static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn,
 			  in_port_t port, int outbound)
 {
-	int *p, i, s = -1;
+	int *p, s = -1;
 
 	if (outbound)
 		p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4;
 	else
 		p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4;
 
-	for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, p++) {
-		SWAP(s, *p);
-		if (s >= 0)
-			break;
-	}
+	s = tcp_conn_pool_sock(p);
 
 	/* No socket available in namespace: create a new one for connect() */
 	if (s < 0 && !outbound) {
-- 
@@ -452,18 +452,14 @@ static int tcp_splice_connect_ns(void *arg)
 static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn,
 			  in_port_t port, int outbound)
 {
-	int *p, i, s = -1;
+	int *p, s = -1;
 
 	if (outbound)
 		p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4;
 	else
 		p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4;
 
-	for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, p++) {
-		SWAP(s, *p);
-		if (s >= 0)
-			break;
-	}
+	s = tcp_conn_pool_sock(p);
 
 	/* No socket available in namespace: create a new one for connect() */
 	if (s < 0 && !outbound) {
-- 
2.39.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 5/5] tcp: Improve handling of fallback if socket pool is empty on new splice
  2023-01-09  0:50 [PATCH 0/5] Cleanups to tcp socket pool handling David Gibson
                   ` (3 preceding siblings ...)
  2023-01-09  0:50 ` [PATCH 4/5] tcp: Split pool lookup from creating new sockets in tcp_conn_new_sock() David Gibson
@ 2023-01-09  0:50 ` David Gibson
  2023-01-12 18:09   ` Stefano Brivio
  4 siblings, 1 reply; 13+ messages in thread
From: David Gibson @ 2023-01-09  0:50 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

When creating a new spliced connection, we need to get a socket in the
other ns from the originating one.  To avoid excessive ns switches we
usually get these from a pool refilled on a timer.  However, if the pool
runs out we need a fallback.  Currently that's done by passing -1 as the
socket to tcp_splice_connnect() and running it in the target ns.

This means that tcp_splice_connect() itself needs to have different cases
depending on whether it's given an existing socket or not, which is
a separate concern from what it's mostly doing.  We change it to require
a suitable open socket to be passed in, and ensuring in the caller that we
have one.

This requires adding the fallback paths to the caller, tcp_splice_new().
We use slightly different approaches for a socket in the init ns versus the
guest ns.

This also means that we no longer need to run tcp_splice_connect() itself
in the guest ns, which allows us to remove a bunch of boilerplate code.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c        |  2 +-
 tcp_conn.h   |  1 +
 tcp_splice.c | 89 ++++++++++++++++++----------------------------------
 3 files changed, 32 insertions(+), 60 deletions(-)

diff --git a/tcp.c b/tcp.c
index a540235..7a88dab 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1849,7 +1849,7 @@ int tcp_conn_pool_sock(int pool[])
  *
  * Return: socket number on success, negative code if socket creation failed
  */
-static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
+int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
 {
 	int s;
 
diff --git a/tcp_conn.h b/tcp_conn.h
index c807e8b..a499f34 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -193,6 +193,7 @@ void tcp_table_compact(struct ctx *c, union tcp_conn *hole);
 void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union);
 void tcp_splice_timer(struct ctx *c, union tcp_conn *conn_union);
 int tcp_conn_pool_sock(int pool[]);
+int tcp_conn_new_sock(const struct ctx *c, sa_family_t af);
 void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af);
 void tcp_splice_refill(const struct ctx *c);
 
diff --git a/tcp_splice.c b/tcp_splice.c
index 5a88975..1cd9fdc 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -88,6 +88,9 @@ static const char *tcp_splice_flag_str[] __attribute((__unused__)) = {
 	"RCVLOWAT_ACT_B", "CLOSING",
 };
 
+/* Forward decl */
+static int tcp_sock_refill_ns(void *arg);
+
 /**
  * tcp_splice_conn_epoll_events() - epoll events masks for given state
  * @events:	Connection event flags
@@ -348,12 +351,8 @@ static int tcp_splice_connect_finish(const struct ctx *c,
  * Return: 0 for connect() succeeded or in progress, negative value on error
  */
 static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
-			      int s, in_port_t port)
+			      int sock_conn, in_port_t port)
 {
-	int sock_conn = (s >= 0) ? s : socket(CONN_V6(conn) ? AF_INET6 :
-							      AF_INET,
-					      SOCK_STREAM | SOCK_NONBLOCK,
-					      IPPROTO_TCP);
 	struct sockaddr_in6 addr6 = {
 		.sin6_family = AF_INET6,
 		.sin6_port = htons(port),
@@ -367,19 +366,8 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
 	const struct sockaddr *sa;
 	socklen_t sl;
 
-	if (sock_conn < 0)
-		return -errno;
-
-	if (sock_conn > SOCKET_MAX) {
-		close(sock_conn);
-		return -EIO;
-	}
-
 	conn->b = sock_conn;
 
-	if (s < 0)
-		tcp_sock_set_bufsize(c, conn->b);
-
 	if (setsockopt(conn->b, SOL_TCP, TCP_QUICKACK,
 		       &((int){ 1 }), sizeof(int))) {
 		trace("TCP (spliced): failed to set TCP_QUICKACK on socket %i",
@@ -410,36 +398,6 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
 	return 0;
 }
 
-/**
- * struct tcp_splice_connect_ns_arg - Arguments for tcp_splice_connect_ns()
- * @c:		Execution context
- * @conn:	Accepted inbound connection
- * @port:	Destination port, host order
- * @ret:	Return value of tcp_splice_connect_ns()
- */
-struct tcp_splice_connect_ns_arg {
-	const struct ctx *c;
-	struct tcp_splice_conn *conn;
-	in_port_t port;
-	int ret;
-};
-
-/**
- * tcp_splice_connect_ns() - Enter namespace and call tcp_splice_connect()
- * @arg:	See struct tcp_splice_connect_ns_arg
- *
- * Return: 0
- */
-static int tcp_splice_connect_ns(void *arg)
-{
-	struct tcp_splice_connect_ns_arg *a;
-
-	a = (struct tcp_splice_connect_ns_arg *)arg;
-	ns_enter(a->c);
-	a->ret = tcp_splice_connect(a->c, a->conn, -1, a->port);
-	return 0;
-}
-
 /**
  * tcp_splice_new() - Handle new spliced connection
  * @c:		Execution context
@@ -452,24 +410,37 @@ static int tcp_splice_connect_ns(void *arg)
 static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn,
 			  in_port_t port, int outbound)
 {
-	int *p, s = -1;
-
-	if (outbound)
-		p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4;
-	else
-		p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4;
+	int s = -1;
+
+	/* If the pool is empty we take slightly different approaches
+	 * for init or ns sockets.  For init sockets we just open a
+	 * new one without refilling the pool to keep latency down.
+	 * For ns sockets, we're going to incur the latency of
+	 * entering the ns anyway, so we might as well refill the
+	 * pool.
+	 */
+	if (outbound) {
+		int *p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4;
+		int af = CONN_V6(conn) ? AF_INET6 : AF_INET;
+
+		s = tcp_conn_pool_sock(p);
+		if (s < 0)
+			s = tcp_conn_new_sock(c, af);
+	} else {
+		int *p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4;
 
-	s = tcp_conn_pool_sock(p);
+		/* If pool is empty, refill it first */
+		if (p[TCP_SOCK_POOL_SIZE-1] < 0)
+			NS_CALL(tcp_sock_refill_ns, c);
 
-	/* No socket available in namespace: create a new one for connect() */
-	if (s < 0 && !outbound) {
-		struct tcp_splice_connect_ns_arg ns_arg = { c, conn, port, 0 };
+		s = tcp_conn_pool_sock(p);
+	}
 
-		NS_CALL(tcp_splice_connect_ns, &ns_arg);
-		return ns_arg.ret;
+	if (s < 0) {
+		warn("Couldn't open connectable socket for splice (%d)", s);
+		return s;
 	}
 
-	/* Otherwise, the socket will connect on the side it was created on */
 	return tcp_splice_connect(c, conn, s, port);
 }
 
-- 
@@ -88,6 +88,9 @@ static const char *tcp_splice_flag_str[] __attribute((__unused__)) = {
 	"RCVLOWAT_ACT_B", "CLOSING",
 };
 
+/* Forward decl */
+static int tcp_sock_refill_ns(void *arg);
+
 /**
  * tcp_splice_conn_epoll_events() - epoll events masks for given state
  * @events:	Connection event flags
@@ -348,12 +351,8 @@ static int tcp_splice_connect_finish(const struct ctx *c,
  * Return: 0 for connect() succeeded or in progress, negative value on error
  */
 static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
-			      int s, in_port_t port)
+			      int sock_conn, in_port_t port)
 {
-	int sock_conn = (s >= 0) ? s : socket(CONN_V6(conn) ? AF_INET6 :
-							      AF_INET,
-					      SOCK_STREAM | SOCK_NONBLOCK,
-					      IPPROTO_TCP);
 	struct sockaddr_in6 addr6 = {
 		.sin6_family = AF_INET6,
 		.sin6_port = htons(port),
@@ -367,19 +366,8 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
 	const struct sockaddr *sa;
 	socklen_t sl;
 
-	if (sock_conn < 0)
-		return -errno;
-
-	if (sock_conn > SOCKET_MAX) {
-		close(sock_conn);
-		return -EIO;
-	}
-
 	conn->b = sock_conn;
 
-	if (s < 0)
-		tcp_sock_set_bufsize(c, conn->b);
-
 	if (setsockopt(conn->b, SOL_TCP, TCP_QUICKACK,
 		       &((int){ 1 }), sizeof(int))) {
 		trace("TCP (spliced): failed to set TCP_QUICKACK on socket %i",
@@ -410,36 +398,6 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
 	return 0;
 }
 
-/**
- * struct tcp_splice_connect_ns_arg - Arguments for tcp_splice_connect_ns()
- * @c:		Execution context
- * @conn:	Accepted inbound connection
- * @port:	Destination port, host order
- * @ret:	Return value of tcp_splice_connect_ns()
- */
-struct tcp_splice_connect_ns_arg {
-	const struct ctx *c;
-	struct tcp_splice_conn *conn;
-	in_port_t port;
-	int ret;
-};
-
-/**
- * tcp_splice_connect_ns() - Enter namespace and call tcp_splice_connect()
- * @arg:	See struct tcp_splice_connect_ns_arg
- *
- * Return: 0
- */
-static int tcp_splice_connect_ns(void *arg)
-{
-	struct tcp_splice_connect_ns_arg *a;
-
-	a = (struct tcp_splice_connect_ns_arg *)arg;
-	ns_enter(a->c);
-	a->ret = tcp_splice_connect(a->c, a->conn, -1, a->port);
-	return 0;
-}
-
 /**
  * tcp_splice_new() - Handle new spliced connection
  * @c:		Execution context
@@ -452,24 +410,37 @@ static int tcp_splice_connect_ns(void *arg)
 static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn,
 			  in_port_t port, int outbound)
 {
-	int *p, s = -1;
-
-	if (outbound)
-		p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4;
-	else
-		p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4;
+	int s = -1;
+
+	/* If the pool is empty we take slightly different approaches
+	 * for init or ns sockets.  For init sockets we just open a
+	 * new one without refilling the pool to keep latency down.
+	 * For ns sockets, we're going to incur the latency of
+	 * entering the ns anyway, so we might as well refill the
+	 * pool.
+	 */
+	if (outbound) {
+		int *p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4;
+		int af = CONN_V6(conn) ? AF_INET6 : AF_INET;
+
+		s = tcp_conn_pool_sock(p);
+		if (s < 0)
+			s = tcp_conn_new_sock(c, af);
+	} else {
+		int *p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4;
 
-	s = tcp_conn_pool_sock(p);
+		/* If pool is empty, refill it first */
+		if (p[TCP_SOCK_POOL_SIZE-1] < 0)
+			NS_CALL(tcp_sock_refill_ns, c);
 
-	/* No socket available in namespace: create a new one for connect() */
-	if (s < 0 && !outbound) {
-		struct tcp_splice_connect_ns_arg ns_arg = { c, conn, port, 0 };
+		s = tcp_conn_pool_sock(p);
+	}
 
-		NS_CALL(tcp_splice_connect_ns, &ns_arg);
-		return ns_arg.ret;
+	if (s < 0) {
+		warn("Couldn't open connectable socket for splice (%d)", s);
+		return s;
 	}
 
-	/* Otherwise, the socket will connect on the side it was created on */
 	return tcp_splice_connect(c, conn, s, port);
 }
 
-- 
2.39.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/5] tcp: Make a helper to refill each socket pool
  2023-01-09  0:50 ` [PATCH 1/5] tcp: Make a helper to refill each socket pool David Gibson
@ 2023-01-12 18:09   ` Stefano Brivio
  2023-01-15  0:22     ` David Gibson
  0 siblings, 1 reply; 13+ messages in thread
From: Stefano Brivio @ 2023-01-12 18:09 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Mon,  9 Jan 2023 11:50:22 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> tcp_sock_refill() contains two near-identical loops to refill the IPv4
> and IPv6 socket pools.  In addition, if we get an error on the IPv4 pool
> we exit early and won't attempt to refill the IPv6 pool.  That doesn't seem
> sensible, since these are essentially completely independent, and there
> may be value to filling up either one without the other.

Well, for the purposes of the single error condition that's checked
there, they're not independent, in practice: if we just had a socket
number that's too high, the next one will be, too. This is not formally
guaranteed, though.

In any case, this is indeed more elegant, and also drops a
'return -EIO' which doesn't really match the "Return: 0" comment.

-- 
Stefano


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 3/5] tcp: Move socket pool declarations around
  2023-01-09  0:50 ` [PATCH 3/5] tcp: Move socket pool declarations around David Gibson
@ 2023-01-12 18:09   ` Stefano Brivio
  2023-01-15  0:26     ` David Gibson
  0 siblings, 1 reply; 13+ messages in thread
From: Stefano Brivio @ 2023-01-12 18:09 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Mon,  9 Jan 2023 11:50:24 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> tcp_splice.c has some explicit extern declarations to access the socket
> pools.  This is pretty dangerous - if we changed the type of these
> variables in tcp.c, we'd have tcp.c and tcp_splice.c using the memory
> differently with no compiler error.  So, move the extern declarations to
> tcp_conn.h so they're visible to both tcp.c and tcp_splice.c, but not the
> rest of pasta.
> 
> In fact the pools for the guest namespace are necessarily only used by
> tcp_splice.c - we have no sockets on the guest side if we're not splicing.
> So move those declarations - and the functions that deal exclusively with
> them - to tcp_splice.c
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  tcp.c        | 41 ++++-------------------------------------
>  tcp.h        |  2 --
>  tcp_conn.h   | 10 ++++++++--
>  tcp_splice.c | 50 +++++++++++++++++++++++++++++++++++++++++++-------
>  4 files changed, 55 insertions(+), 48 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 4da823c..19fd17b 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -370,8 +370,6 @@ struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
>  #define FIN_TIMEOUT			60
>  #define ACT_TIMEOUT			7200
>  
> -#define TCP_SOCK_POOL_TSH		16 /* Refill in ns if > x used */
> -
>  #define LOW_RTT_TABLE_SIZE		8
>  #define LOW_RTT_THRESHOLD		10 /* us */
>  
> @@ -595,11 +593,9 @@ static inline struct tcp_tap_conn *conn_at_idx(int index)
>  /* Table for lookup from remote address, local port, remote port */
>  static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE];
>  
> -/* Pools for pre-opened sockets */
> +/* Pools for pre-opened sockets (init namespace) */
>  int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
>  int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
> -int ns_sock_pool4		[TCP_SOCK_POOL_SIZE];
> -int ns_sock_pool6		[TCP_SOCK_POOL_SIZE];
>  
>  /**
>   * tcp_conn_epoll_events() - epoll events mask for given connection state
> @@ -2988,7 +2984,7 @@ static int tcp_ns_socks_init(void *arg)
>   * @pool:	Pool of sockets to refill
>   * @af:		Address family to use
>   */
> -static void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af)
> +void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af)
>  {
>  	int i;
>  
> @@ -3022,26 +3018,6 @@ static void tcp_sock_refill_init(const struct ctx *c)
>  		tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6);
>  }
>  
> -/**
> - * tcp_sock_refill_ns() - Refill pools of pre-opened sockets in namespace
> - * @arg:	Execution context cast to void *
> - *
> - * Return: 0
> - */
> -static int tcp_sock_refill_ns(void *arg)
> -{
> -	const struct ctx *c = (const struct ctx *)arg;
> -
> -	ns_enter(c);
> -
> -	if (c->ifi4)
> -		tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
> -	if (c->ifi6)
> -		tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
> -
> -	return 0;
> -}
> -
>  /**
>   * tcp_init() - Get initial sequence, hash secret, initialise per-socket data
>   * @c:		Execution context
> @@ -3090,8 +3066,6 @@ int tcp_init(struct ctx *c)
>  
>  	memset(init_sock_pool4,		0xff,	sizeof(init_sock_pool4));
>  	memset(init_sock_pool6,		0xff,	sizeof(init_sock_pool6));
> -	memset(ns_sock_pool4,		0xff,	sizeof(ns_sock_pool4));
> -	memset(ns_sock_pool6,		0xff,	sizeof(ns_sock_pool6));
>  	memset(tcp_sock_init_ext,	0xff,	sizeof(tcp_sock_init_ext));
>  	memset(tcp_sock_ns,		0xff,	sizeof(tcp_sock_ns));
>  
> @@ -3101,8 +3075,6 @@ int tcp_init(struct ctx *c)
>  		tcp_splice_init(c);
>  
>  		NS_CALL(tcp_ns_socks_init, c);
> -
> -		NS_CALL(tcp_sock_refill_ns, c);
>  	}
>  
>  	return 0;
> @@ -3255,11 +3227,6 @@ void tcp_timer(struct ctx *c, const struct timespec *ts)
>  	}
>  
>  	tcp_sock_refill_init(c);
> -	if (c->mode == MODE_PASTA) {
> -		if ((c->ifi4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
> -		    (c->ifi6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
> -			NS_CALL(tcp_sock_refill_ns, c);
> -
> -		tcp_splice_pipe_refill(c);
> -	}
> +	if (c->mode == MODE_PASTA)
> +		tcp_splice_refill(c);
>  }
> diff --git a/tcp.h b/tcp.h
> index 739b451..236038f 100644
> --- a/tcp.h
> +++ b/tcp.h
> @@ -11,8 +11,6 @@
>  #define TCP_CONN_INDEX_BITS		17	/* 128k */
>  #define TCP_MAX_CONNS			(1 << TCP_CONN_INDEX_BITS)
>  
> -#define TCP_SOCK_POOL_SIZE		32
> -
>  struct ctx;
>  
>  void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
> diff --git a/tcp_conn.h b/tcp_conn.h
> index 70f4a7c..9951b0a 100644
> --- a/tcp_conn.h
> +++ b/tcp_conn.h
> @@ -182,11 +182,17 @@ union tcp_conn {
>  /* TCP connections */
>  extern union tcp_conn tc[];
>  
> +/* Socket pools */
> +#define TCP_SOCK_POOL_SIZE		32
> +
> +extern int init_sock_pool4	[TCP_SOCK_POOL_SIZE];
> +extern int init_sock_pool6	[TCP_SOCK_POOL_SIZE];
> +
>  void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new);
>  void tcp_table_compact(struct ctx *c, union tcp_conn *hole);
>  void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union);
>  void tcp_splice_timer(struct ctx *c, union tcp_conn *conn_union);
> -void tcp_splice_pipe_refill(const struct ctx *c);
> -
> +void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af);
> +void tcp_splice_refill(const struct ctx *c);
>  
>  #endif /* TCP_CONN_H */
> diff --git a/tcp_splice.c b/tcp_splice.c
> index 72b1672..688d776 100644
> --- a/tcp_splice.c
> +++ b/tcp_splice.c
> @@ -61,11 +61,11 @@
>  #define TCP_SPLICE_CONN_PRESSURE	30	/* % of conn_count */
>  #define TCP_SPLICE_FILE_PRESSURE	30	/* % of c->nofile */
>  
> -/* From tcp.c */
> -extern int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
> -extern int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
> -extern int ns_sock_pool4		[TCP_SOCK_POOL_SIZE];
> -extern int ns_sock_pool6		[TCP_SOCK_POOL_SIZE];
> +/* Pools for pre-opened sockets (guest namespace) */

What you mean by "guest namespace" is perfectly clear, but strictly
speaking, that's not a thing, and I'm a bit concerned that it might
cause confusion (especially with passt).

What about using "init" (instead of "init namespace") above, and
"namespace" here?

-- 
Stefano


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 5/5] tcp: Improve handling of fallback if socket pool is empty on new splice
  2023-01-09  0:50 ` [PATCH 5/5] tcp: Improve handling of fallback if socket pool is empty on new splice David Gibson
@ 2023-01-12 18:09   ` Stefano Brivio
  2023-01-15  0:31     ` David Gibson
  0 siblings, 1 reply; 13+ messages in thread
From: Stefano Brivio @ 2023-01-12 18:09 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Mon,  9 Jan 2023 11:50:26 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> When creating a new spliced connection, we need to get a socket in the
> other ns from the originating one.  To avoid excessive ns switches we
> usually get these from a pool refilled on a timer.  However, if the pool
> runs out we need a fallback.  Currently that's done by passing -1 as the
> socket to tcp_splice_connnect() and running it in the target ns.
> 
> This means that tcp_splice_connect() itself needs to have different cases
> depending on whether it's given an existing socket or not, which is
> a separate concern from what it's mostly doing.  We change it to require
> a suitable open socket to be passed in, and ensuring in the caller that we
> have one.
> 
> This requires adding the fallback paths to the caller, tcp_splice_new().
> We use slightly different approaches for a socket in the init ns versus the
> guest ns.
> 
> This also means that we no longer need to run tcp_splice_connect() itself
> in the guest ns, which allows us to remove a bunch of boilerplate code.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  tcp.c        |  2 +-
>  tcp_conn.h   |  1 +
>  tcp_splice.c | 89 ++++++++++++++++++----------------------------------
>  3 files changed, 32 insertions(+), 60 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index a540235..7a88dab 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -1849,7 +1849,7 @@ int tcp_conn_pool_sock(int pool[])
>   *
>   * Return: socket number on success, negative code if socket creation failed
>   */
> -static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
> +int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
>  {
>  	int s;
>  
> diff --git a/tcp_conn.h b/tcp_conn.h
> index c807e8b..a499f34 100644
> --- a/tcp_conn.h
> +++ b/tcp_conn.h
> @@ -193,6 +193,7 @@ void tcp_table_compact(struct ctx *c, union tcp_conn *hole);
>  void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union);
>  void tcp_splice_timer(struct ctx *c, union tcp_conn *conn_union);
>  int tcp_conn_pool_sock(int pool[]);
> +int tcp_conn_new_sock(const struct ctx *c, sa_family_t af);
>  void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af);
>  void tcp_splice_refill(const struct ctx *c);
>  
> diff --git a/tcp_splice.c b/tcp_splice.c
> index 5a88975..1cd9fdc 100644
> --- a/tcp_splice.c
> +++ b/tcp_splice.c
> @@ -88,6 +88,9 @@ static const char *tcp_splice_flag_str[] __attribute((__unused__)) = {
>  	"RCVLOWAT_ACT_B", "CLOSING",
>  };
>  
> +/* Forward decl */

s/decl/declaration/

> +static int tcp_sock_refill_ns(void *arg);
> +
>  /**
>   * tcp_splice_conn_epoll_events() - epoll events masks for given state
>   * @events:	Connection event flags
> @@ -348,12 +351,8 @@ static int tcp_splice_connect_finish(const struct ctx *c,
>   * Return: 0 for connect() succeeded or in progress, negative value on error
>   */
>  static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
> -			      int s, in_port_t port)
> +			      int sock_conn, in_port_t port)
>  {
> -	int sock_conn = (s >= 0) ? s : socket(CONN_V6(conn) ? AF_INET6 :
> -							      AF_INET,
> -					      SOCK_STREAM | SOCK_NONBLOCK,
> -					      IPPROTO_TCP);
>  	struct sockaddr_in6 addr6 = {
>  		.sin6_family = AF_INET6,
>  		.sin6_port = htons(port),
> @@ -367,19 +366,8 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
>  	const struct sockaddr *sa;
>  	socklen_t sl;
>  
> -	if (sock_conn < 0)
> -		return -errno;
> -
> -	if (sock_conn > SOCKET_MAX) {
> -		close(sock_conn);
> -		return -EIO;
> -	}
> -
>  	conn->b = sock_conn;
>  
> -	if (s < 0)
> -		tcp_sock_set_bufsize(c, conn->b);
> -
>  	if (setsockopt(conn->b, SOL_TCP, TCP_QUICKACK,
>  		       &((int){ 1 }), sizeof(int))) {
>  		trace("TCP (spliced): failed to set TCP_QUICKACK on socket %i",
> @@ -410,36 +398,6 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
>  	return 0;
>  }
>  
> -/**
> - * struct tcp_splice_connect_ns_arg - Arguments for tcp_splice_connect_ns()
> - * @c:		Execution context
> - * @conn:	Accepted inbound connection
> - * @port:	Destination port, host order
> - * @ret:	Return value of tcp_splice_connect_ns()
> - */
> -struct tcp_splice_connect_ns_arg {
> -	const struct ctx *c;
> -	struct tcp_splice_conn *conn;
> -	in_port_t port;
> -	int ret;
> -};
> -
> -/**
> - * tcp_splice_connect_ns() - Enter namespace and call tcp_splice_connect()
> - * @arg:	See struct tcp_splice_connect_ns_arg
> - *
> - * Return: 0
> - */
> -static int tcp_splice_connect_ns(void *arg)
> -{
> -	struct tcp_splice_connect_ns_arg *a;
> -
> -	a = (struct tcp_splice_connect_ns_arg *)arg;
> -	ns_enter(a->c);
> -	a->ret = tcp_splice_connect(a->c, a->conn, -1, a->port);
> -	return 0;
> -}
> -
>  /**
>   * tcp_splice_new() - Handle new spliced connection
>   * @c:		Execution context
> @@ -452,24 +410,37 @@ static int tcp_splice_connect_ns(void *arg)
>  static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn,
>  			  in_port_t port, int outbound)
>  {
> -	int *p, s = -1;
> -
> -	if (outbound)
> -		p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4;
> -	else
> -		p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4;
> +	int s = -1;
> +
> +	/* If the pool is empty we take slightly different approaches
> +	 * for init or ns sockets.  For init sockets we just open a
> +	 * new one without refilling the pool to keep latency down.
> +	 * For ns sockets, we're going to incur the latency of
> +	 * entering the ns anyway, so we might as well refill the
> +	 * pool.
> +	 */

Well, wait, one thing is the latency of clone() and setns(), another
thing is the latency coming from a series of (up to) 32 socket() calls.

It's much lower, indeed, but somewhat comparable. Could we just have the
same approach for both cases?

> +	if (outbound) {
> +		int *p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4;
> +		int af = CONN_V6(conn) ? AF_INET6 : AF_INET;
> +
> +		s = tcp_conn_pool_sock(p);
> +		if (s < 0)
> +			s = tcp_conn_new_sock(c, af);
> +	} else {
> +		int *p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4;
>  
> -	s = tcp_conn_pool_sock(p);
> +		/* If pool is empty, refill it first */
> +		if (p[TCP_SOCK_POOL_SIZE-1] < 0)

[TCP_SOCK_POOL_SIZE - 1]

-- 
Stefano


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/5] tcp: Make a helper to refill each socket pool
  2023-01-12 18:09   ` Stefano Brivio
@ 2023-01-15  0:22     ` David Gibson
  0 siblings, 0 replies; 13+ messages in thread
From: David Gibson @ 2023-01-15  0:22 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 1158 bytes --]

On Thu, Jan 12, 2023 at 07:09:21PM +0100, Stefano Brivio wrote:
> On Mon,  9 Jan 2023 11:50:22 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > tcp_sock_refill() contains two near-identical loops to refill the IPv4
> > and IPv6 socket pools.  In addition, if we get an error on the IPv4 pool
> > we exit early and won't attempt to refill the IPv6 pool.  That doesn't seem
> > sensible, since these are essentially completely independent, and there
> > may be value to filling up either one without the other.
> 
> Well, for the purposes of the single error condition that's checked
> there, they're not independent, in practice: if we just had a socket
> number that's too high, the next one will be, too. This is not formally
> guaranteed, though.

Ok, I've tweaked the commit message a bit.

> In any case, this is indeed more elegant, and also drops a
> 'return -EIO' which doesn't really match the "Return: 0" comment.
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 3/5] tcp: Move socket pool declarations around
  2023-01-12 18:09   ` Stefano Brivio
@ 2023-01-15  0:26     ` David Gibson
  0 siblings, 0 replies; 13+ messages in thread
From: David Gibson @ 2023-01-15  0:26 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 7089 bytes --]

On Thu, Jan 12, 2023 at 07:09:25PM +0100, Stefano Brivio wrote:
> On Mon,  9 Jan 2023 11:50:24 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > tcp_splice.c has some explicit extern declarations to access the socket
> > pools.  This is pretty dangerous - if we changed the type of these
> > variables in tcp.c, we'd have tcp.c and tcp_splice.c using the memory
> > differently with no compiler error.  So, move the extern declarations to
> > tcp_conn.h so they're visible to both tcp.c and tcp_splice.c, but not the
> > rest of pasta.
> > 
> > In fact the pools for the guest namespace are necessarily only used by
> > tcp_splice.c - we have no sockets on the guest side if we're not splicing.
> > So move those declarations - and the functions that deal exclusively with
> > them - to tcp_splice.c
> > 
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  tcp.c        | 41 ++++-------------------------------------
> >  tcp.h        |  2 --
> >  tcp_conn.h   | 10 ++++++++--
> >  tcp_splice.c | 50 +++++++++++++++++++++++++++++++++++++++++++-------
> >  4 files changed, 55 insertions(+), 48 deletions(-)
> > 
> > diff --git a/tcp.c b/tcp.c
> > index 4da823c..19fd17b 100644
> > --- a/tcp.c
> > +++ b/tcp.c
> > @@ -370,8 +370,6 @@ struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
> >  #define FIN_TIMEOUT			60
> >  #define ACT_TIMEOUT			7200
> >  
> > -#define TCP_SOCK_POOL_TSH		16 /* Refill in ns if > x used */
> > -
> >  #define LOW_RTT_TABLE_SIZE		8
> >  #define LOW_RTT_THRESHOLD		10 /* us */
> >  
> > @@ -595,11 +593,9 @@ static inline struct tcp_tap_conn *conn_at_idx(int index)
> >  /* Table for lookup from remote address, local port, remote port */
> >  static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE];
> >  
> > -/* Pools for pre-opened sockets */
> > +/* Pools for pre-opened sockets (init namespace) */
> >  int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
> >  int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
> > -int ns_sock_pool4		[TCP_SOCK_POOL_SIZE];
> > -int ns_sock_pool6		[TCP_SOCK_POOL_SIZE];
> >  
> >  /**
> >   * tcp_conn_epoll_events() - epoll events mask for given connection state
> > @@ -2988,7 +2984,7 @@ static int tcp_ns_socks_init(void *arg)
> >   * @pool:	Pool of sockets to refill
> >   * @af:		Address family to use
> >   */
> > -static void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af)
> > +void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af)
> >  {
> >  	int i;
> >  
> > @@ -3022,26 +3018,6 @@ static void tcp_sock_refill_init(const struct ctx *c)
> >  		tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6);
> >  }
> >  
> > -/**
> > - * tcp_sock_refill_ns() - Refill pools of pre-opened sockets in namespace
> > - * @arg:	Execution context cast to void *
> > - *
> > - * Return: 0
> > - */
> > -static int tcp_sock_refill_ns(void *arg)
> > -{
> > -	const struct ctx *c = (const struct ctx *)arg;
> > -
> > -	ns_enter(c);
> > -
> > -	if (c->ifi4)
> > -		tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
> > -	if (c->ifi6)
> > -		tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
> > -
> > -	return 0;
> > -}
> > -
> >  /**
> >   * tcp_init() - Get initial sequence, hash secret, initialise per-socket data
> >   * @c:		Execution context
> > @@ -3090,8 +3066,6 @@ int tcp_init(struct ctx *c)
> >  
> >  	memset(init_sock_pool4,		0xff,	sizeof(init_sock_pool4));
> >  	memset(init_sock_pool6,		0xff,	sizeof(init_sock_pool6));
> > -	memset(ns_sock_pool4,		0xff,	sizeof(ns_sock_pool4));
> > -	memset(ns_sock_pool6,		0xff,	sizeof(ns_sock_pool6));
> >  	memset(tcp_sock_init_ext,	0xff,	sizeof(tcp_sock_init_ext));
> >  	memset(tcp_sock_ns,		0xff,	sizeof(tcp_sock_ns));
> >  
> > @@ -3101,8 +3075,6 @@ int tcp_init(struct ctx *c)
> >  		tcp_splice_init(c);
> >  
> >  		NS_CALL(tcp_ns_socks_init, c);
> > -
> > -		NS_CALL(tcp_sock_refill_ns, c);
> >  	}
> >  
> >  	return 0;
> > @@ -3255,11 +3227,6 @@ void tcp_timer(struct ctx *c, const struct timespec *ts)
> >  	}
> >  
> >  	tcp_sock_refill_init(c);
> > -	if (c->mode == MODE_PASTA) {
> > -		if ((c->ifi4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
> > -		    (c->ifi6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
> > -			NS_CALL(tcp_sock_refill_ns, c);
> > -
> > -		tcp_splice_pipe_refill(c);
> > -	}
> > +	if (c->mode == MODE_PASTA)
> > +		tcp_splice_refill(c);
> >  }
> > diff --git a/tcp.h b/tcp.h
> > index 739b451..236038f 100644
> > --- a/tcp.h
> > +++ b/tcp.h
> > @@ -11,8 +11,6 @@
> >  #define TCP_CONN_INDEX_BITS		17	/* 128k */
> >  #define TCP_MAX_CONNS			(1 << TCP_CONN_INDEX_BITS)
> >  
> > -#define TCP_SOCK_POOL_SIZE		32
> > -
> >  struct ctx;
> >  
> >  void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
> > diff --git a/tcp_conn.h b/tcp_conn.h
> > index 70f4a7c..9951b0a 100644
> > --- a/tcp_conn.h
> > +++ b/tcp_conn.h
> > @@ -182,11 +182,17 @@ union tcp_conn {
> >  /* TCP connections */
> >  extern union tcp_conn tc[];
> >  
> > +/* Socket pools */
> > +#define TCP_SOCK_POOL_SIZE		32
> > +
> > +extern int init_sock_pool4	[TCP_SOCK_POOL_SIZE];
> > +extern int init_sock_pool6	[TCP_SOCK_POOL_SIZE];
> > +
> >  void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new);
> >  void tcp_table_compact(struct ctx *c, union tcp_conn *hole);
> >  void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union);
> >  void tcp_splice_timer(struct ctx *c, union tcp_conn *conn_union);
> > -void tcp_splice_pipe_refill(const struct ctx *c);
> > -
> > +void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af);
> > +void tcp_splice_refill(const struct ctx *c);
> >  
> >  #endif /* TCP_CONN_H */
> > diff --git a/tcp_splice.c b/tcp_splice.c
> > index 72b1672..688d776 100644
> > --- a/tcp_splice.c
> > +++ b/tcp_splice.c
> > @@ -61,11 +61,11 @@
> >  #define TCP_SPLICE_CONN_PRESSURE	30	/* % of conn_count */
> >  #define TCP_SPLICE_FILE_PRESSURE	30	/* % of c->nofile */
> >  
> > -/* From tcp.c */
> > -extern int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
> > -extern int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
> > -extern int ns_sock_pool4		[TCP_SOCK_POOL_SIZE];
> > -extern int ns_sock_pool6		[TCP_SOCK_POOL_SIZE];
> > +/* Pools for pre-opened sockets (guest namespace) */
> 
> What you mean by "guest namespace" is perfectly clear, but strictly
> speaking, that's not a thing, and I'm a bit concerned that it might
> cause confusion (especially with passt).

Hm, right.

> What about using "init" (instead of "init namespace") above, and
> "namespace" here?

Yeah, alright.  I don't love this either, since just "namespace" and
"init" are also kind of ambiguous without context.  But, it's more
consistent with the terminology in other places, so it will do for
now.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 5/5] tcp: Improve handling of fallback if socket pool is empty on new splice
  2023-01-12 18:09   ` Stefano Brivio
@ 2023-01-15  0:31     ` David Gibson
  2023-01-23 17:47       ` Stefano Brivio
  0 siblings, 1 reply; 13+ messages in thread
From: David Gibson @ 2023-01-15  0:31 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 7636 bytes --]

On Thu, Jan 12, 2023 at 07:09:27PM +0100, Stefano Brivio wrote:
> On Mon,  9 Jan 2023 11:50:26 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > When creating a new spliced connection, we need to get a socket in the
> > other ns from the originating one.  To avoid excessive ns switches we
> > usually get these from a pool refilled on a timer.  However, if the pool
> > runs out we need a fallback.  Currently that's done by passing -1 as the
> > socket to tcp_splice_connnect() and running it in the target ns.
> > 
> > This means that tcp_splice_connect() itself needs to have different cases
> > depending on whether it's given an existing socket or not, which is
> > a separate concern from what it's mostly doing.  We change it to require
> > a suitable open socket to be passed in, and ensuring in the caller that we
> > have one.
> > 
> > This requires adding the fallback paths to the caller, tcp_splice_new().
> > We use slightly different approaches for a socket in the init ns versus the
> > guest ns.
> > 
> > This also means that we no longer need to run tcp_splice_connect() itself
> > in the guest ns, which allows us to remove a bunch of boilerplate code.
> > 
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  tcp.c        |  2 +-
> >  tcp_conn.h   |  1 +
> >  tcp_splice.c | 89 ++++++++++++++++++----------------------------------
> >  3 files changed, 32 insertions(+), 60 deletions(-)
> > 
> > diff --git a/tcp.c b/tcp.c
> > index a540235..7a88dab 100644
> > --- a/tcp.c
> > +++ b/tcp.c
> > @@ -1849,7 +1849,7 @@ int tcp_conn_pool_sock(int pool[])
> >   *
> >   * Return: socket number on success, negative code if socket creation failed
> >   */
> > -static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
> > +int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
> >  {
> >  	int s;
> >  
> > diff --git a/tcp_conn.h b/tcp_conn.h
> > index c807e8b..a499f34 100644
> > --- a/tcp_conn.h
> > +++ b/tcp_conn.h
> > @@ -193,6 +193,7 @@ void tcp_table_compact(struct ctx *c, union tcp_conn *hole);
> >  void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union);
> >  void tcp_splice_timer(struct ctx *c, union tcp_conn *conn_union);
> >  int tcp_conn_pool_sock(int pool[]);
> > +int tcp_conn_new_sock(const struct ctx *c, sa_family_t af);
> >  void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af);
> >  void tcp_splice_refill(const struct ctx *c);
> >  
> > diff --git a/tcp_splice.c b/tcp_splice.c
> > index 5a88975..1cd9fdc 100644
> > --- a/tcp_splice.c
> > +++ b/tcp_splice.c
> > @@ -88,6 +88,9 @@ static const char *tcp_splice_flag_str[] __attribute((__unused__)) = {
> >  	"RCVLOWAT_ACT_B", "CLOSING",
> >  };
> >  
> > +/* Forward decl */
> 
> s/decl/declaration/

Fixed.

> > +static int tcp_sock_refill_ns(void *arg);
> > +
> >  /**
> >   * tcp_splice_conn_epoll_events() - epoll events masks for given state
> >   * @events:	Connection event flags
> > @@ -348,12 +351,8 @@ static int tcp_splice_connect_finish(const struct ctx *c,
> >   * Return: 0 for connect() succeeded or in progress, negative value on error
> >   */
> >  static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
> > -			      int s, in_port_t port)
> > +			      int sock_conn, in_port_t port)
> >  {
> > -	int sock_conn = (s >= 0) ? s : socket(CONN_V6(conn) ? AF_INET6 :
> > -							      AF_INET,
> > -					      SOCK_STREAM | SOCK_NONBLOCK,
> > -					      IPPROTO_TCP);
> >  	struct sockaddr_in6 addr6 = {
> >  		.sin6_family = AF_INET6,
> >  		.sin6_port = htons(port),
> > @@ -367,19 +366,8 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
> >  	const struct sockaddr *sa;
> >  	socklen_t sl;
> >  
> > -	if (sock_conn < 0)
> > -		return -errno;
> > -
> > -	if (sock_conn > SOCKET_MAX) {
> > -		close(sock_conn);
> > -		return -EIO;
> > -	}
> > -
> >  	conn->b = sock_conn;
> >  
> > -	if (s < 0)
> > -		tcp_sock_set_bufsize(c, conn->b);
> > -
> >  	if (setsockopt(conn->b, SOL_TCP, TCP_QUICKACK,
> >  		       &((int){ 1 }), sizeof(int))) {
> >  		trace("TCP (spliced): failed to set TCP_QUICKACK on socket %i",
> > @@ -410,36 +398,6 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
> >  	return 0;
> >  }
> >  
> > -/**
> > - * struct tcp_splice_connect_ns_arg - Arguments for tcp_splice_connect_ns()
> > - * @c:		Execution context
> > - * @conn:	Accepted inbound connection
> > - * @port:	Destination port, host order
> > - * @ret:	Return value of tcp_splice_connect_ns()
> > - */
> > -struct tcp_splice_connect_ns_arg {
> > -	const struct ctx *c;
> > -	struct tcp_splice_conn *conn;
> > -	in_port_t port;
> > -	int ret;
> > -};
> > -
> > -/**
> > - * tcp_splice_connect_ns() - Enter namespace and call tcp_splice_connect()
> > - * @arg:	See struct tcp_splice_connect_ns_arg
> > - *
> > - * Return: 0
> > - */
> > -static int tcp_splice_connect_ns(void *arg)
> > -{
> > -	struct tcp_splice_connect_ns_arg *a;
> > -
> > -	a = (struct tcp_splice_connect_ns_arg *)arg;
> > -	ns_enter(a->c);
> > -	a->ret = tcp_splice_connect(a->c, a->conn, -1, a->port);
> > -	return 0;
> > -}
> > -
> >  /**
> >   * tcp_splice_new() - Handle new spliced connection
> >   * @c:		Execution context
> > @@ -452,24 +410,37 @@ static int tcp_splice_connect_ns(void *arg)
> >  static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn,
> >  			  in_port_t port, int outbound)
> >  {
> > -	int *p, s = -1;
> > -
> > -	if (outbound)
> > -		p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4;
> > -	else
> > -		p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4;
> > +	int s = -1;
> > +
> > +	/* If the pool is empty we take slightly different approaches
> > +	 * for init or ns sockets.  For init sockets we just open a
> > +	 * new one without refilling the pool to keep latency down.
> > +	 * For ns sockets, we're going to incur the latency of
> > +	 * entering the ns anyway, so we might as well refill the
> > +	 * pool.
> > +	 */
> 
> Well, wait, one thing is the latency of clone() and setns(), another
> thing is the latency coming from a series of (up to) 32 socket() calls.
> 
> It's much lower, indeed, but somewhat comparable. Could we just have the
> same approach for both cases?

Well, we could.  Making both paths refill the pool is very simple, but
might introduce more latency than we really want for the easier init
case,

Making both paths just open a single socket is a little bit fiddlier,
because we'll need some more boilerplate for another helper that can
be NS_CALL()ed, instead of reusing the refill one.

What we have here was the tradeoff I settled on, admittedly without
much quantative idea of the latencies involved.  Happy to accept your
call on a better one.

> > +	if (outbound) {
> > +		int *p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4;
> > +		int af = CONN_V6(conn) ? AF_INET6 : AF_INET;
> > +
> > +		s = tcp_conn_pool_sock(p);
> > +		if (s < 0)
> > +			s = tcp_conn_new_sock(c, af);
> > +	} else {
> > +		int *p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4;
> >  
> > -	s = tcp_conn_pool_sock(p);
> > +		/* If pool is empty, refill it first */
> > +		if (p[TCP_SOCK_POOL_SIZE-1] < 0)
> 
> [TCP_SOCK_POOL_SIZE - 1]
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 5/5] tcp: Improve handling of fallback if socket pool is empty on new splice
  2023-01-15  0:31     ` David Gibson
@ 2023-01-23 17:47       ` Stefano Brivio
  0 siblings, 0 replies; 13+ messages in thread
From: Stefano Brivio @ 2023-01-23 17:47 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Sun, 15 Jan 2023 10:31:08 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Thu, Jan 12, 2023 at 07:09:27PM +0100, Stefano Brivio wrote:
> > On Mon,  9 Jan 2023 11:50:26 +1100
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> > > When creating a new spliced connection, we need to get a socket in the
> > > other ns from the originating one.  To avoid excessive ns switches we
> > > usually get these from a pool refilled on a timer.  However, if the pool
> > > runs out we need a fallback.  Currently that's done by passing -1 as the
> > > socket to tcp_splice_connnect() and running it in the target ns.
> > > 
> > > This means that tcp_splice_connect() itself needs to have different cases
> > > depending on whether it's given an existing socket or not, which is
> > > a separate concern from what it's mostly doing.  We change it to require
> > > a suitable open socket to be passed in, and ensuring in the caller that we
> > > have one.
> > > 
> > > This requires adding the fallback paths to the caller, tcp_splice_new().
> > > We use slightly different approaches for a socket in the init ns versus the
> > > guest ns.
> > > 
> > > This also means that we no longer need to run tcp_splice_connect() itself
> > > in the guest ns, which allows us to remove a bunch of boilerplate code.
> > > 
> > > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > > ---
> > >  tcp.c        |  2 +-
> > >  tcp_conn.h   |  1 +
> > >  tcp_splice.c | 89 ++++++++++++++++++----------------------------------
> > >  3 files changed, 32 insertions(+), 60 deletions(-)
> > > 
> > > diff --git a/tcp.c b/tcp.c
> > > index a540235..7a88dab 100644
> > > --- a/tcp.c
> > > +++ b/tcp.c
> > > @@ -1849,7 +1849,7 @@ int tcp_conn_pool_sock(int pool[])
> > >   *
> > >   * Return: socket number on success, negative code if socket creation failed
> > >   */
> > > -static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
> > > +int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
> > >  {
> > >  	int s;
> > >  
> > > diff --git a/tcp_conn.h b/tcp_conn.h
> > > index c807e8b..a499f34 100644
> > > --- a/tcp_conn.h
> > > +++ b/tcp_conn.h
> > > @@ -193,6 +193,7 @@ void tcp_table_compact(struct ctx *c, union tcp_conn *hole);
> > >  void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union);
> > >  void tcp_splice_timer(struct ctx *c, union tcp_conn *conn_union);
> > >  int tcp_conn_pool_sock(int pool[]);
> > > +int tcp_conn_new_sock(const struct ctx *c, sa_family_t af);
> > >  void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af);
> > >  void tcp_splice_refill(const struct ctx *c);
> > >  
> > > diff --git a/tcp_splice.c b/tcp_splice.c
> > > index 5a88975..1cd9fdc 100644
> > > --- a/tcp_splice.c
> > > +++ b/tcp_splice.c
> > > @@ -88,6 +88,9 @@ static const char *tcp_splice_flag_str[] __attribute((__unused__)) = {
> > >  	"RCVLOWAT_ACT_B", "CLOSING",
> > >  };
> > >  
> > > +/* Forward decl */  
> > 
> > s/decl/declaration/  
> 
> Fixed.
> 
> > > +static int tcp_sock_refill_ns(void *arg);
> > > +
> > >  /**
> > >   * tcp_splice_conn_epoll_events() - epoll events masks for given state
> > >   * @events:	Connection event flags
> > > @@ -348,12 +351,8 @@ static int tcp_splice_connect_finish(const struct ctx *c,
> > >   * Return: 0 for connect() succeeded or in progress, negative value on error
> > >   */
> > >  static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
> > > -			      int s, in_port_t port)
> > > +			      int sock_conn, in_port_t port)
> > >  {
> > > -	int sock_conn = (s >= 0) ? s : socket(CONN_V6(conn) ? AF_INET6 :
> > > -							      AF_INET,
> > > -					      SOCK_STREAM | SOCK_NONBLOCK,
> > > -					      IPPROTO_TCP);
> > >  	struct sockaddr_in6 addr6 = {
> > >  		.sin6_family = AF_INET6,
> > >  		.sin6_port = htons(port),
> > > @@ -367,19 +366,8 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
> > >  	const struct sockaddr *sa;
> > >  	socklen_t sl;
> > >  
> > > -	if (sock_conn < 0)
> > > -		return -errno;
> > > -
> > > -	if (sock_conn > SOCKET_MAX) {
> > > -		close(sock_conn);
> > > -		return -EIO;
> > > -	}
> > > -
> > >  	conn->b = sock_conn;
> > >  
> > > -	if (s < 0)
> > > -		tcp_sock_set_bufsize(c, conn->b);
> > > -
> > >  	if (setsockopt(conn->b, SOL_TCP, TCP_QUICKACK,
> > >  		       &((int){ 1 }), sizeof(int))) {
> > >  		trace("TCP (spliced): failed to set TCP_QUICKACK on socket %i",
> > > @@ -410,36 +398,6 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
> > >  	return 0;
> > >  }
> > >  
> > > -/**
> > > - * struct tcp_splice_connect_ns_arg - Arguments for tcp_splice_connect_ns()
> > > - * @c:		Execution context
> > > - * @conn:	Accepted inbound connection
> > > - * @port:	Destination port, host order
> > > - * @ret:	Return value of tcp_splice_connect_ns()
> > > - */
> > > -struct tcp_splice_connect_ns_arg {
> > > -	const struct ctx *c;
> > > -	struct tcp_splice_conn *conn;
> > > -	in_port_t port;
> > > -	int ret;
> > > -};
> > > -
> > > -/**
> > > - * tcp_splice_connect_ns() - Enter namespace and call tcp_splice_connect()
> > > - * @arg:	See struct tcp_splice_connect_ns_arg
> > > - *
> > > - * Return: 0
> > > - */
> > > -static int tcp_splice_connect_ns(void *arg)
> > > -{
> > > -	struct tcp_splice_connect_ns_arg *a;
> > > -
> > > -	a = (struct tcp_splice_connect_ns_arg *)arg;
> > > -	ns_enter(a->c);
> > > -	a->ret = tcp_splice_connect(a->c, a->conn, -1, a->port);
> > > -	return 0;
> > > -}
> > > -
> > >  /**
> > >   * tcp_splice_new() - Handle new spliced connection
> > >   * @c:		Execution context
> > > @@ -452,24 +410,37 @@ static int tcp_splice_connect_ns(void *arg)
> > >  static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn,
> > >  			  in_port_t port, int outbound)
> > >  {
> > > -	int *p, s = -1;
> > > -
> > > -	if (outbound)
> > > -		p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4;
> > > -	else
> > > -		p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4;
> > > +	int s = -1;
> > > +
> > > +	/* If the pool is empty we take slightly different approaches
> > > +	 * for init or ns sockets.  For init sockets we just open a
> > > +	 * new one without refilling the pool to keep latency down.
> > > +	 * For ns sockets, we're going to incur the latency of
> > > +	 * entering the ns anyway, so we might as well refill the
> > > +	 * pool.
> > > +	 */  
> > 
> > Well, wait, one thing is the latency of clone() and setns(), another
> > thing is the latency coming from a series of (up to) 32 socket() calls.
> > 
> > It's much lower, indeed, but somewhat comparable. Could we just have the
> > same approach for both cases?  
> 
> Well, we could.  Making both paths refill the pool is very simple, but
> might introduce more latency than we really want for the easier init
> case,
> 
> Making both paths just open a single socket is a little bit fiddlier,
> because we'll need some more boilerplate for another helper that can
> be NS_CALL()ed, instead of reusing the refill one.
> 
> What we have here was the tradeoff I settled on, admittedly without
> much quantative idea of the latencies involved.  Happy to accept your
> call on a better one.

Thinking about this a bit further, I guess it's actually okay to have
this path refill the pool -- after all, it only happens if the pool is
already depleted, which shouldn't be a frequent condition, in practice.

-- 
Stefano


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2023-01-23 17:47 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-09  0:50 [PATCH 0/5] Cleanups to tcp socket pool handling David Gibson
2023-01-09  0:50 ` [PATCH 1/5] tcp: Make a helper to refill each socket pool David Gibson
2023-01-12 18:09   ` Stefano Brivio
2023-01-15  0:22     ` David Gibson
2023-01-09  0:50 ` [PATCH 2/5] tcp: Split init and ns cases for tcp_sock_refill() David Gibson
2023-01-09  0:50 ` [PATCH 3/5] tcp: Move socket pool declarations around David Gibson
2023-01-12 18:09   ` Stefano Brivio
2023-01-15  0:26     ` David Gibson
2023-01-09  0:50 ` [PATCH 4/5] tcp: Split pool lookup from creating new sockets in tcp_conn_new_sock() David Gibson
2023-01-09  0:50 ` [PATCH 5/5] tcp: Improve handling of fallback if socket pool is empty on new splice David Gibson
2023-01-12 18:09   ` Stefano Brivio
2023-01-15  0:31     ` David Gibson
2023-01-23 17:47       ` Stefano Brivio

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).