From: David Gibson <david@gibson.dropbear.id.au>
To: passt-dev@passt.top, Stefano Brivio <sbrivio@redhat.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH 08/14] tcp: Unify spliced and non-spliced connection tables
Date: Mon, 14 Nov 2022 17:17:05 +1100 [thread overview]
Message-ID: <20221114061711.1655510-9-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20221114061711.1655510-1-david@gibson.dropbear.id.au>
Currently spliced and non-spliced connections are stored in completely
separate tables, so there are completely independent limits on the number
of spliced and non-spliced connections. This is a bit counter-intuitive.
More importantly, the fact that the tables are separate prevents us from
unifying some other logic between the two cases. So, merge these two
tables into one, using the 'c.spliced' common field to distinguish between
them when necessary.
For now we keep a common limit of 128k connections, whether they're spliced
or non-spliced, which means we save memory overall. If necessary we could
increase this to a 256k or higher total, which would cost memory but give
some more flexibility.
For now, the code paths which need to step through all extant connections
are still separate for the two cases, just skipping over entries which
aren't for them. We'll improve that in later patches.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
tcp.c | 46 ++++++++++++++++++++----------------
tcp.h | 2 +-
tcp_conn.h | 6 +++++
tcp_splice.c | 66 ++++++++++++++--------------------------------------
4 files changed, 51 insertions(+), 69 deletions(-)
diff --git a/tcp.c b/tcp.c
index 44e1640..ffc030e 100644
--- a/tcp.c
+++ b/tcp.c
@@ -98,11 +98,11 @@
* Connection tracking and storage
* -------------------------------
*
- * Connections are tracked by the @tc array of struct tcp_tap_conn, containing
- * addresses, ports, TCP states and parameters. This is statically allocated and
- * indexed by an arbitrary connection number. The array is compacted whenever a
- * connection is closed, by remapping the highest connection index in use to the
- * one freed up.
+ * Connections are tracked by struct tcp_tap_conn entries in the @tc
+ * array, containing addresses, ports, TCP states and parameters. This
+ * is statically allocated and indexed by an arbitrary connection
+ * number. The array is compacted whenever a connection is closed, by
+ * remapping the highest connection index in use to the one freed up.
*
* References used for the epoll interface report the connection index used for
* the @tc array.
@@ -588,10 +588,10 @@ static unsigned int tcp6_l2_flags_buf_used;
static size_t tcp6_l2_flags_buf_bytes;
/* TCP connections */
-static struct tcp_tap_conn tc[TCP_MAX_CONNS];
+union tcp_conn tc[TCP_MAX_CONNS];
-#define CONN(index) (tc + (index))
-#define CONN_IDX(conn) ((conn) - tc)
+#define CONN(index) (&tc[(index)].tap)
+#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc)
/** conn_at_idx() - Find a connection by index, if present
* @index: Index of connection to lookup
@@ -1350,26 +1350,28 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, int af,
* @c: Execution context
* @hole: Pointer to recently closed connection
*/
-static void tcp_table_compact(struct ctx *c, struct tcp_tap_conn *hole)
+void tcp_table_compact(struct ctx *c, union tcp_conn *hole)
{
- struct tcp_tap_conn *from, *to;
+ union tcp_conn *from;
if (CONN_IDX(hole) == --c->tcp.conn_count) {
- debug("TCP: hash table compaction: maximum index was %li (%p)",
+ debug("TCP: table compaction: maximum index was %li (%p)",
CONN_IDX(hole), hole);
memset(hole, 0, sizeof(*hole));
return;
}
- from = CONN(c->tcp.conn_count);
+ from = tc + c->tcp.conn_count;
memcpy(hole, from, sizeof(*hole));
- to = hole;
- tcp_tap_conn_update(c, from, to);
+ if (from->c.spliced)
+ tcp_splice_conn_update(c, &hole->splice);
+ else
+ tcp_tap_conn_update(c, &from->tap, &hole->tap);
- debug("TCP: hash table compaction: old index %li, new index %li, "
- "sock %i, from: %p, to: %p",
- CONN_IDX(from), CONN_IDX(to), from->sock, from, to);
+ debug("TCP: table compaction (spliced=%d): old index %li, new index %li, "
+ "from: %p, to: %p",
+ from->c.spliced, CONN_IDX(from), CONN_IDX(hole), from, hole);
memset(from, 0, sizeof(*from));
}
@@ -1386,7 +1388,7 @@ static void tcp_conn_destroy(struct ctx *c, struct tcp_tap_conn *conn)
close(conn->timer);
tcp_hash_remove(conn);
- tcp_table_compact(c, conn);
+ tcp_table_compact(c, (union tcp_conn *)conn);
}
static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
@@ -1534,7 +1536,9 @@ void tcp_defer_handler(struct ctx *c)
if (c->tcp.conn_count < MIN(max_files, max_conns))
return;
- for (conn = CONN(c->tcp.conn_count - 1); conn >= tc; conn--) {
+ for (conn = CONN(c->tcp.conn_count - 1); conn >= CONN(0); conn--) {
+ if (conn->c.spliced)
+ continue;
if (conn->events == CLOSED)
tcp_conn_destroy(c, conn);
}
@@ -3432,7 +3436,9 @@ void tcp_timer(struct ctx *c, const struct timespec *ts)
}
}
- for (conn = CONN(c->tcp.conn_count - 1); conn >= tc; conn--) {
+ for (conn = CONN(c->tcp.conn_count - 1); conn >= CONN(0); conn--) {
+ if (conn->c.spliced)
+ continue;
if (conn->events == CLOSED)
tcp_conn_destroy(c, conn);
}
diff --git a/tcp.h b/tcp.h
index bba0f38..49738ef 100644
--- a/tcp.h
+++ b/tcp.h
@@ -54,7 +54,7 @@ union tcp_epoll_ref {
/**
* struct tcp_ctx - Execution context for TCP routines
* @hash_secret: 128-bit secret for hash functions, ISN and hash table
- * @conn_count: Count of connections (not spliced) in connection table
+ * @conn_count: Count of total connections in connection table
* @splice_conn_count: Count of spliced connections in connection table
* @port_to_tap: Ports bound host-side, packets to tap or spliced
* @fwd_in: Port forwarding configuration for inbound packets
diff --git a/tcp_conn.h b/tcp_conn.h
index 39d104a..4295f7d 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -195,4 +195,10 @@ union tcp_conn {
struct tcp_splice_conn splice;
};
+/* TCP connections */
+extern union tcp_conn tc[];
+
+void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new);
+void tcp_table_compact(struct ctx *c, union tcp_conn *hole);
+
#endif /* TCP_CONN_H */
diff --git a/tcp_splice.c b/tcp_splice.c
index 42133af..f12dc2b 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -16,7 +16,7 @@
* For local traffic directed to TCP ports configured for direct
* mapping between namespaces, packets are directly translated between
* L4 sockets using a pair of splice() syscalls. These connections are
- * tracked in the @tc_splice array of struct tcp_splice_conn, using
+ * tracked by struct tcp_splice_conn entries in the @tc array, using
* these events:
*
* - SPLICE_CONNECT: connection accepted, connecting to target
@@ -57,7 +57,7 @@
#define MAX_PIPE_SIZE (8UL * 1024 * 1024)
#define TCP_SPLICE_MAX_CONNS (128 * 1024)
#define TCP_SPLICE_PIPE_POOL_SIZE 16
-#define TCP_SPLICE_CONN_PRESSURE 30 /* % of splice_conn_count */
+#define TCP_SPLICE_CONN_PRESSURE 30 /* % of conn_count */
#define TCP_SPLICE_FILE_PRESSURE 30 /* % of c->nofile */
/* From tcp.c */
@@ -72,11 +72,8 @@ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2];
#define CONN_V6(x) (x->flags & SPLICE_V6)
#define CONN_V4(x) (!CONN_V6(x))
#define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
-#define CONN(index) (tc_splice + (index))
-#define CONN_IDX(conn) ((conn) - tc_splice)
-
-/* Spliced connections */
-static struct tcp_splice_conn tc_splice[TCP_SPLICE_MAX_CONNS];
+#define CONN(index) (&tc[(index)].splice)
+#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc)
/* Display strings for connection events */
static const char *tcp_splice_event_str[] __attribute((__unused__)) = {
@@ -248,43 +245,13 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn,
* @c: Execution context
* @new: New location of tcp_splice_conn
*/
-static void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new)
+void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new)
{
tcp_splice_epoll_ctl(c, new);
if (tcp_splice_epoll_ctl(c, new))
conn_flag(c, new, CLOSING);
}
-/**
- * tcp_table_splice_compact - Compact spliced connection table
- * @c: Execution context
- * @hole: Pointer to recently closed connection
- */
-static void tcp_table_splice_compact(struct ctx *c,
- struct tcp_splice_conn *hole)
-{
- struct tcp_splice_conn *move;
-
- if (CONN_IDX(hole) == --c->tcp.splice_conn_count) {
- debug("TCP (spliced): index %li (max) removed", CONN_IDX(hole));
- return;
- }
-
- move = CONN(c->tcp.splice_conn_count);
-
- memcpy(hole, move, sizeof(*hole));
-
- move->a = move->b = -1;
- move->a_read = move->a_written = move->b_read = move->b_written = 0;
- move->pipe_a_b[0] = move->pipe_a_b[1] = -1;
- move->pipe_b_a[0] = move->pipe_b_a[1] = -1;
- move->flags = move->events = 0;
-
- debug("TCP (spliced): index %li moved to %li",
- CONN_IDX(move), CONN_IDX(hole));
- tcp_splice_conn_update(c, hole);
-}
-
/**
* tcp_splice_destroy() - Close spliced connection and pipes, clear
* @c: Execution context
@@ -319,7 +286,8 @@ static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
conn->flags = 0;
debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn));
- tcp_table_splice_compact(c, conn);
+ c->tcp.splice_conn_count--;
+ tcp_table_compact(c, (union tcp_conn *)conn);
}
/**
@@ -553,7 +521,7 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
if (ref.r.p.tcp.tcp.listen) {
int s;
- if (c->tcp.splice_conn_count >= TCP_SPLICE_MAX_CONNS)
+ if (c->tcp.conn_count >= TCP_MAX_CONNS)
return;
if ((s = accept4(ref.r.s, NULL, NULL, SOCK_NONBLOCK)) < 0)
@@ -565,8 +533,9 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
s);
}
- conn = CONN(c->tcp.splice_conn_count++);
+ conn = CONN(c->tcp.conn_count++);
conn->c.spliced = true;
+ c->tcp.splice_conn_count++;
conn->a = s;
conn->flags = ref.r.p.tcp.tcp.v6 ? SPLICE_V6 : 0;
@@ -845,9 +814,10 @@ void tcp_splice_timer(struct ctx *c)
{
struct tcp_splice_conn *conn;
- for (conn = CONN(c->tcp.splice_conn_count - 1);
- conn >= tc_splice;
- conn--) {
+ for (conn = CONN(c->tcp.conn_count - 1); conn >= CONN(0); conn--) {
+ if (!conn->c.spliced)
+ continue;
+
if (conn->flags & CLOSING) {
tcp_splice_destroy(c, conn);
return;
@@ -890,12 +860,12 @@ void tcp_splice_defer_handler(struct ctx *c)
int max_files = c->nofile / 100 * TCP_SPLICE_FILE_PRESSURE;
struct tcp_splice_conn *conn;
- if (c->tcp.splice_conn_count < MIN(max_files / 6, max_conns))
+ if (c->tcp.conn_count < MIN(max_files / 6, max_conns))
return;
- for (conn = CONN(c->tcp.splice_conn_count - 1);
- conn >= tc_splice;
- conn--) {
+ for (conn = CONN(c->tcp.conn_count - 1); conn >= CONN(0); conn--) {
+ if (!conn->c.spliced)
+ continue;
if (conn->flags & CLOSING)
tcp_splice_destroy(c, conn);
}
--
@@ -16,7 +16,7 @@
* For local traffic directed to TCP ports configured for direct
* mapping between namespaces, packets are directly translated between
* L4 sockets using a pair of splice() syscalls. These connections are
- * tracked in the @tc_splice array of struct tcp_splice_conn, using
+ * tracked by struct tcp_splice_conn entries in the @tc array, using
* these events:
*
* - SPLICE_CONNECT: connection accepted, connecting to target
@@ -57,7 +57,7 @@
#define MAX_PIPE_SIZE (8UL * 1024 * 1024)
#define TCP_SPLICE_MAX_CONNS (128 * 1024)
#define TCP_SPLICE_PIPE_POOL_SIZE 16
-#define TCP_SPLICE_CONN_PRESSURE 30 /* % of splice_conn_count */
+#define TCP_SPLICE_CONN_PRESSURE 30 /* % of conn_count */
#define TCP_SPLICE_FILE_PRESSURE 30 /* % of c->nofile */
/* From tcp.c */
@@ -72,11 +72,8 @@ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2];
#define CONN_V6(x) (x->flags & SPLICE_V6)
#define CONN_V4(x) (!CONN_V6(x))
#define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
-#define CONN(index) (tc_splice + (index))
-#define CONN_IDX(conn) ((conn) - tc_splice)
-
-/* Spliced connections */
-static struct tcp_splice_conn tc_splice[TCP_SPLICE_MAX_CONNS];
+#define CONN(index) (&tc[(index)].splice)
+#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc)
/* Display strings for connection events */
static const char *tcp_splice_event_str[] __attribute((__unused__)) = {
@@ -248,43 +245,13 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn,
* @c: Execution context
* @new: New location of tcp_splice_conn
*/
-static void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new)
+void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new)
{
tcp_splice_epoll_ctl(c, new);
if (tcp_splice_epoll_ctl(c, new))
conn_flag(c, new, CLOSING);
}
-/**
- * tcp_table_splice_compact - Compact spliced connection table
- * @c: Execution context
- * @hole: Pointer to recently closed connection
- */
-static void tcp_table_splice_compact(struct ctx *c,
- struct tcp_splice_conn *hole)
-{
- struct tcp_splice_conn *move;
-
- if (CONN_IDX(hole) == --c->tcp.splice_conn_count) {
- debug("TCP (spliced): index %li (max) removed", CONN_IDX(hole));
- return;
- }
-
- move = CONN(c->tcp.splice_conn_count);
-
- memcpy(hole, move, sizeof(*hole));
-
- move->a = move->b = -1;
- move->a_read = move->a_written = move->b_read = move->b_written = 0;
- move->pipe_a_b[0] = move->pipe_a_b[1] = -1;
- move->pipe_b_a[0] = move->pipe_b_a[1] = -1;
- move->flags = move->events = 0;
-
- debug("TCP (spliced): index %li moved to %li",
- CONN_IDX(move), CONN_IDX(hole));
- tcp_splice_conn_update(c, hole);
-}
-
/**
* tcp_splice_destroy() - Close spliced connection and pipes, clear
* @c: Execution context
@@ -319,7 +286,8 @@ static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
conn->flags = 0;
debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn));
- tcp_table_splice_compact(c, conn);
+ c->tcp.splice_conn_count--;
+ tcp_table_compact(c, (union tcp_conn *)conn);
}
/**
@@ -553,7 +521,7 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
if (ref.r.p.tcp.tcp.listen) {
int s;
- if (c->tcp.splice_conn_count >= TCP_SPLICE_MAX_CONNS)
+ if (c->tcp.conn_count >= TCP_MAX_CONNS)
return;
if ((s = accept4(ref.r.s, NULL, NULL, SOCK_NONBLOCK)) < 0)
@@ -565,8 +533,9 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
s);
}
- conn = CONN(c->tcp.splice_conn_count++);
+ conn = CONN(c->tcp.conn_count++);
conn->c.spliced = true;
+ c->tcp.splice_conn_count++;
conn->a = s;
conn->flags = ref.r.p.tcp.tcp.v6 ? SPLICE_V6 : 0;
@@ -845,9 +814,10 @@ void tcp_splice_timer(struct ctx *c)
{
struct tcp_splice_conn *conn;
- for (conn = CONN(c->tcp.splice_conn_count - 1);
- conn >= tc_splice;
- conn--) {
+ for (conn = CONN(c->tcp.conn_count - 1); conn >= CONN(0); conn--) {
+ if (!conn->c.spliced)
+ continue;
+
if (conn->flags & CLOSING) {
tcp_splice_destroy(c, conn);
return;
@@ -890,12 +860,12 @@ void tcp_splice_defer_handler(struct ctx *c)
int max_files = c->nofile / 100 * TCP_SPLICE_FILE_PRESSURE;
struct tcp_splice_conn *conn;
- if (c->tcp.splice_conn_count < MIN(max_files / 6, max_conns))
+ if (c->tcp.conn_count < MIN(max_files / 6, max_conns))
return;
- for (conn = CONN(c->tcp.splice_conn_count - 1);
- conn >= tc_splice;
- conn--) {
+ for (conn = CONN(c->tcp.conn_count - 1); conn >= CONN(0); conn--) {
+ if (!conn->c.spliced)
+ continue;
if (conn->flags & CLOSING)
tcp_splice_destroy(c, conn);
}
--
2.38.1
next prev parent reply other threads:[~2022-11-14 6:17 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-11-14 6:16 [PATCH 00/14] RFC: tcp: Don't use separate listening sockets for spliced and non-spliced connections David Gibson
2022-11-14 6:16 ` [PATCH 01/14] style: Minor corrections to function comments David Gibson
2022-11-14 6:16 ` [PATCH 02/14] tcp: Remove unused TCP_MAX_SOCKS constant David Gibson
2022-11-14 6:17 ` [PATCH 03/14] tcp: Better helpers for converting between connection pointer and index David Gibson
2022-11-14 6:17 ` [PATCH 04/14] tcp_splice: Helpers for converting from index to/from tcp_splice_conn David Gibson
2022-11-14 6:17 ` [PATCH 05/14] tcp: Move connection state structures into a shared header David Gibson
2022-11-14 6:17 ` [PATCH 06/14] tcp: Add connection union type David Gibson
2022-11-14 6:17 ` [PATCH 07/14] tcp: Improved helpers to update connections after moving David Gibson
2022-11-14 6:17 ` David Gibson [this message]
2022-11-14 6:17 ` [PATCH 09/14] tcp: Unify tcp_defer_handler and tcp_splice_defer_handler() David Gibson
2022-11-14 6:17 ` [PATCH 10/14] tcp: Partially unify tcp_timer() and tcp_splice_timer() David Gibson
2022-11-14 6:17 ` [PATCH 11/14] tcp: Unify the IN_EPOLL flag David Gibson
2022-11-14 6:17 ` [PATCH 12/14] tcp: Separate helpers to create ns listening sockets David Gibson
2022-11-14 6:17 ` [PATCH 13/14] tcp: Unify part of spliced and non-spliced conn_from_sock path David Gibson
2022-11-14 6:17 ` [PATCH 14/14] tcp: Use the same sockets to listen for spliced and non-spliced connections David Gibson
2022-11-15 1:22 ` [PATCH 00/14] RFC: tcp: Don't use separate listening sockets " Stefano Brivio
2022-11-15 4:57 ` David Gibson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20221114061711.1655510-9-david@gibson.dropbear.id.au \
--to=david@gibson.dropbear.id.au \
--cc=passt-dev@passt.top \
--cc=sbrivio@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).