From: David Gibson <david@gibson.dropbear.id.au>
To: passt-dev@passt.top, Stefano Brivio <sbrivio@redhat.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH 14/14] tcp: Use the same sockets to listen for spliced and non-spliced connections
Date: Mon, 14 Nov 2022 17:17:11 +1100 [thread overview]
Message-ID: <20221114061711.1655510-15-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20221114061711.1655510-1-david@gibson.dropbear.id.au>
In pasta mode, tcp_sock_init[46]() create separate sockets to listen for
spliced connections (these are bound to localhost) and non-spliced
connections (these are bound to the host address). This introduces a
subtle behavioural difference between pasta and passt: by default, pasta
will listen only on a single host address, whereas passt will listen on
all addresses (0.0.0.0 or ::). This also prevents us using some additional
optimizations that only work with the unspecified (0.0.0.0 or ::) address.
However, it turns out we don't need to do this. We can splice a connection
if and only if it originates from the loopback address. Currently we
ensure this by having the "spliced" listening sockets listening only on
loopback. However, we can defer the decision about whether to splice a
connection until after accept(), by checking if the connection was made
from localhost.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
tcp.c | 131 +++++++++++++++++-----------------------------------------
1 file changed, 39 insertions(+), 92 deletions(-)
diff --git a/tcp.c b/tcp.c
index e7bfc8c..ac70d4e 100644
--- a/tcp.c
+++ b/tcp.c
@@ -434,7 +434,6 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = {
};
/* Listening sockets, used for automatic port forwarding in pasta mode only */
-static int tcp_sock_init_lo [NUM_PORTS][IP_VERSIONS];
static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS];
static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
@@ -2847,9 +2846,13 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
{
struct sockaddr_storage sa;
union tcp_conn *conn;
+ bool can_splice = false;
socklen_t sl;
int s;
+ assert(ref.r.p.tcp.tcp.listen);
+ assert(!ref.r.p.tcp.tcp.splice);
+
if (c->tcp.conn_count >= TCP_MAX_CONNS)
return;
@@ -2860,7 +2863,25 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
conn = tc + c->tcp.conn_count++;
- if (ref.r.p.tcp.tcp.splice)
+ if (c->mode == MODE_PASTA) {
+ if (ref.r.p.tcp.tcp.v6) {
+ const struct sockaddr_in6 *sa6
+ = (const struct sockaddr_in6 *)&sa;
+ /* clang-tidy doesn't realize accept() initializes sa/sa6 */
+ /* NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult) */
+ if (IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr))
+ can_splice = true;
+ } else {
+ const struct sockaddr_in *sa4 =
+ (const struct sockaddr_in *)&sa;
+ /* clang-tidy doesn't realize accept() initializes sa/sa4 */
+ /* NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) */
+ if (htonl(sa4->sin_addr.s_addr) == INADDR_LOOPBACK)
+ can_splice = true;
+ }
+ }
+
+ if (can_splice)
tcp_splice_conn_from_sock(c, ref, &conn->splice, s);
else
tcp_tap_conn_from_sock(c, ref, &conn->tap, s,
@@ -3017,47 +3038,16 @@ static void tcp_sock_init4(const struct ctx *c, const struct in_addr *addr,
{
in_port_t idx = port + c->tcp.fwd_in.delta[port];
union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.index = idx };
- bool spliced = false, tap = true;
int s;
- if (c->mode == MODE_PASTA) {
- spliced = !addr || IN4_IS_ADDR_UNSPECIFIED(addr) ||
- IN4_IS_ADDR_LOOPBACK(addr);
-
- if (!addr)
- addr = &c->ip4.addr;
-
- tap = !IN4_IS_ADDR_LOOPBACK(addr);
- }
-
- if (tap) {
- s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port,
- tref.u32);
- if (s >= 0)
- tcp_sock_set_bufsize(c, s);
- else
- s = -1;
-
- if (c->tcp.fwd_in.mode == FWD_AUTO)
- tcp_sock_init_ext[port][V4] = s;
- }
-
- if (spliced) {
- struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
- tref.tcp.splice = 1;
-
- addr = &loopback;
-
- s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port,
- tref.u32);
- if (s >= 0)
- tcp_sock_set_bufsize(c, s);
- else
- s = -1;
+ s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port, tref.u32);
+ if (s >= 0)
+ tcp_sock_set_bufsize(c, s);
+ else
+ s = -1;
- if (c->tcp.fwd_out.mode == FWD_AUTO)
- tcp_sock_init_lo[port][V4] = s;
- }
+ if (c->tcp.fwd_in.mode == FWD_AUTO)
+ tcp_sock_init_ext[port][V4] = s;
}
/**
@@ -3074,47 +3064,16 @@ static void tcp_sock_init6(const struct ctx *c,
in_port_t idx = port + c->tcp.fwd_in.delta[port];
union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.v6 = 1,
.tcp.index = idx };
- bool spliced = false, tap = true;
int s;
- if (c->mode == MODE_PASTA) {
- spliced = !addr ||
- IN6_IS_ADDR_UNSPECIFIED(addr) ||
- IN6_IS_ADDR_LOOPBACK(addr);
-
- if (!addr)
- addr = &c->ip6.addr;
-
- tap = !IN6_IS_ADDR_LOOPBACK(addr);
- }
-
- if (tap) {
- s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port,
- tref.u32);
- if (s >= 0)
- tcp_sock_set_bufsize(c, s);
- else
- s = -1;
-
- if (c->tcp.fwd_in.mode == FWD_AUTO)
- tcp_sock_init_ext[port][V6] = s;
- }
-
- if (spliced) {
- tref.tcp.splice = 1;
-
- addr = &in6addr_loopback;
-
- s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port,
- tref.u32);
- if (s >= 0)
- tcp_sock_set_bufsize(c, s);
- else
- s = -1;
+ s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port, tref.u32);
+ if (s >= 0)
+ tcp_sock_set_bufsize(c, s);
+ else
+ s = -1;
- if (c->tcp.fwd_out.mode == FWD_AUTO)
- tcp_sock_init_lo[port][V6] = s;
- }
+ if (c->tcp.fwd_in.mode == FWD_AUTO)
+ tcp_sock_init_ext[port][V6] = s;
}
/**
@@ -3143,7 +3102,7 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
{
in_port_t idx = port + c->tcp.fwd_out.delta[port];
union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = 1,
- .tcp.splice = 1, .tcp.index = idx };
+ .tcp.index = idx };
struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
int s;
@@ -3168,8 +3127,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
{
in_port_t idx = port + c->tcp.fwd_out.delta[port];
union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = 1,
- .tcp.splice = 1, .tcp.v6 = 1,
- .tcp.index = idx};
+ .tcp.v6 = 1, .tcp.index = idx};
int s;
assert(c->mode == MODE_PASTA);
@@ -3336,7 +3294,6 @@ int tcp_init(struct ctx *c)
memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6));
memset(ns_sock_pool4, 0xff, sizeof(ns_sock_pool4));
memset(ns_sock_pool6, 0xff, sizeof(ns_sock_pool6));
- memset(tcp_sock_init_lo, 0xff, sizeof(tcp_sock_init_lo));
memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext));
memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns));
@@ -3444,16 +3401,6 @@ static int tcp_port_rebind(void *arg)
close(tcp_sock_init_ext[port][V6]);
tcp_sock_init_ext[port][V6] = -1;
}
-
- if (tcp_sock_init_lo[port][V4] >= 0) {
- close(tcp_sock_init_lo[port][V4]);
- tcp_sock_init_lo[port][V4] = -1;
- }
-
- if (tcp_sock_init_lo[port][V6] >= 0) {
- close(tcp_sock_init_lo[port][V6]);
- tcp_sock_init_lo[port][V6] = -1;
- }
continue;
}
--
@@ -434,7 +434,6 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = {
};
/* Listening sockets, used for automatic port forwarding in pasta mode only */
-static int tcp_sock_init_lo [NUM_PORTS][IP_VERSIONS];
static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS];
static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
@@ -2847,9 +2846,13 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
{
struct sockaddr_storage sa;
union tcp_conn *conn;
+ bool can_splice = false;
socklen_t sl;
int s;
+ assert(ref.r.p.tcp.tcp.listen);
+ assert(!ref.r.p.tcp.tcp.splice);
+
if (c->tcp.conn_count >= TCP_MAX_CONNS)
return;
@@ -2860,7 +2863,25 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
conn = tc + c->tcp.conn_count++;
- if (ref.r.p.tcp.tcp.splice)
+ if (c->mode == MODE_PASTA) {
+ if (ref.r.p.tcp.tcp.v6) {
+ const struct sockaddr_in6 *sa6
+ = (const struct sockaddr_in6 *)&sa;
+ /* clang-tidy doesn't realize accept() initializes sa/sa6 */
+ /* NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult) */
+ if (IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr))
+ can_splice = true;
+ } else {
+ const struct sockaddr_in *sa4 =
+ (const struct sockaddr_in *)&sa;
+ /* clang-tidy doesn't realize accept() initializes sa/sa4 */
+ /* NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) */
+ if (htonl(sa4->sin_addr.s_addr) == INADDR_LOOPBACK)
+ can_splice = true;
+ }
+ }
+
+ if (can_splice)
tcp_splice_conn_from_sock(c, ref, &conn->splice, s);
else
tcp_tap_conn_from_sock(c, ref, &conn->tap, s,
@@ -3017,47 +3038,16 @@ static void tcp_sock_init4(const struct ctx *c, const struct in_addr *addr,
{
in_port_t idx = port + c->tcp.fwd_in.delta[port];
union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.index = idx };
- bool spliced = false, tap = true;
int s;
- if (c->mode == MODE_PASTA) {
- spliced = !addr || IN4_IS_ADDR_UNSPECIFIED(addr) ||
- IN4_IS_ADDR_LOOPBACK(addr);
-
- if (!addr)
- addr = &c->ip4.addr;
-
- tap = !IN4_IS_ADDR_LOOPBACK(addr);
- }
-
- if (tap) {
- s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port,
- tref.u32);
- if (s >= 0)
- tcp_sock_set_bufsize(c, s);
- else
- s = -1;
-
- if (c->tcp.fwd_in.mode == FWD_AUTO)
- tcp_sock_init_ext[port][V4] = s;
- }
-
- if (spliced) {
- struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
- tref.tcp.splice = 1;
-
- addr = &loopback;
-
- s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port,
- tref.u32);
- if (s >= 0)
- tcp_sock_set_bufsize(c, s);
- else
- s = -1;
+ s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port, tref.u32);
+ if (s >= 0)
+ tcp_sock_set_bufsize(c, s);
+ else
+ s = -1;
- if (c->tcp.fwd_out.mode == FWD_AUTO)
- tcp_sock_init_lo[port][V4] = s;
- }
+ if (c->tcp.fwd_in.mode == FWD_AUTO)
+ tcp_sock_init_ext[port][V4] = s;
}
/**
@@ -3074,47 +3064,16 @@ static void tcp_sock_init6(const struct ctx *c,
in_port_t idx = port + c->tcp.fwd_in.delta[port];
union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.v6 = 1,
.tcp.index = idx };
- bool spliced = false, tap = true;
int s;
- if (c->mode == MODE_PASTA) {
- spliced = !addr ||
- IN6_IS_ADDR_UNSPECIFIED(addr) ||
- IN6_IS_ADDR_LOOPBACK(addr);
-
- if (!addr)
- addr = &c->ip6.addr;
-
- tap = !IN6_IS_ADDR_LOOPBACK(addr);
- }
-
- if (tap) {
- s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port,
- tref.u32);
- if (s >= 0)
- tcp_sock_set_bufsize(c, s);
- else
- s = -1;
-
- if (c->tcp.fwd_in.mode == FWD_AUTO)
- tcp_sock_init_ext[port][V6] = s;
- }
-
- if (spliced) {
- tref.tcp.splice = 1;
-
- addr = &in6addr_loopback;
-
- s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port,
- tref.u32);
- if (s >= 0)
- tcp_sock_set_bufsize(c, s);
- else
- s = -1;
+ s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port, tref.u32);
+ if (s >= 0)
+ tcp_sock_set_bufsize(c, s);
+ else
+ s = -1;
- if (c->tcp.fwd_out.mode == FWD_AUTO)
- tcp_sock_init_lo[port][V6] = s;
- }
+ if (c->tcp.fwd_in.mode == FWD_AUTO)
+ tcp_sock_init_ext[port][V6] = s;
}
/**
@@ -3143,7 +3102,7 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
{
in_port_t idx = port + c->tcp.fwd_out.delta[port];
union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = 1,
- .tcp.splice = 1, .tcp.index = idx };
+ .tcp.index = idx };
struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
int s;
@@ -3168,8 +3127,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
{
in_port_t idx = port + c->tcp.fwd_out.delta[port];
union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = 1,
- .tcp.splice = 1, .tcp.v6 = 1,
- .tcp.index = idx};
+ .tcp.v6 = 1, .tcp.index = idx};
int s;
assert(c->mode == MODE_PASTA);
@@ -3336,7 +3294,6 @@ int tcp_init(struct ctx *c)
memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6));
memset(ns_sock_pool4, 0xff, sizeof(ns_sock_pool4));
memset(ns_sock_pool6, 0xff, sizeof(ns_sock_pool6));
- memset(tcp_sock_init_lo, 0xff, sizeof(tcp_sock_init_lo));
memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext));
memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns));
@@ -3444,16 +3401,6 @@ static int tcp_port_rebind(void *arg)
close(tcp_sock_init_ext[port][V6]);
tcp_sock_init_ext[port][V6] = -1;
}
-
- if (tcp_sock_init_lo[port][V4] >= 0) {
- close(tcp_sock_init_lo[port][V4]);
- tcp_sock_init_lo[port][V4] = -1;
- }
-
- if (tcp_sock_init_lo[port][V6] >= 0) {
- close(tcp_sock_init_lo[port][V6]);
- tcp_sock_init_lo[port][V6] = -1;
- }
continue;
}
--
2.38.1
next prev parent reply other threads:[~2022-11-14 6:17 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-11-14 6:16 [PATCH 00/14] RFC: tcp: Don't use separate listening sockets for spliced and non-spliced connections David Gibson
2022-11-14 6:16 ` [PATCH 01/14] style: Minor corrections to function comments David Gibson
2022-11-14 6:16 ` [PATCH 02/14] tcp: Remove unused TCP_MAX_SOCKS constant David Gibson
2022-11-14 6:17 ` [PATCH 03/14] tcp: Better helpers for converting between connection pointer and index David Gibson
2022-11-14 6:17 ` [PATCH 04/14] tcp_splice: Helpers for converting from index to/from tcp_splice_conn David Gibson
2022-11-14 6:17 ` [PATCH 05/14] tcp: Move connection state structures into a shared header David Gibson
2022-11-14 6:17 ` [PATCH 06/14] tcp: Add connection union type David Gibson
2022-11-14 6:17 ` [PATCH 07/14] tcp: Improved helpers to update connections after moving David Gibson
2022-11-14 6:17 ` [PATCH 08/14] tcp: Unify spliced and non-spliced connection tables David Gibson
2022-11-14 6:17 ` [PATCH 09/14] tcp: Unify tcp_defer_handler and tcp_splice_defer_handler() David Gibson
2022-11-14 6:17 ` [PATCH 10/14] tcp: Partially unify tcp_timer() and tcp_splice_timer() David Gibson
2022-11-14 6:17 ` [PATCH 11/14] tcp: Unify the IN_EPOLL flag David Gibson
2022-11-14 6:17 ` [PATCH 12/14] tcp: Separate helpers to create ns listening sockets David Gibson
2022-11-14 6:17 ` [PATCH 13/14] tcp: Unify part of spliced and non-spliced conn_from_sock path David Gibson
2022-11-14 6:17 ` David Gibson [this message]
2022-11-15 1:22 ` [PATCH 00/14] RFC: tcp: Don't use separate listening sockets for spliced and non-spliced connections Stefano Brivio
2022-11-15 4:57 ` David Gibson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20221114061711.1655510-15-david@gibson.dropbear.id.au \
--to=david@gibson.dropbear.id.au \
--cc=passt-dev@passt.top \
--cc=sbrivio@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).