* [PATCH v2] tcp: Don't rely on bind() to fail to decide that connection target is valid
@ 2024-06-18 16:30 Stefano Brivio
2024-06-19 1:27 ` David Gibson
0 siblings, 1 reply; 2+ messages in thread
From: Stefano Brivio @ 2024-06-18 16:30 UTC (permalink / raw)
To: passt-dev
Commit e1a2e2780c91 ("tcp: Check if connection is local or low RTT
was seen before using large MSS") added a call to bind() before we
issue a connect() to the target for an outbound connection.
If bind() fails, but neither with EADDRNOTAVAIL, nor with EACCESS, we
can conclude that the target address is a local (host) address, and we
can use an unlimited MSS.
While at it, according to the reasoning of that commit, if bind()
succeeds, we would know right away that nobody is listening at that
(local) address and port, and we don't even need to call connect(): we
can just fail early and reset the connection attempt.
But if non-local binds are enabled via net.ipv4.ip_nonlocal_bind or
net.ipv6.ip_nonlocal_bind sysctl, binding to a non-local address will
actually succeed, so we can't rely on it to fail in general.
The visible issue with the existing behaviour is that we would reset
any outbound connection to non-local addresses, if non-local binds are
enabled.
Keep the significant optimisation for local addresses along with the
bind() call, but if it succeeds, don't draw any conclusion: close the
socket, grab another one, and proceed normally.
This will incur a small latency penalty if non-local binds are
enabled (we'll likely fetch an existing socket from the pool but
additionally call close()), or if the target is local but not bound:
we'll need to call connect() and get a failure before relaying that
failure back.
Link: https://github.com/containers/podman/issues/23003
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
v2: Avoid (bogus) reports of uninitialised 'sa' in bind() using an
assertion on 'af' (it must be AF_INET or AF_INET6)
tcp.c | 48 +++++++++++++++++++++++++++++++-----------------
1 file changed, 31 insertions(+), 17 deletions(-)
diff --git a/tcp.c b/tcp.c
index 231f63b..698e7ec 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1623,6 +1623,9 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
flow_initiate(flow, PIF_TAP);
+ flow_target(flow, PIF_HOST);
+ conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
+
if (af == AF_INET) {
if (IN4_IS_ADDR_UNSPECIFIED(saddr) ||
IN4_IS_ADDR_BROADCAST(saddr) ||
@@ -1639,6 +1642,9 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
dstport);
goto cancel;
}
+
+ sa = (struct sockaddr *)&addr4;
+ sl = sizeof(addr4);
} else if (af == AF_INET6) {
if (IN6_IS_ADDR_UNSPECIFIED(saddr) ||
IN6_IS_ADDR_MULTICAST(saddr) || srcport == 0 ||
@@ -1653,6 +1659,11 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
dstport);
goto cancel;
}
+
+ sa = (struct sockaddr *)&addr6;
+ sl = sizeof(addr6);
+ } else {
+ ASSERT(0);
}
if ((s = tcp_conn_sock(c, af)) < 0)
@@ -1665,6 +1676,26 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
addr6.sin6_addr = in6addr_loopback;
}
+ /* Use bind() to check if the target address is local (EADDRINUSE or
+ * similar) and already bound, and set the LOCAL flag in that case.
+ *
+ * If bind() succeeds, in general, we could infer that nobody (else) is
+ * listening on that address and port and reset the connection attempt
+ * early, but we can't rely on that if non-local binds are enabled,
+ * because bind() would succeed for any non-local address we can reach.
+ *
+ * So, if bind() succeeds, close the socket, get a new one, and proceed.
+ */
+ if (bind(s, sa, sl)) {
+ if (errno != EADDRNOTAVAIL && errno != EACCES)
+ conn_flag(c, conn, LOCAL);
+ } else {
+ /* Not a local, bound destination, inconclusive test */
+ close(s);
+ if ((s = tcp_conn_sock(c, af)) < 0)
+ goto cancel;
+ }
+
if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) {
struct sockaddr_in6 addr6_ll = {
.sin6_family = AF_INET6,
@@ -1675,8 +1706,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
goto cancel;
}
- flow_target(flow, PIF_HOST);
- conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
conn->sock = s;
conn->timer = -1;
conn_event(c, conn, TAP_SYN_RCVD);
@@ -1698,14 +1727,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
inany_from_af(&conn->faddr, af, daddr);
- if (af == AF_INET) {
- sa = (struct sockaddr *)&addr4;
- sl = sizeof(addr4);
- } else {
- sa = (struct sockaddr *)&addr6;
- sl = sizeof(addr6);
- }
-
conn->fport = dstport;
conn->eport = srcport;
@@ -1718,13 +1739,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
tcp_hash_insert(c, conn);
- if (!bind(s, sa, sl)) {
- tcp_rst(c, conn); /* Nobody is listening then */
- goto cancel;
- }
- if (errno != EADDRNOTAVAIL && errno != EACCES)
- conn_flag(c, conn, LOCAL);
-
if ((af == AF_INET && !IN4_IS_ADDR_LOOPBACK(&addr4.sin_addr)) ||
(af == AF_INET6 && !IN6_IS_ADDR_LOOPBACK(&addr6.sin6_addr) &&
!IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)))
--
@@ -1623,6 +1623,9 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
flow_initiate(flow, PIF_TAP);
+ flow_target(flow, PIF_HOST);
+ conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
+
if (af == AF_INET) {
if (IN4_IS_ADDR_UNSPECIFIED(saddr) ||
IN4_IS_ADDR_BROADCAST(saddr) ||
@@ -1639,6 +1642,9 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
dstport);
goto cancel;
}
+
+ sa = (struct sockaddr *)&addr4;
+ sl = sizeof(addr4);
} else if (af == AF_INET6) {
if (IN6_IS_ADDR_UNSPECIFIED(saddr) ||
IN6_IS_ADDR_MULTICAST(saddr) || srcport == 0 ||
@@ -1653,6 +1659,11 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
dstport);
goto cancel;
}
+
+ sa = (struct sockaddr *)&addr6;
+ sl = sizeof(addr6);
+ } else {
+ ASSERT(0);
}
if ((s = tcp_conn_sock(c, af)) < 0)
@@ -1665,6 +1676,26 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
addr6.sin6_addr = in6addr_loopback;
}
+ /* Use bind() to check if the target address is local (EADDRINUSE or
+ * similar) and already bound, and set the LOCAL flag in that case.
+ *
+ * If bind() succeeds, in general, we could infer that nobody (else) is
+ * listening on that address and port and reset the connection attempt
+ * early, but we can't rely on that if non-local binds are enabled,
+ * because bind() would succeed for any non-local address we can reach.
+ *
+ * So, if bind() succeeds, close the socket, get a new one, and proceed.
+ */
+ if (bind(s, sa, sl)) {
+ if (errno != EADDRNOTAVAIL && errno != EACCES)
+ conn_flag(c, conn, LOCAL);
+ } else {
+ /* Not a local, bound destination, inconclusive test */
+ close(s);
+ if ((s = tcp_conn_sock(c, af)) < 0)
+ goto cancel;
+ }
+
if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) {
struct sockaddr_in6 addr6_ll = {
.sin6_family = AF_INET6,
@@ -1675,8 +1706,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
goto cancel;
}
- flow_target(flow, PIF_HOST);
- conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
conn->sock = s;
conn->timer = -1;
conn_event(c, conn, TAP_SYN_RCVD);
@@ -1698,14 +1727,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
inany_from_af(&conn->faddr, af, daddr);
- if (af == AF_INET) {
- sa = (struct sockaddr *)&addr4;
- sl = sizeof(addr4);
- } else {
- sa = (struct sockaddr *)&addr6;
- sl = sizeof(addr6);
- }
-
conn->fport = dstport;
conn->eport = srcport;
@@ -1718,13 +1739,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
tcp_hash_insert(c, conn);
- if (!bind(s, sa, sl)) {
- tcp_rst(c, conn); /* Nobody is listening then */
- goto cancel;
- }
- if (errno != EADDRNOTAVAIL && errno != EACCES)
- conn_flag(c, conn, LOCAL);
-
if ((af == AF_INET && !IN4_IS_ADDR_LOOPBACK(&addr4.sin_addr)) ||
(af == AF_INET6 && !IN6_IS_ADDR_LOOPBACK(&addr6.sin6_addr) &&
!IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)))
--
2.43.0
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH v2] tcp: Don't rely on bind() to fail to decide that connection target is valid
2024-06-18 16:30 [PATCH v2] tcp: Don't rely on bind() to fail to decide that connection target is valid Stefano Brivio
@ 2024-06-19 1:27 ` David Gibson
0 siblings, 0 replies; 2+ messages in thread
From: David Gibson @ 2024-06-19 1:27 UTC (permalink / raw)
To: Stefano Brivio; +Cc: passt-dev
[-- Attachment #1: Type: text/plain, Size: 5599 bytes --]
On Tue, Jun 18, 2024 at 06:30:57PM +0200, Stefano Brivio wrote:
> Commit e1a2e2780c91 ("tcp: Check if connection is local or low RTT
> was seen before using large MSS") added a call to bind() before we
> issue a connect() to the target for an outbound connection.
>
> If bind() fails, but neither with EADDRNOTAVAIL, nor with EACCESS, we
> can conclude that the target address is a local (host) address, and we
> can use an unlimited MSS.
>
> While at it, according to the reasoning of that commit, if bind()
> succeeds, we would know right away that nobody is listening at that
> (local) address and port, and we don't even need to call connect(): we
> can just fail early and reset the connection attempt.
>
> But if non-local binds are enabled via net.ipv4.ip_nonlocal_bind or
> net.ipv6.ip_nonlocal_bind sysctl, binding to a non-local address will
> actually succeed, so we can't rely on it to fail in general.
>
> The visible issue with the existing behaviour is that we would reset
> any outbound connection to non-local addresses, if non-local binds are
> enabled.
>
> Keep the significant optimisation for local addresses along with the
> bind() call, but if it succeeds, don't draw any conclusion: close the
> socket, grab another one, and proceed normally.
>
> This will incur a small latency penalty if non-local binds are
> enabled (we'll likely fetch an existing socket from the pool but
> additionally call close()), or if the target is local but not bound:
> we'll need to call connect() and get a failure before relaying that
> failure back.
>
> Link: https://github.com/containers/podman/issues/23003
> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> ---
> v2: Avoid (bogus) reports of uninitialised 'sa' in bind() using an
> assertion on 'af' (it must be AF_INET or AF_INET6)
>
> tcp.c | 48 +++++++++++++++++++++++++++++++-----------------
> 1 file changed, 31 insertions(+), 17 deletions(-)
>
> diff --git a/tcp.c b/tcp.c
> index 231f63b..698e7ec 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -1623,6 +1623,9 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
>
> flow_initiate(flow, PIF_TAP);
>
> + flow_target(flow, PIF_HOST);
> + conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
> +
> if (af == AF_INET) {
> if (IN4_IS_ADDR_UNSPECIFIED(saddr) ||
> IN4_IS_ADDR_BROADCAST(saddr) ||
> @@ -1639,6 +1642,9 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
> dstport);
> goto cancel;
> }
> +
> + sa = (struct sockaddr *)&addr4;
> + sl = sizeof(addr4);
> } else if (af == AF_INET6) {
> if (IN6_IS_ADDR_UNSPECIFIED(saddr) ||
> IN6_IS_ADDR_MULTICAST(saddr) || srcport == 0 ||
> @@ -1653,6 +1659,11 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
> dstport);
> goto cancel;
> }
> +
> + sa = (struct sockaddr *)&addr6;
> + sl = sizeof(addr6);
> + } else {
> + ASSERT(0);
> }
>
> if ((s = tcp_conn_sock(c, af)) < 0)
> @@ -1665,6 +1676,26 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
> addr6.sin6_addr = in6addr_loopback;
> }
>
> + /* Use bind() to check if the target address is local (EADDRINUSE or
> + * similar) and already bound, and set the LOCAL flag in that case.
> + *
> + * If bind() succeeds, in general, we could infer that nobody (else) is
> + * listening on that address and port and reset the connection attempt
> + * early, but we can't rely on that if non-local binds are enabled,
> + * because bind() would succeed for any non-local address we can reach.
> + *
> + * So, if bind() succeeds, close the socket, get a new one, and proceed.
> + */
> + if (bind(s, sa, sl)) {
> + if (errno != EADDRNOTAVAIL && errno != EACCES)
> + conn_flag(c, conn, LOCAL);
> + } else {
> + /* Not a local, bound destination, inconclusive test */
> + close(s);
> + if ((s = tcp_conn_sock(c, af)) < 0)
> + goto cancel;
> + }
> +
> if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) {
> struct sockaddr_in6 addr6_ll = {
> .sin6_family = AF_INET6,
> @@ -1675,8 +1706,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
> goto cancel;
> }
>
> - flow_target(flow, PIF_HOST);
> - conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
> conn->sock = s;
> conn->timer = -1;
> conn_event(c, conn, TAP_SYN_RCVD);
> @@ -1698,14 +1727,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
>
> inany_from_af(&conn->faddr, af, daddr);
>
> - if (af == AF_INET) {
> - sa = (struct sockaddr *)&addr4;
> - sl = sizeof(addr4);
> - } else {
> - sa = (struct sockaddr *)&addr6;
> - sl = sizeof(addr6);
> - }
> -
> conn->fport = dstport;
> conn->eport = srcport;
>
> @@ -1718,13 +1739,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
>
> tcp_hash_insert(c, conn);
>
> - if (!bind(s, sa, sl)) {
> - tcp_rst(c, conn); /* Nobody is listening then */
> - goto cancel;
> - }
> - if (errno != EADDRNOTAVAIL && errno != EACCES)
> - conn_flag(c, conn, LOCAL);
> -
> if ((af == AF_INET && !IN4_IS_ADDR_LOOPBACK(&addr4.sin_addr)) ||
> (af == AF_INET6 && !IN6_IS_ADDR_LOOPBACK(&addr6.sin6_addr) &&
> !IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)))
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2024-06-19 1:50 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-18 16:30 [PATCH v2] tcp: Don't rely on bind() to fail to decide that connection target is valid Stefano Brivio
2024-06-19 1:27 ` David Gibson
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).