public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: Stefano Brivio <sbrivio@redhat.com>
To: passt-dev@passt.top
Subject: [PATCH] tcp: Don't rely on bind() to fail to decide that connection target is valid
Date: Tue, 18 Jun 2024 12:48:58 +0200	[thread overview]
Message-ID: <20240618104858.1773178-1-sbrivio@redhat.com> (raw)

Commit e1a2e2780c91 ("tcp: Check if connection is local or low RTT
was seen before using large MSS") added a call to bind() before we
issue a connect() to the target for an outbound connection.

If bind() fails, but neither with EADDRNOTAVAIL, nor with EACCESS, we
can conclude that the target address is a local (host) address, and we
can use an unlimited MSS.

While at it, according to the reasoning of that commit, if bind()
succeeds, we would know right away that nobody is listening at that
(local) address and port, and we don't even need to call connect(): we
can just fail early and reset the connection attempt.

But if non-local binds are enabled via net.ipv4.ip_nonlocal_bind or
net.ipv6.ip_nonlocal_bind sysctl, binding to a non-local address will
actually succeed, so we can't rely on it to fail in general.

The visible issue with the existing behaviour is that we would reset
any outbound connection to non-local addresses, if non-local binds are
enabled.

Keep the significant optimisation for local addresses along with the
bind() call, but if it succeeds, don't draw any conclusion: close the
socket, grab another one, and proceed normally.

This will incur a small latency penalty if non-local binds are
enabled (we'll likely fetch an existing socket from the pool but
additionally call close()), or if the target is local but not bound:
we'll need to call connect() and get a failure before relaying that
failure back.

Link: https://github.com/containers/podman/issues/23003
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 48 ++++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/tcp.c b/tcp.c
index 231f63b..fd1c550 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1615,14 +1615,17 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 	const struct sockaddr *sa;
 	struct tcp_tap_conn *conn;
 	union flow *flow;
+	socklen_t sl = 0;
 	int s = -1, mss;
-	socklen_t sl;
 
 	if (!(flow = flow_alloc()))
 		return;
 
 	flow_initiate(flow, PIF_TAP);
 
+	flow_target(flow, PIF_HOST);
+	conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
+
 	if (af == AF_INET) {
 		if (IN4_IS_ADDR_UNSPECIFIED(saddr) ||
 		    IN4_IS_ADDR_BROADCAST(saddr) ||
@@ -1639,6 +1642,9 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 			      dstport);
 			goto cancel;
 		}
+
+		sa = (struct sockaddr *)&addr4;
+		sl = sizeof(addr4);
 	} else if (af == AF_INET6) {
 		if (IN6_IS_ADDR_UNSPECIFIED(saddr) ||
 		    IN6_IS_ADDR_MULTICAST(saddr) || srcport == 0 ||
@@ -1653,6 +1659,9 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 			      dstport);
 			goto cancel;
 		}
+
+		sa = (struct sockaddr *)&addr6;
+		sl = sizeof(addr6);
 	}
 
 	if ((s = tcp_conn_sock(c, af)) < 0)
@@ -1665,6 +1674,26 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 			addr6.sin6_addr	= in6addr_loopback;
 	}
 
+	/* Use bind() to check if the target address is local (EADDRINUSE or
+	 * similar) and already bound, and set the LOCAL flag in that case.
+	 *
+	 * If bind() succeeds, in general, we could infer that nobody (else) is
+	 * listening on that address and port and reset the connection attempt
+	 * early, but we can't rely on that if non-local binds are enabled,
+	 * because bind() would succeed for any non-local address we can reach.
+	 *
+	 * So, if bind() succeeds, close the socket, get a new one, and proceed.
+	 */
+	if (bind(s, sa, sl)) {
+		if (errno != EADDRNOTAVAIL && errno != EACCES)
+			conn_flag(c, conn, LOCAL);
+	} else {
+		/* Not a local, bound destination, inconclusive test */
+		close(s);
+		if ((s = tcp_conn_sock(c, af)) < 0)
+			goto cancel;
+	}
+
 	if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) {
 		struct sockaddr_in6 addr6_ll = {
 			.sin6_family = AF_INET6,
@@ -1675,8 +1704,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 			goto cancel;
 	}
 
-	flow_target(flow, PIF_HOST);
-	conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
 	conn->sock = s;
 	conn->timer = -1;
 	conn_event(c, conn, TAP_SYN_RCVD);
@@ -1698,14 +1725,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 
 	inany_from_af(&conn->faddr, af, daddr);
 
-	if (af == AF_INET) {
-		sa = (struct sockaddr *)&addr4;
-		sl = sizeof(addr4);
-	} else {
-		sa = (struct sockaddr *)&addr6;
-		sl = sizeof(addr6);
-	}
-
 	conn->fport = dstport;
 	conn->eport = srcport;
 
@@ -1718,13 +1737,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 
 	tcp_hash_insert(c, conn);
 
-	if (!bind(s, sa, sl)) {
-		tcp_rst(c, conn);	/* Nobody is listening then */
-		goto cancel;
-	}
-	if (errno != EADDRNOTAVAIL && errno != EACCES)
-		conn_flag(c, conn, LOCAL);
-
 	if ((af == AF_INET &&  !IN4_IS_ADDR_LOOPBACK(&addr4.sin_addr)) ||
 	    (af == AF_INET6 && !IN6_IS_ADDR_LOOPBACK(&addr6.sin6_addr) &&
 			       !IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)))
-- 
@@ -1615,14 +1615,17 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 	const struct sockaddr *sa;
 	struct tcp_tap_conn *conn;
 	union flow *flow;
+	socklen_t sl = 0;
 	int s = -1, mss;
-	socklen_t sl;
 
 	if (!(flow = flow_alloc()))
 		return;
 
 	flow_initiate(flow, PIF_TAP);
 
+	flow_target(flow, PIF_HOST);
+	conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
+
 	if (af == AF_INET) {
 		if (IN4_IS_ADDR_UNSPECIFIED(saddr) ||
 		    IN4_IS_ADDR_BROADCAST(saddr) ||
@@ -1639,6 +1642,9 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 			      dstport);
 			goto cancel;
 		}
+
+		sa = (struct sockaddr *)&addr4;
+		sl = sizeof(addr4);
 	} else if (af == AF_INET6) {
 		if (IN6_IS_ADDR_UNSPECIFIED(saddr) ||
 		    IN6_IS_ADDR_MULTICAST(saddr) || srcport == 0 ||
@@ -1653,6 +1659,9 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 			      dstport);
 			goto cancel;
 		}
+
+		sa = (struct sockaddr *)&addr6;
+		sl = sizeof(addr6);
 	}
 
 	if ((s = tcp_conn_sock(c, af)) < 0)
@@ -1665,6 +1674,26 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 			addr6.sin6_addr	= in6addr_loopback;
 	}
 
+	/* Use bind() to check if the target address is local (EADDRINUSE or
+	 * similar) and already bound, and set the LOCAL flag in that case.
+	 *
+	 * If bind() succeeds, in general, we could infer that nobody (else) is
+	 * listening on that address and port and reset the connection attempt
+	 * early, but we can't rely on that if non-local binds are enabled,
+	 * because bind() would succeed for any non-local address we can reach.
+	 *
+	 * So, if bind() succeeds, close the socket, get a new one, and proceed.
+	 */
+	if (bind(s, sa, sl)) {
+		if (errno != EADDRNOTAVAIL && errno != EACCES)
+			conn_flag(c, conn, LOCAL);
+	} else {
+		/* Not a local, bound destination, inconclusive test */
+		close(s);
+		if ((s = tcp_conn_sock(c, af)) < 0)
+			goto cancel;
+	}
+
 	if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) {
 		struct sockaddr_in6 addr6_ll = {
 			.sin6_family = AF_INET6,
@@ -1675,8 +1704,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 			goto cancel;
 	}
 
-	flow_target(flow, PIF_HOST);
-	conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
 	conn->sock = s;
 	conn->timer = -1;
 	conn_event(c, conn, TAP_SYN_RCVD);
@@ -1698,14 +1725,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 
 	inany_from_af(&conn->faddr, af, daddr);
 
-	if (af == AF_INET) {
-		sa = (struct sockaddr *)&addr4;
-		sl = sizeof(addr4);
-	} else {
-		sa = (struct sockaddr *)&addr6;
-		sl = sizeof(addr6);
-	}
-
 	conn->fport = dstport;
 	conn->eport = srcport;
 
@@ -1718,13 +1737,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 
 	tcp_hash_insert(c, conn);
 
-	if (!bind(s, sa, sl)) {
-		tcp_rst(c, conn);	/* Nobody is listening then */
-		goto cancel;
-	}
-	if (errno != EADDRNOTAVAIL && errno != EACCES)
-		conn_flag(c, conn, LOCAL);
-
 	if ((af == AF_INET &&  !IN4_IS_ADDR_LOOPBACK(&addr4.sin_addr)) ||
 	    (af == AF_INET6 && !IN6_IS_ADDR_LOOPBACK(&addr6.sin6_addr) &&
 			       !IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)))
-- 
2.43.0


                 reply	other threads:[~2024-06-18 10:48 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240618104858.1773178-1-sbrivio@redhat.com \
    --to=sbrivio@redhat.com \
    --cc=passt-dev@passt.top \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).