From mboxrd@z Thu Jan 1 00:00:00 1970 Received: by passt.top (Postfix, from userid 1000) id EF8615A0640; Thu, 13 Feb 2025 23:16:50 +0100 (CET) From: Stefano Brivio To: passt-dev@passt.top Subject: [PATCH] tcp, tcp_splice: Don't set SO_SNDBUF and SO_RCVBUF to maximum values Date: Thu, 13 Feb 2025 23:16:50 +0100 Message-ID: <20250213221650.4086105-1-sbrivio@redhat.com> X-Mailer: git-send-email 2.43.0 MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Message-ID-Hash: QBOIA3VDSTMEIWLWNTJXB2HNUVFZ4QMI X-Message-ID-Hash: QBOIA3VDSTMEIWLWNTJXB2HNUVFZ4QMI X-MailFrom: sbrivio@passt.top X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; emergency; loop; banned-address; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header CC: David Gibson X-Mailman-Version: 3.3.8 Precedence: list List-Id: Development discussion and patches for passt Archived-At: Archived-At: List-Archive: List-Archive: List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: I added this a long long time ago because it dramatically improved throughput back then: with rmem_max and wmem_max >= 4 MiB, we would force send and receive buffer sizes for TCP sockets to the maximum allowed value. This effectively disables TCP auto-tuning, which would otherwise allow us to exceed those limits, as crazy as it might sound. But in any case, it made sense. Now that we have zero (internal) copies on every path, plus vhost-user support, it turns out that these settings are entirely obsolete. I get substantially the same throughput in every test we perform, even with very short durations (one second). The settings are not just useless: they actually cause us quite some trouble on guest state migration, because they lead to huge queues that need to be moved as well. Drop those settings. Signed-off-by: Stefano Brivio --- tcp.c | 41 +++++++++-------------------------------- tcp_conn.h | 4 ++-- tcp_splice.c | 6 +++--- 3 files changed, 14 insertions(+), 37 deletions(-) diff --git a/tcp.c b/tcp.c index 16d01f6..b978b30 100644 --- a/tcp.c +++ b/tcp.c @@ -738,24 +738,6 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn) SNDBUF_SET(conn, MIN(INT_MAX, v)); } -/** - * tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values - * @s: Socket, can be -1 to avoid check in the caller - */ -static void tcp_sock_set_bufsize(const struct ctx *c, int s) -{ - int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */ - - if (s == -1) - return; - - if (!c->low_rmem && setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v))) - trace("TCP: failed to set SO_RCVBUF to %i", v); - - if (!c->low_wmem && setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v))) - trace("TCP: failed to set SO_SNDBUF to %i", v); -} - /** * tcp_sock_set_nodelay() - Set TCP_NODELAY option (disable Nagle's algorithm) * @s: Socket, can be -1 to avoid check in the caller @@ -1278,12 +1260,11 @@ int tcp_conn_pool_sock(int pool[]) /** * tcp_conn_new_sock() - Open and prepare new socket for connection - * @c: Execution context * @af: Address family * * Return: socket number on success, negative code if socket creation failed */ -static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) +static int tcp_conn_new_sock(sa_family_t af) { int s; @@ -1297,7 +1278,6 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) if (s < 0) return -errno; - tcp_sock_set_bufsize(c, s); tcp_sock_set_nodelay(s); return s; @@ -1305,12 +1285,11 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) /** * tcp_conn_sock() - Obtain a connectable socket in the host/init namespace - * @c: Execution context * @af: Address family (AF_INET or AF_INET6) * * Return: Socket fd on success, -errno on failure */ -int tcp_conn_sock(const struct ctx *c, sa_family_t af) +int tcp_conn_sock(sa_family_t af) { int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4; int s; @@ -1321,7 +1300,7 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af) /* If the pool is empty we just open a new one without refilling the * pool to keep latency down. */ - if ((s = tcp_conn_new_sock(c, af)) >= 0) + if ((s = tcp_conn_new_sock(af)) >= 0) return s; err("TCP: Unable to open socket for new connection: %s", @@ -1462,7 +1441,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, goto cancel; } - if ((s = tcp_conn_sock(c, af)) < 0) + if ((s = tcp_conn_sock(af)) < 0) goto cancel; pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, tgt->eport); @@ -1483,7 +1462,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, } else { /* Not a local, bound destination, inconclusive test */ close(s); - if ((s = tcp_conn_sock(c, af)) < 0) + if ((s = tcp_conn_sock(af)) < 0) goto cancel; } @@ -2090,7 +2069,6 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, if (s < 0) goto cancel; - tcp_sock_set_bufsize(c, s); tcp_sock_set_nodelay(s); /* FIXME: If useful: when the listening port has a specific bound @@ -2434,13 +2412,12 @@ static int tcp_ns_socks_init(void *arg) /** * tcp_sock_refill_pool() - Refill one pool of pre-opened sockets - * @c: Execution context * @pool: Pool of sockets to refill * @af: Address family to use * * Return: 0 on success, negative error code if there was at least one error */ -int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) +int tcp_sock_refill_pool(int pool[], sa_family_t af) { int i; @@ -2450,7 +2427,7 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) if (pool[i] >= 0) continue; - if ((fd = tcp_conn_new_sock(c, af)) < 0) + if ((fd = tcp_conn_new_sock(af)) < 0) return fd; pool[i] = fd; @@ -2466,13 +2443,13 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) static void tcp_sock_refill_init(const struct ctx *c) { if (c->ifi4) { - int rc = tcp_sock_refill_pool(c, init_sock_pool4, AF_INET); + int rc = tcp_sock_refill_pool(init_sock_pool4, AF_INET); if (rc < 0) warn("TCP: Error refilling IPv4 host socket pool: %s", strerror_(-rc)); } if (c->ifi6) { - int rc = tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6); + int rc = tcp_sock_refill_pool(init_sock_pool6, AF_INET6); if (rc < 0) warn("TCP: Error refilling IPv6 host socket pool: %s", strerror_(-rc)); diff --git a/tcp_conn.h b/tcp_conn.h index d342680..8c20805 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -143,8 +143,8 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn); bool tcp_splice_flow_defer(struct tcp_splice_conn *conn); void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn); int tcp_conn_pool_sock(int pool[]); -int tcp_conn_sock(const struct ctx *c, sa_family_t af); -int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af); +int tcp_conn_sock(sa_family_t af); +int tcp_sock_refill_pool(int pool[], sa_family_t af); void tcp_splice_refill(const struct ctx *c); #endif /* TCP_CONN_H */ diff --git a/tcp_splice.c b/tcp_splice.c index f048a82..f1a9223 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -351,7 +351,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) int one = 1; if (tgtpif == PIF_HOST) - conn->s[1] = tcp_conn_sock(c, af); + conn->s[1] = tcp_conn_sock(af); else if (tgtpif == PIF_SPLICE) conn->s[1] = tcp_conn_sock_ns(c, af); else @@ -703,13 +703,13 @@ static int tcp_sock_refill_ns(void *arg) ns_enter(c); if (c->ifi4) { - int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET); + int rc = tcp_sock_refill_pool(ns_sock_pool4, AF_INET); if (rc < 0) warn("TCP: Error refilling IPv4 ns socket pool: %s", strerror_(-rc)); } if (c->ifi6) { - int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6); + int rc = tcp_sock_refill_pool(ns_sock_pool6, AF_INET6); if (rc < 0) warn("TCP: Error refilling IPv6 ns socket pool: %s", strerror_(-rc)); -- 2.43.0