From: Stefano Brivio <sbrivio@redhat.com>
To: passt-dev@passt.top
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH] tcp, tcp_splice: Don't set SO_SNDBUF and SO_RCVBUF to maximum values
Date: Thu, 13 Feb 2025 23:16:50 +0100 [thread overview]
Message-ID: <20250213221650.4086105-1-sbrivio@redhat.com> (raw)
I added this a long long time ago because it dramatically improved
throughput back then: with rmem_max and wmem_max >= 4 MiB, we would
force send and receive buffer sizes for TCP sockets to the maximum
allowed value.
This effectively disables TCP auto-tuning, which would otherwise allow
us to exceed those limits, as crazy as it might sound. But in any
case, it made sense.
Now that we have zero (internal) copies on every path, plus vhost-user
support, it turns out that these settings are entirely obsolete. I get
substantially the same throughput in every test we perform, even with
very short durations (one second).
The settings are not just useless: they actually cause us quite some
trouble on guest state migration, because they lead to huge queues
that need to be moved as well.
Drop those settings.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
tcp.c | 41 +++++++++--------------------------------
tcp_conn.h | 4 ++--
tcp_splice.c | 6 +++---
3 files changed, 14 insertions(+), 37 deletions(-)
diff --git a/tcp.c b/tcp.c
index 16d01f6..b978b30 100644
--- a/tcp.c
+++ b/tcp.c
@@ -738,24 +738,6 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
SNDBUF_SET(conn, MIN(INT_MAX, v));
}
-/**
- * tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values
- * @s: Socket, can be -1 to avoid check in the caller
- */
-static void tcp_sock_set_bufsize(const struct ctx *c, int s)
-{
- int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */
-
- if (s == -1)
- return;
-
- if (!c->low_rmem && setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v)))
- trace("TCP: failed to set SO_RCVBUF to %i", v);
-
- if (!c->low_wmem && setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)))
- trace("TCP: failed to set SO_SNDBUF to %i", v);
-}
-
/**
* tcp_sock_set_nodelay() - Set TCP_NODELAY option (disable Nagle's algorithm)
* @s: Socket, can be -1 to avoid check in the caller
@@ -1278,12 +1260,11 @@ int tcp_conn_pool_sock(int pool[])
/**
* tcp_conn_new_sock() - Open and prepare new socket for connection
- * @c: Execution context
* @af: Address family
*
* Return: socket number on success, negative code if socket creation failed
*/
-static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
+static int tcp_conn_new_sock(sa_family_t af)
{
int s;
@@ -1297,7 +1278,6 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
if (s < 0)
return -errno;
- tcp_sock_set_bufsize(c, s);
tcp_sock_set_nodelay(s);
return s;
@@ -1305,12 +1285,11 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
/**
* tcp_conn_sock() - Obtain a connectable socket in the host/init namespace
- * @c: Execution context
* @af: Address family (AF_INET or AF_INET6)
*
* Return: Socket fd on success, -errno on failure
*/
-int tcp_conn_sock(const struct ctx *c, sa_family_t af)
+int tcp_conn_sock(sa_family_t af)
{
int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4;
int s;
@@ -1321,7 +1300,7 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af)
/* If the pool is empty we just open a new one without refilling the
* pool to keep latency down.
*/
- if ((s = tcp_conn_new_sock(c, af)) >= 0)
+ if ((s = tcp_conn_new_sock(af)) >= 0)
return s;
err("TCP: Unable to open socket for new connection: %s",
@@ -1462,7 +1441,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
goto cancel;
}
- if ((s = tcp_conn_sock(c, af)) < 0)
+ if ((s = tcp_conn_sock(af)) < 0)
goto cancel;
pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, tgt->eport);
@@ -1483,7 +1462,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
} else {
/* Not a local, bound destination, inconclusive test */
close(s);
- if ((s = tcp_conn_sock(c, af)) < 0)
+ if ((s = tcp_conn_sock(af)) < 0)
goto cancel;
}
@@ -2090,7 +2069,6 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
if (s < 0)
goto cancel;
- tcp_sock_set_bufsize(c, s);
tcp_sock_set_nodelay(s);
/* FIXME: If useful: when the listening port has a specific bound
@@ -2434,13 +2412,12 @@ static int tcp_ns_socks_init(void *arg)
/**
* tcp_sock_refill_pool() - Refill one pool of pre-opened sockets
- * @c: Execution context
* @pool: Pool of sockets to refill
* @af: Address family to use
*
* Return: 0 on success, negative error code if there was at least one error
*/
-int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af)
+int tcp_sock_refill_pool(int pool[], sa_family_t af)
{
int i;
@@ -2450,7 +2427,7 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af)
if (pool[i] >= 0)
continue;
- if ((fd = tcp_conn_new_sock(c, af)) < 0)
+ if ((fd = tcp_conn_new_sock(af)) < 0)
return fd;
pool[i] = fd;
@@ -2466,13 +2443,13 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af)
static void tcp_sock_refill_init(const struct ctx *c)
{
if (c->ifi4) {
- int rc = tcp_sock_refill_pool(c, init_sock_pool4, AF_INET);
+ int rc = tcp_sock_refill_pool(init_sock_pool4, AF_INET);
if (rc < 0)
warn("TCP: Error refilling IPv4 host socket pool: %s",
strerror_(-rc));
}
if (c->ifi6) {
- int rc = tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6);
+ int rc = tcp_sock_refill_pool(init_sock_pool6, AF_INET6);
if (rc < 0)
warn("TCP: Error refilling IPv6 host socket pool: %s",
strerror_(-rc));
diff --git a/tcp_conn.h b/tcp_conn.h
index d342680..8c20805 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -143,8 +143,8 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn);
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
int tcp_conn_pool_sock(int pool[]);
-int tcp_conn_sock(const struct ctx *c, sa_family_t af);
-int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af);
+int tcp_conn_sock(sa_family_t af);
+int tcp_sock_refill_pool(int pool[], sa_family_t af);
void tcp_splice_refill(const struct ctx *c);
#endif /* TCP_CONN_H */
diff --git a/tcp_splice.c b/tcp_splice.c
index f048a82..f1a9223 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -351,7 +351,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
int one = 1;
if (tgtpif == PIF_HOST)
- conn->s[1] = tcp_conn_sock(c, af);
+ conn->s[1] = tcp_conn_sock(af);
else if (tgtpif == PIF_SPLICE)
conn->s[1] = tcp_conn_sock_ns(c, af);
else
@@ -703,13 +703,13 @@ static int tcp_sock_refill_ns(void *arg)
ns_enter(c);
if (c->ifi4) {
- int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
+ int rc = tcp_sock_refill_pool(ns_sock_pool4, AF_INET);
if (rc < 0)
warn("TCP: Error refilling IPv4 ns socket pool: %s",
strerror_(-rc));
}
if (c->ifi6) {
- int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
+ int rc = tcp_sock_refill_pool(ns_sock_pool6, AF_INET6);
if (rc < 0)
warn("TCP: Error refilling IPv6 ns socket pool: %s",
strerror_(-rc));
--
@@ -351,7 +351,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
int one = 1;
if (tgtpif == PIF_HOST)
- conn->s[1] = tcp_conn_sock(c, af);
+ conn->s[1] = tcp_conn_sock(af);
else if (tgtpif == PIF_SPLICE)
conn->s[1] = tcp_conn_sock_ns(c, af);
else
@@ -703,13 +703,13 @@ static int tcp_sock_refill_ns(void *arg)
ns_enter(c);
if (c->ifi4) {
- int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
+ int rc = tcp_sock_refill_pool(ns_sock_pool4, AF_INET);
if (rc < 0)
warn("TCP: Error refilling IPv4 ns socket pool: %s",
strerror_(-rc));
}
if (c->ifi6) {
- int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
+ int rc = tcp_sock_refill_pool(ns_sock_pool6, AF_INET6);
if (rc < 0)
warn("TCP: Error refilling IPv6 ns socket pool: %s",
strerror_(-rc));
--
2.43.0
next reply other threads:[~2025-02-13 22:16 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-13 22:16 Stefano Brivio [this message]
2025-02-13 23:02 ` [PATCH] tcp, tcp_splice: Don't set SO_SNDBUF and SO_RCVBUF to maximum values David Gibson
2025-02-14 2:24 ` David Gibson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250213221650.4086105-1-sbrivio@redhat.com \
--to=sbrivio@redhat.com \
--cc=david@gibson.dropbear.id.au \
--cc=passt-dev@passt.top \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).