* [PATCH v2 1/9] conf: Use the same optstring for passt and pasta modes
2025-03-12 2:18 [PATCH v2 0/9] Improve handling of MTU limits David Gibson
@ 2025-03-12 2:18 ` David Gibson
2025-03-12 2:18 ` [PATCH v2 2/9] conf: Move mode detection into helper function David Gibson
` (8 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: David Gibson @ 2025-03-12 2:18 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
Currently we rely on detecting our mode first and use different sets of
(single character) options for each. This means that if you use an option
valid in only one mode in another you'll get the generic usage() message.
We can give more helpful errors with little extra effort by combining all
the options into a single value of the option string and giving bespoke
messages if an option for the wrong mode is used; in fact we already did
this for some single mode options like '-1'.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
conf.c | 20 ++++++++++++++------
1 file changed, 14 insertions(+), 6 deletions(-)
diff --git a/conf.c b/conf.c
index 065e7201..7f20bc84 100644
--- a/conf.c
+++ b/conf.c
@@ -1388,6 +1388,7 @@ void conf(struct ctx *c, int argc, char **argv)
{"repair-path", required_argument, NULL, 28 },
{ 0 },
};
+ const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:";
const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 };
bool copy_addrs_opt = false, copy_routes_opt = false;
@@ -1397,7 +1398,6 @@ void conf(struct ctx *c, int argc, char **argv)
struct fqdn *dnss = c->dns_search;
unsigned int ifi4 = 0, ifi6 = 0;
const char *logfile = NULL;
- const char *optstring;
size_t logsize = 0;
char *runas = NULL;
long fd_tap_opt;
@@ -1408,9 +1408,6 @@ void conf(struct ctx *c, int argc, char **argv)
if (c->mode == MODE_PASTA) {
c->no_dhcp_dns = c->no_dhcp_dns_search = 1;
fwd_default = FWD_AUTO;
- optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:H:46t:u:T:U:";
- } else {
- optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:";
}
c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
@@ -1614,6 +1611,9 @@ void conf(struct ctx *c, int argc, char **argv)
c->foreground = 1;
break;
case 's':
+ if (c->mode == MODE_PASTA)
+ die("-s is for passt / vhost-user mode only");
+
ret = snprintf(c->sock_path, sizeof(c->sock_path), "%s",
optarg);
if (ret <= 0 || ret >= (int)sizeof(c->sock_path))
@@ -1634,6 +1634,9 @@ void conf(struct ctx *c, int argc, char **argv)
*c->sock_path = 0;
break;
case 'I':
+ if (c->mode != MODE_PASTA)
+ die("-I is for pasta mode only");
+
ret = snprintf(c->pasta_ifn, IFNAMSIZ, "%s",
optarg);
if (ret <= 0 || ret >= IFNAMSIZ)
@@ -1790,11 +1793,16 @@ void conf(struct ctx *c, int argc, char **argv)
break;
case 't':
case 'u':
- case 'T':
- case 'U':
case 'D':
/* Handle these later, once addresses are configured */
break;
+ case 'T':
+ case 'U':
+ if (c->mode != MODE_PASTA)
+ die("-%c is for pasta mode only", name);
+
+ /* Handle properly later, once addresses are configured */
+ break;
case 'h':
usage(argv[0], stdout, EXIT_SUCCESS);
break;
--
@@ -1388,6 +1388,7 @@ void conf(struct ctx *c, int argc, char **argv)
{"repair-path", required_argument, NULL, 28 },
{ 0 },
};
+ const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:";
const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 };
bool copy_addrs_opt = false, copy_routes_opt = false;
@@ -1397,7 +1398,6 @@ void conf(struct ctx *c, int argc, char **argv)
struct fqdn *dnss = c->dns_search;
unsigned int ifi4 = 0, ifi6 = 0;
const char *logfile = NULL;
- const char *optstring;
size_t logsize = 0;
char *runas = NULL;
long fd_tap_opt;
@@ -1408,9 +1408,6 @@ void conf(struct ctx *c, int argc, char **argv)
if (c->mode == MODE_PASTA) {
c->no_dhcp_dns = c->no_dhcp_dns_search = 1;
fwd_default = FWD_AUTO;
- optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:H:46t:u:T:U:";
- } else {
- optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:";
}
c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
@@ -1614,6 +1611,9 @@ void conf(struct ctx *c, int argc, char **argv)
c->foreground = 1;
break;
case 's':
+ if (c->mode == MODE_PASTA)
+ die("-s is for passt / vhost-user mode only");
+
ret = snprintf(c->sock_path, sizeof(c->sock_path), "%s",
optarg);
if (ret <= 0 || ret >= (int)sizeof(c->sock_path))
@@ -1634,6 +1634,9 @@ void conf(struct ctx *c, int argc, char **argv)
*c->sock_path = 0;
break;
case 'I':
+ if (c->mode != MODE_PASTA)
+ die("-I is for pasta mode only");
+
ret = snprintf(c->pasta_ifn, IFNAMSIZ, "%s",
optarg);
if (ret <= 0 || ret >= IFNAMSIZ)
@@ -1790,11 +1793,16 @@ void conf(struct ctx *c, int argc, char **argv)
break;
case 't':
case 'u':
- case 'T':
- case 'U':
case 'D':
/* Handle these later, once addresses are configured */
break;
+ case 'T':
+ case 'U':
+ if (c->mode != MODE_PASTA)
+ die("-%c is for pasta mode only", name);
+
+ /* Handle properly later, once addresses are configured */
+ break;
case 'h':
usage(argv[0], stdout, EXIT_SUCCESS);
break;
--
2.48.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 2/9] conf: Move mode detection into helper function
2025-03-12 2:18 [PATCH v2 0/9] Improve handling of MTU limits David Gibson
2025-03-12 2:18 ` [PATCH v2 1/9] conf: Use the same optstring for passt and pasta modes David Gibson
@ 2025-03-12 2:18 ` David Gibson
2025-03-12 2:18 ` [PATCH v2 3/9] conf: Detect vhost-user mode earlier David Gibson
` (7 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: David Gibson @ 2025-03-12 2:18 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
One of the first things we need to do is determine if we're in passt mode
or pasta mode. Currently this is open-coded in main(), by examining
argv[0]. We want to complexify this a bit in future to cover vhost-user
mode as well. Prepare for this, by moving the mode detection into a new
conf_mode() function.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
conf.c | 26 ++++++++++++++++++++++++++
conf.h | 1 +
passt.c | 14 ++------------
3 files changed, 29 insertions(+), 12 deletions(-)
diff --git a/conf.c b/conf.c
index 7f20bc84..2022ea1d 100644
--- a/conf.c
+++ b/conf.c
@@ -991,6 +991,32 @@ pasta_opts:
_exit(status);
}
+/**
+ * conf_mode() - Determine passt/pasta's operating mode from command line
+ * @argc: Argument count
+ * @argv: Command line arguments
+ *
+ * Return: mode to operate in, PASTA or PASST
+ */
+/* cppcheck-suppress constParameter */
+enum passt_modes conf_mode(int argc, char *argv[])
+{
+ char argv0[PATH_MAX], *basearg0;
+
+ if (argc < 1)
+ die("Cannot determine argv[0]");
+
+ strncpy(argv0, argv[0], PATH_MAX - 1);
+ basearg0 = basename(argv0);
+ if (strstr(basearg0, "pasta"))
+ return MODE_PASTA;
+
+ if (strstr(basearg0, "passt"))
+ return MODE_PASST;
+
+ die("Cannot determine mode, invoke as \"passt\" or \"pasta\"");
+}
+
/**
* conf_print() - Print fundamental configuration parameters
* @c: Execution context
diff --git a/conf.h b/conf.h
index 9d2143db..b45ad746 100644
--- a/conf.h
+++ b/conf.h
@@ -6,6 +6,7 @@
#ifndef CONF_H
#define CONF_H
+enum passt_modes conf_mode(int argc, char *argv[]);
void conf(struct ctx *c, int argc, char **argv);
#endif /* CONF_H */
diff --git a/passt.c b/passt.c
index 868842b0..0bd2a292 100644
--- a/passt.c
+++ b/passt.c
@@ -191,7 +191,6 @@ int main(int argc, char **argv)
{
struct epoll_event events[EPOLL_EVENTS];
int nfds, i, devnull_fd = -1;
- char argv0[PATH_MAX], *name;
struct ctx c = { 0 };
struct rlimit limit;
struct timespec now;
@@ -213,21 +212,12 @@ int main(int argc, char **argv)
sigaction(SIGTERM, &sa, NULL);
sigaction(SIGQUIT, &sa, NULL);
- if (argc < 1)
- _exit(EXIT_FAILURE);
+ c.mode = conf_mode(argc, argv);
- strncpy(argv0, argv[0], PATH_MAX - 1);
- name = basename(argv0);
- if (strstr(name, "pasta")) {
+ if (c.mode == MODE_PASTA) {
sa.sa_handler = pasta_child_handler;
if (sigaction(SIGCHLD, &sa, NULL))
die_perror("Couldn't install signal handlers");
-
- c.mode = MODE_PASTA;
- } else if (strstr(name, "passt")) {
- c.mode = MODE_PASST;
- } else {
- _exit(EXIT_FAILURE);
}
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
--
@@ -191,7 +191,6 @@ int main(int argc, char **argv)
{
struct epoll_event events[EPOLL_EVENTS];
int nfds, i, devnull_fd = -1;
- char argv0[PATH_MAX], *name;
struct ctx c = { 0 };
struct rlimit limit;
struct timespec now;
@@ -213,21 +212,12 @@ int main(int argc, char **argv)
sigaction(SIGTERM, &sa, NULL);
sigaction(SIGQUIT, &sa, NULL);
- if (argc < 1)
- _exit(EXIT_FAILURE);
+ c.mode = conf_mode(argc, argv);
- strncpy(argv0, argv[0], PATH_MAX - 1);
- name = basename(argv0);
- if (strstr(name, "pasta")) {
+ if (c.mode == MODE_PASTA) {
sa.sa_handler = pasta_child_handler;
if (sigaction(SIGCHLD, &sa, NULL))
die_perror("Couldn't install signal handlers");
-
- c.mode = MODE_PASTA;
- } else if (strstr(name, "passt")) {
- c.mode = MODE_PASST;
- } else {
- _exit(EXIT_FAILURE);
}
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
--
2.48.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 3/9] conf: Detect vhost-user mode earlier
2025-03-12 2:18 [PATCH v2 0/9] Improve handling of MTU limits David Gibson
2025-03-12 2:18 ` [PATCH v2 1/9] conf: Use the same optstring for passt and pasta modes David Gibson
2025-03-12 2:18 ` [PATCH v2 2/9] conf: Move mode detection into helper function David Gibson
@ 2025-03-12 2:18 ` David Gibson
2025-03-12 2:18 ` [PATCH v2 4/9] packet: Give explicit name to maximum packet size David Gibson
` (6 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: David Gibson @ 2025-03-12 2:18 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
We detect our operating mode in conf_mode(), unless we're using vhost-user
mode, in which case we change it later when we parse the --vhost-user
option. That means we need to delay parsing the --repair-path option (for
vhost-user only) until still later.
However, there are many other places in the main option parsing loop which
also rely on mode. We get away with those, because they happen to be able
to treat passt and vhost-user modes identically. This is potentially
confusing, though. So, move setting of MODE_VU into conf_mode() so
c->mode always has its final value from that point onwards.
To match, we move the parsing of --repair-path back into the main option
parsing loop.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
conf.c | 43 ++++++++++++++++++++++++++-----------------
1 file changed, 26 insertions(+), 17 deletions(-)
diff --git a/conf.c b/conf.c
index 2022ea1d..b58e2a6e 100644
--- a/conf.c
+++ b/conf.c
@@ -998,10 +998,23 @@ pasta_opts:
*
* Return: mode to operate in, PASTA or PASST
*/
-/* cppcheck-suppress constParameter */
enum passt_modes conf_mode(int argc, char *argv[])
{
+ int vhost_user = 0;
+ const struct option optvu[] = {
+ {"vhost-user", no_argument, &vhost_user, 1 },
+ { 0 },
+ };
char argv0[PATH_MAX], *basearg0;
+ int name;
+
+ optind = 0;
+ do {
+ name = getopt_long(argc, argv, "-:", optvu, NULL);
+ } while (name != -1);
+
+ if (vhost_user)
+ return MODE_VU;
if (argc < 1)
die("Cannot determine argv[0]");
@@ -1604,9 +1617,8 @@ void conf(struct ctx *c, int argc, char **argv)
die("Invalid host nameserver address: %s", optarg);
case 25:
- if (c->mode == MODE_PASTA)
- die("--vhost-user is for passt mode only");
- c->mode = MODE_VU;
+ /* Already handled in conf_mode() */
+ ASSERT(c->mode == MODE_VU);
break;
case 26:
vu_print_capabilities();
@@ -1617,7 +1629,14 @@ void conf(struct ctx *c, int argc, char **argv)
die("Invalid FQDN: %s", optarg);
break;
case 28:
- /* Handle this once we checked --vhost-user */
+ if (c->mode != MODE_VU && strcmp(optarg, "none"))
+ die("--repair-path is for vhost-user mode only");
+
+ if (snprintf_check(c->repair_path,
+ sizeof(c->repair_path), "%s",
+ optarg))
+ die("Invalid passt-repair path: %s", optarg);
+
break;
case 'd':
c->debug = 1;
@@ -1917,8 +1936,8 @@ void conf(struct ctx *c, int argc, char **argv)
if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw))
c->no_dhcp = 1;
- /* Inbound port options, DNS, and --repair-path can be parsed now, after
- * IPv4/IPv6 settings and --vhost-user.
+ /* Inbound port options and DNS can be parsed now, after IPv4/IPv6
+ * settings
*/
fwd_probe_ephemeral();
udp_portmap_clear();
@@ -1964,16 +1983,6 @@ void conf(struct ctx *c, int argc, char **argv)
}
die("Cannot use DNS address %s", optarg);
- } else if (name == 28) {
- if (c->mode != MODE_VU && strcmp(optarg, "none"))
- die("--repair-path is for vhost-user mode only");
-
- if (snprintf_check(c->repair_path,
- sizeof(c->repair_path), "%s",
- optarg))
- die("Invalid passt-repair path: %s", optarg);
-
- break;
}
} while (name != -1);
--
@@ -998,10 +998,23 @@ pasta_opts:
*
* Return: mode to operate in, PASTA or PASST
*/
-/* cppcheck-suppress constParameter */
enum passt_modes conf_mode(int argc, char *argv[])
{
+ int vhost_user = 0;
+ const struct option optvu[] = {
+ {"vhost-user", no_argument, &vhost_user, 1 },
+ { 0 },
+ };
char argv0[PATH_MAX], *basearg0;
+ int name;
+
+ optind = 0;
+ do {
+ name = getopt_long(argc, argv, "-:", optvu, NULL);
+ } while (name != -1);
+
+ if (vhost_user)
+ return MODE_VU;
if (argc < 1)
die("Cannot determine argv[0]");
@@ -1604,9 +1617,8 @@ void conf(struct ctx *c, int argc, char **argv)
die("Invalid host nameserver address: %s", optarg);
case 25:
- if (c->mode == MODE_PASTA)
- die("--vhost-user is for passt mode only");
- c->mode = MODE_VU;
+ /* Already handled in conf_mode() */
+ ASSERT(c->mode == MODE_VU);
break;
case 26:
vu_print_capabilities();
@@ -1617,7 +1629,14 @@ void conf(struct ctx *c, int argc, char **argv)
die("Invalid FQDN: %s", optarg);
break;
case 28:
- /* Handle this once we checked --vhost-user */
+ if (c->mode != MODE_VU && strcmp(optarg, "none"))
+ die("--repair-path is for vhost-user mode only");
+
+ if (snprintf_check(c->repair_path,
+ sizeof(c->repair_path), "%s",
+ optarg))
+ die("Invalid passt-repair path: %s", optarg);
+
break;
case 'd':
c->debug = 1;
@@ -1917,8 +1936,8 @@ void conf(struct ctx *c, int argc, char **argv)
if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw))
c->no_dhcp = 1;
- /* Inbound port options, DNS, and --repair-path can be parsed now, after
- * IPv4/IPv6 settings and --vhost-user.
+ /* Inbound port options and DNS can be parsed now, after IPv4/IPv6
+ * settings
*/
fwd_probe_ephemeral();
udp_portmap_clear();
@@ -1964,16 +1983,6 @@ void conf(struct ctx *c, int argc, char **argv)
}
die("Cannot use DNS address %s", optarg);
- } else if (name == 28) {
- if (c->mode != MODE_VU && strcmp(optarg, "none"))
- die("--repair-path is for vhost-user mode only");
-
- if (snprintf_check(c->repair_path,
- sizeof(c->repair_path), "%s",
- optarg))
- die("Invalid passt-repair path: %s", optarg);
-
- break;
}
} while (name != -1);
--
2.48.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 4/9] packet: Give explicit name to maximum packet size
2025-03-12 2:18 [PATCH v2 0/9] Improve handling of MTU limits David Gibson
` (2 preceding siblings ...)
2025-03-12 2:18 ` [PATCH v2 3/9] conf: Detect vhost-user mode earlier David Gibson
@ 2025-03-12 2:18 ` David Gibson
2025-03-12 2:18 ` [PATCH v2 5/9] packet: Remove redundant TAP_BUF_BYTES define David Gibson
` (5 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: David Gibson @ 2025-03-12 2:18 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
We verify that every packet we store in a pool (and every partial packet
we retreive from it) has a length no longer than UINT16_MAX. This
originated in the older packet pool implementation which stored packet
lengths in a uint16_t. Now, that packets are represented by a struct
iovec with its size_t length, this check serves only as a sanity / security
check that we don't have some wildly out of range length due to a bug
elsewhere.
We have may reasons to (slightly) increase this limit in future, so in
preparation, give this quantity an explicit name - PACKET_MAX_LEN.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
packet.c | 4 ++--
packet.h | 3 +++
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/packet.c b/packet.c
index 0330b548..bcac0375 100644
--- a/packet.c
+++ b/packet.c
@@ -83,7 +83,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
if (packet_check_range(p, start, len, func, line))
return;
- if (len > UINT16_MAX) {
+ if (len > PACKET_MAX_LEN) {
trace("add packet length %zu, %s:%i", len, func, line);
return;
}
@@ -119,7 +119,7 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
return NULL;
}
- if (len > UINT16_MAX) {
+ if (len > PACKET_MAX_LEN) {
if (func) {
trace("packet data length %zu, %s:%i",
len, func, line);
diff --git a/packet.h b/packet.h
index bdc07fef..d099f026 100644
--- a/packet.h
+++ b/packet.h
@@ -6,6 +6,9 @@
#ifndef PACKET_H
#define PACKET_H
+/* Maximum size of a single packet stored in pool, including headers */
+#define PACKET_MAX_LEN UINT16_MAX
+
/**
* struct pool - Generic pool of packets stored in a buffer
* @buf: Buffer storing packet descriptors,
--
@@ -6,6 +6,9 @@
#ifndef PACKET_H
#define PACKET_H
+/* Maximum size of a single packet stored in pool, including headers */
+#define PACKET_MAX_LEN UINT16_MAX
+
/**
* struct pool - Generic pool of packets stored in a buffer
* @buf: Buffer storing packet descriptors,
--
2.48.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 5/9] packet: Remove redundant TAP_BUF_BYTES define
2025-03-12 2:18 [PATCH v2 0/9] Improve handling of MTU limits David Gibson
` (3 preceding siblings ...)
2025-03-12 2:18 ` [PATCH v2 4/9] packet: Give explicit name to maximum packet size David Gibson
@ 2025-03-12 2:18 ` David Gibson
2025-03-12 2:18 ` [PATCH v2 6/9] tap: Use explicit defines for maximum length of L2 frame David Gibson
` (4 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: David Gibson @ 2025-03-12 2:18 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
Currently we define both TAP_BUF_BYTES and PKT_BUF_BYTES as essentially
the same thing. They'll be different only if TAP_BUF_BYTES is negative,
which makes no sense. So, remove TAP_BUF_BYTES and just use PKT_BUF_BYTES.
In addition, most places we use this to just mean the size of the main
packet buffer (pkt_buf) for which we can just directly use sizeof.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
passt.c | 2 +-
passt.h | 5 ++---
tap.c | 4 ++--
3 files changed, 5 insertions(+), 6 deletions(-)
diff --git a/passt.c b/passt.c
index 0bd2a292..cd067728 100644
--- a/passt.c
+++ b/passt.c
@@ -223,7 +223,7 @@ int main(int argc, char **argv)
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
die_perror("Couldn't set disposition for SIGPIPE");
- madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
+ madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE);
c.epollfd = epoll_create1(EPOLL_CLOEXEC);
if (c.epollfd == -1)
diff --git a/passt.h b/passt.h
index 28d13892..6b248051 100644
--- a/passt.h
+++ b/passt.h
@@ -69,12 +69,11 @@ union epoll_ref {
static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
"epoll_ref must have same size as epoll_data");
-#define TAP_BUF_BYTES \
+#define PKT_BUF_BYTES \
ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
#define TAP_MSGS \
- DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
+ DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
-#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0)
extern char pkt_buf [PKT_BUF_BYTES];
extern char *epoll_type_str[];
diff --git a/tap.c b/tap.c
index 4541f51d..fb306e75 100644
--- a/tap.c
+++ b/tap.c
@@ -1080,7 +1080,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
do {
n = recv(c->fd_tap, pkt_buf + partial_len,
- TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
+ sizeof(pkt_buf) - partial_len, MSG_DONTWAIT);
} while ((n < 0) && errno == EINTR);
if (n < 0) {
@@ -1151,7 +1151,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
tap_flush_pools();
- for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) {
+ for (n = 0; n <= (ssize_t)(sizeof(pkt_buf) - ETH_MAX_MTU); n += len) {
len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
if (len == 0) {
--
@@ -1080,7 +1080,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
do {
n = recv(c->fd_tap, pkt_buf + partial_len,
- TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
+ sizeof(pkt_buf) - partial_len, MSG_DONTWAIT);
} while ((n < 0) && errno == EINTR);
if (n < 0) {
@@ -1151,7 +1151,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
tap_flush_pools();
- for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) {
+ for (n = 0; n <= (ssize_t)(sizeof(pkt_buf) - ETH_MAX_MTU); n += len) {
len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
if (len == 0) {
--
2.48.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 6/9] tap: Use explicit defines for maximum length of L2 frame
2025-03-12 2:18 [PATCH v2 0/9] Improve handling of MTU limits David Gibson
` (4 preceding siblings ...)
2025-03-12 2:18 ` [PATCH v2 5/9] packet: Remove redundant TAP_BUF_BYTES define David Gibson
@ 2025-03-12 2:18 ` David Gibson
2025-03-12 2:18 ` [PATCH v2 7/9] Simplify sizing of pkt_buf David Gibson
` (3 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: David Gibson @ 2025-03-12 2:18 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson, David Gibson
Currently in tap.c we (mostly) use ETH_MAX_MTU as the maximum length of
an L2 frame. This define comes from the kernel, but it's badly named and
used confusingly.
First, it doesn't really have anything to do with Ethernet, which has no
structural limit on frame lengths. It comes more from either a) IP which
imposes a 64k datagram limit or b) from internal buffers used in various
places in the kernel (and in passt).
Worse, MTU generally means the maximum size of the IP (L3) datagram which
may be transferred, _not_ counting the L2 headers. In the kernel
ETH_MAX_MTU is sometimes used that way, but sometimes seems to be used as
a maximum frame length, _including_ L2 headers. In tap.c we're mostly
using it in the second way.
Finally, each of our tap backends could have different limits on the frame
size imposed by the mechanisms they're using.
Start clearing up this confusion by replacing it in tap.c with new
L2_MAX_LEN_* defines which specifically refer to the maximum L2 frame
length for each backend.
Signed-off-by: David Gibson <dgibson@redhat.com>
---
tap.c | 23 +++++++++++++++++++----
tap.h | 25 +++++++++++++++++++++++++
2 files changed, 44 insertions(+), 4 deletions(-)
diff --git a/tap.c b/tap.c
index fb306e75..ede547ce 100644
--- a/tap.c
+++ b/tap.c
@@ -62,6 +62,19 @@
#include "vhost_user.h"
#include "vu_common.h"
+/* Maximum allowed frame lengths (including L2 header) */
+
+/* Verify that an L2 frame length limit is large enough to contain the header,
+ * but small enough to fit in the packet pool
+ */
+#define CHECK_FRAME_LEN(len) \
+ static_assert((len) >= ETH_HLEN && (len) <= PACKET_MAX_LEN, \
+ #len " has bad value")
+
+CHECK_FRAME_LEN(L2_MAX_LEN_PASTA);
+CHECK_FRAME_LEN(L2_MAX_LEN_PASST);
+CHECK_FRAME_LEN(L2_MAX_LEN_VU);
+
/* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
@@ -1097,7 +1110,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
while (n >= (ssize_t)sizeof(uint32_t)) {
uint32_t l2len = ntohl_unaligned(p);
- if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) {
+ if (l2len < sizeof(struct ethhdr) || l2len > L2_MAX_LEN_PASST) {
err("Bad frame size from guest, resetting connection");
tap_sock_reset(c);
return;
@@ -1151,8 +1164,10 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
tap_flush_pools();
- for (n = 0; n <= (ssize_t)(sizeof(pkt_buf) - ETH_MAX_MTU); n += len) {
- len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
+ for (n = 0;
+ n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA);
+ n += len) {
+ len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA);
if (len == 0) {
die("EOF on tap device, exiting");
@@ -1170,7 +1185,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
/* Ignore frames of bad length */
if (len < (ssize_t)sizeof(struct ethhdr) ||
- len > (ssize_t)ETH_MAX_MTU)
+ len > (ssize_t)L2_MAX_LEN_PASTA)
continue;
tap_add_packet(c, len, pkt_buf + n);
diff --git a/tap.h b/tap.h
index a2c3b87d..84e9fdb8 100644
--- a/tap.h
+++ b/tap.h
@@ -6,6 +6,31 @@
#ifndef TAP_H
#define TAP_H
+/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header)
+ *
+ * The kernel tuntap device imposes a maximum frame size of 65535 including
+ * 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode).
+ */
+#define L2_MAX_LEN_PASTA USHRT_MAX
+
+/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header)
+ *
+ * The only structural limit the QEMU socket protocol imposes on frames is
+ * (2^32-1) bytes, but that would be ludicrously long in practice. For now,
+ * limit it somewhat arbitrarily to 65535 bytes. FIXME: Work out an appropriate
+ * limit with more precision.
+ */
+#define L2_MAX_LEN_PASST USHRT_MAX
+
+/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header)
+ *
+ * vhost-user allows multiple buffers per frame, each of which can be quite
+ * large, so the inherent frame size limit is rather large. Much larger than is
+ * actually useful for IP. For now limit arbitrarily to 65535 bytes. FIXME:
+ * Work out an appropriate limit with more precision.
+ */
+#define L2_MAX_LEN_VU USHRT_MAX
+
struct udphdr;
/**
--
@@ -6,6 +6,31 @@
#ifndef TAP_H
#define TAP_H
+/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header)
+ *
+ * The kernel tuntap device imposes a maximum frame size of 65535 including
+ * 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode).
+ */
+#define L2_MAX_LEN_PASTA USHRT_MAX
+
+/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header)
+ *
+ * The only structural limit the QEMU socket protocol imposes on frames is
+ * (2^32-1) bytes, but that would be ludicrously long in practice. For now,
+ * limit it somewhat arbitrarily to 65535 bytes. FIXME: Work out an appropriate
+ * limit with more precision.
+ */
+#define L2_MAX_LEN_PASST USHRT_MAX
+
+/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header)
+ *
+ * vhost-user allows multiple buffers per frame, each of which can be quite
+ * large, so the inherent frame size limit is rather large. Much larger than is
+ * actually useful for IP. For now limit arbitrarily to 65535 bytes. FIXME:
+ * Work out an appropriate limit with more precision.
+ */
+#define L2_MAX_LEN_VU USHRT_MAX
+
struct udphdr;
/**
--
2.48.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 7/9] Simplify sizing of pkt_buf
2025-03-12 2:18 [PATCH v2 0/9] Improve handling of MTU limits David Gibson
` (5 preceding siblings ...)
2025-03-12 2:18 ` [PATCH v2 6/9] tap: Use explicit defines for maximum length of L2 frame David Gibson
@ 2025-03-12 2:18 ` David Gibson
2025-03-12 2:18 ` [PATCH v2 8/9] pcap: Correctly set snaplen based on tap backend type David Gibson
` (2 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: David Gibson @ 2025-03-12 2:18 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
We define the size of pkt_buf as large enough to hold 128 maximum size
packets. Well, approximately, since we round down to the page size. We
don't have any specific reliance on how many packets can fit in the buffer,
we just want it to be big enough to allow reasonable batching. The
current definition relies on the confusingly named ETH_MAX_MTU and adds
in sizeof(uint32_t) rather non-obviously for the pseudo-physical header
used by the qemu socket (passt mode) protocol.
Instead, just define it to be 8MiB, which is what that complex calculation
works out to.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
passt.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/passt.h b/passt.h
index 6b248051..8f450912 100644
--- a/passt.h
+++ b/passt.h
@@ -69,8 +69,8 @@ union epoll_ref {
static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
"epoll_ref must have same size as epoll_data");
-#define PKT_BUF_BYTES \
- ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
+/* Large enough for ~128 maximum size frames */
+#define PKT_BUF_BYTES (8UL << 20)
#define TAP_MSGS \
DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
--
@@ -69,8 +69,8 @@ union epoll_ref {
static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
"epoll_ref must have same size as epoll_data");
-#define PKT_BUF_BYTES \
- ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
+/* Large enough for ~128 maximum size frames */
+#define PKT_BUF_BYTES (8UL << 20)
#define TAP_MSGS \
DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
--
2.48.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 8/9] pcap: Correctly set snaplen based on tap backend type
2025-03-12 2:18 [PATCH v2 0/9] Improve handling of MTU limits David Gibson
` (6 preceding siblings ...)
2025-03-12 2:18 ` [PATCH v2 7/9] Simplify sizing of pkt_buf David Gibson
@ 2025-03-12 2:18 ` David Gibson
2025-03-12 2:18 ` [PATCH v2 9/9] conf: Limit maximum MTU based on backend frame size David Gibson
2025-03-12 23:04 ` [PATCH v2 0/9] Improve handling of MTU limits Stefano Brivio
9 siblings, 0 replies; 11+ messages in thread
From: David Gibson @ 2025-03-12 2:18 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
The pcap header includes a value indicating how much of each frame is
captured. We always capture the entire frame, so we want to set this to
the maximum possible frame size. Currently we do that by setting it to
ETH_MAX_MTU, but that's a confusingly named constant which might not always
be correct depending on the details of our tap backend.
Instead add a tap_l2_max_len() function that explicitly returns the maximum
frame size for the current mode and use that to set snaplen. While we're
there, there's no particular need for the pcap header to be defined in a
global; make it local to pcap_init() instead.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
pcap.c | 46 ++++++++++++++++++++++++----------------------
tap.c | 19 +++++++++++++++++++
tap.h | 1 +
3 files changed, 44 insertions(+), 22 deletions(-)
diff --git a/pcap.c b/pcap.c
index 3d623cfe..e95aa6fe 100644
--- a/pcap.c
+++ b/pcap.c
@@ -33,33 +33,12 @@
#include "log.h"
#include "pcap.h"
#include "iov.h"
+#include "tap.h"
#define PCAP_VERSION_MINOR 4
static int pcap_fd = -1;
-/* See pcap.h from libpcap, or pcap-savefile(5) */
-static const struct {
- uint32_t magic;
-#define PCAP_MAGIC 0xa1b2c3d4
-
- uint16_t major;
-#define PCAP_VERSION_MAJOR 2
-
- uint16_t minor;
-#define PCAP_VERSION_MINOR 4
-
- int32_t thiszone;
- uint32_t sigfigs;
- uint32_t snaplen;
-
- uint32_t linktype;
-#define PCAP_LINKTYPE_ETHERNET 1
-} pcap_hdr = {
- PCAP_MAGIC, PCAP_VERSION_MAJOR, PCAP_VERSION_MINOR, 0, 0, ETH_MAX_MTU,
- PCAP_LINKTYPE_ETHERNET
-};
-
struct pcap_pkthdr {
uint32_t tv_sec;
uint32_t tv_usec;
@@ -162,6 +141,29 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
*/
void pcap_init(struct ctx *c)
{
+ /* See pcap.h from libpcap, or pcap-savefile(5) */
+#define PCAP_MAGIC 0xa1b2c3d4
+#define PCAP_VERSION_MAJOR 2
+#define PCAP_VERSION_MINOR 4
+#define PCAP_LINKTYPE_ETHERNET 1
+ const struct {
+ uint32_t magic;
+ uint16_t major;
+ uint16_t minor;
+
+ int32_t thiszone;
+ uint32_t sigfigs;
+ uint32_t snaplen;
+
+ uint32_t linktype;
+ } pcap_hdr = {
+ .magic = PCAP_MAGIC,
+ .major = PCAP_VERSION_MAJOR,
+ .minor = PCAP_VERSION_MINOR,
+ .snaplen = tap_l2_max_len(c),
+ .linktype = PCAP_LINKTYPE_ETHERNET
+ };
+
if (pcap_fd != -1)
return;
diff --git a/tap.c b/tap.c
index ede547ce..182a1151 100644
--- a/tap.c
+++ b/tap.c
@@ -82,6 +82,25 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
#define TAP_SEQS 128 /* Different L4 tuples in one batch */
#define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */
+/**
+ * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
+ * @c: Execution context
+ */
+unsigned long tap_l2_max_len(const struct ctx *c)
+{
+ /* NOLINTBEGIN(bugprone-branch-clone): values can be the same */
+ switch (c->mode) {
+ case MODE_PASST:
+ return L2_MAX_LEN_PASST;
+ case MODE_PASTA:
+ return L2_MAX_LEN_PASTA;
+ case MODE_VU:
+ return L2_MAX_LEN_VU;
+ }
+ /* NOLINTEND(bugprone-branch-clone) */
+ ASSERT(0);
+}
+
/**
* tap_send_single() - Send a single frame
* @c: Execution context
diff --git a/tap.h b/tap.h
index 84e9fdb8..dd39fd89 100644
--- a/tap.h
+++ b/tap.h
@@ -69,6 +69,7 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
thdr->vnet_len = htonl(l2len);
}
+unsigned long tap_l2_max_len(const struct ctx *c);
void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
struct in_addr dst, size_t l4len, uint8_t proto);
--
@@ -69,6 +69,7 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
thdr->vnet_len = htonl(l2len);
}
+unsigned long tap_l2_max_len(const struct ctx *c);
void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
struct in_addr dst, size_t l4len, uint8_t proto);
--
2.48.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 9/9] conf: Limit maximum MTU based on backend frame size
2025-03-12 2:18 [PATCH v2 0/9] Improve handling of MTU limits David Gibson
` (7 preceding siblings ...)
2025-03-12 2:18 ` [PATCH v2 8/9] pcap: Correctly set snaplen based on tap backend type David Gibson
@ 2025-03-12 2:18 ` David Gibson
2025-03-12 23:04 ` [PATCH v2 0/9] Improve handling of MTU limits Stefano Brivio
9 siblings, 0 replies; 11+ messages in thread
From: David Gibson @ 2025-03-12 2:18 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
The -m option controls the MTU, that is the maximum transmissible L3
datagram, not including L2 headers. We currently limit it to ETH_MAX_MTU
which sounds like it makes sense. But ETH_MAX_MTU is confusing: it's not
consistently used as to whether it means the maximum L3 datagram size or
the maximum L2 frame size. Even within conf() we explicitly account for
the L2 header size when computing the default --mtu value, but not when
we compute the maximum --mtu value.
Clean this up by reworking the maximum MTU computation to be the minimum of
IP_MAX_MTU (65535) and the maximum sized IP datagram which can fit into
our L2 frames when we account for the L2 header. The latter can vary
depending on our tap backend, although it doesn't right now.
Link: https://bugs.passt.top/show_bug.cgi?id=66
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
conf.c | 11 +++++++----
util.h | 3 ---
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/conf.c b/conf.c
index b58e2a6e..c760f791 100644
--- a/conf.c
+++ b/conf.c
@@ -1434,6 +1434,7 @@ void conf(struct ctx *c, int argc, char **argv)
enum fwd_ports_mode fwd_default = FWD_NONE;
bool v4_only = false, v6_only = false;
unsigned dns4_idx = 0, dns6_idx = 0;
+ unsigned long max_mtu = IP_MAX_MTU;
struct fqdn *dnss = c->dns_search;
unsigned int ifi4 = 0, ifi6 = 0;
const char *logfile = NULL;
@@ -1449,7 +1450,9 @@ void conf(struct ctx *c, int argc, char **argv)
fwd_default = FWD_AUTO;
}
- c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
+ if (tap_l2_max_len(c) - ETH_HLEN < max_mtu)
+ max_mtu = tap_l2_max_len(c) - ETH_HLEN;
+ c->mtu = ROUND_DOWN(max_mtu, sizeof(uint32_t));
c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET;
c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET;
memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN);
@@ -1711,9 +1714,9 @@ void conf(struct ctx *c, int argc, char **argv)
if (errno || *e)
die("Invalid MTU: %s", optarg);
- if (mtu > ETH_MAX_MTU) {
- die("MTU %lu too large (max %u)",
- mtu, ETH_MAX_MTU);
+ if (mtu > max_mtu) {
+ die("MTU %lu too large (max %lu)",
+ mtu, max_mtu);
}
c->mtu = mtu;
diff --git a/util.h b/util.h
index 0f70f4d4..4d512fab 100644
--- a/util.h
+++ b/util.h
@@ -31,9 +31,6 @@
#ifndef SECCOMP_RET_KILL_PROCESS
#define SECCOMP_RET_KILL_PROCESS SECCOMP_RET_KILL
#endif
-#ifndef ETH_MAX_MTU
-#define ETH_MAX_MTU USHRT_MAX
-#endif
#ifndef IP_MAX_MTU
#define IP_MAX_MTU USHRT_MAX
#endif
--
@@ -31,9 +31,6 @@
#ifndef SECCOMP_RET_KILL_PROCESS
#define SECCOMP_RET_KILL_PROCESS SECCOMP_RET_KILL
#endif
-#ifndef ETH_MAX_MTU
-#define ETH_MAX_MTU USHRT_MAX
-#endif
#ifndef IP_MAX_MTU
#define IP_MAX_MTU USHRT_MAX
#endif
--
2.48.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH v2 0/9] Improve handling of MTU limits
2025-03-12 2:18 [PATCH v2 0/9] Improve handling of MTU limits David Gibson
` (8 preceding siblings ...)
2025-03-12 2:18 ` [PATCH v2 9/9] conf: Limit maximum MTU based on backend frame size David Gibson
@ 2025-03-12 23:04 ` Stefano Brivio
9 siblings, 0 replies; 11+ messages in thread
From: Stefano Brivio @ 2025-03-12 23:04 UTC (permalink / raw)
To: David Gibson; +Cc: passt-dev
On Wed, 12 Mar 2025 13:18:30 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:
> After considerable lead up, this corrects the handling of the --mtu
> option so that it will respect limits imposed by the backend tap
> layers, correctly accounting for L2 headers.
>
> This incorporates the earlier mode setting patches, on which the rest
> depend. There's been no change in those patches, there just included
> here for self-containedness.
>
> v2:
> * More static_assert() validation of frame limits in 6/9
> * Cosmetic fixups in 6/9
Applied.
--
Stefano
^ permalink raw reply [flat|nested] 11+ messages in thread