* [PATCH v2 1/3] util, conf: Add and use ns_is_init() helper
2023-05-22 8:52 [PATCH v2 0/3] Fix pasta-in-pasta operation (and similar) Stefano Brivio
@ 2023-05-22 8:52 ` Stefano Brivio
2023-05-22 9:03 ` David Gibson
2023-05-23 14:11 ` Stefano Brivio
2023-05-22 8:52 ` [PATCH v2 2/3] pasta: Detach mount namespace, (re)mount procfs before spawning command Stefano Brivio
2023-05-22 8:52 ` [PATCH v2 3/3] isolation: Initially Keep CAP_SETFCAP if running as UID 0 in non-init Stefano Brivio
2 siblings, 2 replies; 6+ messages in thread
From: Stefano Brivio @ 2023-05-22 8:52 UTC (permalink / raw)
To: passt-dev; +Cc: David Gibson
We'll need this in isolate_initial(). While at it, don't rely on
BUFSIZ: the earlier issue we had with musl reminded me it's not a
magic "everything will fit" value. Size the read buffer to what we
actually need from uid_map, and check for the final newline too,
because uid_map is organised in lines.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
conf.c | 16 +---------------
util.c | 25 +++++++++++++++++++++++++
util.h | 2 ++
3 files changed, 28 insertions(+), 15 deletions(-)
diff --git a/conf.c b/conf.c
index 447b000..984c3ce 100644
--- a/conf.c
+++ b/conf.c
@@ -1096,10 +1096,6 @@ static int conf_runas(char *opt, unsigned int *uid, unsigned int *gid)
*/
static void conf_ugid(char *runas, uid_t *uid, gid_t *gid)
{
- const char root_uid_map[] = " 0 0 4294967295";
- char buf[BUFSIZ];
- int fd;
-
/* If user has specified --runas, that takes precedence... */
if (runas) {
if (conf_runas(runas, uid, gid))
@@ -1116,18 +1112,8 @@ static void conf_ugid(char *runas, uid_t *uid, gid_t *gid)
return;
/* ...or at least not root in the init namespace... */
- if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) {
- die("Can't determine if we're in init namespace: %s",
- strerror(errno));
- }
-
- if (read(fd, buf, BUFSIZ) != sizeof(root_uid_map) ||
- strncmp(buf, root_uid_map, sizeof(root_uid_map) - 1)) {
- close(fd);
+ if (!ns_is_init())
return;
- }
-
- close(fd);
/* ...otherwise use nobody:nobody */
warn("Don't run as root. Changing to nobody...");
diff --git a/util.c b/util.c
index c3e3471..3c5d51f 100644
--- a/util.c
+++ b/util.c
@@ -390,6 +390,31 @@ int ns_enter(const struct ctx *c)
return 0;
}
+/**
+ * ns_is_init() - Is the caller running in the "init" user namespace?
+ *
+ * Return: true if caller is in init, false otherwise, won't return on failure
+ */
+bool ns_is_init(void)
+{
+ const char root_uid_map[] = " 0 0 4294967295\n";
+ char buf[sizeof(root_uid_map)];
+ bool ret = true;
+ int fd;
+
+ if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) {
+ die("Can't determine if we're in init namespace: %s",
+ strerror(errno));
+ }
+
+ if (read(fd, buf, sizeof(root_uid_map)) != sizeof(root_uid_map) - 1 ||
+ strncmp(buf, root_uid_map, sizeof(root_uid_map)))
+ ret = false;
+
+ close(fd);
+ return ret;
+}
+
/**
* pid_file() - Write PID to file, if requested to do so, and close it
* @fd: Open PID file descriptor, closed on exit, -1 to skip writing it
diff --git a/util.h b/util.h
index ba3e3da..26892aa 100644
--- a/util.h
+++ b/util.h
@@ -8,6 +8,7 @@
#include <stdlib.h>
#include <stdarg.h>
+#include <stdbool.h>
#include "log.h"
@@ -216,6 +217,7 @@ char *line_read(char *buf, size_t len, int fd);
void procfs_scan_listen(struct ctx *c, uint8_t proto, int ip_version, int ns,
uint8_t *map, uint8_t *exclude);
int ns_enter(const struct ctx *c);
+bool ns_is_init(void);
void write_pidfile(int fd, pid_t pid);
int __daemon(int pidfile_fd, int devnull_fd);
int fls(unsigned long x);
--
@@ -8,6 +8,7 @@
#include <stdlib.h>
#include <stdarg.h>
+#include <stdbool.h>
#include "log.h"
@@ -216,6 +217,7 @@ char *line_read(char *buf, size_t len, int fd);
void procfs_scan_listen(struct ctx *c, uint8_t proto, int ip_version, int ns,
uint8_t *map, uint8_t *exclude);
int ns_enter(const struct ctx *c);
+bool ns_is_init(void);
void write_pidfile(int fd, pid_t pid);
int __daemon(int pidfile_fd, int devnull_fd);
int fls(unsigned long x);
--
2.39.2
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH v2 1/3] util, conf: Add and use ns_is_init() helper
2023-05-22 8:52 ` [PATCH v2 1/3] util, conf: Add and use ns_is_init() helper Stefano Brivio
@ 2023-05-22 9:03 ` David Gibson
2023-05-23 14:11 ` Stefano Brivio
1 sibling, 0 replies; 6+ messages in thread
From: David Gibson @ 2023-05-22 9:03 UTC (permalink / raw)
To: Stefano Brivio; +Cc: passt-dev
[-- Attachment #1: Type: text/plain, Size: 3802 bytes --]
On Mon, May 22, 2023 at 10:52:03AM +0200, Stefano Brivio wrote:
> We'll need this in isolate_initial(). While at it, don't rely on
> BUFSIZ: the earlier issue we had with musl reminded me it's not a
> magic "everything will fit" value. Size the read buffer to what we
> actually need from uid_map, and check for the final newline too,
> because uid_map is organised in lines.
>
> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> ---
> conf.c | 16 +---------------
> util.c | 25 +++++++++++++++++++++++++
> util.h | 2 ++
> 3 files changed, 28 insertions(+), 15 deletions(-)
>
> diff --git a/conf.c b/conf.c
> index 447b000..984c3ce 100644
> --- a/conf.c
> +++ b/conf.c
> @@ -1096,10 +1096,6 @@ static int conf_runas(char *opt, unsigned int *uid, unsigned int *gid)
> */
> static void conf_ugid(char *runas, uid_t *uid, gid_t *gid)
> {
> - const char root_uid_map[] = " 0 0 4294967295";
> - char buf[BUFSIZ];
> - int fd;
> -
> /* If user has specified --runas, that takes precedence... */
> if (runas) {
> if (conf_runas(runas, uid, gid))
> @@ -1116,18 +1112,8 @@ static void conf_ugid(char *runas, uid_t *uid, gid_t *gid)
> return;
>
> /* ...or at least not root in the init namespace... */
> - if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) {
> - die("Can't determine if we're in init namespace: %s",
> - strerror(errno));
> - }
> -
> - if (read(fd, buf, BUFSIZ) != sizeof(root_uid_map) ||
> - strncmp(buf, root_uid_map, sizeof(root_uid_map) - 1)) {
> - close(fd);
> + if (!ns_is_init())
> return;
> - }
> -
> - close(fd);
>
> /* ...otherwise use nobody:nobody */
> warn("Don't run as root. Changing to nobody...");
> diff --git a/util.c b/util.c
> index c3e3471..3c5d51f 100644
> --- a/util.c
> +++ b/util.c
> @@ -390,6 +390,31 @@ int ns_enter(const struct ctx *c)
> return 0;
> }
>
> +/**
> + * ns_is_init() - Is the caller running in the "init" user namespace?
> + *
> + * Return: true if caller is in init, false otherwise, won't return on failure
> + */
> +bool ns_is_init(void)
> +{
> + const char root_uid_map[] = " 0 0 4294967295\n";
> + char buf[sizeof(root_uid_map)];
> + bool ret = true;
> + int fd;
> +
> + if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) {
> + die("Can't determine if we're in init namespace: %s",
> + strerror(errno));
> + }
> +
> + if (read(fd, buf, sizeof(root_uid_map)) != sizeof(root_uid_map) - 1 ||
> + strncmp(buf, root_uid_map, sizeof(root_uid_map)))
Personally I'd use memcmp() when the size is known like this, but this
strncmp() should do the same thing.
> + ret = false;
> +
> + close(fd);
> + return ret;
> +}
> +
> /**
> * pid_file() - Write PID to file, if requested to do so, and close it
> * @fd: Open PID file descriptor, closed on exit, -1 to skip writing it
> diff --git a/util.h b/util.h
> index ba3e3da..26892aa 100644
> --- a/util.h
> +++ b/util.h
> @@ -8,6 +8,7 @@
>
> #include <stdlib.h>
> #include <stdarg.h>
> +#include <stdbool.h>
>
> #include "log.h"
>
> @@ -216,6 +217,7 @@ char *line_read(char *buf, size_t len, int fd);
> void procfs_scan_listen(struct ctx *c, uint8_t proto, int ip_version, int ns,
> uint8_t *map, uint8_t *exclude);
> int ns_enter(const struct ctx *c);
> +bool ns_is_init(void);
> void write_pidfile(int fd, pid_t pid);
> int __daemon(int pidfile_fd, int devnull_fd);
> int fls(unsigned long x);
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2 1/3] util, conf: Add and use ns_is_init() helper
2023-05-22 8:52 ` [PATCH v2 1/3] util, conf: Add and use ns_is_init() helper Stefano Brivio
2023-05-22 9:03 ` David Gibson
@ 2023-05-23 14:11 ` Stefano Brivio
1 sibling, 0 replies; 6+ messages in thread
From: Stefano Brivio @ 2023-05-23 14:11 UTC (permalink / raw)
To: passt-dev; +Cc: David Gibson
On Mon, 22 May 2023 10:52:03 +0200
Stefano Brivio <sbrivio@redhat.com> wrote:
> +bool ns_is_init(void)
> +{
> + const char root_uid_map[] = " 0 0 4294967295\n";
> + char buf[sizeof(root_uid_map)];
> + bool ret = true;
> + int fd;
> +
> + if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) {
> + die("Can't determine if we're in init namespace: %s",
> + strerror(errno));
> + }
> +
> + if (read(fd, buf, sizeof(root_uid_map)) != sizeof(root_uid_map) - 1 ||
> + strncmp(buf, root_uid_map, sizeof(root_uid_map)))
Oops, valgrind noticed I didn't actually initialise that extra byte at the end
of 'buf'. Fixing this up as well before pushing...
--
Stefano
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH v2 2/3] pasta: Detach mount namespace, (re)mount procfs before spawning command
2023-05-22 8:52 [PATCH v2 0/3] Fix pasta-in-pasta operation (and similar) Stefano Brivio
2023-05-22 8:52 ` [PATCH v2 1/3] util, conf: Add and use ns_is_init() helper Stefano Brivio
@ 2023-05-22 8:52 ` Stefano Brivio
2023-05-22 8:52 ` [PATCH v2 3/3] isolation: Initially Keep CAP_SETFCAP if running as UID 0 in non-init Stefano Brivio
2 siblings, 0 replies; 6+ messages in thread
From: Stefano Brivio @ 2023-05-22 8:52 UTC (permalink / raw)
To: passt-dev; +Cc: David Gibson
If we want /proc contents to be consistent after pasta spawns a child
process in a new PID namespace (only for operation without a
pre-existing namespace), we need to mount /proc after the clone(2)
call with CLONE_NEWPID, and we enable the child to do that by
passing, in the same call, the CLONE_NEWNS flag, as described by
pid_namespaces(7).
This is not really a remount: in fact, passing MS_REMOUNT to mount(2)
would make the call fail. We're in another mount namespace now, so
it's a fresh mount that has the effect of hiding the existing one.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
pasta.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/pasta.c b/pasta.c
index 3a4d704..b30ce70 100644
--- a/pasta.c
+++ b/pasta.c
@@ -29,6 +29,7 @@
#include <syslog.h>
#include <sys/epoll.h>
#include <sys/inotify.h>
+#include <sys/mount.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
@@ -172,6 +173,10 @@ static int pasta_spawn_cmd(void *arg)
const struct pasta_spawn_cmd_arg *a;
sigset_t set;
+ /* We run in a detached PID and mount namespace: mount /proc over */
+ if (mount("", "/proc", "proc", 0, NULL))
+ warn("Couldn't mount /proc: %s", strerror(errno));
+
if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0"))
warn("Cannot set ping_group_range, ICMP requests might fail");
@@ -243,7 +248,7 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
pasta_child_pid = do_clone(pasta_spawn_cmd, ns_fn_stack,
sizeof(ns_fn_stack),
CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET |
- CLONE_NEWUTS | SIGCHLD,
+ CLONE_NEWUTS | CLONE_NEWNS | SIGCHLD,
(void *)&arg);
if (pasta_child_pid == -1) {
--
@@ -29,6 +29,7 @@
#include <syslog.h>
#include <sys/epoll.h>
#include <sys/inotify.h>
+#include <sys/mount.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
@@ -172,6 +173,10 @@ static int pasta_spawn_cmd(void *arg)
const struct pasta_spawn_cmd_arg *a;
sigset_t set;
+ /* We run in a detached PID and mount namespace: mount /proc over */
+ if (mount("", "/proc", "proc", 0, NULL))
+ warn("Couldn't mount /proc: %s", strerror(errno));
+
if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0"))
warn("Cannot set ping_group_range, ICMP requests might fail");
@@ -243,7 +248,7 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
pasta_child_pid = do_clone(pasta_spawn_cmd, ns_fn_stack,
sizeof(ns_fn_stack),
CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET |
- CLONE_NEWUTS | SIGCHLD,
+ CLONE_NEWUTS | CLONE_NEWNS | SIGCHLD,
(void *)&arg);
if (pasta_child_pid == -1) {
--
2.39.2
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH v2 3/3] isolation: Initially Keep CAP_SETFCAP if running as UID 0 in non-init
2023-05-22 8:52 [PATCH v2 0/3] Fix pasta-in-pasta operation (and similar) Stefano Brivio
2023-05-22 8:52 ` [PATCH v2 1/3] util, conf: Add and use ns_is_init() helper Stefano Brivio
2023-05-22 8:52 ` [PATCH v2 2/3] pasta: Detach mount namespace, (re)mount procfs before spawning command Stefano Brivio
@ 2023-05-22 8:52 ` Stefano Brivio
2 siblings, 0 replies; 6+ messages in thread
From: Stefano Brivio @ 2023-05-22 8:52 UTC (permalink / raw)
To: passt-dev; +Cc: David Gibson
If pasta spawns a child process while running as UID 0, which is only
allowed from a non-init namespace, we need to keep CAP_SETFCAP before
pasta_start_ns() is called: otherwise, starting from Linux 5.12, we
won't be able to update /proc/self/uid_map with the intended mapping
(from 0 to 0). See user_namespaces(7).
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
isolation.c | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)
diff --git a/isolation.c b/isolation.c
index 5f89047..19932bf 100644
--- a/isolation.c
+++ b/isolation.c
@@ -177,6 +177,8 @@ static void clamp_caps(void)
*/
void isolate_initial(void)
{
+ uint64_t keep;
+
/* We want to keep CAP_NET_BIND_SERVICE in the initial
* namespace if we have it, so that we can forward low ports
* into the guest/namespace
@@ -193,9 +195,18 @@ void isolate_initial(void)
* further capabilites in isolate_user() and
* isolate_prefork().
*/
- drop_caps_ep_except(BIT(CAP_NET_BIND_SERVICE) |
- BIT(CAP_SETUID) | BIT(CAP_SETGID) |
- BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN));
+ keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) |
+ BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN);
+
+ /* Since Linux 5.12, if we want to update /proc/self/uid_map to create
+ * a mapping from UID 0, which only happens with pasta spawning a child
+ * from a non-init user namespace (pasta can't run as root), we need to
+ * retain CAP_SETFCAP too.
+ */
+ if (!ns_is_init() && !geteuid())
+ keep |= BIT(CAP_SETFCAP);
+
+ drop_caps_ep_except(keep);
}
/**
--
@@ -177,6 +177,8 @@ static void clamp_caps(void)
*/
void isolate_initial(void)
{
+ uint64_t keep;
+
/* We want to keep CAP_NET_BIND_SERVICE in the initial
* namespace if we have it, so that we can forward low ports
* into the guest/namespace
@@ -193,9 +195,18 @@ void isolate_initial(void)
* further capabilites in isolate_user() and
* isolate_prefork().
*/
- drop_caps_ep_except(BIT(CAP_NET_BIND_SERVICE) |
- BIT(CAP_SETUID) | BIT(CAP_SETGID) |
- BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN));
+ keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) |
+ BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN);
+
+ /* Since Linux 5.12, if we want to update /proc/self/uid_map to create
+ * a mapping from UID 0, which only happens with pasta spawning a child
+ * from a non-init user namespace (pasta can't run as root), we need to
+ * retain CAP_SETFCAP too.
+ */
+ if (!ns_is_init() && !geteuid())
+ keep |= BIT(CAP_SETFCAP);
+
+ drop_caps_ep_except(keep);
}
/**
--
2.39.2
^ permalink raw reply related [flat|nested] 6+ messages in thread