* [PATCH 1/3] util, conf: Add and use ns_is_init() helper
2023-05-21 23:41 [PATCH 0/3] Fix pasta-in-pasta operation (and similar) Stefano Brivio
@ 2023-05-21 23:41 ` Stefano Brivio
2023-05-22 5:41 ` David Gibson
2023-05-21 23:41 ` [PATCH 2/3] pasta: Detach mount namespace, (re)mount procfs before spawning command Stefano Brivio
2023-05-21 23:41 ` [PATCH 3/3] isolation: Initially Keep CAP_SETFCAP if running as UID 0 in non-init Stefano Brivio
2 siblings, 1 reply; 8+ messages in thread
From: Stefano Brivio @ 2023-05-21 23:41 UTC (permalink / raw)
To: passt-dev; +Cc: David Gibson
We'll need this in isolate_initial(). While at it, don't rely on
BUFSIZ: the earlier issue we had with musl reminded me it's not a
magic "everything will fit" value. Size the read buffer to what we
actually need from uid_map.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
conf.c | 16 +---------------
util.c | 25 +++++++++++++++++++++++++
util.h | 2 ++
3 files changed, 28 insertions(+), 15 deletions(-)
diff --git a/conf.c b/conf.c
index 447b000..984c3ce 100644
--- a/conf.c
+++ b/conf.c
@@ -1096,10 +1096,6 @@ static int conf_runas(char *opt, unsigned int *uid, unsigned int *gid)
*/
static void conf_ugid(char *runas, uid_t *uid, gid_t *gid)
{
- const char root_uid_map[] = " 0 0 4294967295";
- char buf[BUFSIZ];
- int fd;
-
/* If user has specified --runas, that takes precedence... */
if (runas) {
if (conf_runas(runas, uid, gid))
@@ -1116,18 +1112,8 @@ static void conf_ugid(char *runas, uid_t *uid, gid_t *gid)
return;
/* ...or at least not root in the init namespace... */
- if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) {
- die("Can't determine if we're in init namespace: %s",
- strerror(errno));
- }
-
- if (read(fd, buf, BUFSIZ) != sizeof(root_uid_map) ||
- strncmp(buf, root_uid_map, sizeof(root_uid_map) - 1)) {
- close(fd);
+ if (!ns_is_init())
return;
- }
-
- close(fd);
/* ...otherwise use nobody:nobody */
warn("Don't run as root. Changing to nobody...");
diff --git a/util.c b/util.c
index c3e3471..5ec8a6c 100644
--- a/util.c
+++ b/util.c
@@ -390,6 +390,31 @@ int ns_enter(const struct ctx *c)
return 0;
}
+/**
+ * ns_is_init() - Is the caller running in the "init" user namespace?
+ *
+ * Return: true if caller is in init, false otherwise, won't return on failure
+ */
+bool ns_is_init(void)
+{
+ const char root_uid_map[] = " 0 0 4294967295";
+ char buf[sizeof(root_uid_map) + 1];
+ bool ret = true;
+ int fd;
+
+ if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) {
+ die("Can't determine if we're in init namespace: %s",
+ strerror(errno));
+ }
+
+ if (read(fd, buf, sizeof(root_uid_map)) != sizeof(root_uid_map) ||
+ strncmp(buf, root_uid_map, sizeof(root_uid_map) - 1))
+ ret = false;
+
+ close(fd);
+ return ret;
+}
+
/**
* pid_file() - Write PID to file, if requested to do so, and close it
* @fd: Open PID file descriptor, closed on exit, -1 to skip writing it
diff --git a/util.h b/util.h
index ba3e3da..26892aa 100644
--- a/util.h
+++ b/util.h
@@ -8,6 +8,7 @@
#include <stdlib.h>
#include <stdarg.h>
+#include <stdbool.h>
#include "log.h"
@@ -216,6 +217,7 @@ char *line_read(char *buf, size_t len, int fd);
void procfs_scan_listen(struct ctx *c, uint8_t proto, int ip_version, int ns,
uint8_t *map, uint8_t *exclude);
int ns_enter(const struct ctx *c);
+bool ns_is_init(void);
void write_pidfile(int fd, pid_t pid);
int __daemon(int pidfile_fd, int devnull_fd);
int fls(unsigned long x);
--
@@ -8,6 +8,7 @@
#include <stdlib.h>
#include <stdarg.h>
+#include <stdbool.h>
#include "log.h"
@@ -216,6 +217,7 @@ char *line_read(char *buf, size_t len, int fd);
void procfs_scan_listen(struct ctx *c, uint8_t proto, int ip_version, int ns,
uint8_t *map, uint8_t *exclude);
int ns_enter(const struct ctx *c);
+bool ns_is_init(void);
void write_pidfile(int fd, pid_t pid);
int __daemon(int pidfile_fd, int devnull_fd);
int fls(unsigned long x);
--
2.39.2
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 1/3] util, conf: Add and use ns_is_init() helper
2023-05-21 23:41 ` [PATCH 1/3] util, conf: Add and use ns_is_init() helper Stefano Brivio
@ 2023-05-22 5:41 ` David Gibson
2023-05-22 8:50 ` Stefano Brivio
0 siblings, 1 reply; 8+ messages in thread
From: David Gibson @ 2023-05-22 5:41 UTC (permalink / raw)
To: Stefano Brivio; +Cc: passt-dev
[-- Attachment #1: Type: text/plain, Size: 3865 bytes --]
On Mon, May 22, 2023 at 01:41:56AM +0200, Stefano Brivio wrote:
> We'll need this in isolate_initial(). While at it, don't rely on
> BUFSIZ: the earlier issue we had with musl reminded me it's not a
> magic "everything will fit" value. Size the read buffer to what we
> actually need from uid_map.
>
> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Although...
> ---
> conf.c | 16 +---------------
> util.c | 25 +++++++++++++++++++++++++
> util.h | 2 ++
> 3 files changed, 28 insertions(+), 15 deletions(-)
>
> diff --git a/conf.c b/conf.c
> index 447b000..984c3ce 100644
> --- a/conf.c
> +++ b/conf.c
> @@ -1096,10 +1096,6 @@ static int conf_runas(char *opt, unsigned int *uid, unsigned int *gid)
> */
> static void conf_ugid(char *runas, uid_t *uid, gid_t *gid)
> {
> - const char root_uid_map[] = " 0 0 4294967295";
> - char buf[BUFSIZ];
> - int fd;
> -
> /* If user has specified --runas, that takes precedence... */
> if (runas) {
> if (conf_runas(runas, uid, gid))
> @@ -1116,18 +1112,8 @@ static void conf_ugid(char *runas, uid_t *uid, gid_t *gid)
> return;
>
> /* ...or at least not root in the init namespace... */
> - if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) {
> - die("Can't determine if we're in init namespace: %s",
> - strerror(errno));
> - }
> -
> - if (read(fd, buf, BUFSIZ) != sizeof(root_uid_map) ||
> - strncmp(buf, root_uid_map, sizeof(root_uid_map) - 1)) {
> - close(fd);
> + if (!ns_is_init())
> return;
> - }
> -
> - close(fd);
>
> /* ...otherwise use nobody:nobody */
> warn("Don't run as root. Changing to nobody...");
> diff --git a/util.c b/util.c
> index c3e3471..5ec8a6c 100644
> --- a/util.c
> +++ b/util.c
> @@ -390,6 +390,31 @@ int ns_enter(const struct ctx *c)
> return 0;
> }
>
> +/**
> + * ns_is_init() - Is the caller running in the "init" user namespace?
> + *
> + * Return: true if caller is in init, false otherwise, won't return on failure
> + */
> +bool ns_is_init(void)
> +{
> + const char root_uid_map[] = " 0 0 4294967295";
> + char buf[sizeof(root_uid_map) + 1];
> + bool ret = true;
> + int fd;
> +
> + if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) {
> + die("Can't determine if we're in init namespace: %s",
> + strerror(errno));
> + }
> +
> + if (read(fd, buf, sizeof(root_uid_map)) != sizeof(root_uid_map) ||
I don't think it can go bad in practice, but I think you want to pass
a slightly larger buffer than root_uid_map[], otherwise this test will
succeed if the uid_map contains the expected thing for init, followed
by something else.
> + strncmp(buf, root_uid_map, sizeof(root_uid_map) - 1))
> + ret = false;
> +
> + close(fd);
> + return ret;
> +}
> +
> /**
> * pid_file() - Write PID to file, if requested to do so, and close it
> * @fd: Open PID file descriptor, closed on exit, -1 to skip writing it
> diff --git a/util.h b/util.h
> index ba3e3da..26892aa 100644
> --- a/util.h
> +++ b/util.h
> @@ -8,6 +8,7 @@
>
> #include <stdlib.h>
> #include <stdarg.h>
> +#include <stdbool.h>
>
> #include "log.h"
>
> @@ -216,6 +217,7 @@ char *line_read(char *buf, size_t len, int fd);
> void procfs_scan_listen(struct ctx *c, uint8_t proto, int ip_version, int ns,
> uint8_t *map, uint8_t *exclude);
> int ns_enter(const struct ctx *c);
> +bool ns_is_init(void);
> void write_pidfile(int fd, pid_t pid);
> int __daemon(int pidfile_fd, int devnull_fd);
> int fls(unsigned long x);
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/3] util, conf: Add and use ns_is_init() helper
2023-05-22 5:41 ` David Gibson
@ 2023-05-22 8:50 ` Stefano Brivio
0 siblings, 0 replies; 8+ messages in thread
From: Stefano Brivio @ 2023-05-22 8:50 UTC (permalink / raw)
To: David Gibson; +Cc: passt-dev
On Mon, 22 May 2023 15:41:17 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:
> On Mon, May 22, 2023 at 01:41:56AM +0200, Stefano Brivio wrote:
> > We'll need this in isolate_initial(). While at it, don't rely on
> > BUFSIZ: the earlier issue we had with musl reminded me it's not a
> > magic "everything will fit" value. Size the read buffer to what we
> > actually need from uid_map.
> >
> > Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> Although...
>
> > ---
> > conf.c | 16 +---------------
> > util.c | 25 +++++++++++++++++++++++++
> > util.h | 2 ++
> > 3 files changed, 28 insertions(+), 15 deletions(-)
> >
> > diff --git a/conf.c b/conf.c
> > index 447b000..984c3ce 100644
> > --- a/conf.c
> > +++ b/conf.c
> > @@ -1096,10 +1096,6 @@ static int conf_runas(char *opt, unsigned int *uid, unsigned int *gid)
> > */
> > static void conf_ugid(char *runas, uid_t *uid, gid_t *gid)
> > {
> > - const char root_uid_map[] = " 0 0 4294967295";
> > - char buf[BUFSIZ];
> > - int fd;
> > -
> > /* If user has specified --runas, that takes precedence... */
> > if (runas) {
> > if (conf_runas(runas, uid, gid))
> > @@ -1116,18 +1112,8 @@ static void conf_ugid(char *runas, uid_t *uid, gid_t *gid)
> > return;
> >
> > /* ...or at least not root in the init namespace... */
> > - if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) {
> > - die("Can't determine if we're in init namespace: %s",
> > - strerror(errno));
> > - }
> > -
> > - if (read(fd, buf, BUFSIZ) != sizeof(root_uid_map) ||
> > - strncmp(buf, root_uid_map, sizeof(root_uid_map) - 1)) {
> > - close(fd);
> > + if (!ns_is_init())
> > return;
> > - }
> > -
> > - close(fd);
> >
> > /* ...otherwise use nobody:nobody */
> > warn("Don't run as root. Changing to nobody...");
> > diff --git a/util.c b/util.c
> > index c3e3471..5ec8a6c 100644
> > --- a/util.c
> > +++ b/util.c
> > @@ -390,6 +390,31 @@ int ns_enter(const struct ctx *c)
> > return 0;
> > }
> >
> > +/**
> > + * ns_is_init() - Is the caller running in the "init" user namespace?
> > + *
> > + * Return: true if caller is in init, false otherwise, won't return on failure
> > + */
> > +bool ns_is_init(void)
> > +{
> > + const char root_uid_map[] = " 0 0 4294967295";
> > + char buf[sizeof(root_uid_map) + 1];
> > + bool ret = true;
> > + int fd;
> > +
> > + if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) {
> > + die("Can't determine if we're in init namespace: %s",
> > + strerror(errno));
> > + }
> > +
> > + if (read(fd, buf, sizeof(root_uid_map)) != sizeof(root_uid_map) ||
>
> I don't think it can go bad in practice, but I think you want to pass
> a slightly larger buffer than root_uid_map[], otherwise this test will
> succeed if the uid_map contains the expected thing for init, followed
> by something else.
Ah, yes, I thought of doing that, then I also thought that with
4294967295 as range size in the first line, there can't be other lines.
Then I (clumsily) went back to the original idea, because... go figure,
let's say it gets extended with something one day...
But now it's inconsistent (and probably confusing): the buffer is long
enough, but I don't use it:
const char root_uid_map[] = " 0 0 4294967295";
32 bytes string length, sizeof() is 33
char buf[sizeof(root_uid_map) + 1];
34 bytes (could be 33)
read(fd, buf, sizeof(root_uid_map))
read at most 33 bytes
!= sizeof(root_uid_map) ||
isn't 33 bytes (should be 32)
strncmp(buf, root_uid_map, sizeof(root_uid_map) - 1))
or the first 32 bytes, up to "5", differ (we should compare one more)
and, by the way, uid_map has _lines_:
$ hexdump -C /proc/self/uid_map
00000000 20 20 20 20 20 20 20 20 20 30 20 20 20 20 20 20 | 0 |
00000010 20 20 20 20 30 20 34 32 39 34 39 36 37 32 39 35 | 0 4294967295|
00000020 0a |.|
so it kind of works, but not for the reason I intended. Respinning
now...
--
Stefano
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 2/3] pasta: Detach mount namespace, (re)mount procfs before spawning command
2023-05-21 23:41 [PATCH 0/3] Fix pasta-in-pasta operation (and similar) Stefano Brivio
2023-05-21 23:41 ` [PATCH 1/3] util, conf: Add and use ns_is_init() helper Stefano Brivio
@ 2023-05-21 23:41 ` Stefano Brivio
2023-05-22 5:42 ` David Gibson
2023-05-21 23:41 ` [PATCH 3/3] isolation: Initially Keep CAP_SETFCAP if running as UID 0 in non-init Stefano Brivio
2 siblings, 1 reply; 8+ messages in thread
From: Stefano Brivio @ 2023-05-21 23:41 UTC (permalink / raw)
To: passt-dev; +Cc: David Gibson
If we want /proc contents to be consistent after pasta spawns a child
process in a new PID namespace (only for operation without a
pre-existing namespace), we need to mount /proc after the clone(2)
call with CLONE_NEWPID, and we enable the child to do that by
passing, in the same call, the CLONE_NEWNS flag, as described by
pid_namespaces(7).
This is not really a remount: in fact, passing MS_REMOUNT to mount(2)
would make the call fail. We're in another mount namespace now, so
it's a fresh mount that has the effect of hiding the existing one.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
pasta.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/pasta.c b/pasta.c
index 3a4d704..b30ce70 100644
--- a/pasta.c
+++ b/pasta.c
@@ -29,6 +29,7 @@
#include <syslog.h>
#include <sys/epoll.h>
#include <sys/inotify.h>
+#include <sys/mount.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
@@ -172,6 +173,10 @@ static int pasta_spawn_cmd(void *arg)
const struct pasta_spawn_cmd_arg *a;
sigset_t set;
+ /* We run in a detached PID and mount namespace: mount /proc over */
+ if (mount("", "/proc", "proc", 0, NULL))
+ warn("Couldn't mount /proc: %s", strerror(errno));
+
if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0"))
warn("Cannot set ping_group_range, ICMP requests might fail");
@@ -243,7 +248,7 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
pasta_child_pid = do_clone(pasta_spawn_cmd, ns_fn_stack,
sizeof(ns_fn_stack),
CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET |
- CLONE_NEWUTS | SIGCHLD,
+ CLONE_NEWUTS | CLONE_NEWNS | SIGCHLD,
(void *)&arg);
if (pasta_child_pid == -1) {
--
@@ -29,6 +29,7 @@
#include <syslog.h>
#include <sys/epoll.h>
#include <sys/inotify.h>
+#include <sys/mount.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
@@ -172,6 +173,10 @@ static int pasta_spawn_cmd(void *arg)
const struct pasta_spawn_cmd_arg *a;
sigset_t set;
+ /* We run in a detached PID and mount namespace: mount /proc over */
+ if (mount("", "/proc", "proc", 0, NULL))
+ warn("Couldn't mount /proc: %s", strerror(errno));
+
if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0"))
warn("Cannot set ping_group_range, ICMP requests might fail");
@@ -243,7 +248,7 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
pasta_child_pid = do_clone(pasta_spawn_cmd, ns_fn_stack,
sizeof(ns_fn_stack),
CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET |
- CLONE_NEWUTS | SIGCHLD,
+ CLONE_NEWUTS | CLONE_NEWNS | SIGCHLD,
(void *)&arg);
if (pasta_child_pid == -1) {
--
2.39.2
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 2/3] pasta: Detach mount namespace, (re)mount procfs before spawning command
2023-05-21 23:41 ` [PATCH 2/3] pasta: Detach mount namespace, (re)mount procfs before spawning command Stefano Brivio
@ 2023-05-22 5:42 ` David Gibson
0 siblings, 0 replies; 8+ messages in thread
From: David Gibson @ 2023-05-22 5:42 UTC (permalink / raw)
To: Stefano Brivio; +Cc: passt-dev
[-- Attachment #1: Type: text/plain, Size: 2181 bytes --]
On Mon, May 22, 2023 at 01:41:57AM +0200, Stefano Brivio wrote:
> If we want /proc contents to be consistent after pasta spawns a child
> process in a new PID namespace (only for operation without a
> pre-existing namespace), we need to mount /proc after the clone(2)
> call with CLONE_NEWPID, and we enable the child to do that by
> passing, in the same call, the CLONE_NEWNS flag, as described by
> pid_namespaces(7).
>
> This is not really a remount: in fact, passing MS_REMOUNT to mount(2)
> would make the call fail. We're in another mount namespace now, so
> it's a fresh mount that has the effect of hiding the existing one.
>
> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> ---
> pasta.c | 7 ++++++-
> 1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/pasta.c b/pasta.c
> index 3a4d704..b30ce70 100644
> --- a/pasta.c
> +++ b/pasta.c
> @@ -29,6 +29,7 @@
> #include <syslog.h>
> #include <sys/epoll.h>
> #include <sys/inotify.h>
> +#include <sys/mount.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> @@ -172,6 +173,10 @@ static int pasta_spawn_cmd(void *arg)
> const struct pasta_spawn_cmd_arg *a;
> sigset_t set;
>
> + /* We run in a detached PID and mount namespace: mount /proc over */
> + if (mount("", "/proc", "proc", 0, NULL))
> + warn("Couldn't mount /proc: %s", strerror(errno));
> +
> if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0"))
> warn("Cannot set ping_group_range, ICMP requests might fail");
>
> @@ -243,7 +248,7 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
> pasta_child_pid = do_clone(pasta_spawn_cmd, ns_fn_stack,
> sizeof(ns_fn_stack),
> CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET |
> - CLONE_NEWUTS | SIGCHLD,
> + CLONE_NEWUTS | CLONE_NEWNS | SIGCHLD,
> (void *)&arg);
>
> if (pasta_child_pid == -1) {
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 3/3] isolation: Initially Keep CAP_SETFCAP if running as UID 0 in non-init
2023-05-21 23:41 [PATCH 0/3] Fix pasta-in-pasta operation (and similar) Stefano Brivio
2023-05-21 23:41 ` [PATCH 1/3] util, conf: Add and use ns_is_init() helper Stefano Brivio
2023-05-21 23:41 ` [PATCH 2/3] pasta: Detach mount namespace, (re)mount procfs before spawning command Stefano Brivio
@ 2023-05-21 23:41 ` Stefano Brivio
2023-05-22 5:48 ` David Gibson
2 siblings, 1 reply; 8+ messages in thread
From: Stefano Brivio @ 2023-05-21 23:41 UTC (permalink / raw)
To: passt-dev; +Cc: David Gibson
If pasta spawns a child process while running as UID 0, which is only
allowed from a non-init namespace, we need to keep CAP_SETFCAP before
pasta_start_ns() is called: otherwise, starting from Linux 5.12, we
won't be able to update /proc/self/uid_map with the intended mapping
(from 0 to 0). See user_namespaces(7).
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
isolation.c | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)
diff --git a/isolation.c b/isolation.c
index 5f89047..19932bf 100644
--- a/isolation.c
+++ b/isolation.c
@@ -177,6 +177,8 @@ static void clamp_caps(void)
*/
void isolate_initial(void)
{
+ uint64_t keep;
+
/* We want to keep CAP_NET_BIND_SERVICE in the initial
* namespace if we have it, so that we can forward low ports
* into the guest/namespace
@@ -193,9 +195,18 @@ void isolate_initial(void)
* further capabilites in isolate_user() and
* isolate_prefork().
*/
- drop_caps_ep_except(BIT(CAP_NET_BIND_SERVICE) |
- BIT(CAP_SETUID) | BIT(CAP_SETGID) |
- BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN));
+ keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) |
+ BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN);
+
+ /* Since Linux 5.12, if we want to update /proc/self/uid_map to create
+ * a mapping from UID 0, which only happens with pasta spawning a child
+ * from a non-init user namespace (pasta can't run as root), we need to
+ * retain CAP_SETFCAP too.
+ */
+ if (!ns_is_init() && !geteuid())
+ keep |= BIT(CAP_SETFCAP);
+
+ drop_caps_ep_except(keep);
}
/**
--
@@ -177,6 +177,8 @@ static void clamp_caps(void)
*/
void isolate_initial(void)
{
+ uint64_t keep;
+
/* We want to keep CAP_NET_BIND_SERVICE in the initial
* namespace if we have it, so that we can forward low ports
* into the guest/namespace
@@ -193,9 +195,18 @@ void isolate_initial(void)
* further capabilites in isolate_user() and
* isolate_prefork().
*/
- drop_caps_ep_except(BIT(CAP_NET_BIND_SERVICE) |
- BIT(CAP_SETUID) | BIT(CAP_SETGID) |
- BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN));
+ keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) |
+ BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN);
+
+ /* Since Linux 5.12, if we want to update /proc/self/uid_map to create
+ * a mapping from UID 0, which only happens with pasta spawning a child
+ * from a non-init user namespace (pasta can't run as root), we need to
+ * retain CAP_SETFCAP too.
+ */
+ if (!ns_is_init() && !geteuid())
+ keep |= BIT(CAP_SETFCAP);
+
+ drop_caps_ep_except(keep);
}
/**
--
2.39.2
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 3/3] isolation: Initially Keep CAP_SETFCAP if running as UID 0 in non-init
2023-05-21 23:41 ` [PATCH 3/3] isolation: Initially Keep CAP_SETFCAP if running as UID 0 in non-init Stefano Brivio
@ 2023-05-22 5:48 ` David Gibson
0 siblings, 0 replies; 8+ messages in thread
From: David Gibson @ 2023-05-22 5:48 UTC (permalink / raw)
To: Stefano Brivio; +Cc: passt-dev
[-- Attachment #1: Type: text/plain, Size: 2033 bytes --]
On Mon, May 22, 2023 at 01:41:58AM +0200, Stefano Brivio wrote:
> If pasta spawns a child process while running as UID 0, which is only
> allowed from a non-init namespace, we need to keep CAP_SETFCAP before
> pasta_start_ns() is called: otherwise, starting from Linux 5.12, we
> won't be able to update /proc/self/uid_map with the intended mapping
> (from 0 to 0). See user_namespaces(7).
>
> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> ---
> isolation.c | 17 ++++++++++++++---
> 1 file changed, 14 insertions(+), 3 deletions(-)
>
> diff --git a/isolation.c b/isolation.c
> index 5f89047..19932bf 100644
> --- a/isolation.c
> +++ b/isolation.c
> @@ -177,6 +177,8 @@ static void clamp_caps(void)
> */
> void isolate_initial(void)
> {
> + uint64_t keep;
> +
> /* We want to keep CAP_NET_BIND_SERVICE in the initial
> * namespace if we have it, so that we can forward low ports
> * into the guest/namespace
> @@ -193,9 +195,18 @@ void isolate_initial(void)
> * further capabilites in isolate_user() and
> * isolate_prefork().
> */
> - drop_caps_ep_except(BIT(CAP_NET_BIND_SERVICE) |
> - BIT(CAP_SETUID) | BIT(CAP_SETGID) |
> - BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN));
> + keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) |
> + BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN);
> +
> + /* Since Linux 5.12, if we want to update /proc/self/uid_map to create
> + * a mapping from UID 0, which only happens with pasta spawning a child
> + * from a non-init user namespace (pasta can't run as root), we need to
> + * retain CAP_SETFCAP too.
> + */
> + if (!ns_is_init() && !geteuid())
> + keep |= BIT(CAP_SETFCAP);
> +
> + drop_caps_ep_except(keep);
> }
>
> /**
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 8+ messages in thread