// SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport * for qemu/UNIX domain socket mode * * PASTA - Pack A Subtle Tap Abstraction * for network namespace/tap device mode * * isolation.c - Self isolation helpers * * Copyright Red Hat * Author: Stefano Brivio * Author: David Gibson */ /** * DOC: Theory of Operation * * For security the passt/pasta process performs a number of * self-isolations steps, dropping capabilities, setting namespaces * and otherwise minimizing the impact we can have on the system at * large if we were compromised. * * Obviously we can't isolate ourselves from resources before we've * done anything we need to do with those resources, so we have * multiple stages of self-isolation. In order these are: * * 1. isolate_initial() * ==================== * * Executed immediately after startup, drops capabilities we don't * need at any point during execution (or which we gain back when we * need by joining other namespaces). * * 2. isolate_user() * ================= * * Executed once we know what user and user namespace we want to * operate in. Sets our final UID & GID, and enters the correct user * namespace. * * 3. isolate_prefork() * ==================== * * Executed after all setup, but before daemonizing (fork()ing into * the background). Uses mount namespace and pivot_root() to remove * our access to the filesystem(). * * 4. isolate_postfork() * ===================== * * Executed immediately after daemonizing, but before entering the * actual packet forwarding phase of operation. Or, if not * daemonizing, immediately after isolate_prefork(). Uses seccomp() * to restrict ourselves to the handful of syscalls we need during * runtime operation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util.h" #include "seccomp.h" #include "passt.h" #include "isolation.h" #define CAP_VERSION _LINUX_CAPABILITY_VERSION_3 #define CAP_WORDS _LINUX_CAPABILITY_U32S_3 /** * drop_caps_ep_except() - Drop capabilities from effective & permitted sets * @keep: Capabilities to keep */ static void drop_caps_ep_except(uint64_t keep) { struct __user_cap_header_struct hdr = { .version = CAP_VERSION, .pid = 0, }; struct __user_cap_data_struct data[CAP_WORDS]; int i; if (syscall(SYS_capget, &hdr, data)) { err("Couldn't get current capabilities: %s", strerror(errno)); exit(EXIT_FAILURE); } for (i = 0; i < CAP_WORDS; i++) { uint32_t mask = keep >> (32 * i); data[i].effective &= mask; data[i].permitted &= mask; } if (syscall(SYS_capset, &hdr, data)) { err("Couldn't drop capabilities: %s", strerror(errno)); exit(EXIT_FAILURE); } } /** * isolate_initial() - Early, config independent self isolation * * Should: * - drop unneeded capabilities * Musn't: * - remove filessytem access (we need to access files during setup) */ void isolate_initial(void) { /* We want to keep CAP_NET_BIND_SERVICE in the initial * namespace if we have it, so that we can forward low ports * into the guest/namespace */ drop_caps_ep_except((1UL << CAP_NET_BIND_SERVICE)); } /** * isolate_user() - Switch to final UID/GID and move into userns * @uid: User ID to run as (in original userns) * @gid: Group ID to run as (in original userns) * @use_userns: Whether to join or create a userns * @userns: userns path to enter, may be empty * * Should: * - set our final UID and GID * - enter our final user namespace * Mustn't: * - remove filesystem access (we need that for further setup) */ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) { char uidmap[BUFSIZ]; char gidmap[BUFSIZ]; /* First set our UID & GID in the original namespace */ if (setgroups(0, NULL)) { /* If we don't have CAP_SETGID, this will EPERM */ if (errno != EPERM) { err("Can't drop supplementary groups: %s", strerror(errno)); exit(EXIT_FAILURE); } } if (setgid(gid) != 0) { err("Can't set GID to %u: %s", gid, strerror(errno)); exit(EXIT_FAILURE); } if (setuid(uid) != 0) { err("Can't set UID to %u: %s", uid, strerror(errno)); exit(EXIT_FAILURE); } /* If we're told not to use a userns, nothing more to do */ if (!use_userns) return; /* Otherwise, if given a userns, join it */ if (*userns) { int ufd; ufd = open(userns, O_RDONLY | O_CLOEXEC); if (ufd < 0) { err("Couldn't open user namespace %s: %s", userns, strerror(errno)); exit(EXIT_FAILURE); } if (setns(ufd, CLONE_NEWUSER) != 0) { err("Couldn't enter user namespace %s: %s", userns, strerror(errno)); exit(EXIT_FAILURE); } close(ufd); return; } /* Otherwise, create our own userns */ if (unshare(CLONE_NEWUSER) != 0) { err("Couldn't create user namespace: %s", strerror(errno)); exit(EXIT_FAILURE); } /* Configure user and group mappings */ snprintf(uidmap, BUFSIZ, "0 %u 1", uid); snprintf(gidmap, BUFSIZ, "0 %u 1", gid); if (write_file("/proc/self/uid_map", uidmap) || write_file("/proc/self/setgroups", "deny") || write_file("/proc/self/gid_map", gidmap)) { warn("Couldn't configure user namespace"); } } /** * isolate_prefork() - Self isolation before daemonizing * @c: Execution context * * Should: * - Moves us to our own IPC and UTS namespaces * - Moves us to a mount namespace with only an empty directory * - Drops unneeded capabilities (in the new user namespace) * Mustn't: * - Remove syscalls we need to daemonize * * Return: negative error code on failure, zero on success */ int isolate_prefork(struct ctx *c) { int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS; uint64_t ns_caps = 0; /* If we run in foreground, we have no chance to actually move to a new * PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody * ever gets around seccomp profiles -- there's no harm in passing it. */ if (!c->foreground || c->mode == MODE_PASST) flags |= CLONE_NEWPID; if (unshare(flags)) { perror("unshare"); return -errno; } if (mount("", "/", "", MS_UNBINDABLE | MS_REC, NULL)) { perror("mount /"); return -errno; } if (mount("", TMPDIR, "tmpfs", MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, "nr_inodes=2,nr_blocks=0")) { perror("mount tmpfs"); return -errno; } if (chdir(TMPDIR)) { perror("chdir"); return -errno; } if (syscall(SYS_pivot_root, ".", ".")) { perror("pivot_root"); return -errno; } if (umount2(".", MNT_DETACH | UMOUNT_NOFOLLOW)) { perror("umount2"); return -errno; } /* Drop capabilites in our new userns */ if (c->mode == MODE_PASTA) { /* Keep CAP_SYS_ADMIN, so that we can setns() to the * netns when we need to act upon it */ ns_caps |= 1UL << CAP_SYS_ADMIN; /* Keep CAP_NET_BIND_SERVICE, so we can splice * outbound connections to low port numbers */ ns_caps |= 1UL << CAP_NET_BIND_SERVICE; } drop_caps_ep_except(ns_caps); return 0; } /** * isolate_postfork() - Self isolation after daemonizing * @c: Execution context * * Should: * - disable core dumps * - limit to a minimal set of syscalls */ void isolate_postfork(const struct ctx *c) { struct sock_fprog prog; prctl(PR_SET_DUMPABLE, 0); if (c->mode == MODE_PASST) { prog.len = (unsigned short)ARRAY_SIZE(filter_passt); prog.filter = filter_passt; } else { prog.len = (unsigned short)ARRAY_SIZE(filter_pasta); prog.filter = filter_pasta; } if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { perror("prctl"); exit(EXIT_FAILURE); } }