public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
blob 2468f8452587b6cbed0f8d6281a790a55dd96e71 8120 bytes (raw)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
 
// SPDX-License-Identifier: AGPL-3.0-or-later

/* PASST - Plug A Simple Socket Transport
 *  for qemu/UNIX domain socket mode
 *
 * PASTA - Pack A Subtle Tap Abstraction
 *  for network namespace/tap device mode
 *
 * isolation.c - Self isolation helpers
 *
 * Copyright Red Hat
 * Author: Stefano Brivio <sbrivio@redhat.com>
 * Author: David Gibson <david@gibson.dropbear.id.au>
 */
/**
 * DOC: Theory of Operation
 *
 * For security the passt/pasta process performs a number of
 * self-isolations steps, dropping capabilities, setting namespaces
 * and otherwise minimizing the impact we can have on the system at
 * large if we were compromised.
 *
 * Obviously we can't isolate ourselves from resources before we've
 * done anything we need to do with those resources, so we have
 * multiple stages of self-isolation.  In order these are:
 *
 * 1. isolate_initial()
 * ====================
 *
 * Executed immediately after startup, drops capabilities we don't
 * need at any point during execution (or which we gain back when we
 * need by joining other namespaces).
 *
 * 2. isolate_user()
 * =================
 *
 * Executed once we know what user and user namespace we want to
 * operate in.  Sets our final UID & GID, and enters the correct user
 * namespace.
 *
 * 3. isolate_prefork()
 * ====================
 *
 * Executed after all setup, but before daemonizing (fork()ing into
 * the background).  Uses mount namespace and pivot_root() to remove
 * our access to the filesystem().
 *
 * 4. isolate_postfork()
 * =====================
 *
 * Executed immediately after daemonizing, but before entering the
 * actual packet forwarding phase of operation.  Or, if not
 * daemonizing, immediately after isolate_prefork().  Uses seccomp()
 * to restrict ourselves to the handful of syscalls we need during
 * runtime operation.
 */

#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <inttypes.h>
#include <limits.h>
#include <pwd.h>
#include <sched.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>

#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/filter.h>
#include <linux/seccomp.h>

#include "util.h"
#include "seccomp.h"
#include "passt.h"
#include "isolation.h"

#define CAP_VERSION	_LINUX_CAPABILITY_VERSION_3
#define CAP_WORDS	_LINUX_CAPABILITY_U32S_3

/**
 * drop_caps_ep_except() - Drop capabilities from effective & permitted sets
 * @keep:	Capabilities to keep
 */
static void drop_caps_ep_except(uint64_t keep)
{
	struct __user_cap_header_struct hdr = {
		.version = CAP_VERSION,
		.pid = 0,
	};
	struct __user_cap_data_struct data[CAP_WORDS];
	int i;

	if (syscall(SYS_capget, &hdr, data)) {
		err("Couldn't get current capabilities: %s", strerror(errno));
		exit(EXIT_FAILURE);
	}

	for (i = 0; i < CAP_WORDS; i++) {
		uint32_t mask = keep >> (32 * i);

		data[i].effective &= mask;
		data[i].permitted &= mask;
	}

	if (syscall(SYS_capset, &hdr, data)) {
		err("Couldn't drop capabilities: %s", strerror(errno));
		exit(EXIT_FAILURE);
	}
}

/**
 * isolate_initial() - Early, config independent self isolation
 *
 * Should:
 *  - drop unneeded capabilities
 * Musn't:
 *  - remove filessytem access (we need to access files during setup)
 */
void isolate_initial(void)
{
	/* We want to keep CAP_NET_BIND_SERVICE in the initial
	 * namespace if we have it, so that we can forward low ports
	 * into the guest/namespace
	 */
	drop_caps_ep_except((1UL << CAP_NET_BIND_SERVICE));
}

/**
 * isolate_user() - Switch to final UID/GID and move into userns
 * @uid:	User ID to run as (in original userns)
 * @gid:	Group ID to run as (in original userns)
 * @use_userns:	Whether to join or create a userns
 * @userns:	userns path to enter, may be empty
 *
 * Should:
 *  - set our final UID and GID
 *  - enter our final user namespace
 * Mustn't:
 *  - remove filesystem access (we need that for further setup)
 */
void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns)
{
	char uidmap[BUFSIZ];
	char gidmap[BUFSIZ];

	/* First set our UID & GID in the original namespace */
	if (setgroups(0, NULL)) {
		/* If we don't have CAP_SETGID, this will EPERM */
		if (errno != EPERM) {
			err("Can't drop supplementary groups: %s",
			    strerror(errno));
			exit(EXIT_FAILURE);
		}
	}

	if (setgid(gid) != 0) {
		err("Can't set GID to %u: %s", gid, strerror(errno));
		exit(EXIT_FAILURE);
	}

	if (setuid(uid) != 0) {
		err("Can't set UID to %u: %s", uid, strerror(errno));
		exit(EXIT_FAILURE);
	}

	/* If we're told not to use a userns, nothing more to do */
	if (!use_userns)
		return;

	/* Otherwise, if given a userns, join it */
	if (*userns) {
		int ufd;

		ufd = open(userns, O_RDONLY | O_CLOEXEC);
		if (ufd < 0) {
			err("Couldn't open user namespace %s: %s",
			    userns, strerror(errno));
			exit(EXIT_FAILURE);
		}

		if (setns(ufd, CLONE_NEWUSER) != 0) {
			err("Couldn't enter user namespace %s: %s",
			    userns, strerror(errno));
			exit(EXIT_FAILURE);
		}

		close(ufd);

		return;
	}

	/* Otherwise, create our own userns */
	if (unshare(CLONE_NEWUSER) != 0) {
		err("Couldn't create user namespace: %s", strerror(errno));
		exit(EXIT_FAILURE);
	}

	/* Configure user and group mappings */
	snprintf(uidmap, BUFSIZ, "0 %u 1", uid);
	snprintf(gidmap, BUFSIZ, "0 %u 1", gid);

	if (write_file("/proc/self/uid_map", uidmap) ||
	    write_file("/proc/self/setgroups", "deny") ||
	    write_file("/proc/self/gid_map", gidmap)) {
		warn("Couldn't configure user namespace");
	}
}

/**
 * isolate_prefork() - Self isolation before daemonizing
 * @c:		Execution context
 *
 * Should:
 *  - Moves us to our own IPC and UTS namespaces
 *  - Moves us to a mount namespace with only an empty directory
 *  - Drops unneeded capabilities (in the new user namespace)
 * Mustn't:
 *  - Remove syscalls we need to daemonize
 *
 * Return: negative error code on failure, zero on success
 */
int isolate_prefork(struct ctx *c)
{
	int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS;
	uint64_t ns_caps = 0;

	/* If we run in foreground, we have no chance to actually move to a new
	 * PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody
	 * ever gets around seccomp profiles -- there's no harm in passing it.
	 */
	if (!c->foreground || c->mode == MODE_PASST)
		flags |= CLONE_NEWPID;

	if (unshare(flags)) {
		perror("unshare");
		return -errno;
	}

	if (mount("", "/", "", MS_UNBINDABLE | MS_REC, NULL)) {
		perror("mount /");
		return -errno;
	}

	if (mount("", TMPDIR, "tmpfs",
		  MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY,
		  "nr_inodes=2,nr_blocks=0")) {
		perror("mount tmpfs");
		return -errno;
	}

	if (chdir(TMPDIR)) {
		perror("chdir");
		return -errno;
	}

	if (syscall(SYS_pivot_root, ".", ".")) {
		perror("pivot_root");
		return -errno;
	}

	if (umount2(".", MNT_DETACH | UMOUNT_NOFOLLOW)) {
		perror("umount2");
		return -errno;
	}

	/* Drop capabilites in our new userns */
	if (c->mode == MODE_PASTA) {
		/* Keep CAP_SYS_ADMIN, so that we can setns() to the
		 * netns when we need to act upon it
		 */
		ns_caps |= 1UL << CAP_SYS_ADMIN;
		/* Keep CAP_NET_BIND_SERVICE, so we can splice
		 * outbound connections to low port numbers
		 */
		ns_caps |= 1UL << CAP_NET_BIND_SERVICE;
	}

	drop_caps_ep_except(ns_caps);

	return 0;
}

/**
 * isolate_postfork() - Self isolation after daemonizing
 * @c:		Execution context
 *
 * Should:
 *  - disable core dumps
 *  - limit to a minimal set of syscalls
 */
void isolate_postfork(const struct ctx *c)
{
	struct sock_fprog prog;

	prctl(PR_SET_DUMPABLE, 0);

	if (c->mode == MODE_PASST) {
		prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
		prog.filter = filter_passt;
	} else {
		prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
		prog.filter = filter_pasta;
	}

	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
	    prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
		perror("prctl");
		exit(EXIT_FAILURE);
	}
}

debug log:

solving 2468f84 ...
found 2468f84 in https://archives.passt.top/passt-dev/20221011054018.1449506-8-david@gibson.dropbear.id.au/
found 4aa75e6 in https://archives.passt.top/passt-dev/20221011054018.1449506-7-david@gibson.dropbear.id.au/
found 10cef05 in https://archives.passt.top/passt-dev/20221011054018.1449506-6-david@gibson.dropbear.id.au/
found 124dea4 in https://passt.top/passt
preparing index
index prepared:
100644 124dea44c1895995f25b91d1982ce994c79b6a24	isolation.c

applying [1/3] https://archives.passt.top/passt-dev/20221011054018.1449506-6-david@gibson.dropbear.id.au/
diff --git a/isolation.c b/isolation.c
index 124dea4..10cef05 100644


applying [2/3] https://archives.passt.top/passt-dev/20221011054018.1449506-7-david@gibson.dropbear.id.au/
diff --git a/isolation.c b/isolation.c
index 10cef05..4aa75e6 100644


applying [3/3] https://archives.passt.top/passt-dev/20221011054018.1449506-8-david@gibson.dropbear.id.au/
diff --git a/isolation.c b/isolation.c
index 4aa75e6..2468f84 100644

Checking patch isolation.c...
Applied patch isolation.c cleanly.
Checking patch isolation.c...
Applied patch isolation.c cleanly.
Checking patch isolation.c...
Applied patch isolation.c cleanly.

index at:
100644 2468f8452587b6cbed0f8d6281a790a55dd96e71	isolation.c

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).