libminijail.c revision 3c84df1c18b410cb33da3c9df010b59f960785a9
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6#define _BSD_SOURCE
7#define _GNU_SOURCE
8
9#include <asm/unistd.h>
10#include <ctype.h>
11#include <errno.h>
12#include <fcntl.h>
13#include <grp.h>
14#include <inttypes.h>
15#include <limits.h>
16#include <linux/capability.h>
17#include <pwd.h>
18#include <sched.h>
19#include <signal.h>
20#include <stdarg.h>
21#include <stddef.h>
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25#include <syscall.h>
26#include <sys/capability.h>
27#include <sys/mount.h>
28#include <sys/param.h>
29#include <sys/prctl.h>
30#include <sys/stat.h>
31#include <sys/types.h>
32#include <sys/user.h>
33#include <sys/wait.h>
34#include <unistd.h>
35
36#include "libminijail.h"
37#include "libminijail-private.h"
38
39#include "signal.h"
40#include "syscall_filter.h"
41#include "util.h"
42
43#ifdef HAVE_SECUREBITS_H
44#include <linux/securebits.h>
45#else
46#define SECURE_ALL_BITS         0x15
47#define SECURE_ALL_LOCKS        (SECURE_ALL_BITS << 1)
48#endif
49
50/* Until these are reliably available in linux/prctl.h */
51#ifndef PR_SET_SECCOMP
52# define PR_SET_SECCOMP 22
53#endif
54
55/* For seccomp_filter using BPF. */
56#ifndef PR_SET_NO_NEW_PRIVS
57# define PR_SET_NO_NEW_PRIVS 38
58#endif
59#ifndef SECCOMP_MODE_FILTER
60# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
61#endif
62
63#ifdef USE_SECCOMP_SOFTFAIL
64# define SECCOMP_SOFTFAIL 1
65#else
66# define SECCOMP_SOFTFAIL 0
67#endif
68
69struct binding {
70	char *src;
71	char *dest;
72	int writeable;
73	struct binding *next;
74};
75
76struct minijail {
77	/*
78	 * WARNING: if you add a flag here you need to make sure it's
79	 * accounted for in minijail_pre{enter|exec}() below.
80	 */
81	struct {
82		int uid:1;
83		int gid:1;
84		int caps:1;
85		int vfs:1;
86		int enter_vfs:1;
87		int pids:1;
88		int net:1;
89		int seccomp:1;
90		int readonly:1;
91		int usergroups:1;
92		int ptrace:1;
93		int no_new_privs:1;
94		int seccomp_filter:1;
95		int log_seccomp_filter:1;
96		int chroot:1;
97		int mount_tmp:1;
98	} flags;
99	uid_t uid;
100	gid_t gid;
101	gid_t usergid;
102	char *user;
103	uint64_t caps;
104	pid_t initpid;
105	int mountns_fd;
106	int filter_len;
107	int binding_count;
108	char *chrootdir;
109	struct sock_fprog *filter_prog;
110	struct binding *bindings_head;
111	struct binding *bindings_tail;
112};
113
114/*
115 * Strip out flags meant for the parent.
116 * We keep things that are not inherited across execve(2) (e.g. capabilities),
117 * or are easier to set after execve(2) (e.g. seccomp filters).
118 */
119void minijail_preenter(struct minijail *j)
120{
121	j->flags.vfs = 0;
122	j->flags.enter_vfs = 0;
123	j->flags.readonly = 0;
124	j->flags.pids = 0;
125}
126
127/*
128 * Strip out flags meant for the child.
129 * We keep things that are inherited across execve(2).
130 */
131void minijail_preexec(struct minijail *j)
132{
133	int vfs = j->flags.vfs;
134	int enter_vfs = j->flags.enter_vfs;
135	int readonly = j->flags.readonly;
136	if (j->user)
137		free(j->user);
138	j->user = NULL;
139	memset(&j->flags, 0, sizeof(j->flags));
140	/* Now restore anything we meant to keep. */
141	j->flags.vfs = vfs;
142	j->flags.enter_vfs = enter_vfs;
143	j->flags.readonly = readonly;
144	/* Note, |pids| will already have been used before this call. */
145}
146
147/* Minijail API. */
148
149struct minijail API *minijail_new(void)
150{
151	return calloc(1, sizeof(struct minijail));
152}
153
154void API minijail_change_uid(struct minijail *j, uid_t uid)
155{
156	if (uid == 0)
157		die("useless change to uid 0");
158	j->uid = uid;
159	j->flags.uid = 1;
160}
161
162void API minijail_change_gid(struct minijail *j, gid_t gid)
163{
164	if (gid == 0)
165		die("useless change to gid 0");
166	j->gid = gid;
167	j->flags.gid = 1;
168}
169
170int API minijail_change_user(struct minijail *j, const char *user)
171{
172	char *buf = NULL;
173	struct passwd pw;
174	struct passwd *ppw = NULL;
175	ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
176	if (sz == -1)
177		sz = 65536;	/* your guess is as good as mine... */
178
179	/*
180	 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
181	 * the maximum needed size of the buffer, so we don't have to search.
182	 */
183	buf = malloc(sz);
184	if (!buf)
185		return -ENOMEM;
186	getpwnam_r(user, &pw, buf, sz, &ppw);
187	/*
188	 * We're safe to free the buffer here. The strings inside pw point
189	 * inside buf, but we don't use any of them; this leaves the pointers
190	 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded.
191	 */
192	free(buf);
193	/* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
194	if (!ppw)
195		return -1;
196	minijail_change_uid(j, ppw->pw_uid);
197	j->user = strdup(user);
198	if (!j->user)
199		return -ENOMEM;
200	j->usergid = ppw->pw_gid;
201	return 0;
202}
203
204int API minijail_change_group(struct minijail *j, const char *group)
205{
206	char *buf = NULL;
207	struct group gr;
208	struct group *pgr = NULL;
209	ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
210	if (sz == -1)
211		sz = 65536;	/* and mine is as good as yours, really */
212
213	/*
214	 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
215	 * the maximum needed size of the buffer, so we don't have to search.
216	 */
217	buf = malloc(sz);
218	if (!buf)
219		return -ENOMEM;
220	getgrnam_r(group, &gr, buf, sz, &pgr);
221	/*
222	 * We're safe to free the buffer here. The strings inside gr point
223	 * inside buf, but we don't use any of them; this leaves the pointers
224	 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
225	 */
226	free(buf);
227	/* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
228	if (!pgr)
229		return -1;
230	minijail_change_gid(j, pgr->gr_gid);
231	return 0;
232}
233
234void API minijail_use_seccomp(struct minijail *j)
235{
236	j->flags.seccomp = 1;
237}
238
239void API minijail_no_new_privs(struct minijail *j)
240{
241	j->flags.no_new_privs = 1;
242}
243
244void API minijail_use_seccomp_filter(struct minijail *j)
245{
246	j->flags.seccomp_filter = 1;
247}
248
249void API minijail_log_seccomp_filter_failures(struct minijail *j)
250{
251	j->flags.log_seccomp_filter = 1;
252}
253
254void API minijail_use_caps(struct minijail *j, uint64_t capmask)
255{
256	j->caps = capmask;
257	j->flags.caps = 1;
258}
259
260void API minijail_namespace_vfs(struct minijail *j)
261{
262	j->flags.vfs = 1;
263}
264
265void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
266{
267	int ns_fd = open(ns_path, O_RDONLY);
268	if (ns_fd < 0) {
269		pdie("failed to open namespace '%s'", ns_path);
270	}
271	j->mountns_fd = ns_fd;
272	j->flags.enter_vfs = 1;
273}
274
275void API minijail_namespace_pids(struct minijail *j)
276{
277	j->flags.vfs = 1;
278	j->flags.readonly = 1;
279	j->flags.pids = 1;
280}
281
282void API minijail_namespace_net(struct minijail *j)
283{
284	j->flags.net = 1;
285}
286
287void API minijail_remount_readonly(struct minijail *j)
288{
289	j->flags.vfs = 1;
290	j->flags.readonly = 1;
291}
292
293void API minijail_inherit_usergroups(struct minijail *j)
294{
295	j->flags.usergroups = 1;
296}
297
298void API minijail_disable_ptrace(struct minijail *j)
299{
300	j->flags.ptrace = 1;
301}
302
303int API minijail_enter_chroot(struct minijail *j, const char *dir)
304{
305	if (j->chrootdir)
306		return -EINVAL;
307	j->chrootdir = strdup(dir);
308	if (!j->chrootdir)
309		return -ENOMEM;
310	j->flags.chroot = 1;
311	return 0;
312}
313
314void API minijail_mount_tmp(struct minijail *j)
315{
316	j->flags.mount_tmp = 1;
317}
318
319int API minijail_bind(struct minijail *j, const char *src, const char *dest,
320		      int writeable)
321{
322	struct binding *b;
323
324	if (*dest != '/')
325		return -EINVAL;
326	b = calloc(1, sizeof(*b));
327	if (!b)
328		return -ENOMEM;
329	b->dest = strdup(dest);
330	if (!b->dest)
331		goto error;
332	b->src = strdup(src);
333	if (!b->src)
334		goto error;
335	b->writeable = writeable;
336
337	info("bind %s -> %s", src, dest);
338
339	/*
340	 * Force vfs namespacing so the bind mounts don't leak out into the
341	 * containing vfs namespace.
342	 */
343	minijail_namespace_vfs(j);
344
345	if (j->bindings_tail)
346		j->bindings_tail->next = b;
347	else
348		j->bindings_head = b;
349	j->bindings_tail = b;
350	j->binding_count++;
351
352	return 0;
353
354error:
355	free(b->src);
356	free(b->dest);
357	free(b);
358	return -ENOMEM;
359}
360
361void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
362{
363	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) {
364		if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
365			warn("not loading seccomp filter, seccomp not supported");
366			return;
367		}
368	}
369	FILE *file = fopen(path, "r");
370	if (!file) {
371		pdie("failed to open seccomp filter file '%s'", path);
372	}
373
374	struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
375	if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
376		die("failed to compile seccomp filter BPF program in '%s'",
377		    path);
378	}
379
380	j->filter_len = fprog->len;
381	j->filter_prog = fprog;
382
383	fclose(file);
384}
385
386struct marshal_state {
387	size_t available;
388	size_t total;
389	char *buf;
390};
391
392void marshal_state_init(struct marshal_state *state,
393			char *buf, size_t available)
394{
395	state->available = available;
396	state->buf = buf;
397	state->total = 0;
398}
399
400void marshal_append(struct marshal_state *state,
401		    char *src, size_t length)
402{
403	size_t copy_len = MIN(state->available, length);
404
405	/* Up to |available| will be written. */
406	if (copy_len) {
407		memcpy(state->buf, src, copy_len);
408		state->buf += copy_len;
409		state->available -= copy_len;
410	}
411	/* |total| will contain the expected length. */
412	state->total += length;
413}
414
415void minijail_marshal_helper(struct marshal_state *state,
416			     const struct minijail *j)
417{
418	struct binding *b = NULL;
419	marshal_append(state, (char *)j, sizeof(*j));
420	if (j->user)
421		marshal_append(state, j->user, strlen(j->user) + 1);
422	if (j->chrootdir)
423		marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
424	if (j->flags.seccomp_filter && j->filter_prog) {
425		struct sock_fprog *fp = j->filter_prog;
426		marshal_append(state, (char *)fp->filter,
427				fp->len * sizeof(struct sock_filter));
428	}
429	for (b = j->bindings_head; b; b = b->next) {
430		marshal_append(state, b->src, strlen(b->src) + 1);
431		marshal_append(state, b->dest, strlen(b->dest) + 1);
432		marshal_append(state, (char *)&b->writeable,
433				sizeof(b->writeable));
434	}
435}
436
437size_t API minijail_size(const struct minijail *j)
438{
439	struct marshal_state state;
440	marshal_state_init(&state, NULL, 0);
441	minijail_marshal_helper(&state, j);
442	return state.total;
443}
444
445int minijail_marshal(const struct minijail *j, char *buf, size_t available)
446{
447	struct marshal_state state;
448	marshal_state_init(&state, buf, available);
449	minijail_marshal_helper(&state, j);
450	return (state.total > available);
451}
452
453/* consumebytes: consumes @length bytes from a buffer @buf of length @buflength
454 * @length    Number of bytes to consume
455 * @buf       Buffer to consume from
456 * @buflength Size of @buf
457 *
458 * Returns a pointer to the base of the bytes, or NULL for errors.
459 */
460void *consumebytes(size_t length, char **buf, size_t *buflength)
461{
462	char *p = *buf;
463	if (length > *buflength)
464		return NULL;
465	*buf += length;
466	*buflength -= length;
467	return p;
468}
469
470/* consumestr: consumes a C string from a buffer @buf of length @length
471 * @buf    Buffer to consume
472 * @length Length of buffer
473 *
474 * Returns a pointer to the base of the string, or NULL for errors.
475 */
476char *consumestr(char **buf, size_t *buflength)
477{
478	size_t len = strnlen(*buf, *buflength);
479	if (len == *buflength)
480		/* There's no null-terminator */
481		return NULL;
482	return consumebytes(len + 1, buf, buflength);
483}
484
485int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
486{
487	int i;
488	int count;
489	int ret = -EINVAL;
490
491	if (length < sizeof(*j))
492		goto out;
493	memcpy((void *)j, serialized, sizeof(*j));
494	serialized += sizeof(*j);
495	length -= sizeof(*j);
496
497	/* Potentially stale pointers not used as signals. */
498	j->bindings_head = NULL;
499	j->bindings_tail = NULL;
500	j->filter_prog = NULL;
501
502	if (j->user) {		/* stale pointer */
503		char *user = consumestr(&serialized, &length);
504		if (!user)
505			goto clear_pointers;
506		j->user = strdup(user);
507		if (!j->user)
508			goto clear_pointers;
509	}
510
511	if (j->chrootdir) {	/* stale pointer */
512		char *chrootdir = consumestr(&serialized, &length);
513		if (!chrootdir)
514			goto bad_chrootdir;
515		j->chrootdir = strdup(chrootdir);
516		if (!j->chrootdir)
517			goto bad_chrootdir;
518	}
519
520	if (j->flags.seccomp_filter && j->filter_len > 0) {
521		size_t ninstrs = j->filter_len;
522		if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
523		    ninstrs > USHRT_MAX)
524			goto bad_filters;
525
526		size_t program_len = ninstrs * sizeof(struct sock_filter);
527		void *program = consumebytes(program_len, &serialized, &length);
528		if (!program)
529			goto bad_filters;
530
531		j->filter_prog = malloc(sizeof(struct sock_fprog));
532		j->filter_prog->len = ninstrs;
533		j->filter_prog->filter = malloc(program_len);
534		memcpy(j->filter_prog->filter, program, program_len);
535	}
536
537	count = j->binding_count;
538	j->binding_count = 0;
539	for (i = 0; i < count; ++i) {
540		int *writeable;
541		const char *dest;
542		const char *src = consumestr(&serialized, &length);
543		if (!src)
544			goto bad_bindings;
545		dest = consumestr(&serialized, &length);
546		if (!dest)
547			goto bad_bindings;
548		writeable = consumebytes(sizeof(*writeable), &serialized, &length);
549		if (!writeable)
550			goto bad_bindings;
551		if (minijail_bind(j, src, dest, *writeable))
552			goto bad_bindings;
553	}
554
555	return 0;
556
557bad_bindings:
558	if (j->flags.seccomp_filter && j->filter_len > 0) {
559		free(j->filter_prog->filter);
560		free(j->filter_prog);
561	}
562bad_filters:
563	if (j->chrootdir)
564		free(j->chrootdir);
565bad_chrootdir:
566	if (j->user)
567		free(j->user);
568clear_pointers:
569	j->user = NULL;
570	j->chrootdir = NULL;
571out:
572	return ret;
573}
574
575/* bind_one: Applies bindings from @b for @j, recursing as needed.
576 * @j Minijail these bindings are for
577 * @b Head of list of bindings
578 *
579 * Returns 0 for success.
580 */
581int bind_one(const struct minijail *j, struct binding *b)
582{
583	int ret = 0;
584	char *dest = NULL;
585	if (ret)
586		return ret;
587	/* dest has a leading "/" */
588	if (asprintf(&dest, "%s%s", j->chrootdir, b->dest) < 0)
589		return -ENOMEM;
590	ret = mount(b->src, dest, NULL, MS_BIND, NULL);
591	if (ret)
592		pdie("bind: %s -> %s", b->src, dest);
593	if (!b->writeable) {
594		ret = mount(b->src, dest, NULL,
595			    MS_BIND | MS_REMOUNT | MS_RDONLY, NULL);
596		if (ret)
597			pdie("bind ro: %s -> %s", b->src, dest);
598	}
599	free(dest);
600	if (b->next)
601		return bind_one(j, b->next);
602	return ret;
603}
604
605int enter_chroot(const struct minijail *j)
606{
607	int ret;
608	if (j->bindings_head && (ret = bind_one(j, j->bindings_head)))
609		return ret;
610
611	if (chroot(j->chrootdir))
612		return -errno;
613
614	if (chdir("/"))
615		return -errno;
616
617	return 0;
618}
619
620int mount_tmp(void)
621{
622	return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
623}
624
625int remount_readonly(void)
626{
627	const char *kProcPath = "/proc";
628	const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
629	/*
630	 * Right now, we're holding a reference to our parent's old mount of
631	 * /proc in our namespace, which means using MS_REMOUNT here would
632	 * mutate our parent's mount as well, even though we're in a VFS
633	 * namespace (!). Instead, remove their mount from our namespace
634	 * and make our own.
635	 */
636	if (umount(kProcPath))
637		return -errno;
638	if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
639		return -errno;
640	return 0;
641}
642
643void drop_ugid(const struct minijail *j)
644{
645	if (j->flags.usergroups) {
646		if (initgroups(j->user, j->usergid))
647			pdie("initgroups");
648	} else {
649		/* Only attempt to clear supplemental groups if we are changing
650		 * users. */
651		if ((j->uid || j->gid) && setgroups(0, NULL))
652			pdie("setgroups");
653	}
654
655	if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
656		pdie("setresgid");
657
658	if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
659		pdie("setresuid");
660}
661
662/*
663 * We specifically do not use cap_valid() as that only tells us the last
664 * valid cap we were *compiled* against (i.e. what the version of kernel
665 * headers says).  If we run on a different kernel version, then it's not
666 * uncommon for that to be less (if an older kernel) or more (if a newer
667 * kernel).  So suck up the answer via /proc.
668 */
669static int run_cap_valid(unsigned int cap)
670{
671	static unsigned int last_cap;
672
673	if (!last_cap) {
674		const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
675		FILE *fp = fopen(cap_file, "re");
676		if (fscanf(fp, "%u", &last_cap) != 1)
677			pdie("fscanf(%s)", cap_file);
678		fclose(fp);
679	}
680
681	return cap <= last_cap;
682}
683
684void drop_caps(const struct minijail *j)
685{
686	cap_t caps = cap_get_proc();
687	cap_value_t flag[1];
688	const uint64_t one = 1;
689	unsigned int i;
690	if (!caps)
691		die("can't get process caps");
692	if (cap_clear_flag(caps, CAP_INHERITABLE))
693		die("can't clear inheritable caps");
694	if (cap_clear_flag(caps, CAP_EFFECTIVE))
695		die("can't clear effective caps");
696	if (cap_clear_flag(caps, CAP_PERMITTED))
697		die("can't clear permitted caps");
698	for (i = 0; i < sizeof(j->caps) * 8 && run_cap_valid(i); ++i) {
699		/* Keep CAP_SETPCAP for dropping bounding set bits. */
700		if (i != CAP_SETPCAP && !(j->caps & (one << i)))
701			continue;
702		flag[0] = i;
703		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
704			die("can't add effective cap");
705		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
706			die("can't add permitted cap");
707		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
708			die("can't add inheritable cap");
709	}
710	if (cap_set_proc(caps))
711		die("can't apply initial cleaned capset");
712
713	/*
714	 * Instead of dropping bounding set first, do it here in case
715	 * the caller had a more permissive bounding set which could
716	 * have been used above to raise a capability that wasn't already
717	 * present. This requires CAP_SETPCAP, so we raised/kept it above.
718	 */
719	for (i = 0; i < sizeof(j->caps) * 8 && run_cap_valid(i); ++i) {
720		if (j->caps & (one << i))
721			continue;
722		if (prctl(PR_CAPBSET_DROP, i))
723			pdie("prctl(PR_CAPBSET_DROP)");
724	}
725
726	/* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
727	if ((j->caps & (one << CAP_SETPCAP)) == 0) {
728		flag[0] = CAP_SETPCAP;
729		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
730			die("can't clear effective cap");
731		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
732			die("can't clear permitted cap");
733		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
734			die("can't clear inheritable cap");
735	}
736
737	if (cap_set_proc(caps))
738		die("can't apply final cleaned capset");
739
740	cap_free(caps);
741}
742
743void set_seccomp_filter(const struct minijail *j)
744{
745	/*
746	 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
747	 * in the kernel source tree for an explanation of the parameters.
748	 */
749	if (j->flags.no_new_privs) {
750		if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
751			pdie("prctl(PR_SET_NO_NEW_PRIVS)");
752	}
753
754	/*
755	 * If we're logging seccomp filter failures,
756	 * install the SIGSYS handler first.
757	 */
758	if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
759		if (install_sigsys_handler())
760			pdie("install SIGSYS handler");
761		warn("logging seccomp filter failures");
762	}
763
764	/*
765	 * Install the syscall filter.
766	 */
767	if (j->flags.seccomp_filter) {
768		if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, j->filter_prog)) {
769			if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
770				warn("seccomp not supported");
771				return;
772			}
773			pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
774		}
775	}
776}
777
778void API minijail_enter(const struct minijail *j)
779{
780	if (j->flags.pids)
781		die("tried to enter a pid-namespaced jail;"
782		    " try minijail_run()?");
783
784	if (j->flags.usergroups && !j->user)
785		die("usergroup inheritance without username");
786
787	/*
788	 * We can't recover from failures if we've dropped privileges partially,
789	 * so we don't even try. If any of our operations fail, we abort() the
790	 * entire process.
791	 */
792	if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
793		pdie("setns(CLONE_NEWNS)");
794
795	if (j->flags.vfs && unshare(CLONE_NEWNS))
796		pdie("unshare(vfs)");
797
798	if (j->flags.net && unshare(CLONE_NEWNET))
799		pdie("unshare(net)");
800
801	if (j->flags.chroot && enter_chroot(j))
802		pdie("chroot");
803
804	if (j->flags.mount_tmp && mount_tmp())
805		pdie("mount_tmp");
806
807	if (j->flags.readonly && remount_readonly())
808		pdie("remount");
809
810	if (j->flags.caps) {
811		/*
812		 * POSIX capabilities are a bit tricky. If we drop our
813		 * capability to change uids, our attempt to use setuid()
814		 * below will fail. Hang on to root caps across setuid(), then
815		 * lock securebits.
816		 */
817		if (prctl(PR_SET_KEEPCAPS, 1))
818			pdie("prctl(PR_SET_KEEPCAPS)");
819		if (prctl
820		    (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
821			pdie("prctl(PR_SET_SECUREBITS)");
822	}
823
824	/*
825	 * If we're setting no_new_privs, we can drop privileges
826	 * before setting seccomp filter. This way filter policies
827	 * don't need to allow privilege-dropping syscalls.
828	 */
829	if (j->flags.no_new_privs) {
830		drop_ugid(j);
831		if (j->flags.caps)
832			drop_caps(j);
833
834		set_seccomp_filter(j);
835	} else {
836		/*
837		 * If we're not setting no_new_privs,
838		 * we need to set seccomp filter *before* dropping privileges.
839		 * WARNING: this means that filter policies *must* allow
840		 * setgroups()/setresgid()/setresuid() for dropping root and
841		 * capget()/capset()/prctl() for dropping caps.
842		 */
843		set_seccomp_filter(j);
844
845		drop_ugid(j);
846		if (j->flags.caps)
847			drop_caps(j);
848	}
849
850	/*
851	 * seccomp has to come last since it cuts off all the other
852	 * privilege-dropping syscalls :)
853	 */
854	if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
855		if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
856			warn("seccomp not supported");
857			return;
858		}
859		pdie("prctl(PR_SET_SECCOMP)");
860	}
861}
862
863/* TODO(wad) will visibility affect this variable? */
864static int init_exitstatus = 0;
865
866void init_term(int __attribute__ ((unused)) sig)
867{
868	_exit(init_exitstatus);
869}
870
871int init(pid_t rootpid)
872{
873	pid_t pid;
874	int status;
875	/* so that we exit with the right status */
876	signal(SIGTERM, init_term);
877	/* TODO(wad) self jail with seccomp_filters here. */
878	while ((pid = wait(&status)) > 0) {
879		/*
880		 * This loop will only end when either there are no processes
881		 * left inside our pid namespace or we get a signal.
882		 */
883		if (pid == rootpid)
884			init_exitstatus = status;
885	}
886	if (!WIFEXITED(init_exitstatus))
887		_exit(MINIJAIL_ERR_INIT);
888	_exit(WEXITSTATUS(init_exitstatus));
889}
890
891int API minijail_from_fd(int fd, struct minijail *j)
892{
893	size_t sz = 0;
894	size_t bytes = read(fd, &sz, sizeof(sz));
895	char *buf;
896	int r;
897	if (sizeof(sz) != bytes)
898		return -EINVAL;
899	if (sz > USHRT_MAX)	/* Arbitrary sanity check */
900		return -E2BIG;
901	buf = malloc(sz);
902	if (!buf)
903		return -ENOMEM;
904	bytes = read(fd, buf, sz);
905	if (bytes != sz) {
906		free(buf);
907		return -EINVAL;
908	}
909	r = minijail_unmarshal(j, buf, sz);
910	free(buf);
911	return r;
912}
913
914int API minijail_to_fd(struct minijail *j, int fd)
915{
916	char *buf;
917	size_t sz = minijail_size(j);
918	ssize_t written;
919	int r;
920
921	if (!sz)
922		return -EINVAL;
923	buf = malloc(sz);
924	r = minijail_marshal(j, buf, sz);
925	if (r) {
926		free(buf);
927		return r;
928	}
929	/* Sends [size][minijail]. */
930	written = write(fd, &sz, sizeof(sz));
931	if (written != sizeof(sz)) {
932		free(buf);
933		return -EFAULT;
934	}
935	written = write(fd, buf, sz);
936	if (written < 0 || (size_t) written != sz) {
937		free(buf);
938		return -EFAULT;
939	}
940	free(buf);
941	return 0;
942}
943
944int setup_preload(void)
945{
946	char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
947	char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
948	if (!newenv)
949		return -ENOMEM;
950
951	/* Only insert a separating space if we have something to separate... */
952	sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
953		PRELOADPATH);
954
955	/* setenv() makes a copy of the string we give it */
956	setenv(kLdPreloadEnvVar, newenv, 1);
957	free(newenv);
958	return 0;
959}
960
961int setup_pipe(int fds[2])
962{
963	int r = pipe(fds);
964	char fd_buf[11];
965	if (r)
966		return r;
967	r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
968	if (r <= 0)
969		return -EINVAL;
970	setenv(kFdEnvVar, fd_buf, 1);
971	return 0;
972}
973
974int setup_pipe_end(int fds[2], size_t index)
975{
976	if (index > 1)
977		return -1;
978
979	close(fds[1 - index]);
980	return fds[index];
981}
982
983int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
984{
985	if (index > 1)
986		return -1;
987
988	close(fds[1 - index]);
989	/* dup2(2) the corresponding end of the pipe into |fd|. */
990	return dup2(fds[index], fd);
991}
992
993int API minijail_run(struct minijail *j, const char *filename,
994		     char *const argv[])
995{
996	return minijail_run_pid_pipes(j, filename, argv,
997				      NULL, NULL, NULL, NULL);
998}
999
1000int API minijail_run_pid(struct minijail *j, const char *filename,
1001			 char *const argv[], pid_t *pchild_pid)
1002{
1003	return minijail_run_pid_pipes(j, filename, argv, pchild_pid,
1004				      NULL, NULL, NULL);
1005}
1006
1007int API minijail_run_pipe(struct minijail *j, const char *filename,
1008			  char *const argv[], int *pstdin_fd)
1009{
1010	return minijail_run_pid_pipes(j, filename, argv, NULL, pstdin_fd,
1011				      NULL, NULL);
1012}
1013
1014int API minijail_run_pid_pipe(struct minijail *j, const char *filename,
1015			      char *const argv[], pid_t *pchild_pid,
1016			      int *pstdin_fd)
1017{
1018	return minijail_run_pid_pipes(j, filename, argv, pchild_pid, pstdin_fd,
1019				      NULL, NULL);
1020}
1021
1022int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
1023			       char *const argv[], pid_t *pchild_pid,
1024			       int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
1025{
1026	char *oldenv, *oldenv_copy = NULL;
1027	pid_t child_pid;
1028	int pipe_fds[2];
1029	int stdin_fds[2];
1030	int stdout_fds[2];
1031	int stderr_fds[2];
1032	int ret;
1033	/* We need to remember this across the minijail_preexec() call. */
1034	int pid_namespace = j->flags.pids;
1035
1036	oldenv = getenv(kLdPreloadEnvVar);
1037	if (oldenv) {
1038		oldenv_copy = strdup(oldenv);
1039		if (!oldenv_copy)
1040			return -ENOMEM;
1041	}
1042
1043	if (setup_preload())
1044		return -EFAULT;
1045
1046	/*
1047	 * Make the process group ID of this process equal to its PID, so that
1048	 * both the Minijail process and the jailed process can be killed
1049	 * together.
1050	 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
1051	 * the process is already a process group leader.
1052	 */
1053	if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
1054		if (errno != EPERM) {
1055			pdie("setpgid(0, 0)");
1056		}
1057	}
1058
1059	/*
1060	 * Before we fork(2) and execve(2) the child process, we need to open
1061	 * a pipe(2) to send the minijail configuration over.
1062	 */
1063	if (setup_pipe(pipe_fds))
1064		return -EFAULT;
1065
1066	/*
1067	 * If we want to write to the child process' standard input,
1068	 * create the pipe(2) now.
1069	 */
1070	if (pstdin_fd) {
1071		if (pipe(stdin_fds))
1072			return -EFAULT;
1073	}
1074
1075	/*
1076	 * If we want to read from the child process' standard output,
1077	 * create the pipe(2) now.
1078	 */
1079	if (pstdout_fd) {
1080		if (pipe(stdout_fds))
1081			return -EFAULT;
1082	}
1083
1084	/*
1085	 * If we want to read from the child process' standard error,
1086	 * create the pipe(2) now.
1087	 */
1088	if (pstderr_fd) {
1089		if (pipe(stderr_fds))
1090			return -EFAULT;
1091	}
1092
1093	/* Use sys_clone() if and only if we're creating a pid namespace.
1094	 *
1095	 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1096	 *
1097	 * In multithreaded programs, there are a bunch of locks inside libc,
1098	 * some of which may be held by other threads at the time that we call
1099	 * minijail_run_pid(). If we call fork(), glibc does its level best to
1100	 * ensure that we hold all of these locks before it calls clone()
1101	 * internally and drop them after clone() returns, but when we call
1102	 * sys_clone(2) directly, all that gets bypassed and we end up with a
1103	 * child address space where some of libc's important locks are held by
1104	 * other threads (which did not get cloned, and hence will never release
1105	 * those locks). This is okay so long as we call exec() immediately
1106	 * after, but a bunch of seemingly-innocent libc functions like setenv()
1107	 * take locks.
1108	 *
1109	 * Hence, only call sys_clone() if we need to, in order to get at pid
1110	 * namespacing. If we follow this path, the child's address space might
1111	 * have broken locks; you may only call functions that do not acquire
1112	 * any locks.
1113	 *
1114	 * Unfortunately, fork() acquires every lock it can get its hands on, as
1115	 * previously detailed, so this function is highly likely to deadlock
1116	 * later on (see "deadlock here") if we're multithreaded.
1117	 *
1118	 * We might hack around this by having the clone()d child (init of the
1119	 * pid namespace) return directly, rather than leaving the clone()d
1120	 * process hanging around to be init for the new namespace (and having
1121	 * its fork()ed child return in turn), but that process would be crippled
1122	 * with its libc locks potentially broken. We might try fork()ing in the
1123	 * parent before we clone() to ensure that we own all the locks, but
1124	 * then we have to have the forked child hanging around consuming
1125	 * resources (and possibly having file descriptors / shared memory
1126	 * regions / etc attached). We'd need to keep the child around to avoid
1127	 * having its children get reparented to init.
1128	 *
1129	 * TODO(ellyjones): figure out if the "forked child hanging around"
1130	 * problem is fixable or not. It would be nice if we worked in this
1131	 * case.
1132	 */
1133	if (pid_namespace)
1134		child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL);
1135	else
1136		child_pid = fork();
1137
1138	if (child_pid < 0) {
1139		free(oldenv_copy);
1140		die("failed to fork child");
1141	}
1142
1143	if (child_pid) {
1144		/* Restore parent's LD_PRELOAD. */
1145		if (oldenv_copy) {
1146			setenv(kLdPreloadEnvVar, oldenv_copy, 1);
1147			free(oldenv_copy);
1148		} else {
1149			unsetenv(kLdPreloadEnvVar);
1150		}
1151		unsetenv(kFdEnvVar);
1152
1153		j->initpid = child_pid;
1154
1155		/* Send marshalled minijail. */
1156		close(pipe_fds[0]);	/* read endpoint */
1157		ret = minijail_to_fd(j, pipe_fds[1]);
1158		close(pipe_fds[1]);	/* write endpoint */
1159		if (ret) {
1160			kill(j->initpid, SIGKILL);
1161			die("failed to send marshalled minijail");
1162		}
1163
1164		if (pchild_pid)
1165			*pchild_pid = child_pid;
1166
1167		/*
1168		 * If we want to write to the child process' standard input,
1169		 * set up the write end of the pipe.
1170		 */
1171		if (pstdin_fd)
1172			*pstdin_fd = setup_pipe_end(stdin_fds,
1173						    1	/* write end */);
1174
1175		/*
1176		 * If we want to read from the child process' standard output,
1177		 * set up the read end of the pipe.
1178		 */
1179		if (pstdout_fd)
1180			*pstdout_fd = setup_pipe_end(stdout_fds,
1181						     0	/* read end */);
1182
1183		/*
1184		 * If we want to read from the child process' standard error,
1185		 * set up the read end of the pipe.
1186		 */
1187		if (pstderr_fd)
1188			*pstderr_fd = setup_pipe_end(stderr_fds,
1189						     0	/* read end */);
1190
1191		return 0;
1192	}
1193	free(oldenv_copy);
1194
1195	/*
1196	 * If we want to write to the jailed process' standard input,
1197	 * set up the read end of the pipe.
1198	 */
1199	if (pstdin_fd) {
1200		if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
1201					    STDIN_FILENO) < 0)
1202			die("failed to set up stdin pipe");
1203	}
1204
1205	/*
1206	 * If we want to read from the jailed process' standard output,
1207	 * set up the write end of the pipe.
1208	 */
1209	if (pstdout_fd) {
1210		if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
1211					    STDOUT_FILENO) < 0)
1212			die("failed to set up stdout pipe");
1213	}
1214
1215	/*
1216	 * If we want to read from the jailed process' standard error,
1217	 * set up the write end of the pipe.
1218	 */
1219	if (pstderr_fd) {
1220		if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
1221					    STDERR_FILENO) < 0)
1222			die("failed to set up stderr pipe");
1223	}
1224
1225	/* Strip out flags that cannot be inherited across execve. */
1226	minijail_preexec(j);
1227	/* Jail this process and its descendants... */
1228	minijail_enter(j);
1229
1230	if (pid_namespace) {
1231		/*
1232		 * pid namespace: this process will become init inside the new
1233		 * namespace, so fork off a child to actually run the program
1234		 * (we don't want all programs we might exec to have to know
1235		 * how to be init).
1236		 *
1237		 * If we're multithreaded, we'll probably deadlock here. See
1238		 * WARNING above.
1239		 */
1240		child_pid = fork();
1241		if (child_pid < 0)
1242			_exit(child_pid);
1243		else if (child_pid > 0)
1244			init(child_pid);	/* never returns */
1245	}
1246
1247	/*
1248	 * If we aren't pid-namespaced:
1249	 *   calling process
1250	 *   -> execve()-ing process
1251	 * If we are:
1252	 *   calling process
1253	 *   -> init()-ing process
1254	 *      -> execve()-ing process
1255	 */
1256	_exit(execve(filename, argv, environ));
1257}
1258
1259int API minijail_run_static(struct minijail *j, const char *filename,
1260			    char *const argv[])
1261{
1262	pid_t child_pid;
1263	int pid_namespace = j->flags.pids;
1264
1265	if (j->flags.caps)
1266		die("caps not supported with static targets");
1267
1268	if (pid_namespace)
1269		child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL);
1270	else
1271		child_pid = fork();
1272
1273	if (child_pid < 0) {
1274		die("failed to fork child");
1275	}
1276	if (child_pid > 0 ) {
1277		j->initpid = child_pid;
1278		return 0;
1279	}
1280
1281	/*
1282	 * We can now drop this child into the sandbox
1283	 * then execve the target.
1284	 */
1285
1286	j->flags.pids = 0;
1287	minijail_enter(j);
1288
1289	if (pid_namespace) {
1290		/*
1291		 * pid namespace: this process will become init inside the new
1292		 * namespace, so fork off a child to actually run the program
1293		 * (we don't want all programs we might exec to have to know
1294		 * how to be init).
1295		 *
1296		 * If we're multithreaded, we'll probably deadlock here. See
1297		 * WARNING above.
1298		 */
1299		child_pid = fork();
1300		if (child_pid < 0)
1301			_exit(child_pid);
1302		else if (child_pid > 0)
1303			init(child_pid);	/* never returns */
1304	}
1305
1306	_exit(execve(filename, argv, environ));
1307}
1308
1309int API minijail_kill(struct minijail *j)
1310{
1311	int st;
1312	if (kill(j->initpid, SIGTERM))
1313		return -errno;
1314	if (waitpid(j->initpid, &st, 0) < 0)
1315		return -errno;
1316	return st;
1317}
1318
1319int API minijail_wait(struct minijail *j)
1320{
1321	int st;
1322	if (waitpid(j->initpid, &st, 0) < 0)
1323		return -errno;
1324
1325	if (!WIFEXITED(st)) {
1326		int error_status = st;
1327		if (WIFSIGNALED(st)) {
1328			int signum = WTERMSIG(st);
1329			warn("child process %d received signal %d",
1330			     j->initpid, signum);
1331			/*
1332			 * We return MINIJAIL_ERR_JAIL if the process received
1333			 * SIGSYS, which happens when a syscall is blocked by
1334			 * seccomp filters.
1335			 * If not, we do what bash(1) does:
1336			 * $? = 128 + signum
1337			 */
1338			if (signum == SIGSYS) {
1339				error_status = MINIJAIL_ERR_JAIL;
1340			} else {
1341				error_status = 128 + signum;
1342			}
1343		}
1344		return error_status;
1345	}
1346
1347	int exit_status = WEXITSTATUS(st);
1348	if (exit_status != 0)
1349		info("child process %d exited with status %d",
1350		     j->initpid, exit_status);
1351
1352	return exit_status;
1353}
1354
1355void API minijail_destroy(struct minijail *j)
1356{
1357	if (j->flags.seccomp_filter && j->filter_prog) {
1358		free(j->filter_prog->filter);
1359		free(j->filter_prog);
1360	}
1361	while (j->bindings_head) {
1362		struct binding *b = j->bindings_head;
1363		j->bindings_head = j->bindings_head->next;
1364		free(b->dest);
1365		free(b->src);
1366		free(b);
1367	}
1368	j->bindings_tail = NULL;
1369	if (j->user)
1370		free(j->user);
1371	if (j->chrootdir)
1372		free(j->chrootdir);
1373	free(j);
1374}
1375