libminijail.c revision 5b7a318e9785d6ffe1698ecd73121befea77259f
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6#define _BSD_SOURCE
7#define _GNU_SOURCE
8
9#include <asm/unistd.h>
10#include <ctype.h>
11#include <errno.h>
12#include <fcntl.h>
13#include <grp.h>
14#include <inttypes.h>
15#include <limits.h>
16#include <linux/capability.h>
17#include <pwd.h>
18#include <sched.h>
19#include <signal.h>
20#include <stdarg.h>
21#include <stddef.h>
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25#include <syscall.h>
26#include <sys/capability.h>
27#include <sys/mount.h>
28#include <sys/param.h>
29#include <sys/prctl.h>
30#include <sys/stat.h>
31#include <sys/types.h>
32#include <sys/user.h>
33#include <sys/wait.h>
34#include <unistd.h>
35
36#include "libminijail.h"
37#include "libminijail-private.h"
38
39#include "signal_handler.h"
40#include "syscall_filter.h"
41#include "util.h"
42
43#ifdef HAVE_SECUREBITS_H
44#include <linux/securebits.h>
45#else
46#define SECURE_ALL_BITS         0x15
47#define SECURE_ALL_LOCKS        (SECURE_ALL_BITS << 1)
48#endif
49
50/* Until these are reliably available in linux/prctl.h */
51#ifndef PR_SET_SECCOMP
52# define PR_SET_SECCOMP 22
53#endif
54
55/* For seccomp_filter using BPF. */
56#ifndef PR_SET_NO_NEW_PRIVS
57# define PR_SET_NO_NEW_PRIVS 38
58#endif
59#ifndef SECCOMP_MODE_FILTER
60# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
61#endif
62
63#ifdef USE_SECCOMP_SOFTFAIL
64# define SECCOMP_SOFTFAIL 1
65#else
66# define SECCOMP_SOFTFAIL 0
67#endif
68
69struct binding {
70	char *src;
71	char *dest;
72	int writeable;
73	struct binding *next;
74};
75
76struct minijail {
77	/*
78	 * WARNING: if you add a flag here you need to make sure it's
79	 * accounted for in minijail_pre{enter|exec}() below.
80	 */
81	struct {
82		int uid:1;
83		int gid:1;
84		int caps:1;
85		int vfs:1;
86		int enter_vfs:1;
87		int pids:1;
88		int net:1;
89		int seccomp:1;
90		int readonly:1;
91		int usergroups:1;
92		int ptrace:1;
93		int no_new_privs:1;
94		int seccomp_filter:1;
95		int log_seccomp_filter:1;
96		int chroot:1;
97		int mount_tmp:1;
98	} flags;
99	uid_t uid;
100	gid_t gid;
101	gid_t usergid;
102	char *user;
103	uint64_t caps;
104	pid_t initpid;
105	int mountns_fd;
106	int filter_len;
107	int binding_count;
108	char *chrootdir;
109	struct sock_fprog *filter_prog;
110	struct binding *bindings_head;
111	struct binding *bindings_tail;
112};
113
114/*
115 * Strip out flags meant for the parent.
116 * We keep things that are not inherited across execve(2) (e.g. capabilities),
117 * or are easier to set after execve(2) (e.g. seccomp filters).
118 */
119void minijail_preenter(struct minijail *j)
120{
121	j->flags.vfs = 0;
122	j->flags.enter_vfs = 0;
123	j->flags.readonly = 0;
124	j->flags.pids = 0;
125}
126
127/*
128 * Strip out flags meant for the child.
129 * We keep things that are inherited across execve(2).
130 */
131void minijail_preexec(struct minijail *j)
132{
133	int vfs = j->flags.vfs;
134	int enter_vfs = j->flags.enter_vfs;
135	int readonly = j->flags.readonly;
136	if (j->user)
137		free(j->user);
138	j->user = NULL;
139	memset(&j->flags, 0, sizeof(j->flags));
140	/* Now restore anything we meant to keep. */
141	j->flags.vfs = vfs;
142	j->flags.enter_vfs = enter_vfs;
143	j->flags.readonly = readonly;
144	/* Note, |pids| will already have been used before this call. */
145}
146
147/* Minijail API. */
148
149struct minijail API *minijail_new(void)
150{
151	return calloc(1, sizeof(struct minijail));
152}
153
154void API minijail_change_uid(struct minijail *j, uid_t uid)
155{
156	if (uid == 0)
157		die("useless change to uid 0");
158	j->uid = uid;
159	j->flags.uid = 1;
160}
161
162void API minijail_change_gid(struct minijail *j, gid_t gid)
163{
164	if (gid == 0)
165		die("useless change to gid 0");
166	j->gid = gid;
167	j->flags.gid = 1;
168}
169
170int API minijail_change_user(struct minijail *j, const char *user)
171{
172	char *buf = NULL;
173	struct passwd pw;
174	struct passwd *ppw = NULL;
175	ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
176	if (sz == -1)
177		sz = 65536;	/* your guess is as good as mine... */
178
179	/*
180	 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
181	 * the maximum needed size of the buffer, so we don't have to search.
182	 */
183	buf = malloc(sz);
184	if (!buf)
185		return -ENOMEM;
186	getpwnam_r(user, &pw, buf, sz, &ppw);
187	/*
188	 * We're safe to free the buffer here. The strings inside pw point
189	 * inside buf, but we don't use any of them; this leaves the pointers
190	 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded.
191	 */
192	free(buf);
193	/* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
194	if (!ppw)
195		return -1;
196	minijail_change_uid(j, ppw->pw_uid);
197	j->user = strdup(user);
198	if (!j->user)
199		return -ENOMEM;
200	j->usergid = ppw->pw_gid;
201	return 0;
202}
203
204int API minijail_change_group(struct minijail *j, const char *group)
205{
206	char *buf = NULL;
207	struct group gr;
208	struct group *pgr = NULL;
209	ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
210	if (sz == -1)
211		sz = 65536;	/* and mine is as good as yours, really */
212
213	/*
214	 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
215	 * the maximum needed size of the buffer, so we don't have to search.
216	 */
217	buf = malloc(sz);
218	if (!buf)
219		return -ENOMEM;
220	getgrnam_r(group, &gr, buf, sz, &pgr);
221	/*
222	 * We're safe to free the buffer here. The strings inside gr point
223	 * inside buf, but we don't use any of them; this leaves the pointers
224	 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
225	 */
226	free(buf);
227	/* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
228	if (!pgr)
229		return -1;
230	minijail_change_gid(j, pgr->gr_gid);
231	return 0;
232}
233
234void API minijail_use_seccomp(struct minijail *j)
235{
236	j->flags.seccomp = 1;
237}
238
239void API minijail_no_new_privs(struct minijail *j)
240{
241	j->flags.no_new_privs = 1;
242}
243
244void API minijail_use_seccomp_filter(struct minijail *j)
245{
246	j->flags.seccomp_filter = 1;
247}
248
249void API minijail_log_seccomp_filter_failures(struct minijail *j)
250{
251	j->flags.log_seccomp_filter = 1;
252}
253
254void API minijail_use_caps(struct minijail *j, uint64_t capmask)
255{
256	j->caps = capmask;
257	j->flags.caps = 1;
258}
259
260void API minijail_namespace_vfs(struct minijail *j)
261{
262	j->flags.vfs = 1;
263}
264
265void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
266{
267	int ns_fd = open(ns_path, O_RDONLY);
268	if (ns_fd < 0) {
269		pdie("failed to open namespace '%s'", ns_path);
270	}
271	j->mountns_fd = ns_fd;
272	j->flags.enter_vfs = 1;
273}
274
275void API minijail_namespace_pids(struct minijail *j)
276{
277	j->flags.vfs = 1;
278	j->flags.readonly = 1;
279	j->flags.pids = 1;
280}
281
282void API minijail_namespace_net(struct minijail *j)
283{
284	j->flags.net = 1;
285}
286
287void API minijail_remount_readonly(struct minijail *j)
288{
289	j->flags.vfs = 1;
290	j->flags.readonly = 1;
291}
292
293void API minijail_inherit_usergroups(struct minijail *j)
294{
295	j->flags.usergroups = 1;
296}
297
298void API minijail_disable_ptrace(struct minijail *j)
299{
300	j->flags.ptrace = 1;
301}
302
303int API minijail_enter_chroot(struct minijail *j, const char *dir)
304{
305	if (j->chrootdir)
306		return -EINVAL;
307	j->chrootdir = strdup(dir);
308	if (!j->chrootdir)
309		return -ENOMEM;
310	j->flags.chroot = 1;
311	return 0;
312}
313
314void API minijail_mount_tmp(struct minijail *j)
315{
316	j->flags.mount_tmp = 1;
317}
318
319int API minijail_bind(struct minijail *j, const char *src, const char *dest,
320		      int writeable)
321{
322	struct binding *b;
323
324	if (*dest != '/')
325		return -EINVAL;
326	b = calloc(1, sizeof(*b));
327	if (!b)
328		return -ENOMEM;
329	b->dest = strdup(dest);
330	if (!b->dest)
331		goto error;
332	b->src = strdup(src);
333	if (!b->src)
334		goto error;
335	b->writeable = writeable;
336
337	info("bind %s -> %s", src, dest);
338
339	/*
340	 * Force vfs namespacing so the bind mounts don't leak out into the
341	 * containing vfs namespace.
342	 */
343	minijail_namespace_vfs(j);
344
345	if (j->bindings_tail)
346		j->bindings_tail->next = b;
347	else
348		j->bindings_head = b;
349	j->bindings_tail = b;
350	j->binding_count++;
351
352	return 0;
353
354error:
355	free(b->src);
356	free(b->dest);
357	free(b);
358	return -ENOMEM;
359}
360
361void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
362{
363	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) {
364		if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
365			warn("not loading seccomp filter, seccomp not supported");
366			return;
367		}
368	}
369	FILE *file = fopen(path, "r");
370	if (!file) {
371		pdie("failed to open seccomp filter file '%s'", path);
372	}
373
374	struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
375	if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
376		die("failed to compile seccomp filter BPF program in '%s'",
377		    path);
378	}
379
380	j->filter_len = fprog->len;
381	j->filter_prog = fprog;
382
383	fclose(file);
384}
385
386struct marshal_state {
387	size_t available;
388	size_t total;
389	char *buf;
390};
391
392void marshal_state_init(struct marshal_state *state,
393			char *buf, size_t available)
394{
395	state->available = available;
396	state->buf = buf;
397	state->total = 0;
398}
399
400void marshal_append(struct marshal_state *state,
401		    char *src, size_t length)
402{
403	size_t copy_len = MIN(state->available, length);
404
405	/* Up to |available| will be written. */
406	if (copy_len) {
407		memcpy(state->buf, src, copy_len);
408		state->buf += copy_len;
409		state->available -= copy_len;
410	}
411	/* |total| will contain the expected length. */
412	state->total += length;
413}
414
415void minijail_marshal_helper(struct marshal_state *state,
416			     const struct minijail *j)
417{
418	struct binding *b = NULL;
419	marshal_append(state, (char *)j, sizeof(*j));
420	if (j->user)
421		marshal_append(state, j->user, strlen(j->user) + 1);
422	if (j->chrootdir)
423		marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
424	if (j->flags.seccomp_filter && j->filter_prog) {
425		struct sock_fprog *fp = j->filter_prog;
426		marshal_append(state, (char *)fp->filter,
427				fp->len * sizeof(struct sock_filter));
428	}
429	for (b = j->bindings_head; b; b = b->next) {
430		marshal_append(state, b->src, strlen(b->src) + 1);
431		marshal_append(state, b->dest, strlen(b->dest) + 1);
432		marshal_append(state, (char *)&b->writeable,
433				sizeof(b->writeable));
434	}
435}
436
437size_t API minijail_size(const struct minijail *j)
438{
439	struct marshal_state state;
440	marshal_state_init(&state, NULL, 0);
441	minijail_marshal_helper(&state, j);
442	return state.total;
443}
444
445int minijail_marshal(const struct minijail *j, char *buf, size_t available)
446{
447	struct marshal_state state;
448	marshal_state_init(&state, buf, available);
449	minijail_marshal_helper(&state, j);
450	return (state.total > available);
451}
452
453/* consumebytes: consumes @length bytes from a buffer @buf of length @buflength
454 * @length    Number of bytes to consume
455 * @buf       Buffer to consume from
456 * @buflength Size of @buf
457 *
458 * Returns a pointer to the base of the bytes, or NULL for errors.
459 */
460void *consumebytes(size_t length, char **buf, size_t *buflength)
461{
462	char *p = *buf;
463	if (length > *buflength)
464		return NULL;
465	*buf += length;
466	*buflength -= length;
467	return p;
468}
469
470/* consumestr: consumes a C string from a buffer @buf of length @length
471 * @buf    Buffer to consume
472 * @length Length of buffer
473 *
474 * Returns a pointer to the base of the string, or NULL for errors.
475 */
476char *consumestr(char **buf, size_t *buflength)
477{
478	size_t len = strnlen(*buf, *buflength);
479	if (len == *buflength)
480		/* There's no null-terminator */
481		return NULL;
482	return consumebytes(len + 1, buf, buflength);
483}
484
485int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
486{
487	int i;
488	int count;
489	int ret = -EINVAL;
490
491	if (length < sizeof(*j))
492		goto out;
493	memcpy((void *)j, serialized, sizeof(*j));
494	serialized += sizeof(*j);
495	length -= sizeof(*j);
496
497	/* Potentially stale pointers not used as signals. */
498	j->bindings_head = NULL;
499	j->bindings_tail = NULL;
500	j->filter_prog = NULL;
501
502	if (j->user) {		/* stale pointer */
503		char *user = consumestr(&serialized, &length);
504		if (!user)
505			goto clear_pointers;
506		j->user = strdup(user);
507		if (!j->user)
508			goto clear_pointers;
509	}
510
511	if (j->chrootdir) {	/* stale pointer */
512		char *chrootdir = consumestr(&serialized, &length);
513		if (!chrootdir)
514			goto bad_chrootdir;
515		j->chrootdir = strdup(chrootdir);
516		if (!j->chrootdir)
517			goto bad_chrootdir;
518	}
519
520	if (j->flags.seccomp_filter && j->filter_len > 0) {
521		size_t ninstrs = j->filter_len;
522		if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
523		    ninstrs > USHRT_MAX)
524			goto bad_filters;
525
526		size_t program_len = ninstrs * sizeof(struct sock_filter);
527		void *program = consumebytes(program_len, &serialized, &length);
528		if (!program)
529			goto bad_filters;
530
531		j->filter_prog = malloc(sizeof(struct sock_fprog));
532		j->filter_prog->len = ninstrs;
533		j->filter_prog->filter = malloc(program_len);
534		memcpy(j->filter_prog->filter, program, program_len);
535	}
536
537	count = j->binding_count;
538	j->binding_count = 0;
539	for (i = 0; i < count; ++i) {
540		int *writeable;
541		const char *dest;
542		const char *src = consumestr(&serialized, &length);
543		if (!src)
544			goto bad_bindings;
545		dest = consumestr(&serialized, &length);
546		if (!dest)
547			goto bad_bindings;
548		writeable = consumebytes(sizeof(*writeable), &serialized, &length);
549		if (!writeable)
550			goto bad_bindings;
551		if (minijail_bind(j, src, dest, *writeable))
552			goto bad_bindings;
553	}
554
555	return 0;
556
557bad_bindings:
558	if (j->flags.seccomp_filter && j->filter_len > 0) {
559		free(j->filter_prog->filter);
560		free(j->filter_prog);
561	}
562bad_filters:
563	if (j->chrootdir)
564		free(j->chrootdir);
565bad_chrootdir:
566	if (j->user)
567		free(j->user);
568clear_pointers:
569	j->user = NULL;
570	j->chrootdir = NULL;
571out:
572	return ret;
573}
574
575/* bind_one: Applies bindings from @b for @j, recursing as needed.
576 * @j Minijail these bindings are for
577 * @b Head of list of bindings
578 *
579 * Returns 0 for success.
580 */
581int bind_one(const struct minijail *j, struct binding *b)
582{
583	int ret = 0;
584	char *dest = NULL;
585	if (ret)
586		return ret;
587	/* dest has a leading "/" */
588	if (asprintf(&dest, "%s%s", j->chrootdir, b->dest) < 0)
589		return -ENOMEM;
590	ret = mount(b->src, dest, NULL, MS_BIND, NULL);
591	if (ret)
592		pdie("bind: %s -> %s", b->src, dest);
593	if (!b->writeable) {
594		ret = mount(b->src, dest, NULL,
595			    MS_BIND | MS_REMOUNT | MS_RDONLY, NULL);
596		if (ret)
597			pdie("bind ro: %s -> %s", b->src, dest);
598	}
599	free(dest);
600	if (b->next)
601		return bind_one(j, b->next);
602	return ret;
603}
604
605int enter_chroot(const struct minijail *j)
606{
607	int ret;
608	if (j->bindings_head && (ret = bind_one(j, j->bindings_head)))
609		return ret;
610
611	if (chroot(j->chrootdir))
612		return -errno;
613
614	if (chdir("/"))
615		return -errno;
616
617	return 0;
618}
619
620int mount_tmp(void)
621{
622	return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
623}
624
625int remount_readonly(void)
626{
627	const char *kProcPath = "/proc";
628	const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
629	/*
630	 * Right now, we're holding a reference to our parent's old mount of
631	 * /proc in our namespace, which means using MS_REMOUNT here would
632	 * mutate our parent's mount as well, even though we're in a VFS
633	 * namespace (!). Instead, remove their mount from our namespace
634	 * and make our own.
635	 */
636	if (umount(kProcPath))
637		return -errno;
638	if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
639		return -errno;
640	return 0;
641}
642
643void drop_ugid(const struct minijail *j)
644{
645	if (j->flags.usergroups) {
646		if (initgroups(j->user, j->usergid))
647			pdie("initgroups");
648	} else {
649		/* Only attempt to clear supplemental groups if we are changing
650		 * users. */
651		if ((j->uid || j->gid) && setgroups(0, NULL))
652			pdie("setgroups");
653	}
654
655	if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
656		pdie("setresgid");
657
658	if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
659		pdie("setresuid");
660}
661
662/*
663 * We specifically do not use cap_valid() as that only tells us the last
664 * valid cap we were *compiled* against (i.e. what the version of kernel
665 * headers says).  If we run on a different kernel version, then it's not
666 * uncommon for that to be less (if an older kernel) or more (if a newer
667 * kernel).  So suck up the answer via /proc.
668 */
669static int run_cap_valid(unsigned int cap)
670{
671	static unsigned int last_cap;
672
673	if (!last_cap) {
674		const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
675		FILE *fp = fopen(cap_file, "re");
676		if (fscanf(fp, "%u", &last_cap) != 1)
677			pdie("fscanf(%s)", cap_file);
678		fclose(fp);
679	}
680
681	return cap <= last_cap;
682}
683
684void drop_caps(const struct minijail *j)
685{
686#if defined(__ANDROID__)
687	/*
688	 * Temporarily disable capabilities support until Minijail can use
689	 * libcap-ng.
690	 */
691	(void) j;
692#else
693	cap_t caps = cap_get_proc();
694	cap_value_t flag[1];
695	const uint64_t one = 1;
696	unsigned int i;
697	if (!caps)
698		die("can't get process caps");
699	if (cap_clear_flag(caps, CAP_INHERITABLE))
700		die("can't clear inheritable caps");
701	if (cap_clear_flag(caps, CAP_EFFECTIVE))
702		die("can't clear effective caps");
703	if (cap_clear_flag(caps, CAP_PERMITTED))
704		die("can't clear permitted caps");
705	for (i = 0; i < sizeof(j->caps) * 8 && run_cap_valid(i); ++i) {
706		/* Keep CAP_SETPCAP for dropping bounding set bits. */
707		if (i != CAP_SETPCAP && !(j->caps & (one << i)))
708			continue;
709		flag[0] = i;
710		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
711			die("can't add effective cap");
712		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
713			die("can't add permitted cap");
714		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
715			die("can't add inheritable cap");
716	}
717	if (cap_set_proc(caps))
718		die("can't apply initial cleaned capset");
719
720	/*
721	 * Instead of dropping bounding set first, do it here in case
722	 * the caller had a more permissive bounding set which could
723	 * have been used above to raise a capability that wasn't already
724	 * present. This requires CAP_SETPCAP, so we raised/kept it above.
725	 */
726	for (i = 0; i < sizeof(j->caps) * 8 && run_cap_valid(i); ++i) {
727		if (j->caps & (one << i))
728			continue;
729		if (prctl(PR_CAPBSET_DROP, i))
730			pdie("prctl(PR_CAPBSET_DROP)");
731	}
732
733	/* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
734	if ((j->caps & (one << CAP_SETPCAP)) == 0) {
735		flag[0] = CAP_SETPCAP;
736		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
737			die("can't clear effective cap");
738		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
739			die("can't clear permitted cap");
740		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
741			die("can't clear inheritable cap");
742	}
743
744	if (cap_set_proc(caps))
745		die("can't apply final cleaned capset");
746
747	cap_free(caps);
748#endif
749}
750
751void set_seccomp_filter(const struct minijail *j)
752{
753	/*
754	 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
755	 * in the kernel source tree for an explanation of the parameters.
756	 */
757	if (j->flags.no_new_privs) {
758		if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
759			pdie("prctl(PR_SET_NO_NEW_PRIVS)");
760	}
761
762	/*
763	 * If we're logging seccomp filter failures,
764	 * install the SIGSYS handler first.
765	 */
766	if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
767		if (install_sigsys_handler())
768			pdie("install SIGSYS handler");
769		warn("logging seccomp filter failures");
770	}
771
772	/*
773	 * Install the syscall filter.
774	 */
775	if (j->flags.seccomp_filter) {
776		if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, j->filter_prog)) {
777			if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
778				warn("seccomp not supported");
779				return;
780			}
781			pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
782		}
783	}
784}
785
786void API minijail_enter(const struct minijail *j)
787{
788	if (j->flags.pids)
789		die("tried to enter a pid-namespaced jail;"
790		    " try minijail_run()?");
791
792	if (j->flags.usergroups && !j->user)
793		die("usergroup inheritance without username");
794
795	/*
796	 * We can't recover from failures if we've dropped privileges partially,
797	 * so we don't even try. If any of our operations fail, we abort() the
798	 * entire process.
799	 */
800	if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
801		pdie("setns(CLONE_NEWNS)");
802
803	if (j->flags.vfs && unshare(CLONE_NEWNS))
804		pdie("unshare(vfs)");
805
806	if (j->flags.net && unshare(CLONE_NEWNET))
807		pdie("unshare(net)");
808
809	if (j->flags.chroot && enter_chroot(j))
810		pdie("chroot");
811
812	if (j->flags.mount_tmp && mount_tmp())
813		pdie("mount_tmp");
814
815	if (j->flags.readonly && remount_readonly())
816		pdie("remount");
817
818	if (j->flags.caps) {
819		/*
820		 * POSIX capabilities are a bit tricky. If we drop our
821		 * capability to change uids, our attempt to use setuid()
822		 * below will fail. Hang on to root caps across setuid(), then
823		 * lock securebits.
824		 */
825		if (prctl(PR_SET_KEEPCAPS, 1))
826			pdie("prctl(PR_SET_KEEPCAPS)");
827		if (prctl
828		    (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
829			pdie("prctl(PR_SET_SECUREBITS)");
830	}
831
832	/*
833	 * If we're setting no_new_privs, we can drop privileges
834	 * before setting seccomp filter. This way filter policies
835	 * don't need to allow privilege-dropping syscalls.
836	 */
837	if (j->flags.no_new_privs) {
838		drop_ugid(j);
839		if (j->flags.caps)
840			drop_caps(j);
841
842		set_seccomp_filter(j);
843	} else {
844		/*
845		 * If we're not setting no_new_privs,
846		 * we need to set seccomp filter *before* dropping privileges.
847		 * WARNING: this means that filter policies *must* allow
848		 * setgroups()/setresgid()/setresuid() for dropping root and
849		 * capget()/capset()/prctl() for dropping caps.
850		 */
851		set_seccomp_filter(j);
852
853		drop_ugid(j);
854		if (j->flags.caps)
855			drop_caps(j);
856	}
857
858	/*
859	 * seccomp has to come last since it cuts off all the other
860	 * privilege-dropping syscalls :)
861	 */
862	if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
863		if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
864			warn("seccomp not supported");
865			return;
866		}
867		pdie("prctl(PR_SET_SECCOMP)");
868	}
869}
870
871/* TODO(wad) will visibility affect this variable? */
872static int init_exitstatus = 0;
873
874void init_term(int __attribute__ ((unused)) sig)
875{
876	_exit(init_exitstatus);
877}
878
879int init(pid_t rootpid)
880{
881	pid_t pid;
882	int status;
883	/* so that we exit with the right status */
884	signal(SIGTERM, init_term);
885	/* TODO(wad) self jail with seccomp_filters here. */
886	while ((pid = wait(&status)) > 0) {
887		/*
888		 * This loop will only end when either there are no processes
889		 * left inside our pid namespace or we get a signal.
890		 */
891		if (pid == rootpid)
892			init_exitstatus = status;
893	}
894	if (!WIFEXITED(init_exitstatus))
895		_exit(MINIJAIL_ERR_INIT);
896	_exit(WEXITSTATUS(init_exitstatus));
897}
898
899int API minijail_from_fd(int fd, struct minijail *j)
900{
901	size_t sz = 0;
902	size_t bytes = read(fd, &sz, sizeof(sz));
903	char *buf;
904	int r;
905	if (sizeof(sz) != bytes)
906		return -EINVAL;
907	if (sz > USHRT_MAX)	/* Arbitrary sanity check */
908		return -E2BIG;
909	buf = malloc(sz);
910	if (!buf)
911		return -ENOMEM;
912	bytes = read(fd, buf, sz);
913	if (bytes != sz) {
914		free(buf);
915		return -EINVAL;
916	}
917	r = minijail_unmarshal(j, buf, sz);
918	free(buf);
919	return r;
920}
921
922int API minijail_to_fd(struct minijail *j, int fd)
923{
924	char *buf;
925	size_t sz = minijail_size(j);
926	ssize_t written;
927	int r;
928
929	if (!sz)
930		return -EINVAL;
931	buf = malloc(sz);
932	r = minijail_marshal(j, buf, sz);
933	if (r) {
934		free(buf);
935		return r;
936	}
937	/* Sends [size][minijail]. */
938	written = write(fd, &sz, sizeof(sz));
939	if (written != sizeof(sz)) {
940		free(buf);
941		return -EFAULT;
942	}
943	written = write(fd, buf, sz);
944	if (written < 0 || (size_t) written != sz) {
945		free(buf);
946		return -EFAULT;
947	}
948	free(buf);
949	return 0;
950}
951
952int setup_preload(void)
953{
954#if defined(__ANDROID__)
955	/* Don't use LDPRELOAD on Brillo. */
956	return 0;
957#else
958	char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
959	char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
960	if (!newenv)
961		return -ENOMEM;
962
963	/* Only insert a separating space if we have something to separate... */
964	sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
965		PRELOADPATH);
966
967	/* setenv() makes a copy of the string we give it */
968	setenv(kLdPreloadEnvVar, newenv, 1);
969	free(newenv);
970	return 0;
971#endif
972}
973
974int setup_pipe(int fds[2])
975{
976	int r = pipe(fds);
977	char fd_buf[11];
978	if (r)
979		return r;
980	r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
981	if (r <= 0)
982		return -EINVAL;
983	setenv(kFdEnvVar, fd_buf, 1);
984	return 0;
985}
986
987int setup_pipe_end(int fds[2], size_t index)
988{
989	if (index > 1)
990		return -1;
991
992	close(fds[1 - index]);
993	return fds[index];
994}
995
996int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
997{
998	if (index > 1)
999		return -1;
1000
1001	close(fds[1 - index]);
1002	/* dup2(2) the corresponding end of the pipe into |fd|. */
1003	return dup2(fds[index], fd);
1004}
1005
1006int API minijail_run(struct minijail *j, const char *filename,
1007		     char *const argv[])
1008{
1009	return minijail_run_pid_pipes(j, filename, argv,
1010				      NULL, NULL, NULL, NULL);
1011}
1012
1013int API minijail_run_pid(struct minijail *j, const char *filename,
1014			 char *const argv[], pid_t *pchild_pid)
1015{
1016	return minijail_run_pid_pipes(j, filename, argv, pchild_pid,
1017				      NULL, NULL, NULL);
1018}
1019
1020int API minijail_run_pipe(struct minijail *j, const char *filename,
1021			  char *const argv[], int *pstdin_fd)
1022{
1023	return minijail_run_pid_pipes(j, filename, argv, NULL, pstdin_fd,
1024				      NULL, NULL);
1025}
1026
1027int API minijail_run_pid_pipe(struct minijail *j, const char *filename,
1028			      char *const argv[], pid_t *pchild_pid,
1029			      int *pstdin_fd)
1030{
1031	return minijail_run_pid_pipes(j, filename, argv, pchild_pid, pstdin_fd,
1032				      NULL, NULL);
1033}
1034
1035int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
1036			       char *const argv[], pid_t *pchild_pid,
1037			       int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
1038{
1039	char *oldenv, *oldenv_copy = NULL;
1040	pid_t child_pid;
1041	int pipe_fds[2];
1042	int stdin_fds[2];
1043	int stdout_fds[2];
1044	int stderr_fds[2];
1045	int ret;
1046	/* We need to remember this across the minijail_preexec() call. */
1047	int pid_namespace = j->flags.pids;
1048
1049	oldenv = getenv(kLdPreloadEnvVar);
1050	if (oldenv) {
1051		oldenv_copy = strdup(oldenv);
1052		if (!oldenv_copy)
1053			return -ENOMEM;
1054	}
1055
1056	if (setup_preload())
1057		return -EFAULT;
1058
1059	/*
1060	 * Make the process group ID of this process equal to its PID, so that
1061	 * both the Minijail process and the jailed process can be killed
1062	 * together.
1063	 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
1064	 * the process is already a process group leader.
1065	 */
1066	if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
1067		if (errno != EPERM) {
1068			pdie("setpgid(0, 0)");
1069		}
1070	}
1071
1072	/*
1073	 * Before we fork(2) and execve(2) the child process, we need to open
1074	 * a pipe(2) to send the minijail configuration over.
1075	 */
1076	if (setup_pipe(pipe_fds))
1077		return -EFAULT;
1078
1079	/*
1080	 * If we want to write to the child process' standard input,
1081	 * create the pipe(2) now.
1082	 */
1083	if (pstdin_fd) {
1084		if (pipe(stdin_fds))
1085			return -EFAULT;
1086	}
1087
1088	/*
1089	 * If we want to read from the child process' standard output,
1090	 * create the pipe(2) now.
1091	 */
1092	if (pstdout_fd) {
1093		if (pipe(stdout_fds))
1094			return -EFAULT;
1095	}
1096
1097	/*
1098	 * If we want to read from the child process' standard error,
1099	 * create the pipe(2) now.
1100	 */
1101	if (pstderr_fd) {
1102		if (pipe(stderr_fds))
1103			return -EFAULT;
1104	}
1105
1106	/* Use sys_clone() if and only if we're creating a pid namespace.
1107	 *
1108	 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1109	 *
1110	 * In multithreaded programs, there are a bunch of locks inside libc,
1111	 * some of which may be held by other threads at the time that we call
1112	 * minijail_run_pid(). If we call fork(), glibc does its level best to
1113	 * ensure that we hold all of these locks before it calls clone()
1114	 * internally and drop them after clone() returns, but when we call
1115	 * sys_clone(2) directly, all that gets bypassed and we end up with a
1116	 * child address space where some of libc's important locks are held by
1117	 * other threads (which did not get cloned, and hence will never release
1118	 * those locks). This is okay so long as we call exec() immediately
1119	 * after, but a bunch of seemingly-innocent libc functions like setenv()
1120	 * take locks.
1121	 *
1122	 * Hence, only call sys_clone() if we need to, in order to get at pid
1123	 * namespacing. If we follow this path, the child's address space might
1124	 * have broken locks; you may only call functions that do not acquire
1125	 * any locks.
1126	 *
1127	 * Unfortunately, fork() acquires every lock it can get its hands on, as
1128	 * previously detailed, so this function is highly likely to deadlock
1129	 * later on (see "deadlock here") if we're multithreaded.
1130	 *
1131	 * We might hack around this by having the clone()d child (init of the
1132	 * pid namespace) return directly, rather than leaving the clone()d
1133	 * process hanging around to be init for the new namespace (and having
1134	 * its fork()ed child return in turn), but that process would be crippled
1135	 * with its libc locks potentially broken. We might try fork()ing in the
1136	 * parent before we clone() to ensure that we own all the locks, but
1137	 * then we have to have the forked child hanging around consuming
1138	 * resources (and possibly having file descriptors / shared memory
1139	 * regions / etc attached). We'd need to keep the child around to avoid
1140	 * having its children get reparented to init.
1141	 *
1142	 * TODO(ellyjones): figure out if the "forked child hanging around"
1143	 * problem is fixable or not. It would be nice if we worked in this
1144	 * case.
1145	 */
1146	if (pid_namespace)
1147		child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL);
1148	else
1149		child_pid = fork();
1150
1151	if (child_pid < 0) {
1152		free(oldenv_copy);
1153		die("failed to fork child");
1154	}
1155
1156	if (child_pid) {
1157		/* Restore parent's LD_PRELOAD. */
1158		if (oldenv_copy) {
1159			setenv(kLdPreloadEnvVar, oldenv_copy, 1);
1160			free(oldenv_copy);
1161		} else {
1162			unsetenv(kLdPreloadEnvVar);
1163		}
1164		unsetenv(kFdEnvVar);
1165
1166		j->initpid = child_pid;
1167
1168		/* Send marshalled minijail. */
1169		close(pipe_fds[0]);	/* read endpoint */
1170		ret = minijail_to_fd(j, pipe_fds[1]);
1171		close(pipe_fds[1]);	/* write endpoint */
1172		if (ret) {
1173			kill(j->initpid, SIGKILL);
1174			die("failed to send marshalled minijail");
1175		}
1176
1177		if (pchild_pid)
1178			*pchild_pid = child_pid;
1179
1180		/*
1181		 * If we want to write to the child process' standard input,
1182		 * set up the write end of the pipe.
1183		 */
1184		if (pstdin_fd)
1185			*pstdin_fd = setup_pipe_end(stdin_fds,
1186						    1	/* write end */);
1187
1188		/*
1189		 * If we want to read from the child process' standard output,
1190		 * set up the read end of the pipe.
1191		 */
1192		if (pstdout_fd)
1193			*pstdout_fd = setup_pipe_end(stdout_fds,
1194						     0	/* read end */);
1195
1196		/*
1197		 * If we want to read from the child process' standard error,
1198		 * set up the read end of the pipe.
1199		 */
1200		if (pstderr_fd)
1201			*pstderr_fd = setup_pipe_end(stderr_fds,
1202						     0	/* read end */);
1203
1204		return 0;
1205	}
1206	free(oldenv_copy);
1207
1208	/*
1209	 * If we want to write to the jailed process' standard input,
1210	 * set up the read end of the pipe.
1211	 */
1212	if (pstdin_fd) {
1213		if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
1214					    STDIN_FILENO) < 0)
1215			die("failed to set up stdin pipe");
1216	}
1217
1218	/*
1219	 * If we want to read from the jailed process' standard output,
1220	 * set up the write end of the pipe.
1221	 */
1222	if (pstdout_fd) {
1223		if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
1224					    STDOUT_FILENO) < 0)
1225			die("failed to set up stdout pipe");
1226	}
1227
1228	/*
1229	 * If we want to read from the jailed process' standard error,
1230	 * set up the write end of the pipe.
1231	 */
1232	if (pstderr_fd) {
1233		if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
1234					    STDERR_FILENO) < 0)
1235			die("failed to set up stderr pipe");
1236	}
1237
1238	/* Strip out flags that cannot be inherited across execve. */
1239	minijail_preexec(j);
1240	/* Jail this process and its descendants... */
1241	minijail_enter(j);
1242
1243	if (pid_namespace) {
1244		/*
1245		 * pid namespace: this process will become init inside the new
1246		 * namespace, so fork off a child to actually run the program
1247		 * (we don't want all programs we might exec to have to know
1248		 * how to be init).
1249		 *
1250		 * If we're multithreaded, we'll probably deadlock here. See
1251		 * WARNING above.
1252		 */
1253		child_pid = fork();
1254		if (child_pid < 0)
1255			_exit(child_pid);
1256		else if (child_pid > 0)
1257			init(child_pid);	/* never returns */
1258	}
1259
1260	/*
1261	 * If we aren't pid-namespaced:
1262	 *   calling process
1263	 *   -> execve()-ing process
1264	 * If we are:
1265	 *   calling process
1266	 *   -> init()-ing process
1267	 *      -> execve()-ing process
1268	 */
1269	_exit(execve(filename, argv, environ));
1270}
1271
1272int API minijail_run_static(struct minijail *j, const char *filename,
1273			    char *const argv[])
1274{
1275	pid_t child_pid;
1276	int pid_namespace = j->flags.pids;
1277
1278	if (j->flags.caps)
1279		die("caps not supported with static targets");
1280
1281	if (pid_namespace)
1282		child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL);
1283	else
1284		child_pid = fork();
1285
1286	if (child_pid < 0) {
1287		die("failed to fork child");
1288	}
1289	if (child_pid > 0 ) {
1290		j->initpid = child_pid;
1291		return 0;
1292	}
1293
1294	/*
1295	 * We can now drop this child into the sandbox
1296	 * then execve the target.
1297	 */
1298
1299	j->flags.pids = 0;
1300	minijail_enter(j);
1301
1302	if (pid_namespace) {
1303		/*
1304		 * pid namespace: this process will become init inside the new
1305		 * namespace, so fork off a child to actually run the program
1306		 * (we don't want all programs we might exec to have to know
1307		 * how to be init).
1308		 *
1309		 * If we're multithreaded, we'll probably deadlock here. See
1310		 * WARNING above.
1311		 */
1312		child_pid = fork();
1313		if (child_pid < 0)
1314			_exit(child_pid);
1315		else if (child_pid > 0)
1316			init(child_pid);	/* never returns */
1317	}
1318
1319	_exit(execve(filename, argv, environ));
1320}
1321
1322int API minijail_kill(struct minijail *j)
1323{
1324	int st;
1325	if (kill(j->initpid, SIGTERM))
1326		return -errno;
1327	if (waitpid(j->initpid, &st, 0) < 0)
1328		return -errno;
1329	return st;
1330}
1331
1332int API minijail_wait(struct minijail *j)
1333{
1334	int st;
1335	if (waitpid(j->initpid, &st, 0) < 0)
1336		return -errno;
1337
1338	if (!WIFEXITED(st)) {
1339		int error_status = st;
1340		if (WIFSIGNALED(st)) {
1341			int signum = WTERMSIG(st);
1342			warn("child process %d received signal %d",
1343			     j->initpid, signum);
1344			/*
1345			 * We return MINIJAIL_ERR_JAIL if the process received
1346			 * SIGSYS, which happens when a syscall is blocked by
1347			 * seccomp filters.
1348			 * If not, we do what bash(1) does:
1349			 * $? = 128 + signum
1350			 */
1351			if (signum == SIGSYS) {
1352				error_status = MINIJAIL_ERR_JAIL;
1353			} else {
1354				error_status = 128 + signum;
1355			}
1356		}
1357		return error_status;
1358	}
1359
1360	int exit_status = WEXITSTATUS(st);
1361	if (exit_status != 0)
1362		info("child process %d exited with status %d",
1363		     j->initpid, exit_status);
1364
1365	return exit_status;
1366}
1367
1368void API minijail_destroy(struct minijail *j)
1369{
1370	if (j->flags.seccomp_filter && j->filter_prog) {
1371		free(j->filter_prog->filter);
1372		free(j->filter_prog);
1373	}
1374	while (j->bindings_head) {
1375		struct binding *b = j->bindings_head;
1376		j->bindings_head = j->bindings_head->next;
1377		free(b->dest);
1378		free(b->src);
1379		free(b);
1380	}
1381	j->bindings_tail = NULL;
1382	if (j->user)
1383		free(j->user);
1384	if (j->chrootdir)
1385		free(j->chrootdir);
1386	free(j);
1387}
1388