libminijail.c revision b9a7b16859466043feeaf973e43c0efaba0de8c1
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6#define _BSD_SOURCE
7#define _DEFAULT_SOURCE
8#define _GNU_SOURCE
9
10#include <asm/unistd.h>
11#include <dirent.h>
12#include <errno.h>
13#include <fcntl.h>
14#include <grp.h>
15#include <linux/capability.h>
16#include <pwd.h>
17#include <sched.h>
18#include <signal.h>
19#include <stdbool.h>
20#include <stddef.h>
21#include <stdio.h>
22#include <stdlib.h>
23#include <string.h>
24#include <sys/capability.h>
25#include <sys/mount.h>
26#include <sys/param.h>
27#include <sys/prctl.h>
28#include <sys/stat.h>
29#include <sys/types.h>
30#include <sys/user.h>
31#include <sys/wait.h>
32#include <syscall.h>
33#include <unistd.h>
34
35#include "libminijail.h"
36#include "libminijail-private.h"
37
38#include "signal_handler.h"
39#include "syscall_filter.h"
40#include "syscall_wrapper.h"
41#include "system.h"
42#include "util.h"
43
44/* Until these are reliably available in linux/prctl.h. */
45#ifndef PR_ALT_SYSCALL
46# define PR_ALT_SYSCALL 0x43724f53
47#endif
48
49/* Seccomp filter related flags. */
50#ifndef PR_SET_NO_NEW_PRIVS
51# define PR_SET_NO_NEW_PRIVS 38
52#endif
53
54#ifndef SECCOMP_MODE_FILTER
55#define SECCOMP_MODE_FILTER 2 /* Uses user-supplied filter. */
56#endif
57
58#ifndef SECCOMP_SET_MODE_STRICT
59# define SECCOMP_SET_MODE_STRICT 0
60#endif
61#ifndef SECCOMP_SET_MODE_FILTER
62# define SECCOMP_SET_MODE_FILTER 1
63#endif
64
65#ifndef SECCOMP_FILTER_FLAG_TSYNC
66# define SECCOMP_FILTER_FLAG_TSYNC 1
67#endif
68/* End seccomp filter related flags. */
69
70/* New cgroup namespace might not be in linux-headers yet. */
71#ifndef CLONE_NEWCGROUP
72# define CLONE_NEWCGROUP 0x02000000
73#endif
74
75#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */
76
77/* Keyctl commands. */
78#define KEYCTL_JOIN_SESSION_KEYRING 1
79
80struct mountpoint {
81	char *src;
82	char *dest;
83	char *type;
84	char *data;
85	int has_data;
86	unsigned long flags;
87	struct mountpoint *next;
88};
89
90struct minijail {
91	/*
92	 * WARNING: if you add a flag here you need to make sure it's
93	 * accounted for in minijail_pre{enter|exec}() below.
94	 */
95	struct {
96		int uid : 1;
97		int gid : 1;
98		int inherit_suppl_gids : 1;
99		int set_suppl_gids : 1;
100		int keep_suppl_gids : 1;
101		int use_caps : 1;
102		int capbset_drop : 1;
103		int set_ambient_caps : 1;
104		int vfs : 1;
105		int enter_vfs : 1;
106		int skip_remount_private : 1;
107		int pids : 1;
108		int ipc : 1;
109		int uts : 1;
110		int net : 1;
111		int enter_net : 1;
112		int ns_cgroups : 1;
113		int userns : 1;
114		int disable_setgroups : 1;
115		int seccomp : 1;
116		int remount_proc_ro : 1;
117		int no_new_privs : 1;
118		int seccomp_filter : 1;
119		int seccomp_filter_tsync : 1;
120		int seccomp_filter_logging : 1;
121		int chroot : 1;
122		int pivot_root : 1;
123		int mount_tmp : 1;
124		int do_init : 1;
125		int pid_file : 1;
126		int cgroups : 1;
127		int alt_syscall : 1;
128		int reset_signal_mask : 1;
129		int close_open_fds : 1;
130		int new_session_keyring : 1;
131		int forward_signals : 1;
132	} flags;
133	uid_t uid;
134	gid_t gid;
135	gid_t usergid;
136	char *user;
137	size_t suppl_gid_count;
138	gid_t *suppl_gid_list;
139	uint64_t caps;
140	uint64_t cap_bset;
141	pid_t initpid;
142	int mountns_fd;
143	int netns_fd;
144	char *chrootdir;
145	char *pid_file_path;
146	char *uidmap;
147	char *gidmap;
148	char *hostname;
149	size_t filter_len;
150	struct sock_fprog *filter_prog;
151	char *alt_syscall_table;
152	struct mountpoint *mounts_head;
153	struct mountpoint *mounts_tail;
154	size_t mounts_count;
155	size_t tmpfs_size;
156	char *cgroups[MAX_CGROUPS];
157	size_t cgroup_count;
158};
159
160/*
161 * Strip out flags meant for the parent.
162 * We keep things that are not inherited across execve(2) (e.g. capabilities),
163 * or are easier to set after execve(2) (e.g. seccomp filters).
164 */
165void minijail_preenter(struct minijail *j)
166{
167	j->flags.vfs = 0;
168	j->flags.enter_vfs = 0;
169	j->flags.skip_remount_private = 0;
170	j->flags.remount_proc_ro = 0;
171	j->flags.pids = 0;
172	j->flags.do_init = 0;
173	j->flags.pid_file = 0;
174	j->flags.cgroups = 0;
175	j->flags.forward_signals = 0;
176}
177
178/*
179 * Strip out flags meant for the child.
180 * We keep things that are inherited across execve(2).
181 */
182void minijail_preexec(struct minijail *j)
183{
184	int vfs = j->flags.vfs;
185	int enter_vfs = j->flags.enter_vfs;
186	int skip_remount_private = j->flags.skip_remount_private;
187	int remount_proc_ro = j->flags.remount_proc_ro;
188	int userns = j->flags.userns;
189	if (j->user)
190		free(j->user);
191	j->user = NULL;
192	if (j->suppl_gid_list)
193		free(j->suppl_gid_list);
194	j->suppl_gid_list = NULL;
195	memset(&j->flags, 0, sizeof(j->flags));
196	/* Now restore anything we meant to keep. */
197	j->flags.vfs = vfs;
198	j->flags.enter_vfs = enter_vfs;
199	j->flags.skip_remount_private = skip_remount_private;
200	j->flags.remount_proc_ro = remount_proc_ro;
201	j->flags.userns = userns;
202	/* Note, |pids| will already have been used before this call. */
203}
204
205/* Minijail API. */
206
207struct minijail API *minijail_new(void)
208{
209	return calloc(1, sizeof(struct minijail));
210}
211
212void API minijail_change_uid(struct minijail *j, uid_t uid)
213{
214	if (uid == 0)
215		die("useless change to uid 0");
216	j->uid = uid;
217	j->flags.uid = 1;
218}
219
220void API minijail_change_gid(struct minijail *j, gid_t gid)
221{
222	if (gid == 0)
223		die("useless change to gid 0");
224	j->gid = gid;
225	j->flags.gid = 1;
226}
227
228void API minijail_set_supplementary_gids(struct minijail *j, size_t size,
229					 const gid_t *list)
230{
231	size_t i;
232
233	if (j->flags.inherit_suppl_gids)
234		die("cannot inherit *and* set supplementary groups");
235	if (j->flags.keep_suppl_gids)
236		die("cannot keep *and* set supplementary groups");
237
238	if (size == 0) {
239		/* Clear supplementary groups. */
240		j->suppl_gid_list = NULL;
241		j->suppl_gid_count = 0;
242		j->flags.set_suppl_gids = 1;
243		return;
244	}
245
246	/* Copy the gid_t array. */
247	j->suppl_gid_list = calloc(size, sizeof(gid_t));
248	if (!j->suppl_gid_list) {
249		die("failed to allocate internal supplementary group array");
250	}
251	for (i = 0; i < size; i++) {
252		j->suppl_gid_list[i] = list[i];
253	}
254	j->suppl_gid_count = size;
255	j->flags.set_suppl_gids = 1;
256}
257
258void API minijail_keep_supplementary_gids(struct minijail *j) {
259	j->flags.keep_suppl_gids = 1;
260}
261
262int API minijail_change_user(struct minijail *j, const char *user)
263{
264	char *buf = NULL;
265	struct passwd pw;
266	struct passwd *ppw = NULL;
267	ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
268	if (sz == -1)
269		sz = 65536;	/* your guess is as good as mine... */
270
271	/*
272	 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
273	 * the maximum needed size of the buffer, so we don't have to search.
274	 */
275	buf = malloc(sz);
276	if (!buf)
277		return -ENOMEM;
278	getpwnam_r(user, &pw, buf, sz, &ppw);
279	/*
280	 * We're safe to free the buffer here. The strings inside |pw| point
281	 * inside |buf|, but we don't use any of them; this leaves the pointers
282	 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3)
283	 * succeeded.
284	 */
285	free(buf);
286	/* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
287	if (!ppw)
288		return -1;
289	minijail_change_uid(j, ppw->pw_uid);
290	j->user = strdup(user);
291	if (!j->user)
292		return -ENOMEM;
293	j->usergid = ppw->pw_gid;
294	return 0;
295}
296
297int API minijail_change_group(struct minijail *j, const char *group)
298{
299	char *buf = NULL;
300	struct group gr;
301	struct group *pgr = NULL;
302	ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
303	if (sz == -1)
304		sz = 65536;	/* and mine is as good as yours, really */
305
306	/*
307	 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
308	 * the maximum needed size of the buffer, so we don't have to search.
309	 */
310	buf = malloc(sz);
311	if (!buf)
312		return -ENOMEM;
313	getgrnam_r(group, &gr, buf, sz, &pgr);
314	/*
315	 * We're safe to free the buffer here. The strings inside gr point
316	 * inside buf, but we don't use any of them; this leaves the pointers
317	 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
318	 */
319	free(buf);
320	/* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
321	if (!pgr)
322		return -1;
323	minijail_change_gid(j, pgr->gr_gid);
324	return 0;
325}
326
327void API minijail_use_seccomp(struct minijail *j)
328{
329	j->flags.seccomp = 1;
330}
331
332void API minijail_no_new_privs(struct minijail *j)
333{
334	j->flags.no_new_privs = 1;
335}
336
337void API minijail_use_seccomp_filter(struct minijail *j)
338{
339	j->flags.seccomp_filter = 1;
340}
341
342void API minijail_set_seccomp_filter_tsync(struct minijail *j)
343{
344	if (j->filter_len > 0 && j->filter_prog != NULL) {
345		die("minijail_set_seccomp_filter_tsync() must be called "
346		    "before minijail_parse_seccomp_filters()");
347	}
348	j->flags.seccomp_filter_tsync = 1;
349}
350
351void API minijail_log_seccomp_filter_failures(struct minijail *j)
352{
353	if (j->filter_len > 0 && j->filter_prog != NULL) {
354		die("minijail_log_seccomp_filter_failures() must be called "
355		    "before minijail_parse_seccomp_filters()");
356	}
357	j->flags.seccomp_filter_logging = 1;
358}
359
360void API minijail_use_caps(struct minijail *j, uint64_t capmask)
361{
362	/*
363	 * 'minijail_use_caps' configures a runtime-capabilities-only
364	 * environment, including a bounding set matching the thread's runtime
365	 * (permitted|inheritable|effective) sets.
366	 * Therefore, it will override any existing bounding set configurations
367	 * since the latter would allow gaining extra runtime capabilities from
368	 * file capabilities.
369	 */
370	if (j->flags.capbset_drop) {
371		warn("overriding bounding set configuration");
372		j->cap_bset = 0;
373		j->flags.capbset_drop = 0;
374	}
375	j->caps = capmask;
376	j->flags.use_caps = 1;
377}
378
379void API minijail_capbset_drop(struct minijail *j, uint64_t capmask)
380{
381	if (j->flags.use_caps) {
382		/*
383		 * 'minijail_use_caps' will have already configured a capability
384		 * bounding set matching the (permitted|inheritable|effective)
385		 * sets. Abort if the user tries to configure a separate
386		 * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps'
387		 * are mutually exclusive.
388		 */
389		die("runtime capabilities already configured, can't drop "
390		    "bounding set separately");
391	}
392	j->cap_bset = capmask;
393	j->flags.capbset_drop = 1;
394}
395
396void API minijail_set_ambient_caps(struct minijail *j)
397{
398	j->flags.set_ambient_caps = 1;
399}
400
401void API minijail_reset_signal_mask(struct minijail *j)
402{
403	j->flags.reset_signal_mask = 1;
404}
405
406void API minijail_namespace_vfs(struct minijail *j)
407{
408	j->flags.vfs = 1;
409}
410
411void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
412{
413	int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC);
414	if (ns_fd < 0) {
415		pdie("failed to open namespace '%s'", ns_path);
416	}
417	j->mountns_fd = ns_fd;
418	j->flags.enter_vfs = 1;
419}
420
421void API minijail_new_session_keyring(struct minijail *j)
422{
423	j->flags.new_session_keyring = 1;
424}
425
426void API minijail_skip_remount_private(struct minijail *j)
427{
428	j->flags.skip_remount_private = 1;
429}
430
431void API minijail_namespace_pids(struct minijail *j)
432{
433	j->flags.vfs = 1;
434	j->flags.remount_proc_ro = 1;
435	j->flags.pids = 1;
436	j->flags.do_init = 1;
437}
438
439void API minijail_namespace_ipc(struct minijail *j)
440{
441	j->flags.ipc = 1;
442}
443
444void API minijail_namespace_uts(struct minijail *j)
445{
446	j->flags.uts = 1;
447}
448
449int API minijail_namespace_set_hostname(struct minijail *j, const char *name)
450{
451	if (j->hostname)
452		return -EINVAL;
453	minijail_namespace_uts(j);
454	j->hostname = strdup(name);
455	if (!j->hostname)
456		return -ENOMEM;
457	return 0;
458}
459
460void API minijail_namespace_net(struct minijail *j)
461{
462	j->flags.net = 1;
463}
464
465void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
466{
467	int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC);
468	if (ns_fd < 0) {
469		pdie("failed to open namespace '%s'", ns_path);
470	}
471	j->netns_fd = ns_fd;
472	j->flags.enter_net = 1;
473}
474
475void API minijail_namespace_cgroups(struct minijail *j)
476{
477	j->flags.ns_cgroups = 1;
478}
479
480void API minijail_close_open_fds(struct minijail *j)
481{
482	j->flags.close_open_fds = 1;
483}
484
485void API minijail_remount_proc_readonly(struct minijail *j)
486{
487	j->flags.vfs = 1;
488	j->flags.remount_proc_ro = 1;
489}
490
491void API minijail_namespace_user(struct minijail *j)
492{
493	j->flags.userns = 1;
494}
495
496void API minijail_namespace_user_disable_setgroups(struct minijail *j)
497{
498	j->flags.disable_setgroups = 1;
499}
500
501int API minijail_uidmap(struct minijail *j, const char *uidmap)
502{
503	j->uidmap = strdup(uidmap);
504	if (!j->uidmap)
505		return -ENOMEM;
506	char *ch;
507	for (ch = j->uidmap; *ch; ch++) {
508		if (*ch == ',')
509			*ch = '\n';
510	}
511	return 0;
512}
513
514int API minijail_gidmap(struct minijail *j, const char *gidmap)
515{
516	j->gidmap = strdup(gidmap);
517	if (!j->gidmap)
518		return -ENOMEM;
519	char *ch;
520	for (ch = j->gidmap; *ch; ch++) {
521		if (*ch == ',')
522			*ch = '\n';
523	}
524	return 0;
525}
526
527void API minijail_inherit_usergroups(struct minijail *j)
528{
529	j->flags.inherit_suppl_gids = 1;
530}
531
532void API minijail_run_as_init(struct minijail *j)
533{
534	/*
535	 * Since the jailed program will become 'init' in the new PID namespace,
536	 * Minijail does not need to fork an 'init' process.
537	 */
538	j->flags.do_init = 0;
539}
540
541int API minijail_enter_chroot(struct minijail *j, const char *dir)
542{
543	if (j->chrootdir)
544		return -EINVAL;
545	j->chrootdir = strdup(dir);
546	if (!j->chrootdir)
547		return -ENOMEM;
548	j->flags.chroot = 1;
549	return 0;
550}
551
552int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
553{
554	if (j->chrootdir)
555		return -EINVAL;
556	j->chrootdir = strdup(dir);
557	if (!j->chrootdir)
558		return -ENOMEM;
559	j->flags.pivot_root = 1;
560	return 0;
561}
562
563char API *minijail_get_original_path(struct minijail *j,
564				     const char *path_inside_chroot)
565{
566	struct mountpoint *b;
567
568	b = j->mounts_head;
569	while (b) {
570		/*
571		 * If |path_inside_chroot| is the exact destination of a
572		 * mount, then the original path is exactly the source of
573		 * the mount.
574		 *  for example: "-b /some/path/exe,/chroot/path/exe"
575		 *    mount source = /some/path/exe, mount dest =
576		 *    /chroot/path/exe Then when getting the original path of
577		 *    "/chroot/path/exe", the source of that mount,
578		 *    "/some/path/exe" is what should be returned.
579		 */
580		if (!strcmp(b->dest, path_inside_chroot))
581			return strdup(b->src);
582
583		/*
584		 * If |path_inside_chroot| is within the destination path of a
585		 * mount, take the suffix of the chroot path relative to the
586		 * mount destination path, and append it to the mount source
587		 * path.
588		 */
589		if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) {
590			const char *relative_path =
591				path_inside_chroot + strlen(b->dest);
592			return path_join(b->src, relative_path);
593		}
594		b = b->next;
595	}
596
597	/* If there is a chroot path, append |path_inside_chroot| to that. */
598	if (j->chrootdir)
599		return path_join(j->chrootdir, path_inside_chroot);
600
601	/* No chroot, so the path outside is the same as it is inside. */
602	return strdup(path_inside_chroot);
603}
604
605size_t minijail_get_tmpfs_size(const struct minijail *j)
606{
607	return j->tmpfs_size;
608}
609
610void API minijail_mount_tmp(struct minijail *j)
611{
612	minijail_mount_tmp_size(j, 64 * 1024 * 1024);
613}
614
615void API minijail_mount_tmp_size(struct minijail *j, size_t size)
616{
617	j->tmpfs_size = size;
618	j->flags.mount_tmp = 1;
619}
620
621int API minijail_write_pid_file(struct minijail *j, const char *path)
622{
623	j->pid_file_path = strdup(path);
624	if (!j->pid_file_path)
625		return -ENOMEM;
626	j->flags.pid_file = 1;
627	return 0;
628}
629
630int API minijail_add_to_cgroup(struct minijail *j, const char *path)
631{
632	if (j->cgroup_count >= MAX_CGROUPS)
633		return -ENOMEM;
634	j->cgroups[j->cgroup_count] = strdup(path);
635	if (!j->cgroups[j->cgroup_count])
636		return -ENOMEM;
637	j->cgroup_count++;
638	j->flags.cgroups = 1;
639	return 0;
640}
641
642int API minijail_forward_signals(struct minijail *j)
643{
644	j->flags.forward_signals = 1;
645	return 0;
646}
647
648int API minijail_mount_with_data(struct minijail *j, const char *src,
649				 const char *dest, const char *type,
650				 unsigned long flags, const char *data)
651{
652	struct mountpoint *m;
653
654	if (*dest != '/')
655		return -EINVAL;
656	m = calloc(1, sizeof(*m));
657	if (!m)
658		return -ENOMEM;
659	m->dest = strdup(dest);
660	if (!m->dest)
661		goto error;
662	m->src = strdup(src);
663	if (!m->src)
664		goto error;
665	m->type = strdup(type);
666	if (!m->type)
667		goto error;
668	if (data) {
669		m->data = strdup(data);
670		if (!m->data)
671			goto error;
672		m->has_data = 1;
673	}
674	m->flags = flags;
675
676	info("mount %s -> %s type '%s'", src, dest, type);
677
678	/*
679	 * Force vfs namespacing so the mounts don't leak out into the
680	 * containing vfs namespace.
681	 */
682	minijail_namespace_vfs(j);
683
684	if (j->mounts_tail)
685		j->mounts_tail->next = m;
686	else
687		j->mounts_head = m;
688	j->mounts_tail = m;
689	j->mounts_count++;
690
691	return 0;
692
693error:
694	free(m->type);
695	free(m->src);
696	free(m->dest);
697	free(m);
698	return -ENOMEM;
699}
700
701int API minijail_mount(struct minijail *j, const char *src, const char *dest,
702		       const char *type, unsigned long flags)
703{
704	return minijail_mount_with_data(j, src, dest, type, flags, NULL);
705}
706
707int API minijail_bind(struct minijail *j, const char *src, const char *dest,
708		      int writeable)
709{
710	unsigned long flags = MS_BIND;
711
712	if (!writeable)
713		flags |= MS_RDONLY;
714
715	return minijail_mount(j, src, dest, "", flags);
716}
717
718static void clear_seccomp_options(struct minijail *j)
719{
720	j->flags.seccomp_filter = 0;
721	j->flags.seccomp_filter_tsync = 0;
722	j->flags.seccomp_filter_logging = 0;
723	j->filter_len = 0;
724	j->filter_prog = NULL;
725	j->flags.no_new_privs = 0;
726}
727
728static int seccomp_should_parse_filters(struct minijail *j)
729{
730	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL) == -1) {
731		/*
732		 * |errno| will be set to EINVAL when seccomp has not been
733		 * compiled into the kernel. On certain platforms and kernel
734		 * versions this is not a fatal failure. In that case, and only
735		 * in that case, disable seccomp and skip loading the filters.
736		 */
737		if ((errno == EINVAL) && seccomp_can_softfail()) {
738			warn("not loading seccomp filters, seccomp filter not "
739			     "supported");
740			clear_seccomp_options(j);
741			return 0;
742		}
743		/*
744		 * If |errno| != EINVAL or seccomp_can_softfail() is false,
745		 * we can proceed. Worst case scenario minijail_enter() will
746		 * abort() if seccomp fails.
747		 */
748	}
749	if (j->flags.seccomp_filter_tsync) {
750		/* Are the seccomp(2) syscall and the TSYNC option supported? */
751		if (sys_seccomp(SECCOMP_SET_MODE_FILTER,
752				SECCOMP_FILTER_FLAG_TSYNC, NULL) == -1) {
753			int saved_errno = errno;
754			if (saved_errno == ENOSYS && seccomp_can_softfail()) {
755				warn("seccomp(2) syscall not supported");
756				clear_seccomp_options(j);
757				return 0;
758			} else if (saved_errno == EINVAL &&
759				   seccomp_can_softfail()) {
760				warn(
761				    "seccomp filter thread sync not supported");
762				clear_seccomp_options(j);
763				return 0;
764			}
765			/*
766			 * Similar logic here. If seccomp_can_softfail() is
767			 * false, or |errno| != ENOSYS, or |errno| != EINVAL,
768			 * we can proceed. Worst case scenario minijail_enter()
769			 * will abort() if seccomp or TSYNC fail.
770			 */
771		}
772	}
773	return 1;
774}
775
776static int parse_seccomp_filters(struct minijail *j, FILE *policy_file)
777{
778	struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
779	int use_ret_trap =
780	    j->flags.seccomp_filter_tsync || j->flags.seccomp_filter_logging;
781	int allow_logging = j->flags.seccomp_filter_logging;
782
783	if (compile_filter(policy_file, fprog, use_ret_trap, allow_logging)) {
784		free(fprog);
785		return -1;
786	}
787
788	j->filter_len = fprog->len;
789	j->filter_prog = fprog;
790	return 0;
791}
792
793void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
794{
795	if (!seccomp_should_parse_filters(j))
796		return;
797
798	FILE *file = fopen(path, "r");
799	if (!file) {
800		pdie("failed to open seccomp filter file '%s'", path);
801	}
802
803	if (parse_seccomp_filters(j, file) != 0) {
804		die("failed to compile seccomp filter BPF program in '%s'",
805		    path);
806	}
807	fclose(file);
808}
809
810void API minijail_parse_seccomp_filters_from_fd(struct minijail *j, int fd)
811{
812	if (!seccomp_should_parse_filters(j))
813		return;
814
815	FILE *file = fdopen(fd, "r");
816	if (!file) {
817		pdie("failed to associate stream with fd %d", fd);
818	}
819
820	if (parse_seccomp_filters(j, file) != 0) {
821		die("failed to compile seccomp filter BPF program from fd %d",
822		    fd);
823	}
824	fclose(file);
825}
826
827int API minijail_use_alt_syscall(struct minijail *j, const char *table)
828{
829	j->alt_syscall_table = strdup(table);
830	if (!j->alt_syscall_table)
831		return -ENOMEM;
832	j->flags.alt_syscall = 1;
833	return 0;
834}
835
836struct marshal_state {
837	size_t available;
838	size_t total;
839	char *buf;
840};
841
842void marshal_state_init(struct marshal_state *state, char *buf,
843			size_t available)
844{
845	state->available = available;
846	state->buf = buf;
847	state->total = 0;
848}
849
850void marshal_append(struct marshal_state *state, void *src, size_t length)
851{
852	size_t copy_len = MIN(state->available, length);
853
854	/* Up to |available| will be written. */
855	if (copy_len) {
856		memcpy(state->buf, src, copy_len);
857		state->buf += copy_len;
858		state->available -= copy_len;
859	}
860	/* |total| will contain the expected length. */
861	state->total += length;
862}
863
864void marshal_mount(struct marshal_state *state, const struct mountpoint *m)
865{
866	marshal_append(state, m->src, strlen(m->src) + 1);
867	marshal_append(state, m->dest, strlen(m->dest) + 1);
868	marshal_append(state, m->type, strlen(m->type) + 1);
869	marshal_append(state, (char *)&m->has_data, sizeof(m->has_data));
870	if (m->has_data)
871		marshal_append(state, m->data, strlen(m->data) + 1);
872	marshal_append(state, (char *)&m->flags, sizeof(m->flags));
873}
874
875void minijail_marshal_helper(struct marshal_state *state,
876			     const struct minijail *j)
877{
878	struct mountpoint *m = NULL;
879	size_t i;
880
881	marshal_append(state, (char *)j, sizeof(*j));
882	if (j->user)
883		marshal_append(state, j->user, strlen(j->user) + 1);
884	if (j->suppl_gid_list) {
885		marshal_append(state, j->suppl_gid_list,
886			       j->suppl_gid_count * sizeof(gid_t));
887	}
888	if (j->chrootdir)
889		marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
890	if (j->hostname)
891		marshal_append(state, j->hostname, strlen(j->hostname) + 1);
892	if (j->alt_syscall_table) {
893		marshal_append(state, j->alt_syscall_table,
894			       strlen(j->alt_syscall_table) + 1);
895	}
896	if (j->flags.seccomp_filter && j->filter_prog) {
897		struct sock_fprog *fp = j->filter_prog;
898		marshal_append(state, (char *)fp->filter,
899			       fp->len * sizeof(struct sock_filter));
900	}
901	for (m = j->mounts_head; m; m = m->next) {
902		marshal_mount(state, m);
903	}
904	for (i = 0; i < j->cgroup_count; ++i)
905		marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1);
906}
907
908size_t API minijail_size(const struct minijail *j)
909{
910	struct marshal_state state;
911	marshal_state_init(&state, NULL, 0);
912	minijail_marshal_helper(&state, j);
913	return state.total;
914}
915
916int minijail_marshal(const struct minijail *j, char *buf, size_t available)
917{
918	struct marshal_state state;
919	marshal_state_init(&state, buf, available);
920	minijail_marshal_helper(&state, j);
921	return (state.total > available);
922}
923
924int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
925{
926	size_t i;
927	size_t count;
928	int ret = -EINVAL;
929
930	if (length < sizeof(*j))
931		goto out;
932	memcpy((void *)j, serialized, sizeof(*j));
933	serialized += sizeof(*j);
934	length -= sizeof(*j);
935
936	/* Potentially stale pointers not used as signals. */
937	j->pid_file_path = NULL;
938	j->uidmap = NULL;
939	j->gidmap = NULL;
940	j->mounts_head = NULL;
941	j->mounts_tail = NULL;
942	j->filter_prog = NULL;
943
944	if (j->user) {		/* stale pointer */
945		char *user = consumestr(&serialized, &length);
946		if (!user)
947			goto clear_pointers;
948		j->user = strdup(user);
949		if (!j->user)
950			goto clear_pointers;
951	}
952
953	if (j->suppl_gid_list) {	/* stale pointer */
954		if (j->suppl_gid_count > NGROUPS_MAX) {
955			goto bad_gid_list;
956		}
957		size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t);
958		void *gid_list_bytes =
959		    consumebytes(gid_list_size, &serialized, &length);
960		if (!gid_list_bytes)
961			goto bad_gid_list;
962
963		j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t));
964		if (!j->suppl_gid_list)
965			goto bad_gid_list;
966
967		memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size);
968	}
969
970	if (j->chrootdir) {	/* stale pointer */
971		char *chrootdir = consumestr(&serialized, &length);
972		if (!chrootdir)
973			goto bad_chrootdir;
974		j->chrootdir = strdup(chrootdir);
975		if (!j->chrootdir)
976			goto bad_chrootdir;
977	}
978
979	if (j->hostname) {	/* stale pointer */
980		char *hostname = consumestr(&serialized, &length);
981		if (!hostname)
982			goto bad_hostname;
983		j->hostname = strdup(hostname);
984		if (!j->hostname)
985			goto bad_hostname;
986	}
987
988	if (j->alt_syscall_table) {	/* stale pointer */
989		char *alt_syscall_table = consumestr(&serialized, &length);
990		if (!alt_syscall_table)
991			goto bad_syscall_table;
992		j->alt_syscall_table = strdup(alt_syscall_table);
993		if (!j->alt_syscall_table)
994			goto bad_syscall_table;
995	}
996
997	if (j->flags.seccomp_filter && j->filter_len > 0) {
998		size_t ninstrs = j->filter_len;
999		if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
1000		    ninstrs > USHRT_MAX)
1001			goto bad_filters;
1002
1003		size_t program_len = ninstrs * sizeof(struct sock_filter);
1004		void *program = consumebytes(program_len, &serialized, &length);
1005		if (!program)
1006			goto bad_filters;
1007
1008		j->filter_prog = malloc(sizeof(struct sock_fprog));
1009		if (!j->filter_prog)
1010			goto bad_filters;
1011
1012		j->filter_prog->len = ninstrs;
1013		j->filter_prog->filter = malloc(program_len);
1014		if (!j->filter_prog->filter)
1015			goto bad_filter_prog_instrs;
1016
1017		memcpy(j->filter_prog->filter, program, program_len);
1018	}
1019
1020	count = j->mounts_count;
1021	j->mounts_count = 0;
1022	for (i = 0; i < count; ++i) {
1023		unsigned long *flags;
1024		int *has_data;
1025		const char *dest;
1026		const char *type;
1027		const char *data = NULL;
1028		const char *src = consumestr(&serialized, &length);
1029		if (!src)
1030			goto bad_mounts;
1031		dest = consumestr(&serialized, &length);
1032		if (!dest)
1033			goto bad_mounts;
1034		type = consumestr(&serialized, &length);
1035		if (!type)
1036			goto bad_mounts;
1037		has_data = consumebytes(sizeof(*has_data), &serialized,
1038					&length);
1039		if (!has_data)
1040			goto bad_mounts;
1041		if (*has_data) {
1042			data = consumestr(&serialized, &length);
1043			if (!data)
1044				goto bad_mounts;
1045		}
1046		flags = consumebytes(sizeof(*flags), &serialized, &length);
1047		if (!flags)
1048			goto bad_mounts;
1049		if (minijail_mount_with_data(j, src, dest, type, *flags, data))
1050			goto bad_mounts;
1051	}
1052
1053	count = j->cgroup_count;
1054	j->cgroup_count = 0;
1055	for (i = 0; i < count; ++i) {
1056		char *cgroup = consumestr(&serialized, &length);
1057		if (!cgroup)
1058			goto bad_cgroups;
1059		j->cgroups[i] = strdup(cgroup);
1060		if (!j->cgroups[i])
1061			goto bad_cgroups;
1062		++j->cgroup_count;
1063	}
1064
1065	return 0;
1066
1067bad_cgroups:
1068	while (j->mounts_head) {
1069		struct mountpoint *m = j->mounts_head;
1070		j->mounts_head = j->mounts_head->next;
1071		free(m->data);
1072		free(m->type);
1073		free(m->dest);
1074		free(m->src);
1075		free(m);
1076	}
1077	for (i = 0; i < j->cgroup_count; ++i)
1078		free(j->cgroups[i]);
1079bad_mounts:
1080	if (j->flags.seccomp_filter && j->filter_len > 0) {
1081		free(j->filter_prog->filter);
1082		free(j->filter_prog);
1083	}
1084bad_filter_prog_instrs:
1085	if (j->filter_prog)
1086		free(j->filter_prog);
1087bad_filters:
1088	if (j->alt_syscall_table)
1089		free(j->alt_syscall_table);
1090bad_syscall_table:
1091	if (j->chrootdir)
1092		free(j->chrootdir);
1093bad_chrootdir:
1094	if (j->hostname)
1095		free(j->hostname);
1096bad_hostname:
1097	if (j->suppl_gid_list)
1098		free(j->suppl_gid_list);
1099bad_gid_list:
1100	if (j->user)
1101		free(j->user);
1102clear_pointers:
1103	j->user = NULL;
1104	j->suppl_gid_list = NULL;
1105	j->chrootdir = NULL;
1106	j->hostname = NULL;
1107	j->alt_syscall_table = NULL;
1108	j->cgroup_count = 0;
1109out:
1110	return ret;
1111}
1112
1113/*
1114 * mount_one: Applies mounts from @m for @j, recursing as needed.
1115 * @j Minijail these mounts are for
1116 * @m Head of list of mounts
1117 *
1118 * Returns 0 for success.
1119 */
1120static int mount_one(const struct minijail *j, struct mountpoint *m)
1121{
1122	int ret;
1123	char *dest;
1124	int remount_ro = 0;
1125
1126	/* |dest| has a leading "/". */
1127	if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0)
1128		return -ENOMEM;
1129
1130	if (setup_mount_destination(m->src, dest, j->uid, j->gid))
1131		pdie("creating mount target '%s' failed", dest);
1132
1133	/*
1134	 * R/O bind mounts have to be remounted since 'bind' and 'ro'
1135	 * can't both be specified in the original bind mount.
1136	 * Remount R/O after the initial mount.
1137	 */
1138	if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) {
1139		remount_ro = 1;
1140		m->flags &= ~MS_RDONLY;
1141	}
1142
1143	ret = mount(m->src, dest, m->type, m->flags, m->data);
1144	if (ret)
1145		pdie("mount: %s -> %s", m->src, dest);
1146
1147	if (remount_ro) {
1148		m->flags |= MS_RDONLY;
1149		ret = mount(m->src, dest, NULL,
1150			    m->flags | MS_REMOUNT, m->data);
1151		if (ret)
1152			pdie("bind ro: %s -> %s", m->src, dest);
1153	}
1154
1155	free(dest);
1156	if (m->next)
1157		return mount_one(j, m->next);
1158	return ret;
1159}
1160
1161static int enter_chroot(const struct minijail *j)
1162{
1163	int ret;
1164
1165	if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
1166		return ret;
1167
1168	if (chroot(j->chrootdir))
1169		return -errno;
1170
1171	if (chdir("/"))
1172		return -errno;
1173
1174	return 0;
1175}
1176
1177static int enter_pivot_root(const struct minijail *j)
1178{
1179	int ret, oldroot, newroot;
1180
1181	if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
1182		return ret;
1183
1184	/*
1185	 * Keep the fd for both old and new root.
1186	 * It will be used in fchdir(2) later.
1187	 */
1188	oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
1189	if (oldroot < 0)
1190		pdie("failed to open / for fchdir");
1191	newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
1192	if (newroot < 0)
1193		pdie("failed to open %s for fchdir", j->chrootdir);
1194
1195	/*
1196	 * To ensure j->chrootdir is the root of a filesystem,
1197	 * do a self bind mount.
1198	 */
1199	if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
1200		pdie("failed to bind mount '%s'", j->chrootdir);
1201	if (chdir(j->chrootdir))
1202		return -errno;
1203	if (syscall(SYS_pivot_root, ".", "."))
1204		pdie("pivot_root");
1205
1206	/*
1207	 * Now the old root is mounted on top of the new root. Use fchdir(2) to
1208	 * change to the old root and unmount it.
1209	 */
1210	if (fchdir(oldroot))
1211		pdie("failed to fchdir to old /");
1212
1213	/*
1214	 * If j->flags.skip_remount_private was enabled for minijail_enter(),
1215	 * there could be a shared mount point under |oldroot|. In that case,
1216	 * mounts under this shared mount point will be unmounted below, and
1217	 * this unmounting will propagate to the original mount namespace
1218	 * (because the mount point is shared). To prevent this unexpected
1219	 * unmounting, remove these mounts from their peer groups by recursively
1220	 * remounting them as MS_PRIVATE.
1221	 */
1222	if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL))
1223		pdie("failed to mount(/, private) before umount(/)");
1224	/* The old root might be busy, so use lazy unmount. */
1225	if (umount2(".", MNT_DETACH))
1226		pdie("umount(/)");
1227	/* Change back to the new root. */
1228	if (fchdir(newroot))
1229		return -errno;
1230	if (close(oldroot))
1231		return -errno;
1232	if (close(newroot))
1233		return -errno;
1234	if (chroot("/"))
1235		return -errno;
1236	/* Set correct CWD for getcwd(3). */
1237	if (chdir("/"))
1238		return -errno;
1239
1240	return 0;
1241}
1242
1243static int mount_tmp(const struct minijail *j)
1244{
1245	const char fmt[] = "size=%zu,mode=1777";
1246	/* Count for the user storing ULLONG_MAX literally + extra space. */
1247	char data[sizeof(fmt) + sizeof("18446744073709551615ULL")];
1248	int ret;
1249
1250	ret = snprintf(data, sizeof(data), fmt, j->tmpfs_size);
1251
1252	if (ret <= 0)
1253		pdie("tmpfs size spec error");
1254	else if ((size_t)ret >= sizeof(data))
1255		pdie("tmpfs size spec too large");
1256	return mount("none", "/tmp", "tmpfs", MS_NODEV | MS_NOEXEC | MS_NOSUID,
1257		     data);
1258}
1259
1260static int remount_proc_readonly(const struct minijail *j)
1261{
1262	const char *kProcPath = "/proc";
1263	const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
1264	/*
1265	 * Right now, we're holding a reference to our parent's old mount of
1266	 * /proc in our namespace, which means using MS_REMOUNT here would
1267	 * mutate our parent's mount as well, even though we're in a VFS
1268	 * namespace (!). Instead, remove their mount from our namespace lazily
1269	 * (MNT_DETACH) and make our own.
1270	 */
1271	if (umount2(kProcPath, MNT_DETACH)) {
1272		/*
1273		 * If we are in a new user namespace, umount(2) will fail.
1274		 * See http://man7.org/linux/man-pages/man7/user_namespaces.7.html
1275		 */
1276		if (j->flags.userns) {
1277			info("umount(/proc, MNT_DETACH) failed, "
1278			     "this is expected when using user namespaces");
1279		} else {
1280			return -errno;
1281		}
1282	}
1283	if (mount("proc", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
1284		return -errno;
1285	return 0;
1286}
1287
1288static void kill_child_and_die(const struct minijail *j, const char *msg)
1289{
1290	kill(j->initpid, SIGKILL);
1291	die("%s", msg);
1292}
1293
1294static void write_pid_file_or_die(const struct minijail *j)
1295{
1296	if (write_pid_to_path(j->initpid, j->pid_file_path))
1297		kill_child_and_die(j, "failed to write pid file");
1298}
1299
1300static void add_to_cgroups_or_die(const struct minijail *j)
1301{
1302	size_t i;
1303
1304	for (i = 0; i < j->cgroup_count; ++i) {
1305		if (write_pid_to_path(j->initpid, j->cgroups[i]))
1306			kill_child_and_die(j, "failed to add to cgroups");
1307	}
1308}
1309
1310static void write_ugid_maps_or_die(const struct minijail *j)
1311{
1312	if (j->uidmap && write_proc_file(j->initpid, j->uidmap, "uid_map") != 0)
1313		kill_child_and_die(j, "failed to write uid_map");
1314	if (j->gidmap && j->flags.disable_setgroups) {
1315		/* Older kernels might not have the /proc/<pid>/setgroups files. */
1316		int ret = write_proc_file(j->initpid, "deny", "setgroups");
1317		if (ret != 0) {
1318			if (ret == -ENOENT) {
1319				/* See http://man7.org/linux/man-pages/man7/user_namespaces.7.html. */
1320				warn("could not disable setgroups(2)");
1321			} else
1322				kill_child_and_die(j, "failed to disable setgroups(2)");
1323		}
1324	}
1325	if (j->gidmap && write_proc_file(j->initpid, j->gidmap, "gid_map") != 0)
1326		kill_child_and_die(j, "failed to write gid_map");
1327}
1328
1329static void enter_user_namespace(const struct minijail *j)
1330{
1331	if (j->uidmap && setresuid(0, 0, 0))
1332		pdie("user_namespaces: setresuid(0, 0, 0) failed");
1333	if (j->gidmap && setresgid(0, 0, 0))
1334		pdie("user_namespaces: setresgid(0, 0, 0) failed");
1335}
1336
1337static void parent_setup_complete(int *pipe_fds)
1338{
1339	close(pipe_fds[0]);
1340	close(pipe_fds[1]);
1341}
1342
1343/*
1344 * wait_for_parent_setup: Called by the child process to wait for any
1345 * further parent-side setup to complete before continuing.
1346 */
1347static void wait_for_parent_setup(int *pipe_fds)
1348{
1349	char buf;
1350
1351	close(pipe_fds[1]);
1352
1353	/* Wait for parent to complete setup and close the pipe. */
1354	if (read(pipe_fds[0], &buf, 1) != 0)
1355		die("failed to sync with parent");
1356	close(pipe_fds[0]);
1357}
1358
1359static void drop_ugid(const struct minijail *j)
1360{
1361	if (j->flags.inherit_suppl_gids + j->flags.keep_suppl_gids +
1362	    j->flags.set_suppl_gids > 1) {
1363		die("can only do one of inherit, keep, or set supplementary "
1364		    "groups");
1365	}
1366
1367	if (j->flags.inherit_suppl_gids) {
1368		if (initgroups(j->user, j->usergid))
1369			pdie("initgroups(%s, %d) failed", j->user, j->usergid);
1370	} else if (j->flags.set_suppl_gids) {
1371		if (setgroups(j->suppl_gid_count, j->suppl_gid_list))
1372			pdie("setgroups(suppl_gids) failed");
1373	} else if (!j->flags.keep_suppl_gids) {
1374		/*
1375		 * Only attempt to clear supplementary groups if we are changing
1376		 * users or groups.
1377		 */
1378		if ((j->flags.uid || j->flags.gid) && setgroups(0, NULL))
1379			pdie("setgroups(0, NULL) failed");
1380	}
1381
1382	if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
1383		pdie("setresgid(%d, %d, %d) failed", j->gid, j->gid, j->gid);
1384
1385	if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
1386		pdie("setresuid(%d, %d, %d) failed", j->uid, j->uid, j->uid);
1387}
1388
1389static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap)
1390{
1391	const uint64_t one = 1;
1392	unsigned int i;
1393	for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) {
1394		if (keep_mask & (one << i))
1395			continue;
1396		if (prctl(PR_CAPBSET_DROP, i))
1397			pdie("could not drop capability from bounding set");
1398	}
1399}
1400
1401static void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
1402{
1403	if (!j->flags.use_caps)
1404		return;
1405
1406	cap_t caps = cap_get_proc();
1407	cap_value_t flag[1];
1408	const size_t ncaps = sizeof(j->caps) * 8;
1409	const uint64_t one = 1;
1410	unsigned int i;
1411	if (!caps)
1412		die("can't get process caps");
1413	if (cap_clear(caps))
1414		die("can't clear caps");
1415
1416	for (i = 0; i < ncaps && i <= last_valid_cap; ++i) {
1417		/* Keep CAP_SETPCAP for dropping bounding set bits. */
1418		if (i != CAP_SETPCAP && !(j->caps & (one << i)))
1419			continue;
1420		flag[0] = i;
1421		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
1422			die("can't add effective cap");
1423		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
1424			die("can't add permitted cap");
1425		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
1426			die("can't add inheritable cap");
1427	}
1428	if (cap_set_proc(caps))
1429		die("can't apply initial cleaned capset");
1430
1431	/*
1432	 * Instead of dropping bounding set first, do it here in case
1433	 * the caller had a more permissive bounding set which could
1434	 * have been used above to raise a capability that wasn't already
1435	 * present. This requires CAP_SETPCAP, so we raised/kept it above.
1436	 */
1437	drop_capbset(j->caps, last_valid_cap);
1438
1439	/* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
1440	if ((j->caps & (one << CAP_SETPCAP)) == 0) {
1441		flag[0] = CAP_SETPCAP;
1442		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
1443			die("can't clear effective cap");
1444		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
1445			die("can't clear permitted cap");
1446		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
1447			die("can't clear inheritable cap");
1448	}
1449
1450	if (cap_set_proc(caps))
1451		die("can't apply final cleaned capset");
1452
1453	/*
1454	 * If ambient capabilities are supported, clear all capabilities first,
1455	 * then raise the requested ones.
1456	 */
1457	if (j->flags.set_ambient_caps) {
1458		if (!cap_ambient_supported()) {
1459			pdie("ambient capabilities not supported");
1460		}
1461		if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) !=
1462		    0) {
1463			pdie("can't clear ambient capabilities");
1464		}
1465
1466		for (i = 0; i < ncaps && i <= last_valid_cap; ++i) {
1467			if (!(j->caps & (one << i)))
1468				continue;
1469
1470			if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i, 0,
1471				  0) != 0) {
1472				pdie("prctl(PR_CAP_AMBIENT, "
1473				     "PR_CAP_AMBIENT_RAISE, %u) failed",
1474				     i);
1475			}
1476		}
1477	}
1478
1479	cap_free(caps);
1480}
1481
1482static void set_seccomp_filter(const struct minijail *j)
1483{
1484	/*
1485	 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
1486	 * in the kernel source tree for an explanation of the parameters.
1487	 */
1488	if (j->flags.no_new_privs) {
1489		if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
1490			pdie("prctl(PR_SET_NO_NEW_PRIVS)");
1491	}
1492
1493	/*
1494	 * Code running with ASan
1495	 * (https://github.com/google/sanitizers/wiki/AddressSanitizer)
1496	 * will make system calls not included in the syscall filter policy,
1497	 * which will likely crash the program. Skip setting seccomp filter in
1498	 * that case.
1499	 * 'running_with_asan()' has no inputs and is completely defined at
1500	 * build time, so this cannot be used by an attacker to skip setting
1501	 * seccomp filter.
1502	 */
1503	if (j->flags.seccomp_filter && running_with_asan()) {
1504		warn("running with ASan, not setting seccomp filter");
1505		return;
1506	}
1507
1508	if (j->flags.seccomp_filter) {
1509		if (j->flags.seccomp_filter_logging) {
1510			/*
1511			 * If logging seccomp filter failures,
1512			 * install the SIGSYS handler first.
1513			 */
1514			if (install_sigsys_handler())
1515				pdie("failed to install SIGSYS handler");
1516			warn("logging seccomp filter failures");
1517		} else if (j->flags.seccomp_filter_tsync) {
1518			/*
1519			 * If setting thread sync,
1520			 * reset the SIGSYS signal handler so that
1521			 * the entire thread group is killed.
1522			 */
1523			if (signal(SIGSYS, SIG_DFL) == SIG_ERR)
1524				pdie("failed to reset SIGSYS disposition");
1525			info("reset SIGSYS disposition");
1526		}
1527	}
1528
1529	/*
1530	 * Install the syscall filter.
1531	 */
1532	if (j->flags.seccomp_filter) {
1533		if (j->flags.seccomp_filter_tsync) {
1534			if (sys_seccomp(SECCOMP_SET_MODE_FILTER,
1535					SECCOMP_FILTER_FLAG_TSYNC,
1536					j->filter_prog)) {
1537				pdie("seccomp(tsync) failed");
1538			}
1539		} else {
1540			if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
1541				  j->filter_prog)) {
1542				pdie("prctl(seccomp_filter) failed");
1543			}
1544		}
1545	}
1546}
1547
1548static pid_t forward_pid = -1;
1549
1550static void forward_signal(__attribute__((unused)) int nr,
1551			   __attribute__((unused)) siginfo_t *siginfo,
1552			   __attribute__((unused)) void *void_context)
1553{
1554	if (forward_pid != -1) {
1555		kill(forward_pid, nr);
1556	}
1557}
1558
1559static void install_signal_handlers(void)
1560{
1561	struct sigaction act;
1562
1563	memset(&act, 0, sizeof(act));
1564	act.sa_sigaction = &forward_signal;
1565	act.sa_flags = SA_SIGINFO | SA_RESTART;
1566
1567	/* Handle all signals, except SIGCHLD. */
1568	for (int nr = 1; nr < NSIG; nr++) {
1569		/*
1570		 * We don't care if we get EINVAL: that just means that we
1571		 * can't handle this signal, so let's skip it and continue.
1572		 */
1573		sigaction(nr, &act, NULL);
1574	}
1575	/* Reset SIGCHLD's handler. */
1576	signal(SIGCHLD, SIG_DFL);
1577
1578	/* Handle real-time signals. */
1579	for (int nr = SIGRTMIN; nr <= SIGRTMAX; nr++) {
1580		sigaction(nr, &act, NULL);
1581	}
1582}
1583
1584void API minijail_enter(const struct minijail *j)
1585{
1586	/*
1587	 * If we're dropping caps, get the last valid cap from /proc now,
1588	 * since /proc can be unmounted before drop_caps() is called.
1589	 */
1590	unsigned int last_valid_cap = 0;
1591	if (j->flags.capbset_drop || j->flags.use_caps)
1592		last_valid_cap = get_last_valid_cap();
1593
1594	if (j->flags.pids)
1595		die("tried to enter a pid-namespaced jail;"
1596		    " try minijail_run()?");
1597
1598	if (j->flags.inherit_suppl_gids && !j->user)
1599		die("cannot inherit supplementary groups without setting a "
1600		    "username");
1601
1602	/*
1603	 * We can't recover from failures if we've dropped privileges partially,
1604	 * so we don't even try. If any of our operations fail, we abort() the
1605	 * entire process.
1606	 */
1607	if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
1608		pdie("setns(CLONE_NEWNS) failed");
1609
1610	if (j->flags.vfs) {
1611		if (unshare(CLONE_NEWNS))
1612			pdie("unshare(CLONE_NEWNS) failed");
1613		/*
1614		 * Unless asked not to, remount all filesystems as private.
1615		 * If they are shared, new bind mounts will creep out of our
1616		 * namespace.
1617		 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
1618		 */
1619		if (!j->flags.skip_remount_private) {
1620			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
1621				pdie("mount(NULL, /, NULL, MS_REC | MS_PRIVATE,"
1622				     " NULL) failed");
1623		}
1624	}
1625
1626	if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
1627		pdie("unshare(CLONE_NEWIPC) failed");
1628	}
1629
1630	if (j->flags.uts) {
1631		if (unshare(CLONE_NEWUTS))
1632			pdie("unshare(CLONE_NEWUTS) failed");
1633
1634		if (j->hostname && sethostname(j->hostname, strlen(j->hostname)))
1635			pdie("sethostname(%s) failed", j->hostname);
1636	}
1637
1638	if (j->flags.enter_net) {
1639		if (setns(j->netns_fd, CLONE_NEWNET))
1640			pdie("setns(CLONE_NEWNET) failed");
1641	} else if (j->flags.net) {
1642		if (unshare(CLONE_NEWNET))
1643			pdie("unshare(CLONE_NEWNET) failed");
1644		config_net_loopback();
1645	}
1646
1647	if (j->flags.ns_cgroups && unshare(CLONE_NEWCGROUP))
1648		pdie("unshare(CLONE_NEWCGROUP) failed");
1649
1650	if (j->flags.new_session_keyring) {
1651		if (syscall(SYS_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL) < 0)
1652			pdie("keyctl(KEYCTL_JOIN_SESSION_KEYRING) failed");
1653	}
1654
1655	if (j->flags.chroot && enter_chroot(j))
1656		pdie("chroot");
1657
1658	if (j->flags.pivot_root && enter_pivot_root(j))
1659		pdie("pivot_root");
1660
1661	if (j->flags.mount_tmp && mount_tmp(j))
1662		pdie("mount_tmp");
1663
1664	if (j->flags.remount_proc_ro && remount_proc_readonly(j))
1665		pdie("remount");
1666
1667	/*
1668	 * If we're only dropping capabilities from the bounding set, but not
1669	 * from the thread's (permitted|inheritable|effective) sets, do it now.
1670	 */
1671	if (j->flags.capbset_drop) {
1672		drop_capbset(j->cap_bset, last_valid_cap);
1673	}
1674
1675	if (j->flags.use_caps) {
1676		/*
1677		 * POSIX capabilities are a bit tricky. If we drop our
1678		 * capability to change uids, our attempt to use setuid()
1679		 * below will fail. Hang on to root caps across setuid(), then
1680		 * lock securebits.
1681		 */
1682		if (prctl(PR_SET_KEEPCAPS, 1))
1683			pdie("prctl(PR_SET_KEEPCAPS) failed");
1684
1685		if (lock_securebits() < 0) {
1686			pdie("locking securebits failed");
1687		}
1688	}
1689
1690	if (j->flags.no_new_privs) {
1691		/*
1692		 * If we're setting no_new_privs, we can drop privileges
1693		 * before setting seccomp filter. This way filter policies
1694		 * don't need to allow privilege-dropping syscalls.
1695		 */
1696		drop_ugid(j);
1697		drop_caps(j, last_valid_cap);
1698		set_seccomp_filter(j);
1699	} else {
1700		/*
1701		 * If we're not setting no_new_privs,
1702		 * we need to set seccomp filter *before* dropping privileges.
1703		 * WARNING: this means that filter policies *must* allow
1704		 * setgroups()/setresgid()/setresuid() for dropping root and
1705		 * capget()/capset()/prctl() for dropping caps.
1706		 */
1707		set_seccomp_filter(j);
1708		drop_ugid(j);
1709		drop_caps(j, last_valid_cap);
1710	}
1711
1712	/*
1713	 * Select the specified alternate syscall table.  The table must not
1714	 * block prctl(2) if we're using seccomp as well.
1715	 */
1716	if (j->flags.alt_syscall) {
1717		if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table))
1718			pdie("prctl(PR_ALT_SYSCALL) failed");
1719	}
1720
1721	/*
1722	 * seccomp has to come last since it cuts off all the other
1723	 * privilege-dropping syscalls :)
1724	 */
1725	if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
1726		if ((errno == EINVAL) && seccomp_can_softfail()) {
1727			warn("seccomp not supported");
1728			return;
1729		}
1730		pdie("prctl(PR_SET_SECCOMP) failed");
1731	}
1732}
1733
1734/* TODO(wad): will visibility affect this variable? */
1735static int init_exitstatus = 0;
1736
1737void init_term(int __attribute__ ((unused)) sig)
1738{
1739	_exit(init_exitstatus);
1740}
1741
1742void init(pid_t rootpid)
1743{
1744	pid_t pid;
1745	int status;
1746	/* So that we exit with the right status. */
1747	signal(SIGTERM, init_term);
1748	/* TODO(wad): self jail with seccomp filters here. */
1749	while ((pid = wait(&status)) > 0) {
1750		/*
1751		 * This loop will only end when either there are no processes
1752		 * left inside our pid namespace or we get a signal.
1753		 */
1754		if (pid == rootpid)
1755			init_exitstatus = status;
1756	}
1757	if (!WIFEXITED(init_exitstatus))
1758		_exit(MINIJAIL_ERR_INIT);
1759	_exit(WEXITSTATUS(init_exitstatus));
1760}
1761
1762int API minijail_from_fd(int fd, struct minijail *j)
1763{
1764	size_t sz = 0;
1765	size_t bytes = read(fd, &sz, sizeof(sz));
1766	char *buf;
1767	int r;
1768	if (sizeof(sz) != bytes)
1769		return -EINVAL;
1770	if (sz > USHRT_MAX)	/* arbitrary sanity check */
1771		return -E2BIG;
1772	buf = malloc(sz);
1773	if (!buf)
1774		return -ENOMEM;
1775	bytes = read(fd, buf, sz);
1776	if (bytes != sz) {
1777		free(buf);
1778		return -EINVAL;
1779	}
1780	r = minijail_unmarshal(j, buf, sz);
1781	free(buf);
1782	return r;
1783}
1784
1785int API minijail_to_fd(struct minijail *j, int fd)
1786{
1787	char *buf;
1788	size_t sz = minijail_size(j);
1789	ssize_t written;
1790	int r;
1791
1792	if (!sz)
1793		return -EINVAL;
1794	buf = malloc(sz);
1795	r = minijail_marshal(j, buf, sz);
1796	if (r) {
1797		free(buf);
1798		return r;
1799	}
1800	/* Sends [size][minijail]. */
1801	written = write(fd, &sz, sizeof(sz));
1802	if (written != sizeof(sz)) {
1803		free(buf);
1804		return -EFAULT;
1805	}
1806	written = write(fd, buf, sz);
1807	if (written < 0 || (size_t) written != sz) {
1808		free(buf);
1809		return -EFAULT;
1810	}
1811	free(buf);
1812	return 0;
1813}
1814
1815int setup_preload(void)
1816{
1817#if defined(__ANDROID__)
1818	/* Don't use LDPRELOAD on Android. */
1819	return 0;
1820#else
1821	char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
1822	char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
1823	if (!newenv)
1824		return -ENOMEM;
1825
1826	/* Only insert a separating space if we have something to separate... */
1827	sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
1828		PRELOADPATH);
1829
1830	/* setenv() makes a copy of the string we give it. */
1831	setenv(kLdPreloadEnvVar, newenv, 1);
1832	free(newenv);
1833	return 0;
1834#endif
1835}
1836
1837static int setup_pipe(int fds[2])
1838{
1839	int r = pipe(fds);
1840	char fd_buf[11];
1841	if (r)
1842		return r;
1843	r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
1844	if (r <= 0)
1845		return -EINVAL;
1846	setenv(kFdEnvVar, fd_buf, 1);
1847	return 0;
1848}
1849
1850static int close_open_fds(int *inheritable_fds, size_t size)
1851{
1852	const char *kFdPath = "/proc/self/fd";
1853
1854	DIR *d = opendir(kFdPath);
1855	struct dirent *dir_entry;
1856
1857	if (d == NULL)
1858		return -1;
1859	int dir_fd = dirfd(d);
1860	while ((dir_entry = readdir(d)) != NULL) {
1861		size_t i;
1862		char *end;
1863		bool should_close = true;
1864		const int fd = strtol(dir_entry->d_name, &end, 10);
1865
1866		if ((*end) != '\0') {
1867			continue;
1868		}
1869		/*
1870		 * We might have set up some pipes that we want to share with
1871		 * the parent process, and should not be closed.
1872		 */
1873		for (i = 0; i < size; ++i) {
1874			if (fd == inheritable_fds[i]) {
1875				should_close = false;
1876				break;
1877			}
1878		}
1879		/* Also avoid closing the directory fd. */
1880		if (should_close && fd != dir_fd)
1881			close(fd);
1882	}
1883	closedir(d);
1884	return 0;
1885}
1886
1887int minijail_run_internal(struct minijail *j, const char *filename,
1888			  char *const argv[], pid_t *pchild_pid,
1889			  int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1890			  int use_preload);
1891
1892int API minijail_run(struct minijail *j, const char *filename,
1893		     char *const argv[])
1894{
1895	return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1896				     true);
1897}
1898
1899int API minijail_run_pid(struct minijail *j, const char *filename,
1900			 char *const argv[], pid_t *pchild_pid)
1901{
1902	return minijail_run_internal(j, filename, argv, pchild_pid,
1903				     NULL, NULL, NULL, true);
1904}
1905
1906int API minijail_run_pipe(struct minijail *j, const char *filename,
1907			  char *const argv[], int *pstdin_fd)
1908{
1909	return minijail_run_internal(j, filename, argv, NULL, pstdin_fd,
1910				     NULL, NULL, true);
1911}
1912
1913int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
1914			       char *const argv[], pid_t *pchild_pid,
1915			       int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
1916{
1917	return minijail_run_internal(j, filename, argv, pchild_pid,
1918				     pstdin_fd, pstdout_fd, pstderr_fd, true);
1919}
1920
1921int API minijail_run_no_preload(struct minijail *j, const char *filename,
1922				char *const argv[])
1923{
1924	return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1925				     false);
1926}
1927
1928int API minijail_run_pid_pipes_no_preload(struct minijail *j,
1929					  const char *filename,
1930					  char *const argv[],
1931					  pid_t *pchild_pid,
1932					  int *pstdin_fd, int *pstdout_fd,
1933					  int *pstderr_fd)
1934{
1935	return minijail_run_internal(j, filename, argv, pchild_pid,
1936				     pstdin_fd, pstdout_fd, pstderr_fd, false);
1937}
1938
1939int minijail_run_internal(struct minijail *j, const char *filename,
1940			  char *const argv[], pid_t *pchild_pid,
1941			  int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1942			  int use_preload)
1943{
1944	char *oldenv, *oldenv_copy = NULL;
1945	pid_t child_pid;
1946	int pipe_fds[2];
1947	int stdin_fds[2];
1948	int stdout_fds[2];
1949	int stderr_fds[2];
1950	int child_sync_pipe_fds[2];
1951	int sync_child = 0;
1952	int ret;
1953	/* We need to remember this across the minijail_preexec() call. */
1954	int pid_namespace = j->flags.pids;
1955	int do_init = j->flags.do_init;
1956
1957	if (use_preload) {
1958		oldenv = getenv(kLdPreloadEnvVar);
1959		if (oldenv) {
1960			oldenv_copy = strdup(oldenv);
1961			if (!oldenv_copy)
1962				return -ENOMEM;
1963		}
1964
1965		if (setup_preload())
1966			return -EFAULT;
1967	}
1968
1969	if (!use_preload) {
1970		if (j->flags.use_caps && j->caps != 0)
1971			die("non-empty capabilities are not supported without "
1972			    "LD_PRELOAD");
1973	}
1974
1975	/*
1976	 * Make the process group ID of this process equal to its PID.
1977	 * In the non-interactive case (e.g. when the parent process is started
1978	 * from init) this ensures the parent process and the jailed process
1979	 * can be killed together.
1980	 * When the parent process is started from the console this ensures
1981	 * the call to setsid(2) in the jailed process succeeds.
1982	 *
1983	 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
1984	 * the process is already a process group leader.
1985	 */
1986	if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
1987		if (errno != EPERM) {
1988			pdie("setpgid(0, 0) failed");
1989		}
1990	}
1991
1992	if (use_preload) {
1993		/*
1994		 * Before we fork(2) and execve(2) the child process, we need
1995		 * to open a pipe(2) to send the minijail configuration over.
1996		 */
1997		if (setup_pipe(pipe_fds))
1998			return -EFAULT;
1999	}
2000
2001	/*
2002	 * If we want to write to the child process' standard input,
2003	 * create the pipe(2) now.
2004	 */
2005	if (pstdin_fd) {
2006		if (pipe(stdin_fds))
2007			return -EFAULT;
2008	}
2009
2010	/*
2011	 * If we want to read from the child process' standard output,
2012	 * create the pipe(2) now.
2013	 */
2014	if (pstdout_fd) {
2015		if (pipe(stdout_fds))
2016			return -EFAULT;
2017	}
2018
2019	/*
2020	 * If we want to read from the child process' standard error,
2021	 * create the pipe(2) now.
2022	 */
2023	if (pstderr_fd) {
2024		if (pipe(stderr_fds))
2025			return -EFAULT;
2026	}
2027
2028	/*
2029	 * If we want to set up a new uid/gid map in the user namespace,
2030	 * or if we need to add the child process to cgroups, create the pipe(2)
2031	 * to sync between parent and child.
2032	 */
2033	if (j->flags.userns || j->flags.cgroups) {
2034		sync_child = 1;
2035		if (pipe(child_sync_pipe_fds))
2036			return -EFAULT;
2037	}
2038
2039	/*
2040	 * Use sys_clone() if and only if we're creating a pid namespace.
2041	 *
2042	 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
2043	 *
2044	 * In multithreaded programs, there are a bunch of locks inside libc,
2045	 * some of which may be held by other threads at the time that we call
2046	 * minijail_run_pid(). If we call fork(), glibc does its level best to
2047	 * ensure that we hold all of these locks before it calls clone()
2048	 * internally and drop them after clone() returns, but when we call
2049	 * sys_clone(2) directly, all that gets bypassed and we end up with a
2050	 * child address space where some of libc's important locks are held by
2051	 * other threads (which did not get cloned, and hence will never release
2052	 * those locks). This is okay so long as we call exec() immediately
2053	 * after, but a bunch of seemingly-innocent libc functions like setenv()
2054	 * take locks.
2055	 *
2056	 * Hence, only call sys_clone() if we need to, in order to get at pid
2057	 * namespacing. If we follow this path, the child's address space might
2058	 * have broken locks; you may only call functions that do not acquire
2059	 * any locks.
2060	 *
2061	 * Unfortunately, fork() acquires every lock it can get its hands on, as
2062	 * previously detailed, so this function is highly likely to deadlock
2063	 * later on (see "deadlock here") if we're multithreaded.
2064	 *
2065	 * We might hack around this by having the clone()d child (init of the
2066	 * pid namespace) return directly, rather than leaving the clone()d
2067	 * process hanging around to be init for the new namespace (and having
2068	 * its fork()ed child return in turn), but that process would be
2069	 * crippled with its libc locks potentially broken. We might try
2070	 * fork()ing in the parent before we clone() to ensure that we own all
2071	 * the locks, but then we have to have the forked child hanging around
2072	 * consuming resources (and possibly having file descriptors / shared
2073	 * memory regions / etc attached). We'd need to keep the child around to
2074	 * avoid having its children get reparented to init.
2075	 *
2076	 * TODO(ellyjones): figure out if the "forked child hanging around"
2077	 * problem is fixable or not. It would be nice if we worked in this
2078	 * case.
2079	 */
2080	if (pid_namespace) {
2081		int clone_flags = CLONE_NEWPID | SIGCHLD;
2082		if (j->flags.userns)
2083			clone_flags |= CLONE_NEWUSER;
2084		child_pid = syscall(SYS_clone, clone_flags, NULL);
2085	} else {
2086		child_pid = fork();
2087	}
2088
2089	if (child_pid < 0) {
2090		if (use_preload) {
2091			free(oldenv_copy);
2092		}
2093		die("failed to fork child");
2094	}
2095
2096	if (child_pid) {
2097		if (use_preload) {
2098			/* Restore parent's LD_PRELOAD. */
2099			if (oldenv_copy) {
2100				setenv(kLdPreloadEnvVar, oldenv_copy, 1);
2101				free(oldenv_copy);
2102			} else {
2103				unsetenv(kLdPreloadEnvVar);
2104			}
2105			unsetenv(kFdEnvVar);
2106		}
2107
2108		j->initpid = child_pid;
2109
2110		if (j->flags.forward_signals) {
2111			forward_pid = child_pid;
2112			install_signal_handlers();
2113		}
2114
2115		if (j->flags.pid_file)
2116			write_pid_file_or_die(j);
2117
2118		if (j->flags.cgroups)
2119			add_to_cgroups_or_die(j);
2120
2121		if (j->flags.userns)
2122			write_ugid_maps_or_die(j);
2123
2124		if (sync_child)
2125			parent_setup_complete(child_sync_pipe_fds);
2126
2127		if (use_preload) {
2128			/* Send marshalled minijail. */
2129			close(pipe_fds[0]);	/* read endpoint */
2130			ret = minijail_to_fd(j, pipe_fds[1]);
2131			close(pipe_fds[1]);	/* write endpoint */
2132			if (ret) {
2133				kill(j->initpid, SIGKILL);
2134				die("failed to send marshalled minijail");
2135			}
2136		}
2137
2138		if (pchild_pid)
2139			*pchild_pid = child_pid;
2140
2141		/*
2142		 * If we want to write to the child process' standard input,
2143		 * set up the write end of the pipe.
2144		 */
2145		if (pstdin_fd)
2146			*pstdin_fd = setup_pipe_end(stdin_fds,
2147						    1 /* write end */);
2148
2149		/*
2150		 * If we want to read from the child process' standard output,
2151		 * set up the read end of the pipe.
2152		 */
2153		if (pstdout_fd)
2154			*pstdout_fd = setup_pipe_end(stdout_fds,
2155						     0 /* read end */);
2156
2157		/*
2158		 * If we want to read from the child process' standard error,
2159		 * set up the read end of the pipe.
2160		 */
2161		if (pstderr_fd)
2162			*pstderr_fd = setup_pipe_end(stderr_fds,
2163						     0 /* read end */);
2164
2165		return 0;
2166	}
2167	/* Child process. */
2168	free(oldenv_copy);
2169
2170	if (j->flags.reset_signal_mask) {
2171		sigset_t signal_mask;
2172		if (sigemptyset(&signal_mask) != 0)
2173			pdie("sigemptyset failed");
2174		if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0)
2175			pdie("sigprocmask failed");
2176	}
2177
2178	if (j->flags.close_open_fds) {
2179		const size_t kMaxInheritableFdsSize = 10;
2180		int inheritable_fds[kMaxInheritableFdsSize];
2181		size_t size = 0;
2182		if (use_preload) {
2183			inheritable_fds[size++] = pipe_fds[0];
2184			inheritable_fds[size++] = pipe_fds[1];
2185		}
2186		if (sync_child) {
2187			inheritable_fds[size++] = child_sync_pipe_fds[0];
2188			inheritable_fds[size++] = child_sync_pipe_fds[1];
2189		}
2190		if (pstdin_fd) {
2191			inheritable_fds[size++] = stdin_fds[0];
2192			inheritable_fds[size++] = stdin_fds[1];
2193		}
2194		if (pstdout_fd) {
2195			inheritable_fds[size++] = stdout_fds[0];
2196			inheritable_fds[size++] = stdout_fds[1];
2197		}
2198		if (pstderr_fd) {
2199			inheritable_fds[size++] = stderr_fds[0];
2200			inheritable_fds[size++] = stderr_fds[1];
2201		}
2202
2203		if (close_open_fds(inheritable_fds, size) < 0)
2204			die("failed to close open file descriptors");
2205	}
2206
2207	if (sync_child)
2208		wait_for_parent_setup(child_sync_pipe_fds);
2209
2210	if (j->flags.userns)
2211		enter_user_namespace(j);
2212
2213	/*
2214	 * If we want to write to the jailed process' standard input,
2215	 * set up the read end of the pipe.
2216	 */
2217	if (pstdin_fd) {
2218		if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
2219					    STDIN_FILENO) < 0)
2220			die("failed to set up stdin pipe");
2221	}
2222
2223	/*
2224	 * If we want to read from the jailed process' standard output,
2225	 * set up the write end of the pipe.
2226	 */
2227	if (pstdout_fd) {
2228		if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
2229					    STDOUT_FILENO) < 0)
2230			die("failed to set up stdout pipe");
2231	}
2232
2233	/*
2234	 * If we want to read from the jailed process' standard error,
2235	 * set up the write end of the pipe.
2236	 */
2237	if (pstderr_fd) {
2238		if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
2239					    STDERR_FILENO) < 0)
2240			die("failed to set up stderr pipe");
2241	}
2242
2243	/*
2244	 * If any of stdin, stdout, or stderr are TTYs, create a new session.
2245	 * This prevents the jailed process from using the TIOCSTI ioctl
2246	 * to push characters into the parent process terminal's input buffer,
2247	 * therefore escaping the jail.
2248	 */
2249	if (isatty(STDIN_FILENO) || isatty(STDOUT_FILENO) ||
2250	    isatty(STDERR_FILENO)) {
2251		if (setsid() < 0) {
2252			pdie("setsid() failed");
2253		}
2254	}
2255
2256	/* If running an init program, let it decide when/how to mount /proc. */
2257	if (pid_namespace && !do_init)
2258		j->flags.remount_proc_ro = 0;
2259
2260	if (use_preload) {
2261		/* Strip out flags that cannot be inherited across execve(2). */
2262		minijail_preexec(j);
2263	} else {
2264		/*
2265		 * If not using LD_PRELOAD, do all jailing before execve(2).
2266		 * Note that PID namespaces can only be entered on fork(2),
2267		 * so that flag is still cleared.
2268		 */
2269		j->flags.pids = 0;
2270	}
2271	/* Jail this process, then execve(2) the target. */
2272	minijail_enter(j);
2273
2274	if (pid_namespace && do_init) {
2275		/*
2276		 * pid namespace: this process will become init inside the new
2277		 * namespace. We don't want all programs we might exec to have
2278		 * to know how to be init. Normally (do_init == 1) we fork off
2279		 * a child to actually run the program. If |do_init == 0|, we
2280		 * let the program keep pid 1 and be init.
2281		 *
2282		 * If we're multithreaded, we'll probably deadlock here. See
2283		 * WARNING above.
2284		 */
2285		child_pid = fork();
2286		if (child_pid < 0) {
2287			_exit(child_pid);
2288		} else if (child_pid > 0) {
2289			/*
2290			 * Best effort. Don't bother checking the return value.
2291			 */
2292			prctl(PR_SET_NAME, "minijail-init");
2293			init(child_pid);	/* Never returns. */
2294		}
2295	}
2296
2297	/*
2298	 * If we aren't pid-namespaced, or the jailed program asked to be init:
2299	 *   calling process
2300	 *   -> execve()-ing process
2301	 * If we are:
2302	 *   calling process
2303	 *   -> init()-ing process
2304	 *      -> execve()-ing process
2305	 */
2306	ret = execve(filename, argv, environ);
2307	if (ret == -1) {
2308		pwarn("execve(%s) failed", filename);
2309	}
2310	_exit(ret);
2311}
2312
2313int API minijail_kill(struct minijail *j)
2314{
2315	int st;
2316	if (kill(j->initpid, SIGTERM))
2317		return -errno;
2318	if (waitpid(j->initpid, &st, 0) < 0)
2319		return -errno;
2320	return st;
2321}
2322
2323int API minijail_wait(struct minijail *j)
2324{
2325	int st;
2326	if (waitpid(j->initpid, &st, 0) < 0)
2327		return -errno;
2328
2329	if (!WIFEXITED(st)) {
2330		int error_status = st;
2331		if (WIFSIGNALED(st)) {
2332			int signum = WTERMSIG(st);
2333			warn("child process %d received signal %d",
2334			     j->initpid, signum);
2335			/*
2336			 * We return MINIJAIL_ERR_JAIL if the process received
2337			 * SIGSYS, which happens when a syscall is blocked by
2338			 * seccomp filters.
2339			 * If not, we do what bash(1) does:
2340			 * $? = 128 + signum
2341			 */
2342			if (signum == SIGSYS) {
2343				error_status = MINIJAIL_ERR_JAIL;
2344			} else {
2345				error_status = 128 + signum;
2346			}
2347		}
2348		return error_status;
2349	}
2350
2351	int exit_status = WEXITSTATUS(st);
2352	if (exit_status != 0)
2353		info("child process %d exited with status %d",
2354		     j->initpid, exit_status);
2355
2356	return exit_status;
2357}
2358
2359void API minijail_destroy(struct minijail *j)
2360{
2361	size_t i;
2362
2363	if (j->flags.seccomp_filter && j->filter_prog) {
2364		free(j->filter_prog->filter);
2365		free(j->filter_prog);
2366	}
2367	while (j->mounts_head) {
2368		struct mountpoint *m = j->mounts_head;
2369		j->mounts_head = j->mounts_head->next;
2370		free(m->data);
2371		free(m->type);
2372		free(m->dest);
2373		free(m->src);
2374		free(m);
2375	}
2376	j->mounts_tail = NULL;
2377	if (j->user)
2378		free(j->user);
2379	if (j->suppl_gid_list)
2380		free(j->suppl_gid_list);
2381	if (j->chrootdir)
2382		free(j->chrootdir);
2383	if (j->pid_file_path)
2384		free(j->pid_file_path);
2385	if (j->uidmap)
2386		free(j->uidmap);
2387	if (j->gidmap)
2388		free(j->gidmap);
2389	if (j->hostname)
2390		free(j->hostname);
2391	if (j->alt_syscall_table)
2392		free(j->alt_syscall_table);
2393	for (i = 0; i < j->cgroup_count; ++i)
2394		free(j->cgroups[i]);
2395	free(j);
2396}
2397