libminijail.c revision 4b276a6c643cee568b9b623b1ce00fd41db9e8b9
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6#define _BSD_SOURCE
7#define _GNU_SOURCE
8
9#include <asm/unistd.h>
10#include <ctype.h>
11#include <errno.h>
12#include <fcntl.h>
13#include <grp.h>
14#include <inttypes.h>
15#include <limits.h>
16#include <linux/capability.h>
17#include <pwd.h>
18#include <sched.h>
19#include <signal.h>
20#include <stdarg.h>
21#include <stdbool.h>
22#include <stddef.h>
23#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#include <syscall.h>
27#include <sys/capability.h>
28#include <sys/mount.h>
29#include <sys/param.h>
30#include <sys/prctl.h>
31#include <sys/stat.h>
32#include <sys/types.h>
33#include <sys/user.h>
34#include <sys/wait.h>
35#include <unistd.h>
36
37#include "libminijail.h"
38#include "libminijail-private.h"
39
40#include "signal_handler.h"
41#include "syscall_filter.h"
42#include "util.h"
43
44#ifdef HAVE_SECUREBITS_H
45#include <linux/securebits.h>
46#else
47#define SECURE_ALL_BITS         0x15
48#define SECURE_ALL_LOCKS        (SECURE_ALL_BITS << 1)
49#endif
50
51/* Until these are reliably available in linux/prctl.h */
52#ifndef PR_SET_SECCOMP
53# define PR_SET_SECCOMP 22
54#endif
55
56#ifndef PR_ALT_SYSCALL
57# define PR_ALT_SYSCALL 0x43724f53
58#endif
59
60/* For seccomp_filter using BPF. */
61#ifndef PR_SET_NO_NEW_PRIVS
62# define PR_SET_NO_NEW_PRIVS 38
63#endif
64#ifndef SECCOMP_MODE_FILTER
65# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
66#endif
67
68#ifdef USE_SECCOMP_SOFTFAIL
69# define SECCOMP_SOFTFAIL 1
70#else
71# define SECCOMP_SOFTFAIL 0
72#endif
73
74struct mountpoint {
75	char *src;
76	char *dest;
77	char *type;
78	unsigned long flags;
79	struct mountpoint *next;
80};
81
82struct minijail {
83	/*
84	 * WARNING: if you add a flag here you need to make sure it's
85	 * accounted for in minijail_pre{enter|exec}() below.
86	 */
87	struct {
88		int uid:1;
89		int gid:1;
90		int usergroups:1;
91		int suppl_gids:1;
92		int caps:1;
93		int vfs:1;
94		int enter_vfs:1;
95		int pids:1;
96		int ipc:1;
97		int net:1;
98		int enter_net:1;
99		int userns:1;
100		int seccomp:1;
101		int remount_proc_ro:1;
102		int no_new_privs:1;
103		int seccomp_filter:1;
104		int log_seccomp_filter:1;
105		int chroot:1;
106		int pivot_root:1;
107		int mount_tmp:1;
108		int do_init:1;
109		int pid_file:1;
110		int alt_syscall:1;
111		int reset_signal_mask:1;
112	} flags;
113	uid_t uid;
114	gid_t gid;
115	gid_t usergid;
116	char *user;
117	size_t suppl_gid_count;
118	gid_t *suppl_gid_list;
119	uint64_t caps;
120	pid_t initpid;
121	int mountns_fd;
122	int netns_fd;
123	char *chrootdir;
124	char *pid_file_path;
125	char *uidmap;
126	char *gidmap;
127	size_t filter_len;
128	struct sock_fprog *filter_prog;
129	char *alt_syscall_table;
130	struct mountpoint *mounts_head;
131	struct mountpoint *mounts_tail;
132	size_t mounts_count;
133};
134
135/*
136 * Strip out flags meant for the parent.
137 * We keep things that are not inherited across execve(2) (e.g. capabilities),
138 * or are easier to set after execve(2) (e.g. seccomp filters).
139 */
140void minijail_preenter(struct minijail *j)
141{
142	j->flags.vfs = 0;
143	j->flags.enter_vfs = 0;
144	j->flags.remount_proc_ro = 0;
145	j->flags.pids = 0;
146	j->flags.do_init = 0;
147	j->flags.pid_file = 0;
148}
149
150/*
151 * Strip out flags meant for the child.
152 * We keep things that are inherited across execve(2).
153 */
154void minijail_preexec(struct minijail *j)
155{
156	int vfs = j->flags.vfs;
157	int enter_vfs = j->flags.enter_vfs;
158	int remount_proc_ro = j->flags.remount_proc_ro;
159	int userns = j->flags.userns;
160	if (j->user)
161		free(j->user);
162	j->user = NULL;
163	if (j->suppl_gid_list)
164		free(j->suppl_gid_list);
165	j->suppl_gid_list = NULL;
166	memset(&j->flags, 0, sizeof(j->flags));
167	/* Now restore anything we meant to keep. */
168	j->flags.vfs = vfs;
169	j->flags.enter_vfs = enter_vfs;
170	j->flags.remount_proc_ro = remount_proc_ro;
171	j->flags.userns = userns;
172	/* Note, |pids| will already have been used before this call. */
173}
174
175/* Minijail API. */
176
177struct minijail API *minijail_new(void)
178{
179	return calloc(1, sizeof(struct minijail));
180}
181
182void API minijail_change_uid(struct minijail *j, uid_t uid)
183{
184	if (uid == 0)
185		die("useless change to uid 0");
186	j->uid = uid;
187	j->flags.uid = 1;
188}
189
190void API minijail_change_gid(struct minijail *j, gid_t gid)
191{
192	if (gid == 0)
193		die("useless change to gid 0");
194	j->gid = gid;
195	j->flags.gid = 1;
196}
197
198int API minijail_set_supplementary_gids(struct minijail *j, size_t size,
199					const gid_t *list)
200{
201	size_t i;
202
203	if (j->flags.usergroups)
204		die("cannot inherit *and* set supplementary groups");
205
206	if (size == 0)
207		return -EINVAL;
208
209	/* Copy the gid_t array. */
210	j->suppl_gid_list = calloc(size, sizeof(gid_t));
211	if (!j->suppl_gid_list) {
212		return -ENOMEM;
213	}
214	for (i = 0; i < size; i++) {
215		j->suppl_gid_list[i] = list[i];
216	}
217	j->suppl_gid_count = size;
218	j->flags.suppl_gids = 1;
219	return 0;
220}
221
222int API minijail_change_user(struct minijail *j, const char *user)
223{
224	char *buf = NULL;
225	struct passwd pw;
226	struct passwd *ppw = NULL;
227	ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
228	if (sz == -1)
229		sz = 65536;	/* your guess is as good as mine... */
230
231	/*
232	 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
233	 * the maximum needed size of the buffer, so we don't have to search.
234	 */
235	buf = malloc(sz);
236	if (!buf)
237		return -ENOMEM;
238	getpwnam_r(user, &pw, buf, sz, &ppw);
239	/*
240	 * We're safe to free the buffer here. The strings inside pw point
241	 * inside buf, but we don't use any of them; this leaves the pointers
242	 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded.
243	 */
244	free(buf);
245	/* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
246	if (!ppw)
247		return -1;
248	minijail_change_uid(j, ppw->pw_uid);
249	j->user = strdup(user);
250	if (!j->user)
251		return -ENOMEM;
252	j->usergid = ppw->pw_gid;
253	return 0;
254}
255
256int API minijail_change_group(struct minijail *j, const char *group)
257{
258	char *buf = NULL;
259	struct group gr;
260	struct group *pgr = NULL;
261	ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
262	if (sz == -1)
263		sz = 65536;	/* and mine is as good as yours, really */
264
265	/*
266	 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
267	 * the maximum needed size of the buffer, so we don't have to search.
268	 */
269	buf = malloc(sz);
270	if (!buf)
271		return -ENOMEM;
272	getgrnam_r(group, &gr, buf, sz, &pgr);
273	/*
274	 * We're safe to free the buffer here. The strings inside gr point
275	 * inside buf, but we don't use any of them; this leaves the pointers
276	 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
277	 */
278	free(buf);
279	/* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
280	if (!pgr)
281		return -1;
282	minijail_change_gid(j, pgr->gr_gid);
283	return 0;
284}
285
286void API minijail_use_seccomp(struct minijail *j)
287{
288	j->flags.seccomp = 1;
289}
290
291void API minijail_no_new_privs(struct minijail *j)
292{
293	j->flags.no_new_privs = 1;
294}
295
296void API minijail_use_seccomp_filter(struct minijail *j)
297{
298	j->flags.seccomp_filter = 1;
299}
300
301void API minijail_log_seccomp_filter_failures(struct minijail *j)
302{
303	j->flags.log_seccomp_filter = 1;
304}
305
306void API minijail_use_caps(struct minijail *j, uint64_t capmask)
307{
308	j->caps = capmask;
309	j->flags.caps = 1;
310}
311
312void API minijail_reset_signal_mask(struct minijail* j) {
313	j->flags.reset_signal_mask = 1;
314}
315
316void API minijail_namespace_vfs(struct minijail *j)
317{
318	j->flags.vfs = 1;
319}
320
321void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
322{
323	int ns_fd = open(ns_path, O_RDONLY);
324	if (ns_fd < 0) {
325		pdie("failed to open namespace '%s'", ns_path);
326	}
327	j->mountns_fd = ns_fd;
328	j->flags.enter_vfs = 1;
329}
330
331void API minijail_namespace_pids(struct minijail *j)
332{
333	j->flags.vfs = 1;
334	j->flags.remount_proc_ro = 1;
335	j->flags.pids = 1;
336	j->flags.do_init = 1;
337}
338
339void API minijail_namespace_ipc(struct minijail *j)
340{
341	j->flags.ipc = 1;
342}
343
344void API minijail_namespace_net(struct minijail *j)
345{
346	j->flags.net = 1;
347}
348
349void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
350{
351	int ns_fd = open(ns_path, O_RDONLY);
352	if (ns_fd < 0) {
353		pdie("failed to open namespace '%s'", ns_path);
354	}
355	j->netns_fd = ns_fd;
356	j->flags.enter_net = 1;
357}
358
359void API minijail_remount_proc_readonly(struct minijail *j)
360{
361	j->flags.vfs = 1;
362	j->flags.remount_proc_ro = 1;
363}
364
365void API minijail_namespace_user(struct minijail *j)
366{
367	j->flags.userns = 1;
368}
369
370int API minijail_uidmap(struct minijail *j, const char *uidmap)
371{
372	j->uidmap = strdup(uidmap);
373	if (!j->uidmap)
374		return -ENOMEM;
375	char *ch;
376	for (ch = j->uidmap; *ch; ch++) {
377		if (*ch == ',')
378			*ch = '\n';
379	}
380	return 0;
381}
382
383int API minijail_gidmap(struct minijail *j, const char *gidmap)
384{
385	j->gidmap = strdup(gidmap);
386	if (!j->gidmap)
387		return -ENOMEM;
388	char *ch;
389	for (ch = j->gidmap; *ch; ch++) {
390		if (*ch == ',')
391			*ch = '\n';
392	}
393	return 0;
394}
395
396void API minijail_inherit_usergroups(struct minijail *j)
397{
398	j->flags.usergroups = 1;
399}
400
401void API minijail_run_as_init(struct minijail *j)
402{
403	/*
404	 * Since the jailed program will become 'init' in the new PID namespace,
405	 * Minijail does not need to fork an 'init' process.
406	 */
407	j->flags.do_init = 0;
408}
409
410int API minijail_enter_chroot(struct minijail *j, const char *dir)
411{
412	if (j->chrootdir)
413		return -EINVAL;
414	j->chrootdir = strdup(dir);
415	if (!j->chrootdir)
416		return -ENOMEM;
417	j->flags.chroot = 1;
418	return 0;
419}
420
421int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
422{
423	if (j->chrootdir)
424		return -EINVAL;
425	j->chrootdir = strdup(dir);
426	if (!j->chrootdir)
427		return -ENOMEM;
428	j->flags.pivot_root = 1;
429	return 0;
430}
431
432static char *append_external_path(const char *external_path,
433				  const char *path_inside_chroot)
434{
435	char *path;
436	size_t pathlen;
437
438	/* One extra char for '/' and one for '\0', hence + 2. */
439	pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2;
440	path = malloc(pathlen);
441	snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot);
442
443	return path;
444}
445
446char API *minijail_get_original_path(struct minijail *j,
447				     const char *path_inside_chroot)
448{
449	struct mountpoint *b;
450
451	b = j->mounts_head;
452	while (b) {
453		/*
454		 * If |path_inside_chroot| is the exact destination of a
455		 * mount, then the original path is exactly the source of
456		 * the mount.
457		 *  for example: "-b /some/path/exe,/chroot/path/exe"
458		 *    mount source = /some/path/exe, mount dest =
459		 *    /chroot/path/exe Then when getting the original path of
460		 *    "/chroot/path/exe", the source of that mount,
461		 *    "/some/path/exe" is what should be returned.
462		 */
463		if (!strcmp(b->dest, path_inside_chroot))
464			return strdup(b->src);
465
466		/*
467		 * If |path_inside_chroot| is within the destination path of a
468		 * mount, take the suffix of the chroot path relative to the
469		 * mount destination path, and append it to the mount source
470		 * path.
471		 */
472		if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) {
473			const char *relative_path =
474				path_inside_chroot + strlen(b->dest);
475			return append_external_path(b->src, relative_path);
476		}
477		b = b->next;
478	}
479
480	/* If there is a chroot path, append |path_inside_chroot| to that. */
481	if (j->chrootdir)
482		return append_external_path(j->chrootdir, path_inside_chroot);
483
484	/* No chroot, so the path outside is the same as it is inside. */
485	return strdup(path_inside_chroot);
486}
487
488void API minijail_mount_tmp(struct minijail *j)
489{
490	j->flags.mount_tmp = 1;
491}
492
493int API minijail_write_pid_file(struct minijail *j, const char *path)
494{
495	j->pid_file_path = strdup(path);
496	if (!j->pid_file_path)
497		return -ENOMEM;
498	j->flags.pid_file = 1;
499	return 0;
500}
501
502int API minijail_mount(struct minijail *j, const char *src, const char *dest,
503		       const char *type, unsigned long flags)
504{
505	struct mountpoint *m;
506
507	if (*dest != '/')
508		return -EINVAL;
509	m = calloc(1, sizeof(*m));
510	if (!m)
511		return -ENOMEM;
512	m->dest = strdup(dest);
513	if (!m->dest)
514		goto error;
515	m->src = strdup(src);
516	if (!m->src)
517		goto error;
518	m->type = strdup(type);
519	if (!m->type)
520		goto error;
521	m->flags = flags;
522
523	info("mount %s -> %s type %s", src, dest, type);
524
525	/*
526	 * Force vfs namespacing so the mounts don't leak out into the
527	 * containing vfs namespace.
528	 */
529	minijail_namespace_vfs(j);
530
531	if (j->mounts_tail)
532		j->mounts_tail->next = m;
533	else
534		j->mounts_head = m;
535	j->mounts_tail = m;
536	j->mounts_count++;
537
538	return 0;
539
540error:
541	free(m->src);
542	free(m->dest);
543	free(m);
544	return -ENOMEM;
545}
546
547int API minijail_bind(struct minijail *j, const char *src, const char *dest,
548		      int writeable)
549{
550	unsigned long flags = MS_BIND;
551
552	if (!writeable)
553		flags |= MS_RDONLY;
554
555	return minijail_mount(j, src, dest, "", flags);
556}
557
558void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
559{
560	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) {
561		if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
562			warn("not loading seccomp filter,"
563			     " seccomp not supported");
564			return;
565		}
566	}
567	FILE *file = fopen(path, "r");
568	if (!file) {
569		pdie("failed to open seccomp filter file '%s'", path);
570	}
571
572	struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
573	if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
574		die("failed to compile seccomp filter BPF program in '%s'",
575		    path);
576	}
577
578	j->filter_len = fprog->len;
579	j->filter_prog = fprog;
580
581	fclose(file);
582}
583
584int API minijail_use_alt_syscall(struct minijail *j, const char *table)
585{
586	j->alt_syscall_table = strdup(table);
587	if (!j->alt_syscall_table)
588		return -ENOMEM;
589	j->flags.alt_syscall = 1;
590	return 0;
591}
592
593struct marshal_state {
594	size_t available;
595	size_t total;
596	char *buf;
597};
598
599void marshal_state_init(struct marshal_state *state,
600			char *buf, size_t available)
601{
602	state->available = available;
603	state->buf = buf;
604	state->total = 0;
605}
606
607void marshal_append(struct marshal_state *state,
608		    void *src, size_t length)
609{
610	size_t copy_len = MIN(state->available, length);
611
612	/* Up to |available| will be written. */
613	if (copy_len) {
614		memcpy(state->buf, src, copy_len);
615		state->buf += copy_len;
616		state->available -= copy_len;
617	}
618	/* |total| will contain the expected length. */
619	state->total += length;
620}
621
622void minijail_marshal_helper(struct marshal_state *state,
623			     const struct minijail *j)
624{
625	struct mountpoint *m = NULL;
626	marshal_append(state, (char *)j, sizeof(*j));
627	if (j->user)
628		marshal_append(state, j->user, strlen(j->user) + 1);
629	if (j->suppl_gid_list) {
630		marshal_append(state, j->suppl_gid_list,
631			       j->suppl_gid_count * sizeof(gid_t));
632	}
633	if (j->chrootdir)
634		marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
635	if (j->alt_syscall_table) {
636		marshal_append(state, j->alt_syscall_table,
637			       strlen(j->alt_syscall_table) + 1);
638	}
639	if (j->flags.seccomp_filter && j->filter_prog) {
640		struct sock_fprog *fp = j->filter_prog;
641		marshal_append(state, (char *)fp->filter,
642				fp->len * sizeof(struct sock_filter));
643	}
644	for (m = j->mounts_head; m; m = m->next) {
645		marshal_append(state, m->src, strlen(m->src) + 1);
646		marshal_append(state, m->dest, strlen(m->dest) + 1);
647		marshal_append(state, m->type, strlen(m->type) + 1);
648		marshal_append(state, (char *)&m->flags, sizeof(m->flags));
649	}
650}
651
652size_t API minijail_size(const struct minijail *j)
653{
654	struct marshal_state state;
655	marshal_state_init(&state, NULL, 0);
656	minijail_marshal_helper(&state, j);
657	return state.total;
658}
659
660int minijail_marshal(const struct minijail *j, char *buf, size_t available)
661{
662	struct marshal_state state;
663	marshal_state_init(&state, buf, available);
664	minijail_marshal_helper(&state, j);
665	return (state.total > available);
666}
667
668/*
669 * consumebytes: consumes @length bytes from a buffer @buf of length @buflength
670 * @length    Number of bytes to consume
671 * @buf       Buffer to consume from
672 * @buflength Size of @buf
673 *
674 * Returns a pointer to the base of the bytes, or NULL for errors.
675 */
676void *consumebytes(size_t length, char **buf, size_t *buflength)
677{
678	char *p = *buf;
679	if (length > *buflength)
680		return NULL;
681	*buf += length;
682	*buflength -= length;
683	return p;
684}
685
686/*
687 * consumestr: consumes a C string from a buffer @buf of length @length
688 * @buf    Buffer to consume
689 * @length Length of buffer
690 *
691 * Returns a pointer to the base of the string, or NULL for errors.
692 */
693char *consumestr(char **buf, size_t *buflength)
694{
695	size_t len = strnlen(*buf, *buflength);
696	if (len == *buflength)
697		/* There's no null-terminator. */
698		return NULL;
699	return consumebytes(len + 1, buf, buflength);
700}
701
702int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
703{
704	size_t i;
705	size_t count;
706	int ret = -EINVAL;
707
708	if (length < sizeof(*j))
709		goto out;
710	memcpy((void *)j, serialized, sizeof(*j));
711	serialized += sizeof(*j);
712	length -= sizeof(*j);
713
714	/* Potentially stale pointers not used as signals. */
715	j->mounts_head = NULL;
716	j->mounts_tail = NULL;
717	j->filter_prog = NULL;
718
719	if (j->user) {		/* stale pointer */
720		char *user = consumestr(&serialized, &length);
721		if (!user)
722			goto clear_pointers;
723		j->user = strdup(user);
724		if (!j->user)
725			goto clear_pointers;
726	}
727
728	if (j->suppl_gid_list) {	/* stale pointer */
729		if (j->suppl_gid_count > NGROUPS_MAX) {
730			goto bad_gid_list;
731		}
732		size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t);
733		void *gid_list_bytes =
734		    consumebytes(gid_list_size, &serialized, &length);
735		if (!gid_list_bytes)
736			goto bad_gid_list;
737
738		j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t));
739		if (!j->suppl_gid_list)
740			goto bad_gid_list;
741
742		memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size);
743	}
744
745	if (j->chrootdir) {	/* stale pointer */
746		char *chrootdir = consumestr(&serialized, &length);
747		if (!chrootdir)
748			goto bad_chrootdir;
749		j->chrootdir = strdup(chrootdir);
750		if (!j->chrootdir)
751			goto bad_chrootdir;
752	}
753
754	if (j->alt_syscall_table) {	/* stale pointer */
755		char *alt_syscall_table = consumestr(&serialized, &length);
756		if (!alt_syscall_table)
757			goto bad_syscall_table;
758		j->alt_syscall_table = strdup(alt_syscall_table);
759		if (!j->alt_syscall_table)
760			goto bad_syscall_table;
761	}
762
763	if (j->flags.seccomp_filter && j->filter_len > 0) {
764		size_t ninstrs = j->filter_len;
765		if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
766		    ninstrs > USHRT_MAX)
767			goto bad_filters;
768
769		size_t program_len = ninstrs * sizeof(struct sock_filter);
770		void *program = consumebytes(program_len, &serialized, &length);
771		if (!program)
772			goto bad_filters;
773
774		j->filter_prog = malloc(sizeof(struct sock_fprog));
775		if (!j->filter_prog)
776			goto bad_filters;
777
778		j->filter_prog->len = ninstrs;
779		j->filter_prog->filter = malloc(program_len);
780		if (!j->filter_prog->filter)
781			goto bad_filter_prog_instrs;
782
783		memcpy(j->filter_prog->filter, program, program_len);
784	}
785
786	count = j->mounts_count;
787	j->mounts_count = 0;
788	for (i = 0; i < count; ++i) {
789		unsigned long *flags;
790		const char *dest;
791		const char *type;
792		const char *src = consumestr(&serialized, &length);
793		if (!src)
794			goto bad_mounts;
795		dest = consumestr(&serialized, &length);
796		if (!dest)
797			goto bad_mounts;
798		type = consumestr(&serialized, &length);
799		if (!type)
800			goto bad_mounts;
801		flags = consumebytes(sizeof(*flags), &serialized, &length);
802		if (!flags)
803			goto bad_mounts;
804		if (minijail_mount(j, src, dest, type, *flags))
805			goto bad_mounts;
806	}
807
808	return 0;
809
810bad_mounts:
811	if (j->flags.seccomp_filter && j->filter_len > 0) {
812		free(j->filter_prog->filter);
813		free(j->filter_prog);
814	}
815bad_filter_prog_instrs:
816	if (j->filter_prog)
817		free(j->filter_prog);
818bad_filters:
819	if (j->alt_syscall_table)
820		free(j->alt_syscall_table);
821bad_syscall_table:
822	if (j->chrootdir)
823		free(j->chrootdir);
824bad_chrootdir:
825	if (j->suppl_gid_list)
826		free(j->suppl_gid_list);
827bad_gid_list:
828	if (j->user)
829		free(j->user);
830clear_pointers:
831	j->user = NULL;
832	j->suppl_gid_list = NULL;
833	j->chrootdir = NULL;
834	j->alt_syscall_table = NULL;
835out:
836	return ret;
837}
838
839static void write_ugid_mappings(const struct minijail *j, int *pipe_fds)
840{
841	int fd, ret, len;
842	size_t sz;
843	char fname[32];
844	close(pipe_fds[0]);
845
846	sz = sizeof(fname);
847	if (j->uidmap) {
848		ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid);
849		if (ret < 0 || (size_t)ret >= sz)
850			die("failed to write file name of uid_map");
851		fd = open(fname, O_WRONLY);
852		if (fd < 0)
853			pdie("failed to open '%s'", fname);
854		len = strlen(j->uidmap);
855		if (write(fd, j->uidmap, len) < len)
856			die("failed to set uid_map");
857		close(fd);
858	}
859	if (j->gidmap) {
860		ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid);
861		if (ret < 0 || (size_t)ret >= sz)
862			die("failed to write file name of gid_map");
863		fd = open(fname, O_WRONLY);
864		if (fd < 0)
865			pdie("failed to open '%s'", fname);
866		len = strlen(j->gidmap);
867		if (write(fd, j->gidmap, len) < len)
868			die("failed to set gid_map");
869		close(fd);
870	}
871
872	close(pipe_fds[1]);
873}
874
875static void enter_user_namespace(const struct minijail *j, int *pipe_fds)
876{
877	char buf;
878
879	close(pipe_fds[1]);
880
881	/* Wait for parent to set up uid/gid mappings. */
882	if (read(pipe_fds[0], &buf, 1) != 0)
883		die("failed to sync with parent");
884	close(pipe_fds[0]);
885
886	if (j->uidmap && setresuid(0, 0, 0))
887		pdie("setresuid");
888	if (j->gidmap && setresgid(0, 0, 0))
889		pdie("setresgid");
890}
891
892/*
893 * mount_one: Applies mounts from @m for @j, recursing as needed.
894 * @j Minijail these mounts are for
895 * @m Head of list of mounts
896 *
897 * Returns 0 for success.
898 */
899static int mount_one(const struct minijail *j, struct mountpoint *m)
900{
901	int ret;
902	char *dest;
903	int remount_ro = 0;
904
905	/* dest has a leading "/" */
906	if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0)
907		return -ENOMEM;
908
909	/*
910	 * R/O bind mounts have to be remounted since bind and ro can't both be
911	 * specified in the original bind mount. Remount R/O after the initial
912	 * mount.
913	 */
914	if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) {
915		remount_ro = 1;
916		m->flags &= ~MS_RDONLY;
917	}
918
919	ret = mount(m->src, dest, m->type, m->flags, NULL);
920	if (ret)
921		pdie("mount: %s -> %s", m->src, dest);
922
923	if (remount_ro) {
924		m->flags |= MS_RDONLY;
925		ret = mount(m->src, dest, NULL,
926			    m->flags | MS_REMOUNT, NULL);
927		if (ret)
928			pdie("bind ro: %s -> %s", m->src, dest);
929	}
930
931	free(dest);
932	if (m->next)
933		return mount_one(j, m->next);
934	return ret;
935}
936
937int enter_chroot(const struct minijail *j)
938{
939	int ret;
940
941	if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
942		return ret;
943
944	if (chroot(j->chrootdir))
945		return -errno;
946
947	if (chdir("/"))
948		return -errno;
949
950	return 0;
951}
952
953int enter_pivot_root(const struct minijail *j)
954{
955	int ret, oldroot, newroot;
956
957	if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
958		return ret;
959
960	/*
961	 * Keep the fd for both old and new root.
962	 * It will be used in fchdir later.
963	 */
964	oldroot = open("/", O_DIRECTORY | O_RDONLY);
965	if (oldroot < 0)
966		pdie("failed to open / for fchdir");
967	newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY);
968	if (newroot < 0)
969		pdie("failed to open %s for fchdir", j->chrootdir);
970
971	/*
972	 * To ensure chrootdir is the root of a file system,
973	 * do a self bind mount.
974	 */
975	if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
976		pdie("failed to bind mount '%s'", j->chrootdir);
977	if (chdir(j->chrootdir))
978		return -errno;
979	if (syscall(SYS_pivot_root, ".", "."))
980		pdie("pivot_root");
981
982	/*
983	 * Now the old root is mounted on top of the new root. Use fchdir to
984	 * change to the old root and unmount it.
985	 */
986	if (fchdir(oldroot))
987		pdie("failed to fchdir to old /");
988	/* The old root might be busy, so use lazy unmount. */
989	if (umount2(".", MNT_DETACH))
990		pdie("umount(/)");
991	/* Change back to the new root. */
992	if (fchdir(newroot))
993		return -errno;
994	if (chroot("/"))
995		return -errno;
996	/* Set correct CWD for getcwd(3). */
997	if (chdir("/"))
998		return -errno;
999
1000	return 0;
1001}
1002
1003int mount_tmp(void)
1004{
1005	return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
1006}
1007
1008int remount_proc_readonly(const struct minijail *j)
1009{
1010	const char *kProcPath = "/proc";
1011	const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
1012	/*
1013	 * Right now, we're holding a reference to our parent's old mount of
1014	 * /proc in our namespace, which means using MS_REMOUNT here would
1015	 * mutate our parent's mount as well, even though we're in a VFS
1016	 * namespace (!). Instead, remove their mount from our namespace
1017	 * and make our own. However, if we are in a new user namespace, /proc
1018	 * is not seen as mounted, so don't return error if umount() fails.
1019	 */
1020	if (umount2(kProcPath, MNT_DETACH) && !j->flags.userns)
1021		return -errno;
1022	if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
1023		return -errno;
1024	return 0;
1025}
1026
1027static void write_pid_file(const struct minijail *j)
1028{
1029	FILE *fp = fopen(j->pid_file_path, "w");
1030
1031	if (!fp)
1032		pdie("failed to open '%s'", j->pid_file_path);
1033	if (fprintf(fp, "%d\n", (int)j->initpid) < 0)
1034		pdie("fprintf(%s)", j->pid_file_path);
1035	if (fclose(fp))
1036		pdie("fclose(%s)", j->pid_file_path);
1037}
1038
1039void drop_ugid(const struct minijail *j)
1040{
1041	if (j->flags.usergroups && j->flags.suppl_gids) {
1042		die("tried to inherit *and* set supplementary groups;"
1043		    " can only do one");
1044	}
1045
1046	if (j->flags.usergroups) {
1047		if (initgroups(j->user, j->usergid))
1048			pdie("initgroups");
1049	} else if (j->flags.suppl_gids) {
1050		if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) {
1051			pdie("setgroups");
1052		}
1053	} else {
1054		/*
1055		 * Only attempt to clear supplementary groups if we are changing
1056		 * users.
1057		 */
1058		if ((j->uid || j->gid) && setgroups(0, NULL))
1059			pdie("setgroups");
1060	}
1061
1062	if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
1063		pdie("setresgid");
1064
1065	if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
1066		pdie("setresuid");
1067}
1068
1069/*
1070 * We specifically do not use cap_valid() as that only tells us the last
1071 * valid cap we were *compiled* against (i.e. what the version of kernel
1072 * headers says). If we run on a different kernel version, then it's not
1073 * uncommon for that to be less (if an older kernel) or more (if a newer
1074 * kernel).
1075 * Normally, we suck up the answer via /proc. On Android, not all processes are
1076 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
1077 * programmatically find the value by calling prctl(PR_CAPBSET_READ).
1078 */
1079static unsigned int get_last_valid_cap()
1080{
1081	unsigned int last_valid_cap = 0;
1082	if (is_android()) {
1083		for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
1084		     ++last_valid_cap);
1085
1086		/* |last_valid_cap| will be the first failing value. */
1087		if (last_valid_cap > 0) {
1088			last_valid_cap--;
1089		}
1090	} else {
1091		const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
1092		FILE *fp = fopen(cap_file, "re");
1093		if (fscanf(fp, "%u", &last_valid_cap) != 1)
1094			pdie("fscanf(%s)", cap_file);
1095		fclose(fp);
1096	}
1097	return last_valid_cap;
1098}
1099
1100void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
1101{
1102	cap_t caps = cap_get_proc();
1103	cap_value_t flag[1];
1104	const uint64_t one = 1;
1105	unsigned int i;
1106	if (!caps)
1107		die("can't get process caps");
1108	if (cap_clear_flag(caps, CAP_INHERITABLE))
1109		die("can't clear inheritable caps");
1110	if (cap_clear_flag(caps, CAP_EFFECTIVE))
1111		die("can't clear effective caps");
1112	if (cap_clear_flag(caps, CAP_PERMITTED))
1113		die("can't clear permitted caps");
1114	for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
1115		/* Keep CAP_SETPCAP for dropping bounding set bits. */
1116		if (i != CAP_SETPCAP && !(j->caps & (one << i)))
1117			continue;
1118		flag[0] = i;
1119		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
1120			die("can't add effective cap");
1121		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
1122			die("can't add permitted cap");
1123		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
1124			die("can't add inheritable cap");
1125	}
1126	if (cap_set_proc(caps))
1127		die("can't apply initial cleaned capset");
1128
1129	/*
1130	 * Instead of dropping bounding set first, do it here in case
1131	 * the caller had a more permissive bounding set which could
1132	 * have been used above to raise a capability that wasn't already
1133	 * present. This requires CAP_SETPCAP, so we raised/kept it above.
1134	 */
1135	for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
1136		if (j->caps & (one << i))
1137			continue;
1138		if (prctl(PR_CAPBSET_DROP, i))
1139			pdie("prctl(PR_CAPBSET_DROP)");
1140	}
1141
1142	/* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
1143	if ((j->caps & (one << CAP_SETPCAP)) == 0) {
1144		flag[0] = CAP_SETPCAP;
1145		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
1146			die("can't clear effective cap");
1147		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
1148			die("can't clear permitted cap");
1149		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
1150			die("can't clear inheritable cap");
1151	}
1152
1153	if (cap_set_proc(caps))
1154		die("can't apply final cleaned capset");
1155
1156	cap_free(caps);
1157}
1158
1159void set_seccomp_filter(const struct minijail *j)
1160{
1161	/*
1162	 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
1163	 * in the kernel source tree for an explanation of the parameters.
1164	 */
1165	if (j->flags.no_new_privs) {
1166		if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
1167			pdie("prctl(PR_SET_NO_NEW_PRIVS)");
1168	}
1169
1170	/*
1171	 * If we're logging seccomp filter failures,
1172	 * install the SIGSYS handler first.
1173	 */
1174	if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
1175		if (install_sigsys_handler())
1176			pdie("install SIGSYS handler");
1177		warn("logging seccomp filter failures");
1178	}
1179
1180	/*
1181	 * Install the syscall filter.
1182	 */
1183	if (j->flags.seccomp_filter) {
1184		if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
1185			  j->filter_prog)) {
1186			if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
1187				warn("seccomp not supported");
1188				return;
1189			}
1190			pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
1191		}
1192	}
1193}
1194
1195void API minijail_enter(const struct minijail *j)
1196{
1197	/*
1198	 * If we're dropping caps, get the last valid cap from /proc now,
1199	 * since /proc can be unmounted before drop_caps() is called.
1200	 */
1201	unsigned int last_valid_cap = 0;
1202	if (j->flags.caps)
1203		last_valid_cap = get_last_valid_cap();
1204
1205	if (j->flags.pids)
1206		die("tried to enter a pid-namespaced jail;"
1207		    " try minijail_run()?");
1208
1209	if (j->flags.usergroups && !j->user)
1210		die("usergroup inheritance without username");
1211
1212	/*
1213	 * We can't recover from failures if we've dropped privileges partially,
1214	 * so we don't even try. If any of our operations fail, we abort() the
1215	 * entire process.
1216	 */
1217	if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
1218		pdie("setns(CLONE_NEWNS)");
1219
1220	if (j->flags.vfs) {
1221		if (unshare(CLONE_NEWNS))
1222			pdie("unshare(vfs)");
1223		/*
1224		 * Remount all filesystems as private. If they are shared
1225		 * new bind mounts will creep out of our namespace.
1226		 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
1227		 */
1228		if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
1229			pdie("mount(/, private)");
1230	}
1231
1232	if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
1233		pdie("unshare(ipc)");
1234	}
1235
1236	if (j->flags.enter_net) {
1237		if (setns(j->netns_fd, CLONE_NEWNET))
1238			pdie("setns(CLONE_NEWNET)");
1239	} else if (j->flags.net && unshare(CLONE_NEWNET)) {
1240		pdie("unshare(net)");
1241	}
1242
1243	if (j->flags.chroot && enter_chroot(j))
1244		pdie("chroot");
1245
1246	if (j->flags.pivot_root && enter_pivot_root(j))
1247		pdie("pivot_root");
1248
1249	if (j->flags.mount_tmp && mount_tmp())
1250		pdie("mount_tmp");
1251
1252	if (j->flags.remount_proc_ro && remount_proc_readonly(j))
1253		pdie("remount");
1254
1255	if (j->flags.caps) {
1256		/*
1257		 * POSIX capabilities are a bit tricky. If we drop our
1258		 * capability to change uids, our attempt to use setuid()
1259		 * below will fail. Hang on to root caps across setuid(), then
1260		 * lock securebits.
1261		 */
1262		if (prctl(PR_SET_KEEPCAPS, 1))
1263			pdie("prctl(PR_SET_KEEPCAPS)");
1264		if (prctl
1265		    (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
1266			pdie("prctl(PR_SET_SECUREBITS)");
1267	}
1268
1269	/*
1270	 * If we're setting no_new_privs, we can drop privileges
1271	 * before setting seccomp filter. This way filter policies
1272	 * don't need to allow privilege-dropping syscalls.
1273	 */
1274	if (j->flags.no_new_privs) {
1275		drop_ugid(j);
1276		if (j->flags.caps)
1277			drop_caps(j, last_valid_cap);
1278
1279		set_seccomp_filter(j);
1280	} else {
1281		/*
1282		 * If we're not setting no_new_privs,
1283		 * we need to set seccomp filter *before* dropping privileges.
1284		 * WARNING: this means that filter policies *must* allow
1285		 * setgroups()/setresgid()/setresuid() for dropping root and
1286		 * capget()/capset()/prctl() for dropping caps.
1287		 */
1288		set_seccomp_filter(j);
1289
1290		drop_ugid(j);
1291		if (j->flags.caps)
1292			drop_caps(j, last_valid_cap);
1293	}
1294
1295	/*
1296	 * Select the specified alternate syscall table.  The table must not
1297	 * block prctl(2) if we're using seccomp as well.
1298	 */
1299	if (j->flags.alt_syscall) {
1300		if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table))
1301			pdie("prctl(PR_ALT_SYSCALL)");
1302	}
1303
1304	/*
1305	 * seccomp has to come last since it cuts off all the other
1306	 * privilege-dropping syscalls :)
1307	 */
1308	if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
1309		if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
1310			warn("seccomp not supported");
1311			return;
1312		}
1313		pdie("prctl(PR_SET_SECCOMP)");
1314	}
1315}
1316
1317/* TODO(wad) will visibility affect this variable? */
1318static int init_exitstatus = 0;
1319
1320void init_term(int __attribute__ ((unused)) sig)
1321{
1322	_exit(init_exitstatus);
1323}
1324
1325int init(pid_t rootpid)
1326{
1327	pid_t pid;
1328	int status;
1329	/* so that we exit with the right status */
1330	signal(SIGTERM, init_term);
1331	/* TODO(wad) self jail with seccomp_filters here. */
1332	while ((pid = wait(&status)) > 0) {
1333		/*
1334		 * This loop will only end when either there are no processes
1335		 * left inside our pid namespace or we get a signal.
1336		 */
1337		if (pid == rootpid)
1338			init_exitstatus = status;
1339	}
1340	if (!WIFEXITED(init_exitstatus))
1341		_exit(MINIJAIL_ERR_INIT);
1342	_exit(WEXITSTATUS(init_exitstatus));
1343}
1344
1345int API minijail_from_fd(int fd, struct minijail *j)
1346{
1347	size_t sz = 0;
1348	size_t bytes = read(fd, &sz, sizeof(sz));
1349	char *buf;
1350	int r;
1351	if (sizeof(sz) != bytes)
1352		return -EINVAL;
1353	if (sz > USHRT_MAX)	/* arbitrary sanity check */
1354		return -E2BIG;
1355	buf = malloc(sz);
1356	if (!buf)
1357		return -ENOMEM;
1358	bytes = read(fd, buf, sz);
1359	if (bytes != sz) {
1360		free(buf);
1361		return -EINVAL;
1362	}
1363	r = minijail_unmarshal(j, buf, sz);
1364	free(buf);
1365	return r;
1366}
1367
1368int API minijail_to_fd(struct minijail *j, int fd)
1369{
1370	char *buf;
1371	size_t sz = minijail_size(j);
1372	ssize_t written;
1373	int r;
1374
1375	if (!sz)
1376		return -EINVAL;
1377	buf = malloc(sz);
1378	r = minijail_marshal(j, buf, sz);
1379	if (r) {
1380		free(buf);
1381		return r;
1382	}
1383	/* Sends [size][minijail]. */
1384	written = write(fd, &sz, sizeof(sz));
1385	if (written != sizeof(sz)) {
1386		free(buf);
1387		return -EFAULT;
1388	}
1389	written = write(fd, buf, sz);
1390	if (written < 0 || (size_t) written != sz) {
1391		free(buf);
1392		return -EFAULT;
1393	}
1394	free(buf);
1395	return 0;
1396}
1397
1398int setup_preload(void)
1399{
1400#if defined(__ANDROID__)
1401	/* Don't use LDPRELOAD on Brillo. */
1402	return 0;
1403#else
1404	char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
1405	char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
1406	if (!newenv)
1407		return -ENOMEM;
1408
1409	/* Only insert a separating space if we have something to separate... */
1410	sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
1411		PRELOADPATH);
1412
1413	/* setenv() makes a copy of the string we give it. */
1414	setenv(kLdPreloadEnvVar, newenv, 1);
1415	free(newenv);
1416	return 0;
1417#endif
1418}
1419
1420int setup_pipe(int fds[2])
1421{
1422	int r = pipe(fds);
1423	char fd_buf[11];
1424	if (r)
1425		return r;
1426	r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
1427	if (r <= 0)
1428		return -EINVAL;
1429	setenv(kFdEnvVar, fd_buf, 1);
1430	return 0;
1431}
1432
1433int setup_pipe_end(int fds[2], size_t index)
1434{
1435	if (index > 1)
1436		return -1;
1437
1438	close(fds[1 - index]);
1439	return fds[index];
1440}
1441
1442int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
1443{
1444	if (index > 1)
1445		return -1;
1446
1447	close(fds[1 - index]);
1448	/* dup2(2) the corresponding end of the pipe into |fd|. */
1449	return dup2(fds[index], fd);
1450}
1451
1452int minijail_run_internal(struct minijail *j, const char *filename,
1453			  char *const argv[], pid_t *pchild_pid,
1454			  int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1455			  int use_preload);
1456
1457int API minijail_run(struct minijail *j, const char *filename,
1458		     char *const argv[])
1459{
1460	return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1461				     true);
1462}
1463
1464int API minijail_run_pid(struct minijail *j, const char *filename,
1465			 char *const argv[], pid_t *pchild_pid)
1466{
1467	return minijail_run_internal(j, filename, argv, pchild_pid,
1468				     NULL, NULL, NULL, true);
1469}
1470
1471int API minijail_run_pipe(struct minijail *j, const char *filename,
1472			  char *const argv[], int *pstdin_fd)
1473{
1474	return minijail_run_internal(j, filename, argv, NULL, pstdin_fd,
1475				     NULL, NULL, true);
1476}
1477
1478int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
1479			       char *const argv[], pid_t *pchild_pid,
1480			       int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
1481{
1482	return minijail_run_internal(j, filename, argv, pchild_pid,
1483				     pstdin_fd, pstdout_fd, pstderr_fd, true);
1484}
1485
1486int API minijail_run_no_preload(struct minijail *j, const char *filename,
1487				char *const argv[])
1488{
1489	return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1490				     false);
1491}
1492
1493int API minijail_run_pid_pipes_no_preload(struct minijail *j,
1494					  const char *filename,
1495					  char *const argv[],
1496					  pid_t *pchild_pid,
1497					  int *pstdin_fd, int *pstdout_fd,
1498					  int *pstderr_fd) {
1499	return minijail_run_internal(j, filename, argv, pchild_pid,
1500				     pstdin_fd, pstdout_fd, pstderr_fd, false);
1501}
1502
1503int minijail_run_internal(struct minijail *j, const char *filename,
1504			  char *const argv[], pid_t *pchild_pid,
1505			  int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1506			  int use_preload)
1507{
1508	char *oldenv, *oldenv_copy = NULL;
1509	pid_t child_pid;
1510	int pipe_fds[2];
1511	int stdin_fds[2];
1512	int stdout_fds[2];
1513	int stderr_fds[2];
1514	int userns_pipe_fds[2];
1515	int ret;
1516	/* We need to remember this across the minijail_preexec() call. */
1517	int pid_namespace = j->flags.pids;
1518	int do_init = j->flags.do_init;
1519
1520	if (use_preload) {
1521		oldenv = getenv(kLdPreloadEnvVar);
1522		if (oldenv) {
1523			oldenv_copy = strdup(oldenv);
1524			if (!oldenv_copy)
1525				return -ENOMEM;
1526		}
1527
1528		if (setup_preload())
1529			return -EFAULT;
1530	}
1531
1532	if (!use_preload) {
1533		if (j->flags.caps)
1534			die("Capabilities are not supported without "
1535			    "LD_PRELOAD");
1536	}
1537
1538	/*
1539	 * Make the process group ID of this process equal to its PID, so that
1540	 * both the Minijail process and the jailed process can be killed
1541	 * together.
1542	 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
1543	 * the process is already a process group leader.
1544	 */
1545	if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
1546		if (errno != EPERM) {
1547			pdie("setpgid(0, 0)");
1548		}
1549	}
1550
1551	if (use_preload) {
1552		/*
1553		 * Before we fork(2) and execve(2) the child process, we need
1554		 * to open a pipe(2) to send the minijail configuration over.
1555		 */
1556		if (setup_pipe(pipe_fds))
1557			return -EFAULT;
1558	}
1559
1560	/*
1561	 * If we want to write to the child process' standard input,
1562	 * create the pipe(2) now.
1563	 */
1564	if (pstdin_fd) {
1565		if (pipe(stdin_fds))
1566			return -EFAULT;
1567	}
1568
1569	/*
1570	 * If we want to read from the child process' standard output,
1571	 * create the pipe(2) now.
1572	 */
1573	if (pstdout_fd) {
1574		if (pipe(stdout_fds))
1575			return -EFAULT;
1576	}
1577
1578	/*
1579	 * If we want to read from the child process' standard error,
1580	 * create the pipe(2) now.
1581	 */
1582	if (pstderr_fd) {
1583		if (pipe(stderr_fds))
1584			return -EFAULT;
1585	}
1586
1587	/*
1588	 * If we want to set up a new uid/gid mapping in the user namespace,
1589	 * create the pipe(2) to sync between parent and child.
1590	 */
1591	if (j->flags.userns) {
1592		if (pipe(userns_pipe_fds))
1593			return -EFAULT;
1594	}
1595
1596	/*
1597	 * Use sys_clone() if and only if we're creating a pid namespace.
1598	 *
1599	 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1600	 *
1601	 * In multithreaded programs, there are a bunch of locks inside libc,
1602	 * some of which may be held by other threads at the time that we call
1603	 * minijail_run_pid(). If we call fork(), glibc does its level best to
1604	 * ensure that we hold all of these locks before it calls clone()
1605	 * internally and drop them after clone() returns, but when we call
1606	 * sys_clone(2) directly, all that gets bypassed and we end up with a
1607	 * child address space where some of libc's important locks are held by
1608	 * other threads (which did not get cloned, and hence will never release
1609	 * those locks). This is okay so long as we call exec() immediately
1610	 * after, but a bunch of seemingly-innocent libc functions like setenv()
1611	 * take locks.
1612	 *
1613	 * Hence, only call sys_clone() if we need to, in order to get at pid
1614	 * namespacing. If we follow this path, the child's address space might
1615	 * have broken locks; you may only call functions that do not acquire
1616	 * any locks.
1617	 *
1618	 * Unfortunately, fork() acquires every lock it can get its hands on, as
1619	 * previously detailed, so this function is highly likely to deadlock
1620	 * later on (see "deadlock here") if we're multithreaded.
1621	 *
1622	 * We might hack around this by having the clone()d child (init of the
1623	 * pid namespace) return directly, rather than leaving the clone()d
1624	 * process hanging around to be init for the new namespace (and having
1625	 * its fork()ed child return in turn), but that process would be crippled
1626	 * with its libc locks potentially broken. We might try fork()ing in the
1627	 * parent before we clone() to ensure that we own all the locks, but
1628	 * then we have to have the forked child hanging around consuming
1629	 * resources (and possibly having file descriptors / shared memory
1630	 * regions / etc attached). We'd need to keep the child around to avoid
1631	 * having its children get reparented to init.
1632	 *
1633	 * TODO(ellyjones): figure out if the "forked child hanging around"
1634	 * problem is fixable or not. It would be nice if we worked in this
1635	 * case.
1636	 */
1637	if (pid_namespace) {
1638		int clone_flags = CLONE_NEWPID | SIGCHLD;
1639		if (j->flags.userns)
1640			clone_flags |= CLONE_NEWUSER;
1641		child_pid = syscall(SYS_clone, clone_flags, NULL);
1642	} else {
1643		child_pid = fork();
1644	}
1645
1646	if (child_pid < 0) {
1647		if (use_preload) {
1648			free(oldenv_copy);
1649		}
1650		die("failed to fork child");
1651	}
1652
1653	if (child_pid) {
1654		if (use_preload) {
1655			/* Restore parent's LD_PRELOAD. */
1656			if (oldenv_copy) {
1657				setenv(kLdPreloadEnvVar, oldenv_copy, 1);
1658				free(oldenv_copy);
1659			} else {
1660				unsetenv(kLdPreloadEnvVar);
1661			}
1662			unsetenv(kFdEnvVar);
1663		}
1664
1665		j->initpid = child_pid;
1666
1667		if (j->flags.pid_file)
1668			write_pid_file(j);
1669
1670		if (j->flags.userns)
1671			write_ugid_mappings(j, userns_pipe_fds);
1672
1673		if (use_preload) {
1674			/* Send marshalled minijail. */
1675			close(pipe_fds[0]);	/* read endpoint */
1676			ret = minijail_to_fd(j, pipe_fds[1]);
1677			close(pipe_fds[1]);	/* write endpoint */
1678			if (ret) {
1679				kill(j->initpid, SIGKILL);
1680				die("failed to send marshalled minijail");
1681			}
1682		}
1683
1684		if (pchild_pid)
1685			*pchild_pid = child_pid;
1686
1687		/*
1688		 * If we want to write to the child process' standard input,
1689		 * set up the write end of the pipe.
1690		 */
1691		if (pstdin_fd)
1692			*pstdin_fd = setup_pipe_end(stdin_fds,
1693						    1 /* write end */);
1694
1695		/*
1696		 * If we want to read from the child process' standard output,
1697		 * set up the read end of the pipe.
1698		 */
1699		if (pstdout_fd)
1700			*pstdout_fd = setup_pipe_end(stdout_fds,
1701						     0 /* read end */);
1702
1703		/*
1704		 * If we want to read from the child process' standard error,
1705		 * set up the read end of the pipe.
1706		 */
1707		if (pstderr_fd)
1708			*pstderr_fd = setup_pipe_end(stderr_fds,
1709						     0 /* read end */);
1710
1711		return 0;
1712	}
1713	free(oldenv_copy);
1714
1715	if (j->flags.reset_signal_mask) {
1716		sigset_t signal_mask;
1717		if (sigemptyset(&signal_mask) != 0)
1718			pdie("sigemptyset failed");
1719		if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0)
1720			pdie("sigprocmask failed");
1721	}
1722
1723	if (j->flags.userns)
1724		enter_user_namespace(j, userns_pipe_fds);
1725
1726	/*
1727	 * If we want to write to the jailed process' standard input,
1728	 * set up the read end of the pipe.
1729	 */
1730	if (pstdin_fd) {
1731		if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
1732					    STDIN_FILENO) < 0)
1733			die("failed to set up stdin pipe");
1734	}
1735
1736	/*
1737	 * If we want to read from the jailed process' standard output,
1738	 * set up the write end of the pipe.
1739	 */
1740	if (pstdout_fd) {
1741		if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
1742					    STDOUT_FILENO) < 0)
1743			die("failed to set up stdout pipe");
1744	}
1745
1746	/*
1747	 * If we want to read from the jailed process' standard error,
1748	 * set up the write end of the pipe.
1749	 */
1750	if (pstderr_fd) {
1751		if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
1752					    STDERR_FILENO) < 0)
1753			die("failed to set up stderr pipe");
1754	}
1755
1756	/* If running an init program, let it decide when/how to mount /proc. */
1757	if (pid_namespace && !do_init)
1758		j->flags.remount_proc_ro = 0;
1759
1760	if (use_preload) {
1761		/* Strip out flags that cannot be inherited across execve(2). */
1762		minijail_preexec(j);
1763	} else {
1764		j->flags.pids = 0;
1765	}
1766	/* Jail this process, then execve() the target. */
1767	minijail_enter(j);
1768
1769	if (pid_namespace && do_init) {
1770		/*
1771		 * pid namespace: this process will become init inside the new
1772		 * namespace. We don't want all programs we might exec to have
1773		 * to know how to be init. Normally (do_init == 1) we fork off
1774		 * a child to actually run the program. If |do_init == 0|, we
1775		 * let the program keep pid 1 and be init.
1776		 *
1777		 * If we're multithreaded, we'll probably deadlock here. See
1778		 * WARNING above.
1779		 */
1780		child_pid = fork();
1781		if (child_pid < 0)
1782			_exit(child_pid);
1783		else if (child_pid > 0)
1784			init(child_pid);	/* never returns */
1785	}
1786
1787	/*
1788	 * If we aren't pid-namespaced, or the jailed program asked to be init:
1789	 *   calling process
1790	 *   -> execve()-ing process
1791	 * If we are:
1792	 *   calling process
1793	 *   -> init()-ing process
1794	 *      -> execve()-ing process
1795	 */
1796	_exit(execve(filename, argv, environ));
1797}
1798
1799int API minijail_kill(struct minijail *j)
1800{
1801	int st;
1802	if (kill(j->initpid, SIGTERM))
1803		return -errno;
1804	if (waitpid(j->initpid, &st, 0) < 0)
1805		return -errno;
1806	return st;
1807}
1808
1809int API minijail_wait(struct minijail *j)
1810{
1811	int st;
1812	if (waitpid(j->initpid, &st, 0) < 0)
1813		return -errno;
1814
1815	if (!WIFEXITED(st)) {
1816		int error_status = st;
1817		if (WIFSIGNALED(st)) {
1818			int signum = WTERMSIG(st);
1819			warn("child process %d received signal %d",
1820			     j->initpid, signum);
1821			/*
1822			 * We return MINIJAIL_ERR_JAIL if the process received
1823			 * SIGSYS, which happens when a syscall is blocked by
1824			 * seccomp filters.
1825			 * If not, we do what bash(1) does:
1826			 * $? = 128 + signum
1827			 */
1828			if (signum == SIGSYS) {
1829				error_status = MINIJAIL_ERR_JAIL;
1830			} else {
1831				error_status = 128 + signum;
1832			}
1833		}
1834		return error_status;
1835	}
1836
1837	int exit_status = WEXITSTATUS(st);
1838	if (exit_status != 0)
1839		info("child process %d exited with status %d",
1840		     j->initpid, exit_status);
1841
1842	return exit_status;
1843}
1844
1845void API minijail_destroy(struct minijail *j)
1846{
1847	if (j->flags.seccomp_filter && j->filter_prog) {
1848		free(j->filter_prog->filter);
1849		free(j->filter_prog);
1850	}
1851	while (j->mounts_head) {
1852		struct mountpoint *m = j->mounts_head;
1853		j->mounts_head = j->mounts_head->next;
1854		free(m->type);
1855		free(m->dest);
1856		free(m->src);
1857		free(m);
1858	}
1859	j->mounts_tail = NULL;
1860	if (j->user)
1861		free(j->user);
1862	if (j->suppl_gid_list)
1863		free(j->suppl_gid_list);
1864	if (j->chrootdir)
1865		free(j->chrootdir);
1866	if (j->alt_syscall_table)
1867		free(j->alt_syscall_table);
1868	free(j);
1869}
1870