libminijail.c revision 2860c4693ea5f40b44e4b2eb2f0b6970ffcd7f27
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6#define _BSD_SOURCE
7#define _GNU_SOURCE
8
9#include <asm/unistd.h>
10#include <ctype.h>
11#include <errno.h>
12#include <fcntl.h>
13#include <grp.h>
14#include <inttypes.h>
15#include <limits.h>
16#include <linux/capability.h>
17#include <pwd.h>
18#include <sched.h>
19#include <signal.h>
20#include <stdarg.h>
21#include <stdbool.h>
22#include <stddef.h>
23#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#include <syscall.h>
27#include <sys/capability.h>
28#include <sys/mount.h>
29#include <sys/param.h>
30#include <sys/prctl.h>
31#include <sys/stat.h>
32#include <sys/types.h>
33#include <sys/user.h>
34#include <sys/wait.h>
35#include <unistd.h>
36
37#include "libminijail.h"
38#include "libminijail-private.h"
39
40#include "signal_handler.h"
41#include "syscall_filter.h"
42#include "util.h"
43
44#ifdef HAVE_SECUREBITS_H
45#include <linux/securebits.h>
46#else
47#define SECURE_ALL_BITS         0x15
48#define SECURE_ALL_LOCKS        (SECURE_ALL_BITS << 1)
49#endif
50
51/* Until these are reliably available in linux/prctl.h */
52#ifndef PR_SET_SECCOMP
53# define PR_SET_SECCOMP 22
54#endif
55
56#ifndef PR_ALT_SYSCALL
57# define PR_ALT_SYSCALL 0x43724f53
58#endif
59
60/* For seccomp_filter using BPF. */
61#ifndef PR_SET_NO_NEW_PRIVS
62# define PR_SET_NO_NEW_PRIVS 38
63#endif
64#ifndef SECCOMP_MODE_FILTER
65# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
66#endif
67
68#ifdef USE_SECCOMP_SOFTFAIL
69# define SECCOMP_SOFTFAIL 1
70#else
71# define SECCOMP_SOFTFAIL 0
72#endif
73
74struct mountpoint {
75	char *src;
76	char *dest;
77	char *type;
78	unsigned long flags;
79	struct mountpoint *next;
80};
81
82struct minijail {
83	/*
84	 * WARNING: if you add a flag here you need to make sure it's
85	 * accounted for in minijail_pre{enter|exec}() below.
86	 */
87	struct {
88		int uid:1;
89		int gid:1;
90		int usergroups:1;
91		int suppl_gids:1;
92		int caps:1;
93		int vfs:1;
94		int enter_vfs:1;
95		int pids:1;
96		int ipc:1;
97		int net:1;
98		int enter_net:1;
99		int userns:1;
100		int seccomp:1;
101		int remount_proc_ro:1;
102		int no_new_privs:1;
103		int seccomp_filter:1;
104		int log_seccomp_filter:1;
105		int chroot:1;
106		int pivot_root:1;
107		int mount_tmp:1;
108		int do_init:1;
109		int pid_file:1;
110		int alt_syscall:1;
111		int reset_signal_mask:1;
112	} flags;
113	uid_t uid;
114	gid_t gid;
115	gid_t usergid;
116	char *user;
117	size_t suppl_gid_count;
118	gid_t *suppl_gid_list;
119	uint64_t caps;
120	pid_t initpid;
121	int mountns_fd;
122	int netns_fd;
123	char *chrootdir;
124	char *pid_file_path;
125	char *uidmap;
126	char *gidmap;
127	size_t filter_len;
128	struct sock_fprog *filter_prog;
129	char *alt_syscall_table;
130	struct mountpoint *mounts_head;
131	struct mountpoint *mounts_tail;
132	size_t mounts_count;
133};
134
135/*
136 * Strip out flags meant for the parent.
137 * We keep things that are not inherited across execve(2) (e.g. capabilities),
138 * or are easier to set after execve(2) (e.g. seccomp filters).
139 */
140void minijail_preenter(struct minijail *j)
141{
142	j->flags.vfs = 0;
143	j->flags.enter_vfs = 0;
144	j->flags.remount_proc_ro = 0;
145	j->flags.pids = 0;
146	j->flags.do_init = 0;
147	j->flags.pid_file = 0;
148}
149
150/*
151 * Strip out flags meant for the child.
152 * We keep things that are inherited across execve(2).
153 */
154void minijail_preexec(struct minijail *j)
155{
156	int vfs = j->flags.vfs;
157	int enter_vfs = j->flags.enter_vfs;
158	int remount_proc_ro = j->flags.remount_proc_ro;
159	int userns = j->flags.userns;
160	if (j->user)
161		free(j->user);
162	j->user = NULL;
163	if (j->suppl_gid_list)
164		free(j->suppl_gid_list);
165	j->suppl_gid_list = NULL;
166	memset(&j->flags, 0, sizeof(j->flags));
167	/* Now restore anything we meant to keep. */
168	j->flags.vfs = vfs;
169	j->flags.enter_vfs = enter_vfs;
170	j->flags.remount_proc_ro = remount_proc_ro;
171	j->flags.userns = userns;
172	/* Note, |pids| will already have been used before this call. */
173}
174
175/* Minijail API. */
176
177struct minijail API *minijail_new(void)
178{
179	return calloc(1, sizeof(struct minijail));
180}
181
182void API minijail_change_uid(struct minijail *j, uid_t uid)
183{
184	if (uid == 0)
185		die("useless change to uid 0");
186	j->uid = uid;
187	j->flags.uid = 1;
188}
189
190void API minijail_change_gid(struct minijail *j, gid_t gid)
191{
192	if (gid == 0)
193		die("useless change to gid 0");
194	j->gid = gid;
195	j->flags.gid = 1;
196}
197
198int API minijail_set_supplementary_gids(struct minijail *j, size_t size,
199					const gid_t *list)
200{
201	size_t i;
202
203	if (j->flags.usergroups)
204		die("cannot inherit *and* set supplementary groups");
205
206	if (size == 0)
207		return -EINVAL;
208
209	/* Copy the gid_t array. */
210	j->suppl_gid_list = calloc(size, sizeof(gid_t));
211	if (!j->suppl_gid_list) {
212		return -ENOMEM;
213	}
214	for (i = 0; i < size; i++) {
215		j->suppl_gid_list[i] = list[i];
216	}
217	j->suppl_gid_count = size;
218	j->flags.suppl_gids = 1;
219	return 0;
220}
221
222int API minijail_change_user(struct minijail *j, const char *user)
223{
224	char *buf = NULL;
225	struct passwd pw;
226	struct passwd *ppw = NULL;
227	ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
228	if (sz == -1)
229		sz = 65536;	/* your guess is as good as mine... */
230
231	/*
232	 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
233	 * the maximum needed size of the buffer, so we don't have to search.
234	 */
235	buf = malloc(sz);
236	if (!buf)
237		return -ENOMEM;
238	getpwnam_r(user, &pw, buf, sz, &ppw);
239	/*
240	 * We're safe to free the buffer here. The strings inside pw point
241	 * inside buf, but we don't use any of them; this leaves the pointers
242	 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded.
243	 */
244	free(buf);
245	/* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
246	if (!ppw)
247		return -1;
248	minijail_change_uid(j, ppw->pw_uid);
249	j->user = strdup(user);
250	if (!j->user)
251		return -ENOMEM;
252	j->usergid = ppw->pw_gid;
253	return 0;
254}
255
256int API minijail_change_group(struct minijail *j, const char *group)
257{
258	char *buf = NULL;
259	struct group gr;
260	struct group *pgr = NULL;
261	ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
262	if (sz == -1)
263		sz = 65536;	/* and mine is as good as yours, really */
264
265	/*
266	 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
267	 * the maximum needed size of the buffer, so we don't have to search.
268	 */
269	buf = malloc(sz);
270	if (!buf)
271		return -ENOMEM;
272	getgrnam_r(group, &gr, buf, sz, &pgr);
273	/*
274	 * We're safe to free the buffer here. The strings inside gr point
275	 * inside buf, but we don't use any of them; this leaves the pointers
276	 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
277	 */
278	free(buf);
279	/* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
280	if (!pgr)
281		return -1;
282	minijail_change_gid(j, pgr->gr_gid);
283	return 0;
284}
285
286void API minijail_use_seccomp(struct minijail *j)
287{
288	j->flags.seccomp = 1;
289}
290
291void API minijail_no_new_privs(struct minijail *j)
292{
293	j->flags.no_new_privs = 1;
294}
295
296void API minijail_use_seccomp_filter(struct minijail *j)
297{
298	j->flags.seccomp_filter = 1;
299}
300
301void API minijail_log_seccomp_filter_failures(struct minijail *j)
302{
303	j->flags.log_seccomp_filter = 1;
304}
305
306void API minijail_use_caps(struct minijail *j, uint64_t capmask)
307{
308	j->caps = capmask;
309	j->flags.caps = 1;
310}
311
312void API minijail_reset_signal_mask(struct minijail* j) {
313	j->flags.reset_signal_mask = 1;
314}
315
316void API minijail_namespace_vfs(struct minijail *j)
317{
318	j->flags.vfs = 1;
319}
320
321void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
322{
323	int ns_fd = open(ns_path, O_RDONLY);
324	if (ns_fd < 0) {
325		pdie("failed to open namespace '%s'", ns_path);
326	}
327	j->mountns_fd = ns_fd;
328	j->flags.enter_vfs = 1;
329}
330
331void API minijail_namespace_pids(struct minijail *j)
332{
333	j->flags.vfs = 1;
334	j->flags.remount_proc_ro = 1;
335	j->flags.pids = 1;
336	j->flags.do_init = 1;
337}
338
339void API minijail_namespace_ipc(struct minijail *j)
340{
341	j->flags.ipc = 1;
342}
343
344void API minijail_namespace_net(struct minijail *j)
345{
346	j->flags.net = 1;
347}
348
349void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
350{
351	int ns_fd = open(ns_path, O_RDONLY);
352	if (ns_fd < 0) {
353		pdie("failed to open namespace '%s'", ns_path);
354	}
355	j->netns_fd = ns_fd;
356	j->flags.enter_net = 1;
357}
358
359void API minijail_remount_proc_readonly(struct minijail *j)
360{
361	j->flags.vfs = 1;
362	j->flags.remount_proc_ro = 1;
363}
364
365void API minijail_namespace_user(struct minijail *j)
366{
367	j->flags.userns = 1;
368}
369
370int API minijail_uidmap(struct minijail *j, const char *uidmap)
371{
372	j->uidmap = strdup(uidmap);
373	if (!j->uidmap)
374		return -ENOMEM;
375	char *ch;
376	for (ch = j->uidmap; *ch; ch++) {
377		if (*ch == ',')
378			*ch = '\n';
379	}
380	return 0;
381}
382
383int API minijail_gidmap(struct minijail *j, const char *gidmap)
384{
385	j->gidmap = strdup(gidmap);
386	if (!j->gidmap)
387		return -ENOMEM;
388	char *ch;
389	for (ch = j->gidmap; *ch; ch++) {
390		if (*ch == ',')
391			*ch = '\n';
392	}
393	return 0;
394}
395
396void API minijail_inherit_usergroups(struct minijail *j)
397{
398	j->flags.usergroups = 1;
399}
400
401void API minijail_run_as_init(struct minijail *j)
402{
403	/*
404	 * Since the jailed program will become 'init' in the new PID namespace,
405	 * Minijail does not need to fork an 'init' process.
406	 */
407	j->flags.do_init = 0;
408}
409
410int API minijail_enter_chroot(struct minijail *j, const char *dir)
411{
412	if (j->chrootdir)
413		return -EINVAL;
414	j->chrootdir = strdup(dir);
415	if (!j->chrootdir)
416		return -ENOMEM;
417	j->flags.chroot = 1;
418	return 0;
419}
420
421int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
422{
423	if (j->chrootdir)
424		return -EINVAL;
425	j->chrootdir = strdup(dir);
426	if (!j->chrootdir)
427		return -ENOMEM;
428	j->flags.pivot_root = 1;
429	return 0;
430}
431
432static char *append_external_path(const char *external_path,
433				  const char *path_inside_chroot)
434{
435	char *path;
436	size_t pathlen;
437
438	/* One extra char for '/' and one for '\0', hence + 2. */
439	pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2;
440	path = malloc(pathlen);
441	snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot);
442
443	return path;
444}
445
446char API *minijail_get_original_path(struct minijail *j,
447				     const char *path_inside_chroot)
448{
449	struct mountpoint *b;
450
451	b = j->mounts_head;
452	while (b) {
453		/*
454		 * If |path_inside_chroot| is the exact destination of a
455		 * mount, then the original path is exactly the source of
456		 * the mount.
457		 *  for example: "-b /some/path/exe,/chroot/path/exe"
458		 *    mount source = /some/path/exe, mount dest =
459		 *    /chroot/path/exe Then when getting the original path of
460		 *    "/chroot/path/exe", the source of that mount,
461		 *    "/some/path/exe" is what should be returned.
462		 */
463		if (!strcmp(b->dest, path_inside_chroot))
464			return strdup(b->src);
465
466		/*
467		 * If |path_inside_chroot| is within the destination path of a
468		 * mount, take the suffix of the chroot path relative to the
469		 * mount destination path, and append it to the mount source
470		 * path.
471		 */
472		if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) {
473			const char *relative_path =
474				path_inside_chroot + strlen(b->dest);
475			return append_external_path(b->src, relative_path);
476		}
477		b = b->next;
478	}
479
480	/* If there is a chroot path, append |path_inside_chroot| to that. */
481	if (j->chrootdir)
482		return append_external_path(j->chrootdir, path_inside_chroot);
483
484	/* No chroot, so the path outside is the same as it is inside. */
485	return strdup(path_inside_chroot);
486}
487
488void API minijail_mount_tmp(struct minijail *j)
489{
490	j->flags.mount_tmp = 1;
491}
492
493int API minijail_write_pid_file(struct minijail *j, const char *path)
494{
495	j->pid_file_path = strdup(path);
496	if (!j->pid_file_path)
497		return -ENOMEM;
498	j->flags.pid_file = 1;
499	return 0;
500}
501
502int API minijail_mount(struct minijail *j, const char *src, const char *dest,
503		       const char *type, unsigned long flags)
504{
505	struct mountpoint *m;
506
507	if (*dest != '/')
508		return -EINVAL;
509	m = calloc(1, sizeof(*m));
510	if (!m)
511		return -ENOMEM;
512	m->dest = strdup(dest);
513	if (!m->dest)
514		goto error;
515	m->src = strdup(src);
516	if (!m->src)
517		goto error;
518	m->type = strdup(type);
519	if (!m->type)
520		goto error;
521	m->flags = flags;
522
523	info("mount %s -> %s type %s", src, dest, type);
524
525	/*
526	 * Force vfs namespacing so the mounts don't leak out into the
527	 * containing vfs namespace.
528	 */
529	minijail_namespace_vfs(j);
530
531	if (j->mounts_tail)
532		j->mounts_tail->next = m;
533	else
534		j->mounts_head = m;
535	j->mounts_tail = m;
536	j->mounts_count++;
537
538	return 0;
539
540error:
541	free(m->src);
542	free(m->dest);
543	free(m);
544	return -ENOMEM;
545}
546
547int API minijail_bind(struct minijail *j, const char *src, const char *dest,
548		      int writeable)
549{
550	unsigned long flags = MS_BIND;
551
552	if (!writeable)
553		flags |= MS_RDONLY;
554
555	return minijail_mount(j, src, dest, "", flags);
556}
557
558void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
559{
560	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) {
561		if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
562			warn("not loading seccomp filter,"
563			     " seccomp not supported");
564			return;
565		}
566	}
567	FILE *file = fopen(path, "r");
568	if (!file) {
569		pdie("failed to open seccomp filter file '%s'", path);
570	}
571
572	struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
573	if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
574		die("failed to compile seccomp filter BPF program in '%s'",
575		    path);
576	}
577
578	j->filter_len = fprog->len;
579	j->filter_prog = fprog;
580
581	fclose(file);
582}
583
584int API minijail_use_alt_syscall(struct minijail *j, const char *table)
585{
586	j->alt_syscall_table = strdup(table);
587	if (!j->alt_syscall_table)
588		return -ENOMEM;
589	j->flags.alt_syscall = 1;
590	return 0;
591}
592
593struct marshal_state {
594	size_t available;
595	size_t total;
596	char *buf;
597};
598
599void marshal_state_init(struct marshal_state *state,
600			char *buf, size_t available)
601{
602	state->available = available;
603	state->buf = buf;
604	state->total = 0;
605}
606
607void marshal_append(struct marshal_state *state,
608		    void *src, size_t length)
609{
610	size_t copy_len = MIN(state->available, length);
611
612	/* Up to |available| will be written. */
613	if (copy_len) {
614		memcpy(state->buf, src, copy_len);
615		state->buf += copy_len;
616		state->available -= copy_len;
617	}
618	/* |total| will contain the expected length. */
619	state->total += length;
620}
621
622void minijail_marshal_helper(struct marshal_state *state,
623			     const struct minijail *j)
624{
625	struct mountpoint *m = NULL;
626	marshal_append(state, (char *)j, sizeof(*j));
627	if (j->user)
628		marshal_append(state, j->user, strlen(j->user) + 1);
629	if (j->suppl_gid_list) {
630		marshal_append(state, j->suppl_gid_list,
631			       j->suppl_gid_count * sizeof(gid_t));
632	}
633	if (j->chrootdir)
634		marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
635	if (j->alt_syscall_table) {
636		marshal_append(state, j->alt_syscall_table,
637			       strlen(j->alt_syscall_table) + 1);
638	}
639	if (j->flags.seccomp_filter && j->filter_prog) {
640		struct sock_fprog *fp = j->filter_prog;
641		marshal_append(state, (char *)fp->filter,
642				fp->len * sizeof(struct sock_filter));
643	}
644	for (m = j->mounts_head; m; m = m->next) {
645		marshal_append(state, m->src, strlen(m->src) + 1);
646		marshal_append(state, m->dest, strlen(m->dest) + 1);
647		marshal_append(state, m->type, strlen(m->type) + 1);
648		marshal_append(state, (char *)&m->flags, sizeof(m->flags));
649	}
650}
651
652size_t API minijail_size(const struct minijail *j)
653{
654	struct marshal_state state;
655	marshal_state_init(&state, NULL, 0);
656	minijail_marshal_helper(&state, j);
657	return state.total;
658}
659
660int minijail_marshal(const struct minijail *j, char *buf, size_t available)
661{
662	struct marshal_state state;
663	marshal_state_init(&state, buf, available);
664	minijail_marshal_helper(&state, j);
665	return (state.total > available);
666}
667
668/*
669 * consumebytes: consumes @length bytes from a buffer @buf of length @buflength
670 * @length    Number of bytes to consume
671 * @buf       Buffer to consume from
672 * @buflength Size of @buf
673 *
674 * Returns a pointer to the base of the bytes, or NULL for errors.
675 */
676void *consumebytes(size_t length, char **buf, size_t *buflength)
677{
678	char *p = *buf;
679	if (length > *buflength)
680		return NULL;
681	*buf += length;
682	*buflength -= length;
683	return p;
684}
685
686/*
687 * consumestr: consumes a C string from a buffer @buf of length @length
688 * @buf    Buffer to consume
689 * @length Length of buffer
690 *
691 * Returns a pointer to the base of the string, or NULL for errors.
692 */
693char *consumestr(char **buf, size_t *buflength)
694{
695	size_t len = strnlen(*buf, *buflength);
696	if (len == *buflength)
697		/* There's no null-terminator. */
698		return NULL;
699	return consumebytes(len + 1, buf, buflength);
700}
701
702int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
703{
704	size_t i;
705	size_t count;
706	int ret = -EINVAL;
707
708	if (length < sizeof(*j))
709		goto out;
710	memcpy((void *)j, serialized, sizeof(*j));
711	serialized += sizeof(*j);
712	length -= sizeof(*j);
713
714	/* Potentially stale pointers not used as signals. */
715	j->mounts_head = NULL;
716	j->mounts_tail = NULL;
717	j->filter_prog = NULL;
718
719	if (j->user) {		/* stale pointer */
720		char *user = consumestr(&serialized, &length);
721		if (!user)
722			goto clear_pointers;
723		j->user = strdup(user);
724		if (!j->user)
725			goto clear_pointers;
726	}
727
728	if (j->suppl_gid_list) {	/* stale pointer */
729		if (j->suppl_gid_count > NGROUPS_MAX) {
730			goto bad_gid_list;
731		}
732		size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t);
733		void *gid_list_bytes =
734		    consumebytes(gid_list_size, &serialized, &length);
735		if (!gid_list_bytes)
736			goto bad_gid_list;
737
738		j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t));
739		if (!j->suppl_gid_list)
740			goto bad_gid_list;
741
742		memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size);
743	}
744
745	if (j->chrootdir) {	/* stale pointer */
746		char *chrootdir = consumestr(&serialized, &length);
747		if (!chrootdir)
748			goto bad_chrootdir;
749		j->chrootdir = strdup(chrootdir);
750		if (!j->chrootdir)
751			goto bad_chrootdir;
752	}
753
754	if (j->alt_syscall_table) {	/* stale pointer */
755		char *alt_syscall_table = consumestr(&serialized, &length);
756		if (!alt_syscall_table)
757			goto bad_syscall_table;
758		j->alt_syscall_table = strdup(alt_syscall_table);
759		if (!j->alt_syscall_table)
760			goto bad_syscall_table;
761	}
762
763	if (j->flags.seccomp_filter && j->filter_len > 0) {
764		size_t ninstrs = j->filter_len;
765		if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
766		    ninstrs > USHRT_MAX)
767			goto bad_filters;
768
769		size_t program_len = ninstrs * sizeof(struct sock_filter);
770		void *program = consumebytes(program_len, &serialized, &length);
771		if (!program)
772			goto bad_filters;
773
774		j->filter_prog = malloc(sizeof(struct sock_fprog));
775		if (!j->filter_prog)
776			goto bad_filters;
777
778		j->filter_prog->len = ninstrs;
779		j->filter_prog->filter = malloc(program_len);
780		if (!j->filter_prog->filter)
781			goto bad_filter_prog_instrs;
782
783		memcpy(j->filter_prog->filter, program, program_len);
784	}
785
786	count = j->mounts_count;
787	j->mounts_count = 0;
788	for (i = 0; i < count; ++i) {
789		unsigned long *flags;
790		const char *dest;
791		const char *type;
792		const char *src = consumestr(&serialized, &length);
793		if (!src)
794			goto bad_mounts;
795		dest = consumestr(&serialized, &length);
796		if (!dest)
797			goto bad_mounts;
798		type = consumestr(&serialized, &length);
799		if (!type)
800			goto bad_mounts;
801		flags = consumebytes(sizeof(*flags), &serialized, &length);
802		if (!flags)
803			goto bad_mounts;
804		if (minijail_mount(j, src, dest, type, *flags))
805			goto bad_mounts;
806	}
807
808	return 0;
809
810bad_mounts:
811	if (j->flags.seccomp_filter && j->filter_len > 0) {
812		free(j->filter_prog->filter);
813		free(j->filter_prog);
814	}
815bad_filter_prog_instrs:
816	if (j->filter_prog)
817		free(j->filter_prog);
818bad_filters:
819	if (j->alt_syscall_table)
820		free(j->alt_syscall_table);
821bad_syscall_table:
822	if (j->chrootdir)
823		free(j->chrootdir);
824bad_chrootdir:
825	if (j->suppl_gid_list)
826		free(j->suppl_gid_list);
827bad_gid_list:
828	if (j->user)
829		free(j->user);
830clear_pointers:
831	j->user = NULL;
832	j->suppl_gid_list = NULL;
833	j->chrootdir = NULL;
834	j->alt_syscall_table = NULL;
835out:
836	return ret;
837}
838
839static void write_ugid_mappings(const struct minijail *j, int *pipe_fds)
840{
841	int fd, ret, len;
842	size_t sz;
843	char fname[32];
844	close(pipe_fds[0]);
845
846	sz = sizeof(fname);
847	if (j->uidmap) {
848		ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid);
849		if (ret < 0 || (size_t)ret >= sz)
850			die("failed to write file name of uid_map");
851		fd = open(fname, O_WRONLY);
852		if (fd < 0)
853			pdie("failed to open '%s'", fname);
854		len = strlen(j->uidmap);
855		if (write(fd, j->uidmap, len) < len)
856			die("failed to set uid_map");
857		close(fd);
858	}
859	if (j->gidmap) {
860		ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid);
861		if (ret < 0 || (size_t)ret >= sz)
862			die("failed to write file name of gid_map");
863		fd = open(fname, O_WRONLY);
864		if (fd < 0)
865			pdie("failed to open '%s'", fname);
866		len = strlen(j->gidmap);
867		if (write(fd, j->gidmap, len) < len)
868			die("failed to set gid_map");
869		close(fd);
870	}
871
872	close(pipe_fds[1]);
873}
874
875static void enter_user_namespace(const struct minijail *j, int *pipe_fds)
876{
877	char buf;
878
879	close(pipe_fds[1]);
880
881	/* Wait for parent to set up uid/gid mappings. */
882	if (read(pipe_fds[0], &buf, 1) != 0)
883		die("failed to sync with parent");
884	close(pipe_fds[0]);
885
886	if (j->uidmap && setresuid(0, 0, 0))
887		pdie("setresuid");
888	if (j->gidmap && setresgid(0, 0, 0))
889		pdie("setresgid");
890}
891
892/*
893 * mount_one: Applies mounts from @m for @j, recursing as needed.
894 * @j Minijail these mounts are for
895 * @m Head of list of mounts
896 *
897 * Returns 0 for success.
898 */
899static int mount_one(const struct minijail *j, struct mountpoint *m)
900{
901	int ret;
902	char *dest;
903	int remount_ro = 0;
904
905	/* dest has a leading "/" */
906	if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0)
907		return -ENOMEM;
908
909	/*
910	 * R/O bind mounts have to be remounted since bind and ro can't both be
911	 * specified in the original bind mount. Remount R/O after the initial
912	 * mount.
913	 */
914	if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) {
915		remount_ro = 1;
916		m->flags &= ~MS_RDONLY;
917	}
918
919	ret = mount(m->src, dest, m->type, m->flags, NULL);
920	if (ret)
921		pdie("mount: %s -> %s", m->src, dest);
922
923	if (remount_ro) {
924		m->flags |= MS_RDONLY;
925		ret = mount(m->src, dest, NULL,
926			    m->flags | MS_REMOUNT, NULL);
927		if (ret)
928			pdie("bind ro: %s -> %s", m->src, dest);
929	}
930
931	free(dest);
932	if (m->next)
933		return mount_one(j, m->next);
934	return ret;
935}
936
937int enter_chroot(const struct minijail *j)
938{
939	int ret;
940
941	if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
942		return ret;
943
944	if (chroot(j->chrootdir))
945		return -errno;
946
947	if (chdir("/"))
948		return -errno;
949
950	return 0;
951}
952
953int enter_pivot_root(const struct minijail *j)
954{
955	int ret, oldroot, newroot;
956
957	if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
958		return ret;
959
960	/*
961	 * Keep the fd for both old and new root.
962	 * It will be used in fchdir later.
963	 */
964	oldroot = open("/", O_DIRECTORY | O_RDONLY);
965	if (oldroot < 0)
966		pdie("failed to open / for fchdir");
967	newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY);
968	if (newroot < 0)
969		pdie("failed to open %s for fchdir", j->chrootdir);
970
971	/*
972	 * To ensure chrootdir is the root of a file system,
973	 * do a self bind mount.
974	 */
975	if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
976		pdie("failed to bind mount '%s'", j->chrootdir);
977	if (chdir(j->chrootdir))
978		return -errno;
979	if (syscall(SYS_pivot_root, ".", "."))
980		pdie("pivot_root");
981
982	/*
983	 * Now the old root is mounted on top of the new root. Use fchdir to
984	 * change to the old root and unmount it.
985	 */
986	if (fchdir(oldroot))
987		pdie("failed to fchdir to old /");
988	/* The old root might be busy, so use lazy unmount. */
989	if (umount2(".", MNT_DETACH))
990		pdie("umount(/)");
991	/* Change back to the new root. */
992	if (fchdir(newroot))
993		return -errno;
994	if (chroot("/"))
995		return -errno;
996	/* Set correct CWD for getcwd(3). */
997	if (chdir("/"))
998		return -errno;
999
1000	return 0;
1001}
1002
1003int mount_tmp(void)
1004{
1005	return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
1006}
1007
1008int remount_proc_readonly(const struct minijail *j)
1009{
1010	const char *kProcPath = "/proc";
1011	const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
1012	/*
1013	 * Right now, we're holding a reference to our parent's old mount of
1014	 * /proc in our namespace, which means using MS_REMOUNT here would
1015	 * mutate our parent's mount as well, even though we're in a VFS
1016	 * namespace (!). Instead, remove their mount from our namespace
1017	 * and make our own. However, if we are in a new user namespace, /proc
1018	 * is not seen as mounted, so don't return error if umount() fails.
1019	 */
1020	if (umount2(kProcPath, MNT_DETACH) && !j->flags.userns)
1021		return -errno;
1022	if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
1023		return -errno;
1024	return 0;
1025}
1026
1027static void write_pid_file(const struct minijail *j)
1028{
1029	FILE *fp = fopen(j->pid_file_path, "w");
1030
1031	if (!fp)
1032		pdie("failed to open '%s'", j->pid_file_path);
1033	if (fprintf(fp, "%d\n", (int)j->initpid) < 0)
1034		pdie("fprintf(%s)", j->pid_file_path);
1035	if (fclose(fp))
1036		pdie("fclose(%s)", j->pid_file_path);
1037}
1038
1039void drop_ugid(const struct minijail *j)
1040{
1041	if (j->flags.usergroups && j->flags.suppl_gids) {
1042		die("tried to inherit *and* set supplementary groups;"
1043		    " can only do one");
1044	}
1045
1046	if (j->flags.usergroups) {
1047		if (initgroups(j->user, j->usergid))
1048			pdie("initgroups");
1049	} else if (j->flags.suppl_gids) {
1050		if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) {
1051			pdie("setgroups");
1052		}
1053	} else {
1054		/*
1055		 * Only attempt to clear supplementary groups if we are changing
1056		 * users.
1057		 */
1058		if ((j->uid || j->gid) && setgroups(0, NULL))
1059			pdie("setgroups");
1060	}
1061
1062	if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
1063		pdie("setresgid");
1064
1065	if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
1066		pdie("setresuid");
1067}
1068
1069/*
1070 * We specifically do not use cap_valid() as that only tells us the last
1071 * valid cap we were *compiled* against (i.e. what the version of kernel
1072 * headers says).  If we run on a different kernel version, then it's not
1073 * uncommon for that to be less (if an older kernel) or more (if a newer
1074 * kernel).  So suck up the answer via /proc.
1075 */
1076static unsigned int get_last_valid_cap()
1077{
1078	const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
1079	FILE *fp = fopen(cap_file, "re");
1080	unsigned int last_valid_cap;
1081
1082	if (fscanf(fp, "%u", &last_valid_cap) != 1)
1083		pdie("fscanf(%s)", cap_file);
1084	fclose(fp);
1085
1086	return last_valid_cap;
1087}
1088
1089void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
1090{
1091	cap_t caps = cap_get_proc();
1092	cap_value_t flag[1];
1093	const uint64_t one = 1;
1094	unsigned int i;
1095	if (!caps)
1096		die("can't get process caps");
1097	if (cap_clear_flag(caps, CAP_INHERITABLE))
1098		die("can't clear inheritable caps");
1099	if (cap_clear_flag(caps, CAP_EFFECTIVE))
1100		die("can't clear effective caps");
1101	if (cap_clear_flag(caps, CAP_PERMITTED))
1102		die("can't clear permitted caps");
1103	for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
1104		/* Keep CAP_SETPCAP for dropping bounding set bits. */
1105		if (i != CAP_SETPCAP && !(j->caps & (one << i)))
1106			continue;
1107		flag[0] = i;
1108		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
1109			die("can't add effective cap");
1110		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
1111			die("can't add permitted cap");
1112		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
1113			die("can't add inheritable cap");
1114	}
1115	if (cap_set_proc(caps))
1116		die("can't apply initial cleaned capset");
1117
1118	/*
1119	 * Instead of dropping bounding set first, do it here in case
1120	 * the caller had a more permissive bounding set which could
1121	 * have been used above to raise a capability that wasn't already
1122	 * present. This requires CAP_SETPCAP, so we raised/kept it above.
1123	 */
1124	for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
1125		if (j->caps & (one << i))
1126			continue;
1127		if (prctl(PR_CAPBSET_DROP, i))
1128			pdie("prctl(PR_CAPBSET_DROP)");
1129	}
1130
1131	/* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
1132	if ((j->caps & (one << CAP_SETPCAP)) == 0) {
1133		flag[0] = CAP_SETPCAP;
1134		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
1135			die("can't clear effective cap");
1136		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
1137			die("can't clear permitted cap");
1138		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
1139			die("can't clear inheritable cap");
1140	}
1141
1142	if (cap_set_proc(caps))
1143		die("can't apply final cleaned capset");
1144
1145	cap_free(caps);
1146}
1147
1148void set_seccomp_filter(const struct minijail *j)
1149{
1150	/*
1151	 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
1152	 * in the kernel source tree for an explanation of the parameters.
1153	 */
1154	if (j->flags.no_new_privs) {
1155		if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
1156			pdie("prctl(PR_SET_NO_NEW_PRIVS)");
1157	}
1158
1159	/*
1160	 * If we're logging seccomp filter failures,
1161	 * install the SIGSYS handler first.
1162	 */
1163	if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
1164		if (install_sigsys_handler())
1165			pdie("install SIGSYS handler");
1166		warn("logging seccomp filter failures");
1167	}
1168
1169	/*
1170	 * Install the syscall filter.
1171	 */
1172	if (j->flags.seccomp_filter) {
1173		if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
1174			  j->filter_prog)) {
1175			if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
1176				warn("seccomp not supported");
1177				return;
1178			}
1179			pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
1180		}
1181	}
1182}
1183
1184void API minijail_enter(const struct minijail *j)
1185{
1186	/*
1187	 * If we're dropping caps, get the last valid cap from /proc now,
1188	 * since /proc can be unmounted before drop_caps() is called.
1189	 */
1190	unsigned int last_valid_cap = 0;
1191	if (j->flags.caps)
1192		last_valid_cap = get_last_valid_cap();
1193
1194	if (j->flags.pids)
1195		die("tried to enter a pid-namespaced jail;"
1196		    " try minijail_run()?");
1197
1198	if (j->flags.usergroups && !j->user)
1199		die("usergroup inheritance without username");
1200
1201	/*
1202	 * We can't recover from failures if we've dropped privileges partially,
1203	 * so we don't even try. If any of our operations fail, we abort() the
1204	 * entire process.
1205	 */
1206	if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
1207		pdie("setns(CLONE_NEWNS)");
1208
1209	if (j->flags.vfs) {
1210		if (unshare(CLONE_NEWNS))
1211			pdie("unshare(vfs)");
1212		/*
1213		 * Remount all filesystems as private. If they are shared
1214		 * new bind mounts will creep out of our namespace.
1215		 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
1216		 */
1217		if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
1218			pdie("mount(/, private)");
1219	}
1220
1221	if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
1222		pdie("unshare(ipc)");
1223	}
1224
1225	if (j->flags.enter_net) {
1226		if (setns(j->netns_fd, CLONE_NEWNET))
1227			pdie("setns(CLONE_NEWNET)");
1228	} else if (j->flags.net && unshare(CLONE_NEWNET)) {
1229		pdie("unshare(net)");
1230	}
1231
1232	if (j->flags.chroot && enter_chroot(j))
1233		pdie("chroot");
1234
1235	if (j->flags.pivot_root && enter_pivot_root(j))
1236		pdie("pivot_root");
1237
1238	if (j->flags.mount_tmp && mount_tmp())
1239		pdie("mount_tmp");
1240
1241	if (j->flags.remount_proc_ro && remount_proc_readonly(j))
1242		pdie("remount");
1243
1244	if (j->flags.caps) {
1245		/*
1246		 * POSIX capabilities are a bit tricky. If we drop our
1247		 * capability to change uids, our attempt to use setuid()
1248		 * below will fail. Hang on to root caps across setuid(), then
1249		 * lock securebits.
1250		 */
1251		if (prctl(PR_SET_KEEPCAPS, 1))
1252			pdie("prctl(PR_SET_KEEPCAPS)");
1253		if (prctl
1254		    (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
1255			pdie("prctl(PR_SET_SECUREBITS)");
1256	}
1257
1258	/*
1259	 * If we're setting no_new_privs, we can drop privileges
1260	 * before setting seccomp filter. This way filter policies
1261	 * don't need to allow privilege-dropping syscalls.
1262	 */
1263	if (j->flags.no_new_privs) {
1264		drop_ugid(j);
1265		if (j->flags.caps)
1266			drop_caps(j, last_valid_cap);
1267
1268		set_seccomp_filter(j);
1269	} else {
1270		/*
1271		 * If we're not setting no_new_privs,
1272		 * we need to set seccomp filter *before* dropping privileges.
1273		 * WARNING: this means that filter policies *must* allow
1274		 * setgroups()/setresgid()/setresuid() for dropping root and
1275		 * capget()/capset()/prctl() for dropping caps.
1276		 */
1277		set_seccomp_filter(j);
1278
1279		drop_ugid(j);
1280		if (j->flags.caps)
1281			drop_caps(j, last_valid_cap);
1282	}
1283
1284	/*
1285	 * Select the specified alternate syscall table.  The table must not
1286	 * block prctl(2) if we're using seccomp as well.
1287	 */
1288	if (j->flags.alt_syscall) {
1289		if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table))
1290			pdie("prctl(PR_ALT_SYSCALL)");
1291	}
1292
1293	/*
1294	 * seccomp has to come last since it cuts off all the other
1295	 * privilege-dropping syscalls :)
1296	 */
1297	if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
1298		if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
1299			warn("seccomp not supported");
1300			return;
1301		}
1302		pdie("prctl(PR_SET_SECCOMP)");
1303	}
1304}
1305
1306/* TODO(wad) will visibility affect this variable? */
1307static int init_exitstatus = 0;
1308
1309void init_term(int __attribute__ ((unused)) sig)
1310{
1311	_exit(init_exitstatus);
1312}
1313
1314int init(pid_t rootpid)
1315{
1316	pid_t pid;
1317	int status;
1318	/* so that we exit with the right status */
1319	signal(SIGTERM, init_term);
1320	/* TODO(wad) self jail with seccomp_filters here. */
1321	while ((pid = wait(&status)) > 0) {
1322		/*
1323		 * This loop will only end when either there are no processes
1324		 * left inside our pid namespace or we get a signal.
1325		 */
1326		if (pid == rootpid)
1327			init_exitstatus = status;
1328	}
1329	if (!WIFEXITED(init_exitstatus))
1330		_exit(MINIJAIL_ERR_INIT);
1331	_exit(WEXITSTATUS(init_exitstatus));
1332}
1333
1334int API minijail_from_fd(int fd, struct minijail *j)
1335{
1336	size_t sz = 0;
1337	size_t bytes = read(fd, &sz, sizeof(sz));
1338	char *buf;
1339	int r;
1340	if (sizeof(sz) != bytes)
1341		return -EINVAL;
1342	if (sz > USHRT_MAX)	/* arbitrary sanity check */
1343		return -E2BIG;
1344	buf = malloc(sz);
1345	if (!buf)
1346		return -ENOMEM;
1347	bytes = read(fd, buf, sz);
1348	if (bytes != sz) {
1349		free(buf);
1350		return -EINVAL;
1351	}
1352	r = minijail_unmarshal(j, buf, sz);
1353	free(buf);
1354	return r;
1355}
1356
1357int API minijail_to_fd(struct minijail *j, int fd)
1358{
1359	char *buf;
1360	size_t sz = minijail_size(j);
1361	ssize_t written;
1362	int r;
1363
1364	if (!sz)
1365		return -EINVAL;
1366	buf = malloc(sz);
1367	r = minijail_marshal(j, buf, sz);
1368	if (r) {
1369		free(buf);
1370		return r;
1371	}
1372	/* Sends [size][minijail]. */
1373	written = write(fd, &sz, sizeof(sz));
1374	if (written != sizeof(sz)) {
1375		free(buf);
1376		return -EFAULT;
1377	}
1378	written = write(fd, buf, sz);
1379	if (written < 0 || (size_t) written != sz) {
1380		free(buf);
1381		return -EFAULT;
1382	}
1383	free(buf);
1384	return 0;
1385}
1386
1387int setup_preload(void)
1388{
1389#if defined(__ANDROID__)
1390	/* Don't use LDPRELOAD on Brillo. */
1391	return 0;
1392#else
1393	char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
1394	char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
1395	if (!newenv)
1396		return -ENOMEM;
1397
1398	/* Only insert a separating space if we have something to separate... */
1399	sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
1400		PRELOADPATH);
1401
1402	/* setenv() makes a copy of the string we give it. */
1403	setenv(kLdPreloadEnvVar, newenv, 1);
1404	free(newenv);
1405	return 0;
1406#endif
1407}
1408
1409int setup_pipe(int fds[2])
1410{
1411	int r = pipe(fds);
1412	char fd_buf[11];
1413	if (r)
1414		return r;
1415	r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
1416	if (r <= 0)
1417		return -EINVAL;
1418	setenv(kFdEnvVar, fd_buf, 1);
1419	return 0;
1420}
1421
1422int setup_pipe_end(int fds[2], size_t index)
1423{
1424	if (index > 1)
1425		return -1;
1426
1427	close(fds[1 - index]);
1428	return fds[index];
1429}
1430
1431int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
1432{
1433	if (index > 1)
1434		return -1;
1435
1436	close(fds[1 - index]);
1437	/* dup2(2) the corresponding end of the pipe into |fd|. */
1438	return dup2(fds[index], fd);
1439}
1440
1441int minijail_run_internal(struct minijail *j, const char *filename,
1442			  char *const argv[], pid_t *pchild_pid,
1443			  int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1444			  int use_preload);
1445
1446int API minijail_run(struct minijail *j, const char *filename,
1447		     char *const argv[])
1448{
1449	return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1450				     true);
1451}
1452
1453int API minijail_run_pid(struct minijail *j, const char *filename,
1454			 char *const argv[], pid_t *pchild_pid)
1455{
1456	return minijail_run_internal(j, filename, argv, pchild_pid,
1457				     NULL, NULL, NULL, true);
1458}
1459
1460int API minijail_run_pipe(struct minijail *j, const char *filename,
1461			  char *const argv[], int *pstdin_fd)
1462{
1463	return minijail_run_internal(j, filename, argv, NULL, pstdin_fd,
1464				     NULL, NULL, true);
1465}
1466
1467int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
1468			       char *const argv[], pid_t *pchild_pid,
1469			       int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
1470{
1471	return minijail_run_internal(j, filename, argv, pchild_pid,
1472				     pstdin_fd, pstdout_fd, pstderr_fd, true);
1473}
1474
1475int API minijail_run_no_preload(struct minijail *j, const char *filename,
1476				char *const argv[])
1477{
1478	return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1479				     false);
1480}
1481
1482int API minijail_run_pid_pipes_no_preload(struct minijail *j,
1483					  const char *filename,
1484					  char *const argv[],
1485					  pid_t *pchild_pid,
1486					  int *pstdin_fd, int *pstdout_fd,
1487					  int *pstderr_fd) {
1488	return minijail_run_internal(j, filename, argv, pchild_pid,
1489				     pstdin_fd, pstdout_fd, pstderr_fd, false);
1490}
1491
1492int minijail_run_internal(struct minijail *j, const char *filename,
1493			  char *const argv[], pid_t *pchild_pid,
1494			  int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1495			  int use_preload)
1496{
1497	char *oldenv, *oldenv_copy = NULL;
1498	pid_t child_pid;
1499	int pipe_fds[2];
1500	int stdin_fds[2];
1501	int stdout_fds[2];
1502	int stderr_fds[2];
1503	int userns_pipe_fds[2];
1504	int ret;
1505	/* We need to remember this across the minijail_preexec() call. */
1506	int pid_namespace = j->flags.pids;
1507	int do_init = j->flags.do_init;
1508
1509	if (use_preload) {
1510		oldenv = getenv(kLdPreloadEnvVar);
1511		if (oldenv) {
1512			oldenv_copy = strdup(oldenv);
1513			if (!oldenv_copy)
1514				return -ENOMEM;
1515		}
1516
1517		if (setup_preload())
1518			return -EFAULT;
1519	}
1520
1521	if (!use_preload) {
1522		if (j->flags.caps)
1523			die("Capabilities are not supported without "
1524			    "LD_PRELOAD");
1525	}
1526
1527	/*
1528	 * Make the process group ID of this process equal to its PID, so that
1529	 * both the Minijail process and the jailed process can be killed
1530	 * together.
1531	 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
1532	 * the process is already a process group leader.
1533	 */
1534	if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
1535		if (errno != EPERM) {
1536			pdie("setpgid(0, 0)");
1537		}
1538	}
1539
1540	if (use_preload) {
1541		/*
1542		 * Before we fork(2) and execve(2) the child process, we need
1543		 * to open a pipe(2) to send the minijail configuration over.
1544		 */
1545		if (setup_pipe(pipe_fds))
1546			return -EFAULT;
1547	}
1548
1549	/*
1550	 * If we want to write to the child process' standard input,
1551	 * create the pipe(2) now.
1552	 */
1553	if (pstdin_fd) {
1554		if (pipe(stdin_fds))
1555			return -EFAULT;
1556	}
1557
1558	/*
1559	 * If we want to read from the child process' standard output,
1560	 * create the pipe(2) now.
1561	 */
1562	if (pstdout_fd) {
1563		if (pipe(stdout_fds))
1564			return -EFAULT;
1565	}
1566
1567	/*
1568	 * If we want to read from the child process' standard error,
1569	 * create the pipe(2) now.
1570	 */
1571	if (pstderr_fd) {
1572		if (pipe(stderr_fds))
1573			return -EFAULT;
1574	}
1575
1576	/*
1577	 * If we want to set up a new uid/gid mapping in the user namespace,
1578	 * create the pipe(2) to sync between parent and child.
1579	 */
1580	if (j->flags.userns) {
1581		if (pipe(userns_pipe_fds))
1582			return -EFAULT;
1583	}
1584
1585	/*
1586	 * Use sys_clone() if and only if we're creating a pid namespace.
1587	 *
1588	 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1589	 *
1590	 * In multithreaded programs, there are a bunch of locks inside libc,
1591	 * some of which may be held by other threads at the time that we call
1592	 * minijail_run_pid(). If we call fork(), glibc does its level best to
1593	 * ensure that we hold all of these locks before it calls clone()
1594	 * internally and drop them after clone() returns, but when we call
1595	 * sys_clone(2) directly, all that gets bypassed and we end up with a
1596	 * child address space where some of libc's important locks are held by
1597	 * other threads (which did not get cloned, and hence will never release
1598	 * those locks). This is okay so long as we call exec() immediately
1599	 * after, but a bunch of seemingly-innocent libc functions like setenv()
1600	 * take locks.
1601	 *
1602	 * Hence, only call sys_clone() if we need to, in order to get at pid
1603	 * namespacing. If we follow this path, the child's address space might
1604	 * have broken locks; you may only call functions that do not acquire
1605	 * any locks.
1606	 *
1607	 * Unfortunately, fork() acquires every lock it can get its hands on, as
1608	 * previously detailed, so this function is highly likely to deadlock
1609	 * later on (see "deadlock here") if we're multithreaded.
1610	 *
1611	 * We might hack around this by having the clone()d child (init of the
1612	 * pid namespace) return directly, rather than leaving the clone()d
1613	 * process hanging around to be init for the new namespace (and having
1614	 * its fork()ed child return in turn), but that process would be crippled
1615	 * with its libc locks potentially broken. We might try fork()ing in the
1616	 * parent before we clone() to ensure that we own all the locks, but
1617	 * then we have to have the forked child hanging around consuming
1618	 * resources (and possibly having file descriptors / shared memory
1619	 * regions / etc attached). We'd need to keep the child around to avoid
1620	 * having its children get reparented to init.
1621	 *
1622	 * TODO(ellyjones): figure out if the "forked child hanging around"
1623	 * problem is fixable or not. It would be nice if we worked in this
1624	 * case.
1625	 */
1626	if (pid_namespace) {
1627		int clone_flags = CLONE_NEWPID | SIGCHLD;
1628		if (j->flags.userns)
1629			clone_flags |= CLONE_NEWUSER;
1630		child_pid = syscall(SYS_clone, clone_flags, NULL);
1631	} else {
1632		child_pid = fork();
1633	}
1634
1635	if (child_pid < 0) {
1636		if (use_preload) {
1637			free(oldenv_copy);
1638		}
1639		die("failed to fork child");
1640	}
1641
1642	if (child_pid) {
1643		if (use_preload) {
1644			/* Restore parent's LD_PRELOAD. */
1645			if (oldenv_copy) {
1646				setenv(kLdPreloadEnvVar, oldenv_copy, 1);
1647				free(oldenv_copy);
1648			} else {
1649				unsetenv(kLdPreloadEnvVar);
1650			}
1651			unsetenv(kFdEnvVar);
1652		}
1653
1654		j->initpid = child_pid;
1655
1656		if (j->flags.pid_file)
1657			write_pid_file(j);
1658
1659		if (j->flags.userns)
1660			write_ugid_mappings(j, userns_pipe_fds);
1661
1662		if (use_preload) {
1663			/* Send marshalled minijail. */
1664			close(pipe_fds[0]);	/* read endpoint */
1665			ret = minijail_to_fd(j, pipe_fds[1]);
1666			close(pipe_fds[1]);	/* write endpoint */
1667			if (ret) {
1668				kill(j->initpid, SIGKILL);
1669				die("failed to send marshalled minijail");
1670			}
1671		}
1672
1673		if (pchild_pid)
1674			*pchild_pid = child_pid;
1675
1676		/*
1677		 * If we want to write to the child process' standard input,
1678		 * set up the write end of the pipe.
1679		 */
1680		if (pstdin_fd)
1681			*pstdin_fd = setup_pipe_end(stdin_fds,
1682						    1 /* write end */);
1683
1684		/*
1685		 * If we want to read from the child process' standard output,
1686		 * set up the read end of the pipe.
1687		 */
1688		if (pstdout_fd)
1689			*pstdout_fd = setup_pipe_end(stdout_fds,
1690						     0 /* read end */);
1691
1692		/*
1693		 * If we want to read from the child process' standard error,
1694		 * set up the read end of the pipe.
1695		 */
1696		if (pstderr_fd)
1697			*pstderr_fd = setup_pipe_end(stderr_fds,
1698						     0 /* read end */);
1699
1700		return 0;
1701	}
1702	free(oldenv_copy);
1703
1704	if (j->flags.reset_signal_mask) {
1705		sigset_t signal_mask;
1706		if (sigemptyset(&signal_mask) != 0)
1707			pdie("sigemptyset failed");
1708		if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0)
1709			pdie("sigprocmask failed");
1710	}
1711
1712	if (j->flags.userns)
1713		enter_user_namespace(j, userns_pipe_fds);
1714
1715	/*
1716	 * If we want to write to the jailed process' standard input,
1717	 * set up the read end of the pipe.
1718	 */
1719	if (pstdin_fd) {
1720		if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
1721					    STDIN_FILENO) < 0)
1722			die("failed to set up stdin pipe");
1723	}
1724
1725	/*
1726	 * If we want to read from the jailed process' standard output,
1727	 * set up the write end of the pipe.
1728	 */
1729	if (pstdout_fd) {
1730		if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
1731					    STDOUT_FILENO) < 0)
1732			die("failed to set up stdout pipe");
1733	}
1734
1735	/*
1736	 * If we want to read from the jailed process' standard error,
1737	 * set up the write end of the pipe.
1738	 */
1739	if (pstderr_fd) {
1740		if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
1741					    STDERR_FILENO) < 0)
1742			die("failed to set up stderr pipe");
1743	}
1744
1745	/* If running an init program, let it decide when/how to mount /proc. */
1746	if (pid_namespace && !do_init)
1747		j->flags.remount_proc_ro = 0;
1748
1749	if (use_preload) {
1750		/* Strip out flags that cannot be inherited across execve(2). */
1751		minijail_preexec(j);
1752	} else {
1753		j->flags.pids = 0;
1754	}
1755	/* Jail this process, then execve() the target. */
1756	minijail_enter(j);
1757
1758	if (pid_namespace && do_init) {
1759		/*
1760		 * pid namespace: this process will become init inside the new
1761		 * namespace. We don't want all programs we might exec to have
1762		 * to know how to be init. Normally (do_init == 1) we fork off
1763		 * a child to actually run the program. If |do_init == 0|, we
1764		 * let the program keep pid 1 and be init.
1765		 *
1766		 * If we're multithreaded, we'll probably deadlock here. See
1767		 * WARNING above.
1768		 */
1769		child_pid = fork();
1770		if (child_pid < 0)
1771			_exit(child_pid);
1772		else if (child_pid > 0)
1773			init(child_pid);	/* never returns */
1774	}
1775
1776	/*
1777	 * If we aren't pid-namespaced, or the jailed program asked to be init:
1778	 *   calling process
1779	 *   -> execve()-ing process
1780	 * If we are:
1781	 *   calling process
1782	 *   -> init()-ing process
1783	 *      -> execve()-ing process
1784	 */
1785	_exit(execve(filename, argv, environ));
1786}
1787
1788int API minijail_kill(struct minijail *j)
1789{
1790	int st;
1791	if (kill(j->initpid, SIGTERM))
1792		return -errno;
1793	if (waitpid(j->initpid, &st, 0) < 0)
1794		return -errno;
1795	return st;
1796}
1797
1798int API minijail_wait(struct minijail *j)
1799{
1800	int st;
1801	if (waitpid(j->initpid, &st, 0) < 0)
1802		return -errno;
1803
1804	if (!WIFEXITED(st)) {
1805		int error_status = st;
1806		if (WIFSIGNALED(st)) {
1807			int signum = WTERMSIG(st);
1808			warn("child process %d received signal %d",
1809			     j->initpid, signum);
1810			/*
1811			 * We return MINIJAIL_ERR_JAIL if the process received
1812			 * SIGSYS, which happens when a syscall is blocked by
1813			 * seccomp filters.
1814			 * If not, we do what bash(1) does:
1815			 * $? = 128 + signum
1816			 */
1817			if (signum == SIGSYS) {
1818				error_status = MINIJAIL_ERR_JAIL;
1819			} else {
1820				error_status = 128 + signum;
1821			}
1822		}
1823		return error_status;
1824	}
1825
1826	int exit_status = WEXITSTATUS(st);
1827	if (exit_status != 0)
1828		info("child process %d exited with status %d",
1829		     j->initpid, exit_status);
1830
1831	return exit_status;
1832}
1833
1834void API minijail_destroy(struct minijail *j)
1835{
1836	if (j->flags.seccomp_filter && j->filter_prog) {
1837		free(j->filter_prog->filter);
1838		free(j->filter_prog);
1839	}
1840	while (j->mounts_head) {
1841		struct mountpoint *m = j->mounts_head;
1842		j->mounts_head = j->mounts_head->next;
1843		free(m->type);
1844		free(m->dest);
1845		free(m->src);
1846		free(m);
1847	}
1848	j->mounts_tail = NULL;
1849	if (j->user)
1850		free(j->user);
1851	if (j->suppl_gid_list)
1852		free(j->suppl_gid_list);
1853	if (j->chrootdir)
1854		free(j->chrootdir);
1855	if (j->alt_syscall_table)
1856		free(j->alt_syscall_table);
1857	free(j);
1858}
1859