libminijail.c revision 4cbc2a522e1bc88424905bee32199af1c0fdbd20
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6#define _BSD_SOURCE
7#define _DEFAULT_SOURCE
8#define _GNU_SOURCE
9
10#include <asm/unistd.h>
11#include <ctype.h>
12#include <errno.h>
13#include <fcntl.h>
14#include <grp.h>
15#include <inttypes.h>
16#include <limits.h>
17#include <linux/capability.h>
18#include <pwd.h>
19#include <sched.h>
20#include <signal.h>
21#include <stdarg.h>
22#include <stdbool.h>
23#include <stddef.h>
24#include <stdio.h>
25#include <stdlib.h>
26#include <string.h>
27#include <syscall.h>
28#include <sys/capability.h>
29#include <sys/mount.h>
30#include <sys/param.h>
31#include <sys/prctl.h>
32#include <sys/stat.h>
33#include <sys/types.h>
34#include <sys/user.h>
35#include <sys/utsname.h>
36#include <sys/wait.h>
37#include <unistd.h>
38
39#include "libminijail.h"
40#include "libminijail-private.h"
41
42#include "signal_handler.h"
43#include "syscall_filter.h"
44#include "util.h"
45
46#ifdef HAVE_SECUREBITS_H
47# include <linux/securebits.h>
48#else
49# define SECURE_ALL_BITS	0x55
50# define SECURE_ALL_LOCKS	(SECURE_ALL_BITS << 1)
51#endif
52/* For kernels < 4.3. */
53#define OLD_SECURE_ALL_BITS	0x15
54#define OLD_SECURE_ALL_LOCKS	(OLD_SECURE_ALL_BITS << 1)
55
56/*
57 * Assert the value of SECURE_ALL_BITS at compile-time.
58 * Brillo devices are currently compiled against 4.4 kernel headers. Kernel 4.3
59 * added a new securebit.
60 * When a new securebit is added, the new SECURE_ALL_BITS mask will return EPERM
61 * when used on older kernels. The compile-time assert will catch this situation
62 * at compile time.
63 */
64#ifdef __BRILLO__
65_Static_assert(SECURE_ALL_BITS == 0x55, "SECURE_ALL_BITS == 0x55.");
66#endif
67
68/* Until these are reliably available in linux/prctl.h. */
69#ifndef PR_SET_SECCOMP
70# define PR_SET_SECCOMP 22
71#endif
72
73#ifndef PR_ALT_SYSCALL
74# define PR_ALT_SYSCALL 0x43724f53
75#endif
76
77/* For seccomp_filter using BPF. */
78#ifndef PR_SET_NO_NEW_PRIVS
79# define PR_SET_NO_NEW_PRIVS 38
80#endif
81#ifndef SECCOMP_MODE_FILTER
82# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
83#endif
84
85#ifdef USE_SECCOMP_SOFTFAIL
86# define SECCOMP_SOFTFAIL 1
87#else
88# define SECCOMP_SOFTFAIL 0
89#endif
90
91/* New cgroup namespace might not be in linux-headers yet. */
92#ifndef CLONE_NEWCGROUP
93# define CLONE_NEWCGROUP 0x02000000
94#endif
95
96#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */
97
98struct mountpoint {
99	char *src;
100	char *dest;
101	char *type;
102	unsigned long flags;
103	struct mountpoint *next;
104};
105
106struct minijail {
107	/*
108	 * WARNING: if you add a flag here you need to make sure it's
109	 * accounted for in minijail_pre{enter|exec}() below.
110	 */
111	struct {
112		int uid:1;
113		int gid:1;
114		int usergroups:1;
115		int suppl_gids:1;
116		int use_caps:1;
117		int capbset_drop:1;
118		int vfs:1;
119		int enter_vfs:1;
120		int skip_remount_private:1;
121		int pids:1;
122		int ipc:1;
123		int net:1;
124		int enter_net:1;
125		int ns_cgroups:1;
126		int userns:1;
127		int seccomp:1;
128		int remount_proc_ro:1;
129		int no_new_privs:1;
130		int seccomp_filter:1;
131		int log_seccomp_filter:1;
132		int chroot:1;
133		int pivot_root:1;
134		int mount_tmp:1;
135		int do_init:1;
136		int pid_file:1;
137		int cgroups:1;
138		int alt_syscall:1;
139		int reset_signal_mask:1;
140	} flags;
141	uid_t uid;
142	gid_t gid;
143	gid_t usergid;
144	char *user;
145	size_t suppl_gid_count;
146	gid_t *suppl_gid_list;
147	uint64_t caps;
148	uint64_t cap_bset;
149	pid_t initpid;
150	int mountns_fd;
151	int netns_fd;
152	char *chrootdir;
153	char *pid_file_path;
154	char *uidmap;
155	char *gidmap;
156	size_t filter_len;
157	struct sock_fprog *filter_prog;
158	char *alt_syscall_table;
159	struct mountpoint *mounts_head;
160	struct mountpoint *mounts_tail;
161	size_t mounts_count;
162	char *cgroups[MAX_CGROUPS];
163	size_t cgroup_count;
164};
165
166/*
167 * Strip out flags meant for the parent.
168 * We keep things that are not inherited across execve(2) (e.g. capabilities),
169 * or are easier to set after execve(2) (e.g. seccomp filters).
170 */
171void minijail_preenter(struct minijail *j)
172{
173	j->flags.vfs = 0;
174	j->flags.enter_vfs = 0;
175	j->flags.skip_remount_private = 0;
176	j->flags.remount_proc_ro = 0;
177	j->flags.pids = 0;
178	j->flags.do_init = 0;
179	j->flags.pid_file = 0;
180	j->flags.cgroups = 0;
181}
182
183/*
184 * Strip out flags meant for the child.
185 * We keep things that are inherited across execve(2).
186 */
187void minijail_preexec(struct minijail *j)
188{
189	int vfs = j->flags.vfs;
190	int enter_vfs = j->flags.enter_vfs;
191	int skip_remount_private = j->flags.skip_remount_private;
192	int remount_proc_ro = j->flags.remount_proc_ro;
193	int userns = j->flags.userns;
194	if (j->user)
195		free(j->user);
196	j->user = NULL;
197	if (j->suppl_gid_list)
198		free(j->suppl_gid_list);
199	j->suppl_gid_list = NULL;
200	memset(&j->flags, 0, sizeof(j->flags));
201	/* Now restore anything we meant to keep. */
202	j->flags.vfs = vfs;
203	j->flags.enter_vfs = enter_vfs;
204	j->flags.skip_remount_private = skip_remount_private;
205	j->flags.remount_proc_ro = remount_proc_ro;
206	j->flags.userns = userns;
207	/* Note, |pids| will already have been used before this call. */
208}
209
210/* Returns true if the kernel version is less than 3.8. */
211int seccomp_kernel_support_not_required()
212{
213	int major, minor;
214	struct utsname uts;
215	return (uname(&uts) != -1 &&
216			sscanf(uts.release, "%d.%d", &major, &minor) == 2 &&
217			((major < 3) || ((major == 3) && (minor < 8))));
218}
219
220/* Allow seccomp soft-fail on Android devices with kernel version < 3.8. */
221int can_softfail()
222{
223#if SECCOMP_SOFTFAIL
224	if (is_android()) {
225		if (seccomp_kernel_support_not_required())
226			return 1;
227		else
228			return 0;
229	} else {
230		return 1;
231	}
232#endif
233	return 0;
234}
235
236/* Minijail API. */
237
238struct minijail API *minijail_new(void)
239{
240	return calloc(1, sizeof(struct minijail));
241}
242
243void API minijail_change_uid(struct minijail *j, uid_t uid)
244{
245	if (uid == 0)
246		die("useless change to uid 0");
247	j->uid = uid;
248	j->flags.uid = 1;
249}
250
251void API minijail_change_gid(struct minijail *j, gid_t gid)
252{
253	if (gid == 0)
254		die("useless change to gid 0");
255	j->gid = gid;
256	j->flags.gid = 1;
257}
258
259void API minijail_set_supplementary_gids(struct minijail *j, size_t size,
260					 const gid_t *list)
261{
262	size_t i;
263
264	if (j->flags.usergroups)
265		die("cannot inherit *and* set supplementary groups");
266
267	if (size == 0) {
268		/* Clear supplementary groups. */
269		j->suppl_gid_list = NULL;
270		j->suppl_gid_count = 0;
271		j->flags.suppl_gids = 1;
272		return;
273	}
274
275	/* Copy the gid_t array. */
276	j->suppl_gid_list = calloc(size, sizeof(gid_t));
277	if (!j->suppl_gid_list) {
278		die("failed to allocate internal supplementary group array");
279	}
280	for (i = 0; i < size; i++) {
281		j->suppl_gid_list[i] = list[i];
282	}
283	j->suppl_gid_count = size;
284	j->flags.suppl_gids = 1;
285}
286
287int API minijail_change_user(struct minijail *j, const char *user)
288{
289	char *buf = NULL;
290	struct passwd pw;
291	struct passwd *ppw = NULL;
292	ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
293	if (sz == -1)
294		sz = 65536;	/* your guess is as good as mine... */
295
296	/*
297	 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
298	 * the maximum needed size of the buffer, so we don't have to search.
299	 */
300	buf = malloc(sz);
301	if (!buf)
302		return -ENOMEM;
303	getpwnam_r(user, &pw, buf, sz, &ppw);
304	/*
305	 * We're safe to free the buffer here. The strings inside |pw| point
306	 * inside |buf|, but we don't use any of them; this leaves the pointers
307	 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3)
308	 * succeeded.
309	 */
310	free(buf);
311	/* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
312	if (!ppw)
313		return -1;
314	minijail_change_uid(j, ppw->pw_uid);
315	j->user = strdup(user);
316	if (!j->user)
317		return -ENOMEM;
318	j->usergid = ppw->pw_gid;
319	return 0;
320}
321
322int API minijail_change_group(struct minijail *j, const char *group)
323{
324	char *buf = NULL;
325	struct group gr;
326	struct group *pgr = NULL;
327	ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
328	if (sz == -1)
329		sz = 65536;	/* and mine is as good as yours, really */
330
331	/*
332	 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
333	 * the maximum needed size of the buffer, so we don't have to search.
334	 */
335	buf = malloc(sz);
336	if (!buf)
337		return -ENOMEM;
338	getgrnam_r(group, &gr, buf, sz, &pgr);
339	/*
340	 * We're safe to free the buffer here. The strings inside gr point
341	 * inside buf, but we don't use any of them; this leaves the pointers
342	 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
343	 */
344	free(buf);
345	/* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
346	if (!pgr)
347		return -1;
348	minijail_change_gid(j, pgr->gr_gid);
349	return 0;
350}
351
352void API minijail_use_seccomp(struct minijail *j)
353{
354	j->flags.seccomp = 1;
355}
356
357void API minijail_no_new_privs(struct minijail *j)
358{
359	j->flags.no_new_privs = 1;
360}
361
362void API minijail_use_seccomp_filter(struct minijail *j)
363{
364	j->flags.seccomp_filter = 1;
365}
366
367void API minijail_log_seccomp_filter_failures(struct minijail *j)
368{
369	j->flags.log_seccomp_filter = 1;
370}
371
372void API minijail_use_caps(struct minijail *j, uint64_t capmask)
373{
374	/*
375	 * 'minijail_use_caps' configures a runtime-capabilities-only
376	 * environment, including a bounding set matching the thread's runtime
377	 * (permitted|inheritable|effective) sets.
378	 * Therefore, it will override any existing bounding set configurations
379	 * since the latter would allow gaining extra runtime capabilities from
380	 * file capabilities.
381	 */
382	if (j->flags.capbset_drop) {
383		warn("overriding bounding set configuration");
384		j->cap_bset = 0;
385		j->flags.capbset_drop = 0;
386	}
387	j->caps = capmask;
388	j->flags.use_caps = 1;
389}
390
391void API minijail_capbset_drop(struct minijail *j, uint64_t capmask)
392{
393	if (j->flags.use_caps) {
394		/*
395		 * 'minijail_use_caps' will have already configured a capability
396		 * bounding set matching the (permitted|inheritable|effective)
397		 * sets. Abort if the user tries to configure a separate
398		 * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps'
399		 * are mutually exclusive.
400		 */
401		die("runtime capabilities already configured, can't drop "
402		    "bounding set separately");
403	}
404	j->cap_bset = capmask;
405	j->flags.capbset_drop = 1;
406}
407
408void API minijail_reset_signal_mask(struct minijail *j)
409{
410	j->flags.reset_signal_mask = 1;
411}
412
413void API minijail_namespace_vfs(struct minijail *j)
414{
415	j->flags.vfs = 1;
416}
417
418void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
419{
420	int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC);
421	if (ns_fd < 0) {
422		pdie("failed to open namespace '%s'", ns_path);
423	}
424	j->mountns_fd = ns_fd;
425	j->flags.enter_vfs = 1;
426}
427
428void API minijail_skip_remount_private(struct minijail *j)
429{
430	j->flags.skip_remount_private = 1;
431}
432
433void API minijail_namespace_pids(struct minijail *j)
434{
435	j->flags.vfs = 1;
436	j->flags.remount_proc_ro = 1;
437	j->flags.pids = 1;
438	j->flags.do_init = 1;
439}
440
441void API minijail_namespace_ipc(struct minijail *j)
442{
443	j->flags.ipc = 1;
444}
445
446void API minijail_namespace_net(struct minijail *j)
447{
448	j->flags.net = 1;
449}
450
451void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
452{
453	int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC);
454	if (ns_fd < 0) {
455		pdie("failed to open namespace '%s'", ns_path);
456	}
457	j->netns_fd = ns_fd;
458	j->flags.enter_net = 1;
459}
460
461void API minijail_namespace_cgroups(struct minijail *j)
462{
463	j->flags.ns_cgroups = 1;
464}
465
466void API minijail_remount_proc_readonly(struct minijail *j)
467{
468	j->flags.vfs = 1;
469	j->flags.remount_proc_ro = 1;
470}
471
472void API minijail_namespace_user(struct minijail *j)
473{
474	j->flags.userns = 1;
475}
476
477int API minijail_uidmap(struct minijail *j, const char *uidmap)
478{
479	j->uidmap = strdup(uidmap);
480	if (!j->uidmap)
481		return -ENOMEM;
482	char *ch;
483	for (ch = j->uidmap; *ch; ch++) {
484		if (*ch == ',')
485			*ch = '\n';
486	}
487	return 0;
488}
489
490int API minijail_gidmap(struct minijail *j, const char *gidmap)
491{
492	j->gidmap = strdup(gidmap);
493	if (!j->gidmap)
494		return -ENOMEM;
495	char *ch;
496	for (ch = j->gidmap; *ch; ch++) {
497		if (*ch == ',')
498			*ch = '\n';
499	}
500	return 0;
501}
502
503void API minijail_inherit_usergroups(struct minijail *j)
504{
505	j->flags.usergroups = 1;
506}
507
508void API minijail_run_as_init(struct minijail *j)
509{
510	/*
511	 * Since the jailed program will become 'init' in the new PID namespace,
512	 * Minijail does not need to fork an 'init' process.
513	 */
514	j->flags.do_init = 0;
515}
516
517int API minijail_enter_chroot(struct minijail *j, const char *dir)
518{
519	if (j->chrootdir)
520		return -EINVAL;
521	j->chrootdir = strdup(dir);
522	if (!j->chrootdir)
523		return -ENOMEM;
524	j->flags.chroot = 1;
525	return 0;
526}
527
528int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
529{
530	if (j->chrootdir)
531		return -EINVAL;
532	j->chrootdir = strdup(dir);
533	if (!j->chrootdir)
534		return -ENOMEM;
535	j->flags.pivot_root = 1;
536	return 0;
537}
538
539static char *append_external_path(const char *external_path,
540				  const char *path_inside_chroot)
541{
542	char *path;
543	size_t pathlen;
544
545	/* One extra char for '/' and one for '\0', hence + 2. */
546	pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2;
547	path = malloc(pathlen);
548	snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot);
549
550	return path;
551}
552
553char API *minijail_get_original_path(struct minijail *j,
554				     const char *path_inside_chroot)
555{
556	struct mountpoint *b;
557
558	b = j->mounts_head;
559	while (b) {
560		/*
561		 * If |path_inside_chroot| is the exact destination of a
562		 * mount, then the original path is exactly the source of
563		 * the mount.
564		 *  for example: "-b /some/path/exe,/chroot/path/exe"
565		 *    mount source = /some/path/exe, mount dest =
566		 *    /chroot/path/exe Then when getting the original path of
567		 *    "/chroot/path/exe", the source of that mount,
568		 *    "/some/path/exe" is what should be returned.
569		 */
570		if (!strcmp(b->dest, path_inside_chroot))
571			return strdup(b->src);
572
573		/*
574		 * If |path_inside_chroot| is within the destination path of a
575		 * mount, take the suffix of the chroot path relative to the
576		 * mount destination path, and append it to the mount source
577		 * path.
578		 */
579		if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) {
580			const char *relative_path =
581				path_inside_chroot + strlen(b->dest);
582			return append_external_path(b->src, relative_path);
583		}
584		b = b->next;
585	}
586
587	/* If there is a chroot path, append |path_inside_chroot| to that. */
588	if (j->chrootdir)
589		return append_external_path(j->chrootdir, path_inside_chroot);
590
591	/* No chroot, so the path outside is the same as it is inside. */
592	return strdup(path_inside_chroot);
593}
594
595void API minijail_mount_tmp(struct minijail *j)
596{
597	j->flags.mount_tmp = 1;
598}
599
600int API minijail_write_pid_file(struct minijail *j, const char *path)
601{
602	j->pid_file_path = strdup(path);
603	if (!j->pid_file_path)
604		return -ENOMEM;
605	j->flags.pid_file = 1;
606	return 0;
607}
608
609int API minijail_add_to_cgroup(struct minijail *j, const char *path)
610{
611	if (j->cgroup_count >= MAX_CGROUPS)
612		return -ENOMEM;
613	j->cgroups[j->cgroup_count] = strdup(path);
614	if (!j->cgroups[j->cgroup_count])
615		return -ENOMEM;
616	j->cgroup_count++;
617	j->flags.cgroups = 1;
618	return 0;
619}
620
621int API minijail_mount(struct minijail *j, const char *src, const char *dest,
622		       const char *type, unsigned long flags)
623{
624	struct mountpoint *m;
625
626	if (*dest != '/')
627		return -EINVAL;
628	m = calloc(1, sizeof(*m));
629	if (!m)
630		return -ENOMEM;
631	m->dest = strdup(dest);
632	if (!m->dest)
633		goto error;
634	m->src = strdup(src);
635	if (!m->src)
636		goto error;
637	m->type = strdup(type);
638	if (!m->type)
639		goto error;
640	m->flags = flags;
641
642	info("mount %s -> %s type '%s'", src, dest, type);
643
644	/*
645	 * Force vfs namespacing so the mounts don't leak out into the
646	 * containing vfs namespace.
647	 */
648	minijail_namespace_vfs(j);
649
650	if (j->mounts_tail)
651		j->mounts_tail->next = m;
652	else
653		j->mounts_head = m;
654	j->mounts_tail = m;
655	j->mounts_count++;
656
657	return 0;
658
659error:
660	free(m->src);
661	free(m->dest);
662	free(m);
663	return -ENOMEM;
664}
665
666int API minijail_bind(struct minijail *j, const char *src, const char *dest,
667		      int writeable)
668{
669	unsigned long flags = MS_BIND;
670
671	if (!writeable)
672		flags |= MS_RDONLY;
673
674	return minijail_mount(j, src, dest, "", flags);
675}
676
677void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
678{
679	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) {
680		if ((errno == EINVAL) && can_softfail()) {
681			warn("not loading seccomp filter,"
682			     " seccomp not supported");
683			j->flags.seccomp_filter = 0;
684			j->flags.log_seccomp_filter = 0;
685			j->filter_len = 0;
686			j->filter_prog = NULL;
687			j->flags.no_new_privs = 0;
688		}
689	}
690	FILE *file = fopen(path, "r");
691	if (!file) {
692		pdie("failed to open seccomp filter file '%s'", path);
693	}
694
695	struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
696	if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
697		die("failed to compile seccomp filter BPF program in '%s'",
698		    path);
699	}
700
701	j->filter_len = fprog->len;
702	j->filter_prog = fprog;
703
704	fclose(file);
705}
706
707int API minijail_use_alt_syscall(struct minijail *j, const char *table)
708{
709	j->alt_syscall_table = strdup(table);
710	if (!j->alt_syscall_table)
711		return -ENOMEM;
712	j->flags.alt_syscall = 1;
713	return 0;
714}
715
716struct marshal_state {
717	size_t available;
718	size_t total;
719	char *buf;
720};
721
722void marshal_state_init(struct marshal_state *state, char *buf,
723			size_t available)
724{
725	state->available = available;
726	state->buf = buf;
727	state->total = 0;
728}
729
730void marshal_append(struct marshal_state *state, void *src, size_t length)
731{
732	size_t copy_len = MIN(state->available, length);
733
734	/* Up to |available| will be written. */
735	if (copy_len) {
736		memcpy(state->buf, src, copy_len);
737		state->buf += copy_len;
738		state->available -= copy_len;
739	}
740	/* |total| will contain the expected length. */
741	state->total += length;
742}
743
744void minijail_marshal_helper(struct marshal_state *state,
745			     const struct minijail *j)
746{
747	struct mountpoint *m = NULL;
748	size_t i;
749
750	marshal_append(state, (char *)j, sizeof(*j));
751	if (j->user)
752		marshal_append(state, j->user, strlen(j->user) + 1);
753	if (j->suppl_gid_list) {
754		marshal_append(state, j->suppl_gid_list,
755			       j->suppl_gid_count * sizeof(gid_t));
756	}
757	if (j->chrootdir)
758		marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
759	if (j->alt_syscall_table) {
760		marshal_append(state, j->alt_syscall_table,
761			       strlen(j->alt_syscall_table) + 1);
762	}
763	if (j->flags.seccomp_filter && j->filter_prog) {
764		struct sock_fprog *fp = j->filter_prog;
765		marshal_append(state, (char *)fp->filter,
766			       fp->len * sizeof(struct sock_filter));
767	}
768	for (m = j->mounts_head; m; m = m->next) {
769		marshal_append(state, m->src, strlen(m->src) + 1);
770		marshal_append(state, m->dest, strlen(m->dest) + 1);
771		marshal_append(state, m->type, strlen(m->type) + 1);
772		marshal_append(state, (char *)&m->flags, sizeof(m->flags));
773	}
774	for (i = 0; i < j->cgroup_count; ++i)
775		marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1);
776}
777
778size_t API minijail_size(const struct minijail *j)
779{
780	struct marshal_state state;
781	marshal_state_init(&state, NULL, 0);
782	minijail_marshal_helper(&state, j);
783	return state.total;
784}
785
786int minijail_marshal(const struct minijail *j, char *buf, size_t available)
787{
788	struct marshal_state state;
789	marshal_state_init(&state, buf, available);
790	minijail_marshal_helper(&state, j);
791	return (state.total > available);
792}
793
794/*
795 * consumebytes: consumes @length bytes from a buffer @buf of length @buflength
796 * @length    Number of bytes to consume
797 * @buf       Buffer to consume from
798 * @buflength Size of @buf
799 *
800 * Returns a pointer to the base of the bytes, or NULL for errors.
801 */
802void *consumebytes(size_t length, char **buf, size_t *buflength)
803{
804	char *p = *buf;
805	if (length > *buflength)
806		return NULL;
807	*buf += length;
808	*buflength -= length;
809	return p;
810}
811
812/*
813 * consumestr: consumes a C string from a buffer @buf of length @length
814 * @buf    Buffer to consume
815 * @length Length of buffer
816 *
817 * Returns a pointer to the base of the string, or NULL for errors.
818 */
819char *consumestr(char **buf, size_t *buflength)
820{
821	size_t len = strnlen(*buf, *buflength);
822	if (len == *buflength)
823		/* There's no null-terminator. */
824		return NULL;
825	return consumebytes(len + 1, buf, buflength);
826}
827
828int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
829{
830	size_t i;
831	size_t count;
832	int ret = -EINVAL;
833
834	if (length < sizeof(*j))
835		goto out;
836	memcpy((void *)j, serialized, sizeof(*j));
837	serialized += sizeof(*j);
838	length -= sizeof(*j);
839
840	/* Potentially stale pointers not used as signals. */
841	j->mounts_head = NULL;
842	j->mounts_tail = NULL;
843	j->filter_prog = NULL;
844
845	if (j->user) {		/* stale pointer */
846		char *user = consumestr(&serialized, &length);
847		if (!user)
848			goto clear_pointers;
849		j->user = strdup(user);
850		if (!j->user)
851			goto clear_pointers;
852	}
853
854	if (j->suppl_gid_list) {	/* stale pointer */
855		if (j->suppl_gid_count > NGROUPS_MAX) {
856			goto bad_gid_list;
857		}
858		size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t);
859		void *gid_list_bytes =
860		    consumebytes(gid_list_size, &serialized, &length);
861		if (!gid_list_bytes)
862			goto bad_gid_list;
863
864		j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t));
865		if (!j->suppl_gid_list)
866			goto bad_gid_list;
867
868		memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size);
869	}
870
871	if (j->chrootdir) {	/* stale pointer */
872		char *chrootdir = consumestr(&serialized, &length);
873		if (!chrootdir)
874			goto bad_chrootdir;
875		j->chrootdir = strdup(chrootdir);
876		if (!j->chrootdir)
877			goto bad_chrootdir;
878	}
879
880	if (j->alt_syscall_table) {	/* stale pointer */
881		char *alt_syscall_table = consumestr(&serialized, &length);
882		if (!alt_syscall_table)
883			goto bad_syscall_table;
884		j->alt_syscall_table = strdup(alt_syscall_table);
885		if (!j->alt_syscall_table)
886			goto bad_syscall_table;
887	}
888
889	if (j->flags.seccomp_filter && j->filter_len > 0) {
890		size_t ninstrs = j->filter_len;
891		if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
892		    ninstrs > USHRT_MAX)
893			goto bad_filters;
894
895		size_t program_len = ninstrs * sizeof(struct sock_filter);
896		void *program = consumebytes(program_len, &serialized, &length);
897		if (!program)
898			goto bad_filters;
899
900		j->filter_prog = malloc(sizeof(struct sock_fprog));
901		if (!j->filter_prog)
902			goto bad_filters;
903
904		j->filter_prog->len = ninstrs;
905		j->filter_prog->filter = malloc(program_len);
906		if (!j->filter_prog->filter)
907			goto bad_filter_prog_instrs;
908
909		memcpy(j->filter_prog->filter, program, program_len);
910	}
911
912	count = j->mounts_count;
913	j->mounts_count = 0;
914	for (i = 0; i < count; ++i) {
915		unsigned long *flags;
916		const char *dest;
917		const char *type;
918		const char *src = consumestr(&serialized, &length);
919		if (!src)
920			goto bad_mounts;
921		dest = consumestr(&serialized, &length);
922		if (!dest)
923			goto bad_mounts;
924		type = consumestr(&serialized, &length);
925		if (!type)
926			goto bad_mounts;
927		flags = consumebytes(sizeof(*flags), &serialized, &length);
928		if (!flags)
929			goto bad_mounts;
930		if (minijail_mount(j, src, dest, type, *flags))
931			goto bad_mounts;
932	}
933
934	count = j->cgroup_count;
935	j->cgroup_count = 0;
936	for (i = 0; i < count; ++i) {
937		char *cgroup = consumestr(&serialized, &length);
938		if (!cgroup)
939			goto bad_cgroups;
940		j->cgroups[i] = strdup(cgroup);
941		if (!j->cgroups[i])
942			goto bad_cgroups;
943		++j->cgroup_count;
944	}
945
946	return 0;
947
948bad_cgroups:
949	while (j->mounts_head) {
950		struct mountpoint *m = j->mounts_head;
951		j->mounts_head = j->mounts_head->next;
952		free(m->type);
953		free(m->dest);
954		free(m->src);
955		free(m);
956	}
957	for (i = 0; i < j->cgroup_count; ++i)
958		free(j->cgroups[i]);
959bad_mounts:
960	if (j->flags.seccomp_filter && j->filter_len > 0) {
961		free(j->filter_prog->filter);
962		free(j->filter_prog);
963	}
964bad_filter_prog_instrs:
965	if (j->filter_prog)
966		free(j->filter_prog);
967bad_filters:
968	if (j->alt_syscall_table)
969		free(j->alt_syscall_table);
970bad_syscall_table:
971	if (j->chrootdir)
972		free(j->chrootdir);
973bad_chrootdir:
974	if (j->suppl_gid_list)
975		free(j->suppl_gid_list);
976bad_gid_list:
977	if (j->user)
978		free(j->user);
979clear_pointers:
980	j->user = NULL;
981	j->suppl_gid_list = NULL;
982	j->chrootdir = NULL;
983	j->alt_syscall_table = NULL;
984	j->cgroup_count = 0;
985out:
986	return ret;
987}
988
989static void write_ugid_mappings(const struct minijail *j)
990{
991	int fd, ret, len;
992	size_t sz;
993	char fname[32];
994
995	sz = sizeof(fname);
996	if (j->uidmap) {
997		ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid);
998		if (ret < 0 || (size_t)ret >= sz)
999			die("failed to write file name of uid_map");
1000		fd = open(fname, O_WRONLY | O_CLOEXEC);
1001		if (fd < 0)
1002			pdie("failed to open '%s'", fname);
1003		len = strlen(j->uidmap);
1004		if (write(fd, j->uidmap, len) < len)
1005			die("failed to set uid_map");
1006		close(fd);
1007	}
1008	if (j->gidmap) {
1009		ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid);
1010		if (ret < 0 || (size_t)ret >= sz)
1011			die("failed to write file name of gid_map");
1012		fd = open(fname, O_WRONLY | O_CLOEXEC);
1013		if (fd < 0)
1014			pdie("failed to open '%s'", fname);
1015		len = strlen(j->gidmap);
1016		if (write(fd, j->gidmap, len) < len)
1017			die("failed to set gid_map");
1018		close(fd);
1019	}
1020}
1021
1022static void parent_setup_complete(int *pipe_fds)
1023{
1024	close(pipe_fds[0]);
1025	close(pipe_fds[1]);
1026}
1027
1028/*
1029 * wait_for_parent_setup: Called by the child process to wait for any
1030 * further parent-side setup to complete before continuing.
1031 */
1032static void wait_for_parent_setup(int *pipe_fds)
1033{
1034	char buf;
1035
1036	close(pipe_fds[1]);
1037
1038	/* Wait for parent to complete setup and close the pipe. */
1039	if (read(pipe_fds[0], &buf, 1) != 0)
1040		die("failed to sync with parent");
1041	close(pipe_fds[0]);
1042}
1043
1044static void enter_user_namespace(const struct minijail *j)
1045{
1046	if (j->uidmap && setresuid(0, 0, 0))
1047		pdie("setresuid");
1048	if (j->gidmap && setresgid(0, 0, 0))
1049		pdie("setresgid");
1050}
1051
1052/*
1053 * mount_one: Applies mounts from @m for @j, recursing as needed.
1054 * @j Minijail these mounts are for
1055 * @m Head of list of mounts
1056 *
1057 * Returns 0 for success.
1058 */
1059static int mount_one(const struct minijail *j, struct mountpoint *m)
1060{
1061	int ret;
1062	char *dest;
1063	int remount_ro = 0;
1064
1065	/* |dest| has a leading "/". */
1066	if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0)
1067		return -ENOMEM;
1068
1069	/*
1070	 * R/O bind mounts have to be remounted since 'bind' and 'ro'
1071	 * can't both be specified in the original bind mount.
1072	 * Remount R/O after the initial mount.
1073	 */
1074	if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) {
1075		remount_ro = 1;
1076		m->flags &= ~MS_RDONLY;
1077	}
1078
1079	ret = mount(m->src, dest, m->type, m->flags, NULL);
1080	if (ret)
1081		pdie("mount: %s -> %s", m->src, dest);
1082
1083	if (remount_ro) {
1084		m->flags |= MS_RDONLY;
1085		ret = mount(m->src, dest, NULL,
1086			    m->flags | MS_REMOUNT, NULL);
1087		if (ret)
1088			pdie("bind ro: %s -> %s", m->src, dest);
1089	}
1090
1091	free(dest);
1092	if (m->next)
1093		return mount_one(j, m->next);
1094	return ret;
1095}
1096
1097int enter_chroot(const struct minijail *j)
1098{
1099	int ret;
1100
1101	if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
1102		return ret;
1103
1104	if (chroot(j->chrootdir))
1105		return -errno;
1106
1107	if (chdir("/"))
1108		return -errno;
1109
1110	return 0;
1111}
1112
1113int enter_pivot_root(const struct minijail *j)
1114{
1115	int ret, oldroot, newroot;
1116
1117	if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
1118		return ret;
1119
1120	/*
1121	 * Keep the fd for both old and new root.
1122	 * It will be used in fchdir(2) later.
1123	 */
1124	oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
1125	if (oldroot < 0)
1126		pdie("failed to open / for fchdir");
1127	newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
1128	if (newroot < 0)
1129		pdie("failed to open %s for fchdir", j->chrootdir);
1130
1131	/*
1132	 * To ensure j->chrootdir is the root of a filesystem,
1133	 * do a self bind mount.
1134	 */
1135	if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
1136		pdie("failed to bind mount '%s'", j->chrootdir);
1137	if (chdir(j->chrootdir))
1138		return -errno;
1139	if (syscall(SYS_pivot_root, ".", "."))
1140		pdie("pivot_root");
1141
1142	/*
1143	 * Now the old root is mounted on top of the new root. Use fchdir(2) to
1144	 * change to the old root and unmount it.
1145	 */
1146	if (fchdir(oldroot))
1147		pdie("failed to fchdir to old /");
1148
1149	/*
1150	 * If j->flags.skip_remount_private was enabled for minijail_enter(),
1151	 * there could be a shared mount point under |oldroot|. In that case,
1152	 * mounts under this shared mount point will be unmounted below, and
1153	 * this unmounting will propagate to the original mount namespace
1154	 * (because the mount point is shared). To prevent this unexpected
1155	 * unmounting, remove these mounts from their peer groups by recursively
1156	 * remounting them as MS_PRIVATE.
1157	 */
1158	if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL))
1159		pdie("failed to mount(/, private) before umount(/)");
1160	/* The old root might be busy, so use lazy unmount. */
1161	if (umount2(".", MNT_DETACH))
1162		pdie("umount(/)");
1163	/* Change back to the new root. */
1164	if (fchdir(newroot))
1165		return -errno;
1166	if (close(oldroot))
1167		return -errno;
1168	if (close(newroot))
1169		return -errno;
1170	if (chroot("/"))
1171		return -errno;
1172	/* Set correct CWD for getcwd(3). */
1173	if (chdir("/"))
1174		return -errno;
1175
1176	return 0;
1177}
1178
1179int mount_tmp(void)
1180{
1181	return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
1182}
1183
1184int remount_proc_readonly(const struct minijail *j)
1185{
1186	const char *kProcPath = "/proc";
1187	const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
1188	/*
1189	 * Right now, we're holding a reference to our parent's old mount of
1190	 * /proc in our namespace, which means using MS_REMOUNT here would
1191	 * mutate our parent's mount as well, even though we're in a VFS
1192	 * namespace (!). Instead, remove their mount from our namespace lazily
1193	 * (MNT_DETACH) and make our own.
1194	 */
1195	if (umount2(kProcPath, MNT_DETACH)) {
1196		/*
1197		 * If we are in a new user namespace, umount(2) will fail.
1198		 * See http://man7.org/linux/man-pages/man7/user_namespaces.7.html
1199		 */
1200		if (j->flags.userns) {
1201			info("umount(/proc, MNT_DETACH) failed, "
1202			     "this is expected when using user namespaces");
1203		} else {
1204			return -errno;
1205		}
1206	}
1207	if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
1208		return -errno;
1209	return 0;
1210}
1211
1212static void write_pid_to_path(pid_t pid, const char *path)
1213{
1214	FILE *fp = fopen(path, "w");
1215
1216	if (!fp)
1217		pdie("failed to open '%s'", path);
1218	if (fprintf(fp, "%d\n", (int)pid) < 0)
1219		pdie("fprintf(%s)", path);
1220	if (fclose(fp))
1221		pdie("fclose(%s)", path);
1222}
1223
1224static void write_pid_file(const struct minijail *j)
1225{
1226	write_pid_to_path(j->initpid, j->pid_file_path);
1227}
1228
1229static void add_to_cgroups(const struct minijail *j)
1230{
1231	size_t i;
1232
1233	for (i = 0; i < j->cgroup_count; ++i)
1234		write_pid_to_path(j->initpid, j->cgroups[i]);
1235}
1236
1237void drop_ugid(const struct minijail *j)
1238{
1239	if (j->flags.usergroups && j->flags.suppl_gids) {
1240		die("tried to inherit *and* set supplementary groups;"
1241		    " can only do one");
1242	}
1243
1244	if (j->flags.usergroups) {
1245		if (initgroups(j->user, j->usergid))
1246			pdie("initgroups");
1247	} else if (j->flags.suppl_gids) {
1248		if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) {
1249			pdie("setgroups");
1250		}
1251	} else {
1252		/*
1253		 * Only attempt to clear supplementary groups if we are changing
1254		 * users.
1255		 */
1256		if ((j->uid || j->gid) && setgroups(0, NULL))
1257			pdie("setgroups");
1258	}
1259
1260	if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
1261		pdie("setresgid");
1262
1263	if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
1264		pdie("setresuid");
1265}
1266
1267/*
1268 * We specifically do not use cap_valid() as that only tells us the last
1269 * valid cap we were *compiled* against (i.e. what the version of kernel
1270 * headers says). If we run on a different kernel version, then it's not
1271 * uncommon for that to be less (if an older kernel) or more (if a newer
1272 * kernel).
1273 * Normally, we suck up the answer via /proc. On Android, not all processes are
1274 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
1275 * programmatically find the value by calling prctl(PR_CAPBSET_READ).
1276 */
1277static unsigned int get_last_valid_cap()
1278{
1279	unsigned int last_valid_cap = 0;
1280	if (is_android()) {
1281		for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
1282		     ++last_valid_cap);
1283
1284		/* |last_valid_cap| will be the first failing value. */
1285		if (last_valid_cap > 0) {
1286			last_valid_cap--;
1287		}
1288	} else {
1289		const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
1290		FILE *fp = fopen(cap_file, "re");
1291		if (fscanf(fp, "%u", &last_valid_cap) != 1)
1292			pdie("fscanf(%s)", cap_file);
1293		fclose(fp);
1294	}
1295	return last_valid_cap;
1296}
1297
1298static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap)
1299{
1300	const uint64_t one = 1;
1301	unsigned int i;
1302	for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) {
1303		if (keep_mask & (one << i))
1304			continue;
1305		if (prctl(PR_CAPBSET_DROP, i))
1306			pdie("could not drop capability from bounding set");
1307	}
1308}
1309
1310void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
1311{
1312	if (!j->flags.use_caps)
1313		return;
1314
1315	cap_t caps = cap_get_proc();
1316	cap_value_t flag[1];
1317	const uint64_t one = 1;
1318	unsigned int i;
1319	if (!caps)
1320		die("can't get process caps");
1321	if (cap_clear_flag(caps, CAP_INHERITABLE))
1322		die("can't clear inheritable caps");
1323	if (cap_clear_flag(caps, CAP_EFFECTIVE))
1324		die("can't clear effective caps");
1325	if (cap_clear_flag(caps, CAP_PERMITTED))
1326		die("can't clear permitted caps");
1327	for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
1328		/* Keep CAP_SETPCAP for dropping bounding set bits. */
1329		if (i != CAP_SETPCAP && !(j->caps & (one << i)))
1330			continue;
1331		flag[0] = i;
1332		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
1333			die("can't add effective cap");
1334		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
1335			die("can't add permitted cap");
1336		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
1337			die("can't add inheritable cap");
1338	}
1339	if (cap_set_proc(caps))
1340		die("can't apply initial cleaned capset");
1341
1342	/*
1343	 * Instead of dropping bounding set first, do it here in case
1344	 * the caller had a more permissive bounding set which could
1345	 * have been used above to raise a capability that wasn't already
1346	 * present. This requires CAP_SETPCAP, so we raised/kept it above.
1347	 */
1348	drop_capbset(j->caps, last_valid_cap);
1349
1350	/* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
1351	if ((j->caps & (one << CAP_SETPCAP)) == 0) {
1352		flag[0] = CAP_SETPCAP;
1353		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
1354			die("can't clear effective cap");
1355		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
1356			die("can't clear permitted cap");
1357		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
1358			die("can't clear inheritable cap");
1359	}
1360
1361	if (cap_set_proc(caps))
1362		die("can't apply final cleaned capset");
1363
1364	cap_free(caps);
1365}
1366
1367void set_seccomp_filter(const struct minijail *j)
1368{
1369	/*
1370	 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
1371	 * in the kernel source tree for an explanation of the parameters.
1372	 */
1373	if (j->flags.no_new_privs) {
1374		if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
1375			pdie("prctl(PR_SET_NO_NEW_PRIVS)");
1376	}
1377
1378	/*
1379	 * Code running with ASan
1380	 * (https://github.com/google/sanitizers/wiki/AddressSanitizer)
1381	 * will make system calls not included in the syscall filter policy,
1382	 * which will likely crash the program. Skip setting seccomp filter in
1383	 * that case.
1384	 * 'running_with_asan()' has no inputs and is completely defined at
1385	 * build time, so this cannot be used by an attacker to skip setting
1386	 * seccomp filter.
1387	 */
1388	if (j->flags.seccomp_filter && running_with_asan()) {
1389		warn("running with ASan, not setting seccomp filter");
1390		return;
1391	}
1392
1393	/*
1394	 * If we're logging seccomp filter failures,
1395	 * install the SIGSYS handler first.
1396	 */
1397	if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
1398		if (install_sigsys_handler())
1399			pdie("install SIGSYS handler");
1400		warn("logging seccomp filter failures");
1401	}
1402
1403	/*
1404	 * Install the syscall filter.
1405	 */
1406	if (j->flags.seccomp_filter) {
1407		if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
1408			  j->filter_prog)) {
1409			if ((errno == EINVAL) && can_softfail()) {
1410				warn("seccomp not supported");
1411				return;
1412			}
1413			pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
1414		}
1415	}
1416}
1417
1418void API minijail_enter(const struct minijail *j)
1419{
1420	/*
1421	 * If we're dropping caps, get the last valid cap from /proc now,
1422	 * since /proc can be unmounted before drop_caps() is called.
1423	 */
1424	unsigned int last_valid_cap = 0;
1425	if (j->flags.capbset_drop || j->flags.use_caps)
1426		last_valid_cap = get_last_valid_cap();
1427
1428	if (j->flags.pids)
1429		die("tried to enter a pid-namespaced jail;"
1430		    " try minijail_run()?");
1431
1432	if (j->flags.usergroups && !j->user)
1433		die("usergroup inheritance without username");
1434
1435	/*
1436	 * We can't recover from failures if we've dropped privileges partially,
1437	 * so we don't even try. If any of our operations fail, we abort() the
1438	 * entire process.
1439	 */
1440	if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
1441		pdie("setns(CLONE_NEWNS)");
1442
1443	if (j->flags.vfs) {
1444		if (unshare(CLONE_NEWNS))
1445			pdie("unshare(vfs)");
1446		/*
1447		 * Unless asked not to, remount all filesystems as private.
1448		 * If they are shared, new bind mounts will creep out of our
1449		 * namespace.
1450		 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
1451		 */
1452		if (!j->flags.skip_remount_private) {
1453			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
1454				pdie("mount(/, private)");
1455		}
1456	}
1457
1458	if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
1459		pdie("unshare(ipc)");
1460	}
1461
1462	if (j->flags.enter_net) {
1463		if (setns(j->netns_fd, CLONE_NEWNET))
1464			pdie("setns(CLONE_NEWNET)");
1465	} else if (j->flags.net && unshare(CLONE_NEWNET)) {
1466		pdie("unshare(net)");
1467	}
1468
1469	if (j->flags.ns_cgroups && unshare(CLONE_NEWCGROUP))
1470		pdie("unshare(cgroups)");
1471
1472	if (j->flags.chroot && enter_chroot(j))
1473		pdie("chroot");
1474
1475	if (j->flags.pivot_root && enter_pivot_root(j))
1476		pdie("pivot_root");
1477
1478	if (j->flags.mount_tmp && mount_tmp())
1479		pdie("mount_tmp");
1480
1481	if (j->flags.remount_proc_ro && remount_proc_readonly(j))
1482		pdie("remount");
1483
1484	/*
1485	 * If we're only dropping capabilities from the bounding set, but not
1486	 * from the thread's (permitted|inheritable|effective) sets, do it now.
1487	 */
1488	if (j->flags.capbset_drop) {
1489		drop_capbset(j->cap_bset, last_valid_cap);
1490	}
1491
1492	if (j->flags.use_caps) {
1493		/*
1494		 * POSIX capabilities are a bit tricky. If we drop our
1495		 * capability to change uids, our attempt to use setuid()
1496		 * below will fail. Hang on to root caps across setuid(), then
1497		 * lock securebits.
1498		 */
1499		if (prctl(PR_SET_KEEPCAPS, 1))
1500			pdie("prctl(PR_SET_KEEPCAPS)");
1501
1502		/*
1503		 * Kernels 4.3+ define a new securebit
1504		 * (SECURE_NO_CAP_AMBIENT_RAISE), so using the SECURE_ALL_BITS
1505		 * and SECURE_ALL_LOCKS masks from newer kernel headers will
1506		 * return EPERM on older kernels. Detect this, and retry with
1507		 * the right mask for older (2.6.26-4.2) kernels.
1508		 */
1509		int securebits_ret = prctl(PR_SET_SECUREBITS,
1510					   SECURE_ALL_BITS | SECURE_ALL_LOCKS);
1511		if (securebits_ret < 0) {
1512			if (errno == EPERM) {
1513				/* Possibly running on kernel < 4.3. */
1514				securebits_ret = prctl(
1515				    PR_SET_SECUREBITS,
1516				    OLD_SECURE_ALL_BITS | OLD_SECURE_ALL_LOCKS);
1517			}
1518		}
1519		if (securebits_ret < 0)
1520			pdie("prctl(PR_SET_SECUREBITS)");
1521	}
1522
1523	if (j->flags.no_new_privs) {
1524		/*
1525		 * If we're setting no_new_privs, we can drop privileges
1526		 * before setting seccomp filter. This way filter policies
1527		 * don't need to allow privilege-dropping syscalls.
1528		 */
1529		drop_ugid(j);
1530		drop_caps(j, last_valid_cap);
1531		set_seccomp_filter(j);
1532	} else {
1533		/*
1534		 * If we're not setting no_new_privs,
1535		 * we need to set seccomp filter *before* dropping privileges.
1536		 * WARNING: this means that filter policies *must* allow
1537		 * setgroups()/setresgid()/setresuid() for dropping root and
1538		 * capget()/capset()/prctl() for dropping caps.
1539		 */
1540		set_seccomp_filter(j);
1541		drop_ugid(j);
1542		drop_caps(j, last_valid_cap);
1543	}
1544
1545	/*
1546	 * Select the specified alternate syscall table.  The table must not
1547	 * block prctl(2) if we're using seccomp as well.
1548	 */
1549	if (j->flags.alt_syscall) {
1550		if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table))
1551			pdie("prctl(PR_ALT_SYSCALL)");
1552	}
1553
1554	/*
1555	 * seccomp has to come last since it cuts off all the other
1556	 * privilege-dropping syscalls :)
1557	 */
1558	if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
1559		if ((errno == EINVAL) && can_softfail()) {
1560			warn("seccomp not supported");
1561			return;
1562		}
1563		pdie("prctl(PR_SET_SECCOMP)");
1564	}
1565}
1566
1567/* TODO(wad) will visibility affect this variable? */
1568static int init_exitstatus = 0;
1569
1570void init_term(int __attribute__ ((unused)) sig)
1571{
1572	_exit(init_exitstatus);
1573}
1574
1575int init(pid_t rootpid)
1576{
1577	pid_t pid;
1578	int status;
1579	/* so that we exit with the right status */
1580	signal(SIGTERM, init_term);
1581	/* TODO(wad) self jail with seccomp_filters here. */
1582	while ((pid = wait(&status)) > 0) {
1583		/*
1584		 * This loop will only end when either there are no processes
1585		 * left inside our pid namespace or we get a signal.
1586		 */
1587		if (pid == rootpid)
1588			init_exitstatus = status;
1589	}
1590	if (!WIFEXITED(init_exitstatus))
1591		_exit(MINIJAIL_ERR_INIT);
1592	_exit(WEXITSTATUS(init_exitstatus));
1593}
1594
1595int API minijail_from_fd(int fd, struct minijail *j)
1596{
1597	size_t sz = 0;
1598	size_t bytes = read(fd, &sz, sizeof(sz));
1599	char *buf;
1600	int r;
1601	if (sizeof(sz) != bytes)
1602		return -EINVAL;
1603	if (sz > USHRT_MAX)	/* arbitrary sanity check */
1604		return -E2BIG;
1605	buf = malloc(sz);
1606	if (!buf)
1607		return -ENOMEM;
1608	bytes = read(fd, buf, sz);
1609	if (bytes != sz) {
1610		free(buf);
1611		return -EINVAL;
1612	}
1613	r = minijail_unmarshal(j, buf, sz);
1614	free(buf);
1615	return r;
1616}
1617
1618int API minijail_to_fd(struct minijail *j, int fd)
1619{
1620	char *buf;
1621	size_t sz = minijail_size(j);
1622	ssize_t written;
1623	int r;
1624
1625	if (!sz)
1626		return -EINVAL;
1627	buf = malloc(sz);
1628	r = minijail_marshal(j, buf, sz);
1629	if (r) {
1630		free(buf);
1631		return r;
1632	}
1633	/* Sends [size][minijail]. */
1634	written = write(fd, &sz, sizeof(sz));
1635	if (written != sizeof(sz)) {
1636		free(buf);
1637		return -EFAULT;
1638	}
1639	written = write(fd, buf, sz);
1640	if (written < 0 || (size_t) written != sz) {
1641		free(buf);
1642		return -EFAULT;
1643	}
1644	free(buf);
1645	return 0;
1646}
1647
1648int setup_preload(void)
1649{
1650#if defined(__ANDROID__)
1651	/* Don't use LDPRELOAD on Brillo. */
1652	return 0;
1653#else
1654	char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
1655	char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
1656	if (!newenv)
1657		return -ENOMEM;
1658
1659	/* Only insert a separating space if we have something to separate... */
1660	sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
1661		PRELOADPATH);
1662
1663	/* setenv() makes a copy of the string we give it. */
1664	setenv(kLdPreloadEnvVar, newenv, 1);
1665	free(newenv);
1666	return 0;
1667#endif
1668}
1669
1670int setup_pipe(int fds[2])
1671{
1672	int r = pipe(fds);
1673	char fd_buf[11];
1674	if (r)
1675		return r;
1676	r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
1677	if (r <= 0)
1678		return -EINVAL;
1679	setenv(kFdEnvVar, fd_buf, 1);
1680	return 0;
1681}
1682
1683int setup_pipe_end(int fds[2], size_t index)
1684{
1685	if (index > 1)
1686		return -1;
1687
1688	close(fds[1 - index]);
1689	return fds[index];
1690}
1691
1692int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
1693{
1694	if (index > 1)
1695		return -1;
1696
1697	close(fds[1 - index]);
1698	/* dup2(2) the corresponding end of the pipe into |fd|. */
1699	return dup2(fds[index], fd);
1700}
1701
1702int minijail_run_internal(struct minijail *j, const char *filename,
1703			  char *const argv[], pid_t *pchild_pid,
1704			  int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1705			  int use_preload);
1706
1707int API minijail_run(struct minijail *j, const char *filename,
1708		     char *const argv[])
1709{
1710	return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1711				     true);
1712}
1713
1714int API minijail_run_pid(struct minijail *j, const char *filename,
1715			 char *const argv[], pid_t *pchild_pid)
1716{
1717	return minijail_run_internal(j, filename, argv, pchild_pid,
1718				     NULL, NULL, NULL, true);
1719}
1720
1721int API minijail_run_pipe(struct minijail *j, const char *filename,
1722			  char *const argv[], int *pstdin_fd)
1723{
1724	return minijail_run_internal(j, filename, argv, NULL, pstdin_fd,
1725				     NULL, NULL, true);
1726}
1727
1728int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
1729			       char *const argv[], pid_t *pchild_pid,
1730			       int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
1731{
1732	return minijail_run_internal(j, filename, argv, pchild_pid,
1733				     pstdin_fd, pstdout_fd, pstderr_fd, true);
1734}
1735
1736int API minijail_run_no_preload(struct minijail *j, const char *filename,
1737				char *const argv[])
1738{
1739	return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1740				     false);
1741}
1742
1743int API minijail_run_pid_pipes_no_preload(struct minijail *j,
1744					  const char *filename,
1745					  char *const argv[],
1746					  pid_t *pchild_pid,
1747					  int *pstdin_fd, int *pstdout_fd,
1748					  int *pstderr_fd)
1749{
1750	return minijail_run_internal(j, filename, argv, pchild_pid,
1751				     pstdin_fd, pstdout_fd, pstderr_fd, false);
1752}
1753
1754int minijail_run_internal(struct minijail *j, const char *filename,
1755			  char *const argv[], pid_t *pchild_pid,
1756			  int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1757			  int use_preload)
1758{
1759	char *oldenv, *oldenv_copy = NULL;
1760	pid_t child_pid;
1761	int pipe_fds[2];
1762	int stdin_fds[2];
1763	int stdout_fds[2];
1764	int stderr_fds[2];
1765	int child_sync_pipe_fds[2];
1766	int sync_child = 0;
1767	int ret;
1768	/* We need to remember this across the minijail_preexec() call. */
1769	int pid_namespace = j->flags.pids;
1770	int do_init = j->flags.do_init;
1771
1772	if (use_preload) {
1773		oldenv = getenv(kLdPreloadEnvVar);
1774		if (oldenv) {
1775			oldenv_copy = strdup(oldenv);
1776			if (!oldenv_copy)
1777				return -ENOMEM;
1778		}
1779
1780		if (setup_preload())
1781			return -EFAULT;
1782	}
1783
1784	if (!use_preload) {
1785		if (j->flags.use_caps)
1786			die("capabilities are not supported without "
1787			    "LD_PRELOAD");
1788	}
1789
1790	/*
1791	 * Make the process group ID of this process equal to its PID, so that
1792	 * both the Minijail process and the jailed process can be killed
1793	 * together.
1794	 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
1795	 * the process is already a process group leader.
1796	 */
1797	if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
1798		if (errno != EPERM) {
1799			pdie("setpgid(0, 0)");
1800		}
1801	}
1802
1803	if (use_preload) {
1804		/*
1805		 * Before we fork(2) and execve(2) the child process, we need
1806		 * to open a pipe(2) to send the minijail configuration over.
1807		 */
1808		if (setup_pipe(pipe_fds))
1809			return -EFAULT;
1810	}
1811
1812	/*
1813	 * If we want to write to the child process' standard input,
1814	 * create the pipe(2) now.
1815	 */
1816	if (pstdin_fd) {
1817		if (pipe(stdin_fds))
1818			return -EFAULT;
1819	}
1820
1821	/*
1822	 * If we want to read from the child process' standard output,
1823	 * create the pipe(2) now.
1824	 */
1825	if (pstdout_fd) {
1826		if (pipe(stdout_fds))
1827			return -EFAULT;
1828	}
1829
1830	/*
1831	 * If we want to read from the child process' standard error,
1832	 * create the pipe(2) now.
1833	 */
1834	if (pstderr_fd) {
1835		if (pipe(stderr_fds))
1836			return -EFAULT;
1837	}
1838
1839	/*
1840	 * If we want to set up a new uid/gid mapping in the user namespace,
1841	 * or if we need to add the child process to cgroups, create the pipe(2)
1842	 * to sync between parent and child.
1843	 */
1844	if (j->flags.userns || j->flags.cgroups) {
1845		sync_child = 1;
1846		if (pipe(child_sync_pipe_fds))
1847			return -EFAULT;
1848	}
1849
1850	/*
1851	 * Use sys_clone() if and only if we're creating a pid namespace.
1852	 *
1853	 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1854	 *
1855	 * In multithreaded programs, there are a bunch of locks inside libc,
1856	 * some of which may be held by other threads at the time that we call
1857	 * minijail_run_pid(). If we call fork(), glibc does its level best to
1858	 * ensure that we hold all of these locks before it calls clone()
1859	 * internally and drop them after clone() returns, but when we call
1860	 * sys_clone(2) directly, all that gets bypassed and we end up with a
1861	 * child address space where some of libc's important locks are held by
1862	 * other threads (which did not get cloned, and hence will never release
1863	 * those locks). This is okay so long as we call exec() immediately
1864	 * after, but a bunch of seemingly-innocent libc functions like setenv()
1865	 * take locks.
1866	 *
1867	 * Hence, only call sys_clone() if we need to, in order to get at pid
1868	 * namespacing. If we follow this path, the child's address space might
1869	 * have broken locks; you may only call functions that do not acquire
1870	 * any locks.
1871	 *
1872	 * Unfortunately, fork() acquires every lock it can get its hands on, as
1873	 * previously detailed, so this function is highly likely to deadlock
1874	 * later on (see "deadlock here") if we're multithreaded.
1875	 *
1876	 * We might hack around this by having the clone()d child (init of the
1877	 * pid namespace) return directly, rather than leaving the clone()d
1878	 * process hanging around to be init for the new namespace (and having
1879	 * its fork()ed child return in turn), but that process would be
1880	 * crippled with its libc locks potentially broken. We might try
1881	 * fork()ing in the parent before we clone() to ensure that we own all
1882	 * the locks, but then we have to have the forked child hanging around
1883	 * consuming resources (and possibly having file descriptors / shared
1884	 * memory regions / etc attached). We'd need to keep the child around to
1885	 * avoid having its children get reparented to init.
1886	 *
1887	 * TODO(ellyjones): figure out if the "forked child hanging around"
1888	 * problem is fixable or not. It would be nice if we worked in this
1889	 * case.
1890	 */
1891	if (pid_namespace) {
1892		int clone_flags = CLONE_NEWPID | SIGCHLD;
1893		if (j->flags.userns)
1894			clone_flags |= CLONE_NEWUSER;
1895		child_pid = syscall(SYS_clone, clone_flags, NULL);
1896	} else {
1897		child_pid = fork();
1898	}
1899
1900	if (child_pid < 0) {
1901		if (use_preload) {
1902			free(oldenv_copy);
1903		}
1904		die("failed to fork child");
1905	}
1906
1907	if (child_pid) {
1908		if (use_preload) {
1909			/* Restore parent's LD_PRELOAD. */
1910			if (oldenv_copy) {
1911				setenv(kLdPreloadEnvVar, oldenv_copy, 1);
1912				free(oldenv_copy);
1913			} else {
1914				unsetenv(kLdPreloadEnvVar);
1915			}
1916			unsetenv(kFdEnvVar);
1917		}
1918
1919		j->initpid = child_pid;
1920
1921		if (j->flags.pid_file)
1922			write_pid_file(j);
1923
1924		if (j->flags.cgroups)
1925			add_to_cgroups(j);
1926
1927		if (j->flags.userns)
1928			write_ugid_mappings(j);
1929
1930		if (sync_child)
1931			parent_setup_complete(child_sync_pipe_fds);
1932
1933		if (use_preload) {
1934			/* Send marshalled minijail. */
1935			close(pipe_fds[0]);	/* read endpoint */
1936			ret = minijail_to_fd(j, pipe_fds[1]);
1937			close(pipe_fds[1]);	/* write endpoint */
1938			if (ret) {
1939				kill(j->initpid, SIGKILL);
1940				die("failed to send marshalled minijail");
1941			}
1942		}
1943
1944		if (pchild_pid)
1945			*pchild_pid = child_pid;
1946
1947		/*
1948		 * If we want to write to the child process' standard input,
1949		 * set up the write end of the pipe.
1950		 */
1951		if (pstdin_fd)
1952			*pstdin_fd = setup_pipe_end(stdin_fds,
1953						    1 /* write end */);
1954
1955		/*
1956		 * If we want to read from the child process' standard output,
1957		 * set up the read end of the pipe.
1958		 */
1959		if (pstdout_fd)
1960			*pstdout_fd = setup_pipe_end(stdout_fds,
1961						     0 /* read end */);
1962
1963		/*
1964		 * If we want to read from the child process' standard error,
1965		 * set up the read end of the pipe.
1966		 */
1967		if (pstderr_fd)
1968			*pstderr_fd = setup_pipe_end(stderr_fds,
1969						     0 /* read end */);
1970
1971		return 0;
1972	}
1973	free(oldenv_copy);
1974
1975	if (j->flags.reset_signal_mask) {
1976		sigset_t signal_mask;
1977		if (sigemptyset(&signal_mask) != 0)
1978			pdie("sigemptyset failed");
1979		if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0)
1980			pdie("sigprocmask failed");
1981	}
1982
1983	if (sync_child)
1984		wait_for_parent_setup(child_sync_pipe_fds);
1985
1986	if (j->flags.userns)
1987		enter_user_namespace(j);
1988
1989	/*
1990	 * If we want to write to the jailed process' standard input,
1991	 * set up the read end of the pipe.
1992	 */
1993	if (pstdin_fd) {
1994		if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
1995					    STDIN_FILENO) < 0)
1996			die("failed to set up stdin pipe");
1997	}
1998
1999	/*
2000	 * If we want to read from the jailed process' standard output,
2001	 * set up the write end of the pipe.
2002	 */
2003	if (pstdout_fd) {
2004		if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
2005					    STDOUT_FILENO) < 0)
2006			die("failed to set up stdout pipe");
2007	}
2008
2009	/*
2010	 * If we want to read from the jailed process' standard error,
2011	 * set up the write end of the pipe.
2012	 */
2013	if (pstderr_fd) {
2014		if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
2015					    STDERR_FILENO) < 0)
2016			die("failed to set up stderr pipe");
2017	}
2018
2019	/* If running an init program, let it decide when/how to mount /proc. */
2020	if (pid_namespace && !do_init)
2021		j->flags.remount_proc_ro = 0;
2022
2023	if (use_preload) {
2024		/* Strip out flags that cannot be inherited across execve(2). */
2025		minijail_preexec(j);
2026	} else {
2027		j->flags.pids = 0;
2028	}
2029	/* Jail this process, then execve() the target. */
2030	minijail_enter(j);
2031
2032	if (pid_namespace && do_init) {
2033		/*
2034		 * pid namespace: this process will become init inside the new
2035		 * namespace. We don't want all programs we might exec to have
2036		 * to know how to be init. Normally (do_init == 1) we fork off
2037		 * a child to actually run the program. If |do_init == 0|, we
2038		 * let the program keep pid 1 and be init.
2039		 *
2040		 * If we're multithreaded, we'll probably deadlock here. See
2041		 * WARNING above.
2042		 */
2043		child_pid = fork();
2044		if (child_pid < 0)
2045			_exit(child_pid);
2046		else if (child_pid > 0)
2047			init(child_pid);	/* never returns */
2048	}
2049
2050	/*
2051	 * If we aren't pid-namespaced, or the jailed program asked to be init:
2052	 *   calling process
2053	 *   -> execve()-ing process
2054	 * If we are:
2055	 *   calling process
2056	 *   -> init()-ing process
2057	 *      -> execve()-ing process
2058	 */
2059	_exit(execve(filename, argv, environ));
2060}
2061
2062int API minijail_kill(struct minijail *j)
2063{
2064	int st;
2065	if (kill(j->initpid, SIGTERM))
2066		return -errno;
2067	if (waitpid(j->initpid, &st, 0) < 0)
2068		return -errno;
2069	return st;
2070}
2071
2072int API minijail_wait(struct minijail *j)
2073{
2074	int st;
2075	if (waitpid(j->initpid, &st, 0) < 0)
2076		return -errno;
2077
2078	if (!WIFEXITED(st)) {
2079		int error_status = st;
2080		if (WIFSIGNALED(st)) {
2081			int signum = WTERMSIG(st);
2082			warn("child process %d received signal %d",
2083			     j->initpid, signum);
2084			/*
2085			 * We return MINIJAIL_ERR_JAIL if the process received
2086			 * SIGSYS, which happens when a syscall is blocked by
2087			 * seccomp filters.
2088			 * If not, we do what bash(1) does:
2089			 * $? = 128 + signum
2090			 */
2091			if (signum == SIGSYS) {
2092				error_status = MINIJAIL_ERR_JAIL;
2093			} else {
2094				error_status = 128 + signum;
2095			}
2096		}
2097		return error_status;
2098	}
2099
2100	int exit_status = WEXITSTATUS(st);
2101	if (exit_status != 0)
2102		info("child process %d exited with status %d",
2103		     j->initpid, exit_status);
2104
2105	return exit_status;
2106}
2107
2108void API minijail_destroy(struct minijail *j)
2109{
2110	size_t i;
2111
2112	if (j->flags.seccomp_filter && j->filter_prog) {
2113		free(j->filter_prog->filter);
2114		free(j->filter_prog);
2115	}
2116	while (j->mounts_head) {
2117		struct mountpoint *m = j->mounts_head;
2118		j->mounts_head = j->mounts_head->next;
2119		free(m->type);
2120		free(m->dest);
2121		free(m->src);
2122		free(m);
2123	}
2124	j->mounts_tail = NULL;
2125	if (j->user)
2126		free(j->user);
2127	if (j->suppl_gid_list)
2128		free(j->suppl_gid_list);
2129	if (j->chrootdir)
2130		free(j->chrootdir);
2131	if (j->alt_syscall_table)
2132		free(j->alt_syscall_table);
2133	for (i = 0; i < j->cgroup_count; ++i)
2134		free(j->cgroups[i]);
2135	free(j);
2136}
2137