libminijail.c revision 7a569073b95af7532892dc726c2f33cd40edfb57
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6#define _BSD_SOURCE
7#define _DEFAULT_SOURCE
8#define _GNU_SOURCE
9
10#include <asm/unistd.h>
11#include <ctype.h>
12#include <errno.h>
13#include <fcntl.h>
14#include <grp.h>
15#include <inttypes.h>
16#include <limits.h>
17#include <linux/capability.h>
18#include <pwd.h>
19#include <sched.h>
20#include <signal.h>
21#include <stdarg.h>
22#include <stdbool.h>
23#include <stddef.h>
24#include <stdio.h>
25#include <stdlib.h>
26#include <string.h>
27#include <syscall.h>
28#include <sys/capability.h>
29#include <sys/mount.h>
30#include <sys/param.h>
31#include <sys/prctl.h>
32#include <sys/stat.h>
33#include <sys/types.h>
34#include <sys/user.h>
35#include <sys/utsname.h>
36#include <sys/wait.h>
37#include <unistd.h>
38
39#include "libminijail.h"
40#include "libminijail-private.h"
41
42#include "signal_handler.h"
43#include "syscall_filter.h"
44#include "util.h"
45
46#ifdef HAVE_SECUREBITS_H
47# include <linux/securebits.h>
48#else
49# define SECURE_ALL_BITS	0x55
50# define SECURE_ALL_LOCKS	(SECURE_ALL_BITS << 1)
51#endif
52/* For kernels < 4.3. */
53#define OLD_SECURE_ALL_BITS	0x15
54#define OLD_SECURE_ALL_LOCKS	(OLD_SECURE_ALL_BITS << 1)
55
56/*
57 * Assert the value of SECURE_ALL_BITS at compile-time.
58 * Brillo devices are currently compiled against 4.4 kernel headers. Kernel 4.3
59 * added a new securebit.
60 * When a new securebit is added, the new SECURE_ALL_BITS mask will return EPERM
61 * when used on older kernels. The compile-time assert will catch this situation
62 * at compile time.
63 */
64#ifdef __BRILLO__
65_Static_assert(SECURE_ALL_BITS == 0x55, "SECURE_ALL_BITS == 0x55.");
66#endif
67
68/* Until these are reliably available in linux/prctl.h. */
69#ifndef PR_SET_SECCOMP
70# define PR_SET_SECCOMP 22
71#endif
72
73#ifndef PR_ALT_SYSCALL
74# define PR_ALT_SYSCALL 0x43724f53
75#endif
76
77/* For seccomp_filter using BPF. */
78#ifndef PR_SET_NO_NEW_PRIVS
79# define PR_SET_NO_NEW_PRIVS 38
80#endif
81#ifndef SECCOMP_MODE_FILTER
82# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
83#endif
84
85#ifdef USE_SECCOMP_SOFTFAIL
86# define SECCOMP_SOFTFAIL 1
87#else
88# define SECCOMP_SOFTFAIL 0
89#endif
90
91#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */
92
93struct mountpoint {
94	char *src;
95	char *dest;
96	char *type;
97	unsigned long flags;
98	struct mountpoint *next;
99};
100
101struct minijail {
102	/*
103	 * WARNING: if you add a flag here you need to make sure it's
104	 * accounted for in minijail_pre{enter|exec}() below.
105	 */
106	struct {
107		int uid:1;
108		int gid:1;
109		int usergroups:1;
110		int suppl_gids:1;
111		int use_caps:1;
112		int capbset_drop:1;
113		int vfs:1;
114		int enter_vfs:1;
115		int skip_remount_private:1;
116		int pids:1;
117		int ipc:1;
118		int net:1;
119		int enter_net:1;
120		int userns:1;
121		int seccomp:1;
122		int remount_proc_ro:1;
123		int no_new_privs:1;
124		int seccomp_filter:1;
125		int log_seccomp_filter:1;
126		int chroot:1;
127		int pivot_root:1;
128		int mount_tmp:1;
129		int do_init:1;
130		int pid_file:1;
131		int cgroups:1;
132		int alt_syscall:1;
133		int reset_signal_mask:1;
134	} flags;
135	uid_t uid;
136	gid_t gid;
137	gid_t usergid;
138	char *user;
139	size_t suppl_gid_count;
140	gid_t *suppl_gid_list;
141	uint64_t caps;
142	uint64_t cap_bset;
143	pid_t initpid;
144	int mountns_fd;
145	int netns_fd;
146	char *chrootdir;
147	char *pid_file_path;
148	char *uidmap;
149	char *gidmap;
150	size_t filter_len;
151	struct sock_fprog *filter_prog;
152	char *alt_syscall_table;
153	struct mountpoint *mounts_head;
154	struct mountpoint *mounts_tail;
155	size_t mounts_count;
156	char *cgroups[MAX_CGROUPS];
157	size_t cgroup_count;
158};
159
160/*
161 * Strip out flags meant for the parent.
162 * We keep things that are not inherited across execve(2) (e.g. capabilities),
163 * or are easier to set after execve(2) (e.g. seccomp filters).
164 */
165void minijail_preenter(struct minijail *j)
166{
167	j->flags.vfs = 0;
168	j->flags.enter_vfs = 0;
169	j->flags.skip_remount_private = 0;
170	j->flags.remount_proc_ro = 0;
171	j->flags.pids = 0;
172	j->flags.do_init = 0;
173	j->flags.pid_file = 0;
174	j->flags.cgroups = 0;
175}
176
177/*
178 * Strip out flags meant for the child.
179 * We keep things that are inherited across execve(2).
180 */
181void minijail_preexec(struct minijail *j)
182{
183	int vfs = j->flags.vfs;
184	int enter_vfs = j->flags.enter_vfs;
185	int skip_remount_private = j->flags.skip_remount_private;
186	int remount_proc_ro = j->flags.remount_proc_ro;
187	int userns = j->flags.userns;
188	if (j->user)
189		free(j->user);
190	j->user = NULL;
191	if (j->suppl_gid_list)
192		free(j->suppl_gid_list);
193	j->suppl_gid_list = NULL;
194	memset(&j->flags, 0, sizeof(j->flags));
195	/* Now restore anything we meant to keep. */
196	j->flags.vfs = vfs;
197	j->flags.enter_vfs = enter_vfs;
198	j->flags.skip_remount_private = skip_remount_private;
199	j->flags.remount_proc_ro = remount_proc_ro;
200	j->flags.userns = userns;
201	/* Note, |pids| will already have been used before this call. */
202}
203
204/* Returns true if the kernel version is less than 3.8. */
205int seccomp_kernel_support_not_required()
206{
207	int major, minor;
208	struct utsname uts;
209	return (uname(&uts) != -1 &&
210			sscanf(uts.release, "%d.%d", &major, &minor) == 2 &&
211			((major < 3) || ((major == 3) && (minor < 8))));
212}
213
214/* Allow seccomp soft-fail on Android devices with kernel version < 3.8. */
215int can_softfail()
216{
217#if SECCOMP_SOFTFAIL
218	if (is_android()) {
219		if (seccomp_kernel_support_not_required())
220			return 1;
221		else
222			return 0;
223	} else {
224		return 1;
225	}
226#endif
227	return 0;
228}
229
230/* Minijail API. */
231
232struct minijail API *minijail_new(void)
233{
234	return calloc(1, sizeof(struct minijail));
235}
236
237void API minijail_change_uid(struct minijail *j, uid_t uid)
238{
239	if (uid == 0)
240		die("useless change to uid 0");
241	j->uid = uid;
242	j->flags.uid = 1;
243}
244
245void API minijail_change_gid(struct minijail *j, gid_t gid)
246{
247	if (gid == 0)
248		die("useless change to gid 0");
249	j->gid = gid;
250	j->flags.gid = 1;
251}
252
253void API minijail_set_supplementary_gids(struct minijail *j, size_t size,
254					 const gid_t *list)
255{
256	size_t i;
257
258	if (j->flags.usergroups)
259		die("cannot inherit *and* set supplementary groups");
260
261	if (size == 0) {
262		/* Clear supplementary groups. */
263		j->suppl_gid_list = NULL;
264		j->suppl_gid_count = 0;
265		j->flags.suppl_gids = 1;
266		return;
267	}
268
269	/* Copy the gid_t array. */
270	j->suppl_gid_list = calloc(size, sizeof(gid_t));
271	if (!j->suppl_gid_list) {
272		die("failed to allocate internal supplementary group array");
273	}
274	for (i = 0; i < size; i++) {
275		j->suppl_gid_list[i] = list[i];
276	}
277	j->suppl_gid_count = size;
278	j->flags.suppl_gids = 1;
279}
280
281int API minijail_change_user(struct minijail *j, const char *user)
282{
283	char *buf = NULL;
284	struct passwd pw;
285	struct passwd *ppw = NULL;
286	ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
287	if (sz == -1)
288		sz = 65536;	/* your guess is as good as mine... */
289
290	/*
291	 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
292	 * the maximum needed size of the buffer, so we don't have to search.
293	 */
294	buf = malloc(sz);
295	if (!buf)
296		return -ENOMEM;
297	getpwnam_r(user, &pw, buf, sz, &ppw);
298	/*
299	 * We're safe to free the buffer here. The strings inside |pw| point
300	 * inside |buf|, but we don't use any of them; this leaves the pointers
301	 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3)
302	 * succeeded.
303	 */
304	free(buf);
305	/* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
306	if (!ppw)
307		return -1;
308	minijail_change_uid(j, ppw->pw_uid);
309	j->user = strdup(user);
310	if (!j->user)
311		return -ENOMEM;
312	j->usergid = ppw->pw_gid;
313	return 0;
314}
315
316int API minijail_change_group(struct minijail *j, const char *group)
317{
318	char *buf = NULL;
319	struct group gr;
320	struct group *pgr = NULL;
321	ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
322	if (sz == -1)
323		sz = 65536;	/* and mine is as good as yours, really */
324
325	/*
326	 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
327	 * the maximum needed size of the buffer, so we don't have to search.
328	 */
329	buf = malloc(sz);
330	if (!buf)
331		return -ENOMEM;
332	getgrnam_r(group, &gr, buf, sz, &pgr);
333	/*
334	 * We're safe to free the buffer here. The strings inside gr point
335	 * inside buf, but we don't use any of them; this leaves the pointers
336	 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
337	 */
338	free(buf);
339	/* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
340	if (!pgr)
341		return -1;
342	minijail_change_gid(j, pgr->gr_gid);
343	return 0;
344}
345
346void API minijail_use_seccomp(struct minijail *j)
347{
348	j->flags.seccomp = 1;
349}
350
351void API minijail_no_new_privs(struct minijail *j)
352{
353	j->flags.no_new_privs = 1;
354}
355
356void API minijail_use_seccomp_filter(struct minijail *j)
357{
358	j->flags.seccomp_filter = 1;
359}
360
361void API minijail_log_seccomp_filter_failures(struct minijail *j)
362{
363	j->flags.log_seccomp_filter = 1;
364}
365
366void API minijail_use_caps(struct minijail *j, uint64_t capmask)
367{
368	/*
369	 * 'minijail_use_caps' configures a runtime-capabilities-only
370	 * environment, including a bounding set matching the thread's runtime
371	 * (permitted|inheritable|effective) sets.
372	 * Therefore, it will override any existing bounding set configurations
373	 * since the latter would allow gaining extra runtime capabilities from
374	 * file capabilities.
375	 */
376	if (j->flags.capbset_drop) {
377		warn("overriding bounding set configuration");
378		j->cap_bset = 0;
379		j->flags.capbset_drop = 0;
380	}
381	j->caps = capmask;
382	j->flags.use_caps = 1;
383}
384
385void API minijail_capbset_drop(struct minijail *j, uint64_t capmask)
386{
387	if (j->flags.use_caps) {
388		/*
389		 * 'minijail_use_caps' will have already configured a capability
390		 * bounding set matching the (permitted|inheritable|effective)
391		 * sets. Abort if the user tries to configure a separate
392		 * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps'
393		 * are mutually exclusive.
394		 */
395		die("runtime capabilities already configured, can't drop "
396		    "bounding set separately");
397	}
398	j->cap_bset = capmask;
399	j->flags.capbset_drop = 1;
400}
401
402void API minijail_reset_signal_mask(struct minijail *j)
403{
404	j->flags.reset_signal_mask = 1;
405}
406
407void API minijail_namespace_vfs(struct minijail *j)
408{
409	j->flags.vfs = 1;
410}
411
412void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
413{
414	int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC);
415	if (ns_fd < 0) {
416		pdie("failed to open namespace '%s'", ns_path);
417	}
418	j->mountns_fd = ns_fd;
419	j->flags.enter_vfs = 1;
420}
421
422void API minijail_skip_remount_private(struct minijail *j)
423{
424	j->flags.skip_remount_private = 1;
425}
426
427void API minijail_namespace_pids(struct minijail *j)
428{
429	j->flags.vfs = 1;
430	j->flags.remount_proc_ro = 1;
431	j->flags.pids = 1;
432	j->flags.do_init = 1;
433}
434
435void API minijail_namespace_ipc(struct minijail *j)
436{
437	j->flags.ipc = 1;
438}
439
440void API minijail_namespace_net(struct minijail *j)
441{
442	j->flags.net = 1;
443}
444
445void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
446{
447	int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC);
448	if (ns_fd < 0) {
449		pdie("failed to open namespace '%s'", ns_path);
450	}
451	j->netns_fd = ns_fd;
452	j->flags.enter_net = 1;
453}
454
455void API minijail_remount_proc_readonly(struct minijail *j)
456{
457	j->flags.vfs = 1;
458	j->flags.remount_proc_ro = 1;
459}
460
461void API minijail_namespace_user(struct minijail *j)
462{
463	j->flags.userns = 1;
464}
465
466int API minijail_uidmap(struct minijail *j, const char *uidmap)
467{
468	j->uidmap = strdup(uidmap);
469	if (!j->uidmap)
470		return -ENOMEM;
471	char *ch;
472	for (ch = j->uidmap; *ch; ch++) {
473		if (*ch == ',')
474			*ch = '\n';
475	}
476	return 0;
477}
478
479int API minijail_gidmap(struct minijail *j, const char *gidmap)
480{
481	j->gidmap = strdup(gidmap);
482	if (!j->gidmap)
483		return -ENOMEM;
484	char *ch;
485	for (ch = j->gidmap; *ch; ch++) {
486		if (*ch == ',')
487			*ch = '\n';
488	}
489	return 0;
490}
491
492void API minijail_inherit_usergroups(struct minijail *j)
493{
494	j->flags.usergroups = 1;
495}
496
497void API minijail_run_as_init(struct minijail *j)
498{
499	/*
500	 * Since the jailed program will become 'init' in the new PID namespace,
501	 * Minijail does not need to fork an 'init' process.
502	 */
503	j->flags.do_init = 0;
504}
505
506int API minijail_enter_chroot(struct minijail *j, const char *dir)
507{
508	if (j->chrootdir)
509		return -EINVAL;
510	j->chrootdir = strdup(dir);
511	if (!j->chrootdir)
512		return -ENOMEM;
513	j->flags.chroot = 1;
514	return 0;
515}
516
517int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
518{
519	if (j->chrootdir)
520		return -EINVAL;
521	j->chrootdir = strdup(dir);
522	if (!j->chrootdir)
523		return -ENOMEM;
524	j->flags.pivot_root = 1;
525	return 0;
526}
527
528static char *append_external_path(const char *external_path,
529				  const char *path_inside_chroot)
530{
531	char *path;
532	size_t pathlen;
533
534	/* One extra char for '/' and one for '\0', hence + 2. */
535	pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2;
536	path = malloc(pathlen);
537	snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot);
538
539	return path;
540}
541
542char API *minijail_get_original_path(struct minijail *j,
543				     const char *path_inside_chroot)
544{
545	struct mountpoint *b;
546
547	b = j->mounts_head;
548	while (b) {
549		/*
550		 * If |path_inside_chroot| is the exact destination of a
551		 * mount, then the original path is exactly the source of
552		 * the mount.
553		 *  for example: "-b /some/path/exe,/chroot/path/exe"
554		 *    mount source = /some/path/exe, mount dest =
555		 *    /chroot/path/exe Then when getting the original path of
556		 *    "/chroot/path/exe", the source of that mount,
557		 *    "/some/path/exe" is what should be returned.
558		 */
559		if (!strcmp(b->dest, path_inside_chroot))
560			return strdup(b->src);
561
562		/*
563		 * If |path_inside_chroot| is within the destination path of a
564		 * mount, take the suffix of the chroot path relative to the
565		 * mount destination path, and append it to the mount source
566		 * path.
567		 */
568		if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) {
569			const char *relative_path =
570				path_inside_chroot + strlen(b->dest);
571			return append_external_path(b->src, relative_path);
572		}
573		b = b->next;
574	}
575
576	/* If there is a chroot path, append |path_inside_chroot| to that. */
577	if (j->chrootdir)
578		return append_external_path(j->chrootdir, path_inside_chroot);
579
580	/* No chroot, so the path outside is the same as it is inside. */
581	return strdup(path_inside_chroot);
582}
583
584void API minijail_mount_tmp(struct minijail *j)
585{
586	j->flags.mount_tmp = 1;
587}
588
589int API minijail_write_pid_file(struct minijail *j, const char *path)
590{
591	j->pid_file_path = strdup(path);
592	if (!j->pid_file_path)
593		return -ENOMEM;
594	j->flags.pid_file = 1;
595	return 0;
596}
597
598int API minijail_add_to_cgroup(struct minijail *j, const char *path)
599{
600	if (j->cgroup_count >= MAX_CGROUPS)
601		return -ENOMEM;
602	j->cgroups[j->cgroup_count] = strdup(path);
603	if (!j->cgroups[j->cgroup_count])
604		return -ENOMEM;
605	j->cgroup_count++;
606	j->flags.cgroups = 1;
607	return 0;
608}
609
610int API minijail_mount(struct minijail *j, const char *src, const char *dest,
611		       const char *type, unsigned long flags)
612{
613	struct mountpoint *m;
614
615	if (*dest != '/')
616		return -EINVAL;
617	m = calloc(1, sizeof(*m));
618	if (!m)
619		return -ENOMEM;
620	m->dest = strdup(dest);
621	if (!m->dest)
622		goto error;
623	m->src = strdup(src);
624	if (!m->src)
625		goto error;
626	m->type = strdup(type);
627	if (!m->type)
628		goto error;
629	m->flags = flags;
630
631	info("mount %s -> %s type '%s'", src, dest, type);
632
633	/*
634	 * Force vfs namespacing so the mounts don't leak out into the
635	 * containing vfs namespace.
636	 */
637	minijail_namespace_vfs(j);
638
639	if (j->mounts_tail)
640		j->mounts_tail->next = m;
641	else
642		j->mounts_head = m;
643	j->mounts_tail = m;
644	j->mounts_count++;
645
646	return 0;
647
648error:
649	free(m->src);
650	free(m->dest);
651	free(m);
652	return -ENOMEM;
653}
654
655int API minijail_bind(struct minijail *j, const char *src, const char *dest,
656		      int writeable)
657{
658	unsigned long flags = MS_BIND;
659
660	if (!writeable)
661		flags |= MS_RDONLY;
662
663	return minijail_mount(j, src, dest, "", flags);
664}
665
666void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
667{
668	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) {
669		if ((errno == EINVAL) && can_softfail()) {
670			warn("not loading seccomp filter,"
671			     " seccomp not supported");
672			j->flags.seccomp_filter = 0;
673			j->flags.log_seccomp_filter = 0;
674			j->filter_len = 0;
675			j->filter_prog = NULL;
676			j->flags.no_new_privs = 0;
677		}
678	}
679	FILE *file = fopen(path, "r");
680	if (!file) {
681		pdie("failed to open seccomp filter file '%s'", path);
682	}
683
684	struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
685	if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
686		die("failed to compile seccomp filter BPF program in '%s'",
687		    path);
688	}
689
690	j->filter_len = fprog->len;
691	j->filter_prog = fprog;
692
693	fclose(file);
694}
695
696int API minijail_use_alt_syscall(struct minijail *j, const char *table)
697{
698	j->alt_syscall_table = strdup(table);
699	if (!j->alt_syscall_table)
700		return -ENOMEM;
701	j->flags.alt_syscall = 1;
702	return 0;
703}
704
705struct marshal_state {
706	size_t available;
707	size_t total;
708	char *buf;
709};
710
711void marshal_state_init(struct marshal_state *state, char *buf,
712			size_t available)
713{
714	state->available = available;
715	state->buf = buf;
716	state->total = 0;
717}
718
719void marshal_append(struct marshal_state *state, void *src, size_t length)
720{
721	size_t copy_len = MIN(state->available, length);
722
723	/* Up to |available| will be written. */
724	if (copy_len) {
725		memcpy(state->buf, src, copy_len);
726		state->buf += copy_len;
727		state->available -= copy_len;
728	}
729	/* |total| will contain the expected length. */
730	state->total += length;
731}
732
733void minijail_marshal_helper(struct marshal_state *state,
734			     const struct minijail *j)
735{
736	struct mountpoint *m = NULL;
737	size_t i;
738
739	marshal_append(state, (char *)j, sizeof(*j));
740	if (j->user)
741		marshal_append(state, j->user, strlen(j->user) + 1);
742	if (j->suppl_gid_list) {
743		marshal_append(state, j->suppl_gid_list,
744			       j->suppl_gid_count * sizeof(gid_t));
745	}
746	if (j->chrootdir)
747		marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
748	if (j->alt_syscall_table) {
749		marshal_append(state, j->alt_syscall_table,
750			       strlen(j->alt_syscall_table) + 1);
751	}
752	if (j->flags.seccomp_filter && j->filter_prog) {
753		struct sock_fprog *fp = j->filter_prog;
754		marshal_append(state, (char *)fp->filter,
755			       fp->len * sizeof(struct sock_filter));
756	}
757	for (m = j->mounts_head; m; m = m->next) {
758		marshal_append(state, m->src, strlen(m->src) + 1);
759		marshal_append(state, m->dest, strlen(m->dest) + 1);
760		marshal_append(state, m->type, strlen(m->type) + 1);
761		marshal_append(state, (char *)&m->flags, sizeof(m->flags));
762	}
763	for (i = 0; i < j->cgroup_count; ++i)
764		marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1);
765}
766
767size_t API minijail_size(const struct minijail *j)
768{
769	struct marshal_state state;
770	marshal_state_init(&state, NULL, 0);
771	minijail_marshal_helper(&state, j);
772	return state.total;
773}
774
775int minijail_marshal(const struct minijail *j, char *buf, size_t available)
776{
777	struct marshal_state state;
778	marshal_state_init(&state, buf, available);
779	minijail_marshal_helper(&state, j);
780	return (state.total > available);
781}
782
783/*
784 * consumebytes: consumes @length bytes from a buffer @buf of length @buflength
785 * @length    Number of bytes to consume
786 * @buf       Buffer to consume from
787 * @buflength Size of @buf
788 *
789 * Returns a pointer to the base of the bytes, or NULL for errors.
790 */
791void *consumebytes(size_t length, char **buf, size_t *buflength)
792{
793	char *p = *buf;
794	if (length > *buflength)
795		return NULL;
796	*buf += length;
797	*buflength -= length;
798	return p;
799}
800
801/*
802 * consumestr: consumes a C string from a buffer @buf of length @length
803 * @buf    Buffer to consume
804 * @length Length of buffer
805 *
806 * Returns a pointer to the base of the string, or NULL for errors.
807 */
808char *consumestr(char **buf, size_t *buflength)
809{
810	size_t len = strnlen(*buf, *buflength);
811	if (len == *buflength)
812		/* There's no null-terminator. */
813		return NULL;
814	return consumebytes(len + 1, buf, buflength);
815}
816
817int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
818{
819	size_t i;
820	size_t count;
821	int ret = -EINVAL;
822
823	if (length < sizeof(*j))
824		goto out;
825	memcpy((void *)j, serialized, sizeof(*j));
826	serialized += sizeof(*j);
827	length -= sizeof(*j);
828
829	/* Potentially stale pointers not used as signals. */
830	j->mounts_head = NULL;
831	j->mounts_tail = NULL;
832	j->filter_prog = NULL;
833
834	if (j->user) {		/* stale pointer */
835		char *user = consumestr(&serialized, &length);
836		if (!user)
837			goto clear_pointers;
838		j->user = strdup(user);
839		if (!j->user)
840			goto clear_pointers;
841	}
842
843	if (j->suppl_gid_list) {	/* stale pointer */
844		if (j->suppl_gid_count > NGROUPS_MAX) {
845			goto bad_gid_list;
846		}
847		size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t);
848		void *gid_list_bytes =
849		    consumebytes(gid_list_size, &serialized, &length);
850		if (!gid_list_bytes)
851			goto bad_gid_list;
852
853		j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t));
854		if (!j->suppl_gid_list)
855			goto bad_gid_list;
856
857		memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size);
858	}
859
860	if (j->chrootdir) {	/* stale pointer */
861		char *chrootdir = consumestr(&serialized, &length);
862		if (!chrootdir)
863			goto bad_chrootdir;
864		j->chrootdir = strdup(chrootdir);
865		if (!j->chrootdir)
866			goto bad_chrootdir;
867	}
868
869	if (j->alt_syscall_table) {	/* stale pointer */
870		char *alt_syscall_table = consumestr(&serialized, &length);
871		if (!alt_syscall_table)
872			goto bad_syscall_table;
873		j->alt_syscall_table = strdup(alt_syscall_table);
874		if (!j->alt_syscall_table)
875			goto bad_syscall_table;
876	}
877
878	if (j->flags.seccomp_filter && j->filter_len > 0) {
879		size_t ninstrs = j->filter_len;
880		if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
881		    ninstrs > USHRT_MAX)
882			goto bad_filters;
883
884		size_t program_len = ninstrs * sizeof(struct sock_filter);
885		void *program = consumebytes(program_len, &serialized, &length);
886		if (!program)
887			goto bad_filters;
888
889		j->filter_prog = malloc(sizeof(struct sock_fprog));
890		if (!j->filter_prog)
891			goto bad_filters;
892
893		j->filter_prog->len = ninstrs;
894		j->filter_prog->filter = malloc(program_len);
895		if (!j->filter_prog->filter)
896			goto bad_filter_prog_instrs;
897
898		memcpy(j->filter_prog->filter, program, program_len);
899	}
900
901	count = j->mounts_count;
902	j->mounts_count = 0;
903	for (i = 0; i < count; ++i) {
904		unsigned long *flags;
905		const char *dest;
906		const char *type;
907		const char *src = consumestr(&serialized, &length);
908		if (!src)
909			goto bad_mounts;
910		dest = consumestr(&serialized, &length);
911		if (!dest)
912			goto bad_mounts;
913		type = consumestr(&serialized, &length);
914		if (!type)
915			goto bad_mounts;
916		flags = consumebytes(sizeof(*flags), &serialized, &length);
917		if (!flags)
918			goto bad_mounts;
919		if (minijail_mount(j, src, dest, type, *flags))
920			goto bad_mounts;
921	}
922
923	count = j->cgroup_count;
924	j->cgroup_count = 0;
925	for (i = 0; i < count; ++i) {
926		char *cgroup = consumestr(&serialized, &length);
927		if (!cgroup)
928			goto bad_cgroups;
929		j->cgroups[i] = strdup(cgroup);
930		if (!j->cgroups[i])
931			goto bad_cgroups;
932		++j->cgroup_count;
933	}
934
935	return 0;
936
937bad_cgroups:
938	while (j->mounts_head) {
939		struct mountpoint *m = j->mounts_head;
940		j->mounts_head = j->mounts_head->next;
941		free(m->type);
942		free(m->dest);
943		free(m->src);
944		free(m);
945	}
946	for (i = 0; i < j->cgroup_count; ++i)
947		free(j->cgroups[i]);
948bad_mounts:
949	if (j->flags.seccomp_filter && j->filter_len > 0) {
950		free(j->filter_prog->filter);
951		free(j->filter_prog);
952	}
953bad_filter_prog_instrs:
954	if (j->filter_prog)
955		free(j->filter_prog);
956bad_filters:
957	if (j->alt_syscall_table)
958		free(j->alt_syscall_table);
959bad_syscall_table:
960	if (j->chrootdir)
961		free(j->chrootdir);
962bad_chrootdir:
963	if (j->suppl_gid_list)
964		free(j->suppl_gid_list);
965bad_gid_list:
966	if (j->user)
967		free(j->user);
968clear_pointers:
969	j->user = NULL;
970	j->suppl_gid_list = NULL;
971	j->chrootdir = NULL;
972	j->alt_syscall_table = NULL;
973	j->cgroup_count = 0;
974out:
975	return ret;
976}
977
978static void write_ugid_mappings(const struct minijail *j)
979{
980	int fd, ret, len;
981	size_t sz;
982	char fname[32];
983
984	sz = sizeof(fname);
985	if (j->uidmap) {
986		ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid);
987		if (ret < 0 || (size_t)ret >= sz)
988			die("failed to write file name of uid_map");
989		fd = open(fname, O_WRONLY | O_CLOEXEC);
990		if (fd < 0)
991			pdie("failed to open '%s'", fname);
992		len = strlen(j->uidmap);
993		if (write(fd, j->uidmap, len) < len)
994			die("failed to set uid_map");
995		close(fd);
996	}
997	if (j->gidmap) {
998		ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid);
999		if (ret < 0 || (size_t)ret >= sz)
1000			die("failed to write file name of gid_map");
1001		fd = open(fname, O_WRONLY | O_CLOEXEC);
1002		if (fd < 0)
1003			pdie("failed to open '%s'", fname);
1004		len = strlen(j->gidmap);
1005		if (write(fd, j->gidmap, len) < len)
1006			die("failed to set gid_map");
1007		close(fd);
1008	}
1009}
1010
1011static void parent_setup_complete(int *pipe_fds)
1012{
1013	close(pipe_fds[0]);
1014	close(pipe_fds[1]);
1015}
1016
1017/*
1018 * wait_for_parent_setup: Called by the child process to wait for any
1019 * further parent-side setup to complete before continuing.
1020 */
1021static void wait_for_parent_setup(int *pipe_fds)
1022{
1023	char buf;
1024
1025	close(pipe_fds[1]);
1026
1027	/* Wait for parent to complete setup and close the pipe. */
1028	if (read(pipe_fds[0], &buf, 1) != 0)
1029		die("failed to sync with parent");
1030	close(pipe_fds[0]);
1031}
1032
1033static void enter_user_namespace(const struct minijail *j)
1034{
1035	if (j->uidmap && setresuid(0, 0, 0))
1036		pdie("setresuid");
1037	if (j->gidmap && setresgid(0, 0, 0))
1038		pdie("setresgid");
1039}
1040
1041/*
1042 * mount_one: Applies mounts from @m for @j, recursing as needed.
1043 * @j Minijail these mounts are for
1044 * @m Head of list of mounts
1045 *
1046 * Returns 0 for success.
1047 */
1048static int mount_one(const struct minijail *j, struct mountpoint *m)
1049{
1050	int ret;
1051	char *dest;
1052	int remount_ro = 0;
1053
1054	/* |dest| has a leading "/". */
1055	if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0)
1056		return -ENOMEM;
1057
1058	/*
1059	 * R/O bind mounts have to be remounted since 'bind' and 'ro'
1060	 * can't both be specified in the original bind mount.
1061	 * Remount R/O after the initial mount.
1062	 */
1063	if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) {
1064		remount_ro = 1;
1065		m->flags &= ~MS_RDONLY;
1066	}
1067
1068	ret = mount(m->src, dest, m->type, m->flags, NULL);
1069	if (ret)
1070		pdie("mount: %s -> %s", m->src, dest);
1071
1072	if (remount_ro) {
1073		m->flags |= MS_RDONLY;
1074		ret = mount(m->src, dest, NULL,
1075			    m->flags | MS_REMOUNT, NULL);
1076		if (ret)
1077			pdie("bind ro: %s -> %s", m->src, dest);
1078	}
1079
1080	free(dest);
1081	if (m->next)
1082		return mount_one(j, m->next);
1083	return ret;
1084}
1085
1086int enter_chroot(const struct minijail *j)
1087{
1088	int ret;
1089
1090	if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
1091		return ret;
1092
1093	if (chroot(j->chrootdir))
1094		return -errno;
1095
1096	if (chdir("/"))
1097		return -errno;
1098
1099	return 0;
1100}
1101
1102int enter_pivot_root(const struct minijail *j)
1103{
1104	int ret, oldroot, newroot;
1105
1106	if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
1107		return ret;
1108
1109	/*
1110	 * Keep the fd for both old and new root.
1111	 * It will be used in fchdir(2) later.
1112	 */
1113	oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
1114	if (oldroot < 0)
1115		pdie("failed to open / for fchdir");
1116	newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
1117	if (newroot < 0)
1118		pdie("failed to open %s for fchdir", j->chrootdir);
1119
1120	/*
1121	 * To ensure j->chrootdir is the root of a filesystem,
1122	 * do a self bind mount.
1123	 */
1124	if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
1125		pdie("failed to bind mount '%s'", j->chrootdir);
1126	if (chdir(j->chrootdir))
1127		return -errno;
1128	if (syscall(SYS_pivot_root, ".", "."))
1129		pdie("pivot_root");
1130
1131	/*
1132	 * Now the old root is mounted on top of the new root. Use fchdir(2) to
1133	 * change to the old root and unmount it.
1134	 */
1135	if (fchdir(oldroot))
1136		pdie("failed to fchdir to old /");
1137
1138	/*
1139	 * If j->flags.skip_remount_private was enabled for minijail_enter(), there
1140	 * could be a shared mount point under |oldroot|. In that case, mounts
1141	 * under this shared mount point will be unmounted below, and this
1142	 * unmounting will propagate to the original mount namespace (because the
1143	 * mount point is shared). To prevent this unexpected unmounting, remove
1144	 * these mounts from their peer groups by recursively remounting them as
1145	 * MS_PRIVATE.
1146	 */
1147	if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL))
1148		pdie("failed to mount(/, private) before umount(/)");
1149	/* The old root might be busy, so use lazy unmount. */
1150	if (umount2(".", MNT_DETACH))
1151		pdie("umount(/)");
1152	/* Change back to the new root. */
1153	if (fchdir(newroot))
1154		return -errno;
1155	if (close(oldroot))
1156		return -errno;
1157	if (close(newroot))
1158		return -errno;
1159	if (chroot("/"))
1160		return -errno;
1161	/* Set correct CWD for getcwd(3). */
1162	if (chdir("/"))
1163		return -errno;
1164
1165	return 0;
1166}
1167
1168int mount_tmp(void)
1169{
1170	return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
1171}
1172
1173int remount_proc_readonly(const struct minijail *j)
1174{
1175	const char *kProcPath = "/proc";
1176	const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
1177	/*
1178	 * Right now, we're holding a reference to our parent's old mount of
1179	 * /proc in our namespace, which means using MS_REMOUNT here would
1180	 * mutate our parent's mount as well, even though we're in a VFS
1181	 * namespace (!). Instead, remove their mount from our namespace
1182	 * and make our own. However, if we are in a new user namespace, /proc
1183	 * is not seen as mounted, so don't return error if umount() fails.
1184	 */
1185	if (umount2(kProcPath, MNT_DETACH) && !j->flags.userns)
1186		return -errno;
1187	if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
1188		return -errno;
1189	return 0;
1190}
1191
1192static void write_pid_to_path(pid_t pid, const char *path)
1193{
1194	FILE *fp = fopen(path, "w");
1195
1196	if (!fp)
1197		pdie("failed to open '%s'", path);
1198	if (fprintf(fp, "%d\n", (int)pid) < 0)
1199		pdie("fprintf(%s)", path);
1200	if (fclose(fp))
1201		pdie("fclose(%s)", path);
1202}
1203
1204static void write_pid_file(const struct minijail *j)
1205{
1206	write_pid_to_path(j->initpid, j->pid_file_path);
1207}
1208
1209static void add_to_cgroups(const struct minijail *j)
1210{
1211	size_t i;
1212
1213	for (i = 0; i < j->cgroup_count; ++i)
1214		write_pid_to_path(j->initpid, j->cgroups[i]);
1215}
1216
1217void drop_ugid(const struct minijail *j)
1218{
1219	if (j->flags.usergroups && j->flags.suppl_gids) {
1220		die("tried to inherit *and* set supplementary groups;"
1221		    " can only do one");
1222	}
1223
1224	if (j->flags.usergroups) {
1225		if (initgroups(j->user, j->usergid))
1226			pdie("initgroups");
1227	} else if (j->flags.suppl_gids) {
1228		if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) {
1229			pdie("setgroups");
1230		}
1231	} else {
1232		/*
1233		 * Only attempt to clear supplementary groups if we are changing
1234		 * users.
1235		 */
1236		if ((j->uid || j->gid) && setgroups(0, NULL))
1237			pdie("setgroups");
1238	}
1239
1240	if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
1241		pdie("setresgid");
1242
1243	if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
1244		pdie("setresuid");
1245}
1246
1247/*
1248 * We specifically do not use cap_valid() as that only tells us the last
1249 * valid cap we were *compiled* against (i.e. what the version of kernel
1250 * headers says). If we run on a different kernel version, then it's not
1251 * uncommon for that to be less (if an older kernel) or more (if a newer
1252 * kernel).
1253 * Normally, we suck up the answer via /proc. On Android, not all processes are
1254 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
1255 * programmatically find the value by calling prctl(PR_CAPBSET_READ).
1256 */
1257static unsigned int get_last_valid_cap()
1258{
1259	unsigned int last_valid_cap = 0;
1260	if (is_android()) {
1261		for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
1262		     ++last_valid_cap);
1263
1264		/* |last_valid_cap| will be the first failing value. */
1265		if (last_valid_cap > 0) {
1266			last_valid_cap--;
1267		}
1268	} else {
1269		const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
1270		FILE *fp = fopen(cap_file, "re");
1271		if (fscanf(fp, "%u", &last_valid_cap) != 1)
1272			pdie("fscanf(%s)", cap_file);
1273		fclose(fp);
1274	}
1275	return last_valid_cap;
1276}
1277
1278static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap)
1279{
1280	const uint64_t one = 1;
1281	unsigned int i;
1282	for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) {
1283		if (keep_mask & (one << i))
1284			continue;
1285		if (prctl(PR_CAPBSET_DROP, i))
1286			pdie("could not drop capability from bounding set");
1287	}
1288}
1289
1290void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
1291{
1292	if (!j->flags.use_caps)
1293		return;
1294
1295	cap_t caps = cap_get_proc();
1296	cap_value_t flag[1];
1297	const uint64_t one = 1;
1298	unsigned int i;
1299	if (!caps)
1300		die("can't get process caps");
1301	if (cap_clear_flag(caps, CAP_INHERITABLE))
1302		die("can't clear inheritable caps");
1303	if (cap_clear_flag(caps, CAP_EFFECTIVE))
1304		die("can't clear effective caps");
1305	if (cap_clear_flag(caps, CAP_PERMITTED))
1306		die("can't clear permitted caps");
1307	for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
1308		/* Keep CAP_SETPCAP for dropping bounding set bits. */
1309		if (i != CAP_SETPCAP && !(j->caps & (one << i)))
1310			continue;
1311		flag[0] = i;
1312		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
1313			die("can't add effective cap");
1314		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
1315			die("can't add permitted cap");
1316		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
1317			die("can't add inheritable cap");
1318	}
1319	if (cap_set_proc(caps))
1320		die("can't apply initial cleaned capset");
1321
1322	/*
1323	 * Instead of dropping bounding set first, do it here in case
1324	 * the caller had a more permissive bounding set which could
1325	 * have been used above to raise a capability that wasn't already
1326	 * present. This requires CAP_SETPCAP, so we raised/kept it above.
1327	 */
1328	drop_capbset(j->caps, last_valid_cap);
1329
1330	/* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
1331	if ((j->caps & (one << CAP_SETPCAP)) == 0) {
1332		flag[0] = CAP_SETPCAP;
1333		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
1334			die("can't clear effective cap");
1335		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
1336			die("can't clear permitted cap");
1337		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
1338			die("can't clear inheritable cap");
1339	}
1340
1341	if (cap_set_proc(caps))
1342		die("can't apply final cleaned capset");
1343
1344	cap_free(caps);
1345}
1346
1347void set_seccomp_filter(const struct minijail *j)
1348{
1349	/*
1350	 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
1351	 * in the kernel source tree for an explanation of the parameters.
1352	 */
1353	if (j->flags.no_new_privs) {
1354		if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
1355			pdie("prctl(PR_SET_NO_NEW_PRIVS)");
1356	}
1357
1358	/*
1359	 * Code running with ASan
1360	 * (https://github.com/google/sanitizers/wiki/AddressSanitizer)
1361	 * will make system calls not included in the syscall filter policy,
1362	 * which will likely crash the program. Skip setting seccomp filter in
1363	 * that case.
1364	 * 'running_with_asan()' has no inputs and is completely defined at
1365	 * build time, so this cannot be used by an attacker to skip setting
1366	 * seccomp filter.
1367	 */
1368	if (j->flags.seccomp_filter && running_with_asan()) {
1369		warn("running with ASan, not setting seccomp filter");
1370		return;
1371	}
1372
1373	/*
1374	 * If we're logging seccomp filter failures,
1375	 * install the SIGSYS handler first.
1376	 */
1377	if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
1378		if (install_sigsys_handler())
1379			pdie("install SIGSYS handler");
1380		warn("logging seccomp filter failures");
1381	}
1382
1383	/*
1384	 * Install the syscall filter.
1385	 */
1386	if (j->flags.seccomp_filter) {
1387		if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
1388			  j->filter_prog)) {
1389			if ((errno == EINVAL) && can_softfail()) {
1390				warn("seccomp not supported");
1391				return;
1392			}
1393			pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
1394		}
1395	}
1396}
1397
1398void API minijail_enter(const struct minijail *j)
1399{
1400	/*
1401	 * If we're dropping caps, get the last valid cap from /proc now,
1402	 * since /proc can be unmounted before drop_caps() is called.
1403	 */
1404	unsigned int last_valid_cap = 0;
1405	if (j->flags.capbset_drop || j->flags.use_caps)
1406		last_valid_cap = get_last_valid_cap();
1407
1408	if (j->flags.pids)
1409		die("tried to enter a pid-namespaced jail;"
1410		    " try minijail_run()?");
1411
1412	if (j->flags.usergroups && !j->user)
1413		die("usergroup inheritance without username");
1414
1415	/*
1416	 * We can't recover from failures if we've dropped privileges partially,
1417	 * so we don't even try. If any of our operations fail, we abort() the
1418	 * entire process.
1419	 */
1420	if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
1421		pdie("setns(CLONE_NEWNS)");
1422
1423	if (j->flags.vfs) {
1424		if (unshare(CLONE_NEWNS))
1425			pdie("unshare(vfs)");
1426		/*
1427		 * Unless asked not to, remount all filesystems as private.
1428		 * If they are shared, new bind mounts will creep out of our
1429		 * namespace.
1430		 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
1431		 */
1432		if (!j->flags.skip_remount_private) {
1433			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
1434				pdie("mount(/, private)");
1435		}
1436	}
1437
1438	if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
1439		pdie("unshare(ipc)");
1440	}
1441
1442	if (j->flags.enter_net) {
1443		if (setns(j->netns_fd, CLONE_NEWNET))
1444			pdie("setns(CLONE_NEWNET)");
1445	} else if (j->flags.net && unshare(CLONE_NEWNET)) {
1446		pdie("unshare(net)");
1447	}
1448
1449	if (j->flags.chroot && enter_chroot(j))
1450		pdie("chroot");
1451
1452	if (j->flags.pivot_root && enter_pivot_root(j))
1453		pdie("pivot_root");
1454
1455	if (j->flags.mount_tmp && mount_tmp())
1456		pdie("mount_tmp");
1457
1458	if (j->flags.remount_proc_ro && remount_proc_readonly(j))
1459		pdie("remount");
1460
1461	/*
1462	 * If we're only dropping capabilities from the bounding set, but not
1463	 * from the thread's (permitted|inheritable|effective) sets, do it now.
1464	 */
1465	if (j->flags.capbset_drop) {
1466		drop_capbset(j->cap_bset, last_valid_cap);
1467	}
1468
1469	if (j->flags.use_caps) {
1470		/*
1471		 * POSIX capabilities are a bit tricky. If we drop our
1472		 * capability to change uids, our attempt to use setuid()
1473		 * below will fail. Hang on to root caps across setuid(), then
1474		 * lock securebits.
1475		 */
1476		if (prctl(PR_SET_KEEPCAPS, 1))
1477			pdie("prctl(PR_SET_KEEPCAPS)");
1478
1479		/*
1480		 * Kernels 4.3+ define a new securebit
1481		 * (SECURE_NO_CAP_AMBIENT_RAISE), so using the SECURE_ALL_BITS
1482		 * and SECURE_ALL_LOCKS masks from newer kernel headers will
1483		 * return EPERM on older kernels. Detect this, and retry with
1484		 * the right mask for older (2.6.26-4.2) kernels.
1485		 */
1486		int securebits_ret = prctl(PR_SET_SECUREBITS,
1487					   SECURE_ALL_BITS | SECURE_ALL_LOCKS);
1488		if (securebits_ret < 0) {
1489			if (errno == EPERM) {
1490				/* Possibly running on kernel < 4.3. */
1491				securebits_ret = prctl(
1492				    PR_SET_SECUREBITS,
1493				    OLD_SECURE_ALL_BITS | OLD_SECURE_ALL_LOCKS);
1494			}
1495		}
1496		if (securebits_ret < 0)
1497			pdie("prctl(PR_SET_SECUREBITS)");
1498	}
1499
1500	if (j->flags.no_new_privs) {
1501		/*
1502		 * If we're setting no_new_privs, we can drop privileges
1503		 * before setting seccomp filter. This way filter policies
1504		 * don't need to allow privilege-dropping syscalls.
1505		 */
1506		drop_ugid(j);
1507		drop_caps(j, last_valid_cap);
1508		set_seccomp_filter(j);
1509	} else {
1510		/*
1511		 * If we're not setting no_new_privs,
1512		 * we need to set seccomp filter *before* dropping privileges.
1513		 * WARNING: this means that filter policies *must* allow
1514		 * setgroups()/setresgid()/setresuid() for dropping root and
1515		 * capget()/capset()/prctl() for dropping caps.
1516		 */
1517		set_seccomp_filter(j);
1518		drop_ugid(j);
1519		drop_caps(j, last_valid_cap);
1520	}
1521
1522	/*
1523	 * Select the specified alternate syscall table.  The table must not
1524	 * block prctl(2) if we're using seccomp as well.
1525	 */
1526	if (j->flags.alt_syscall) {
1527		if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table))
1528			pdie("prctl(PR_ALT_SYSCALL)");
1529	}
1530
1531	/*
1532	 * seccomp has to come last since it cuts off all the other
1533	 * privilege-dropping syscalls :)
1534	 */
1535	if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
1536		if ((errno == EINVAL) && can_softfail()) {
1537			warn("seccomp not supported");
1538			return;
1539		}
1540		pdie("prctl(PR_SET_SECCOMP)");
1541	}
1542}
1543
1544/* TODO(wad) will visibility affect this variable? */
1545static int init_exitstatus = 0;
1546
1547void init_term(int __attribute__ ((unused)) sig)
1548{
1549	_exit(init_exitstatus);
1550}
1551
1552int init(pid_t rootpid)
1553{
1554	pid_t pid;
1555	int status;
1556	/* so that we exit with the right status */
1557	signal(SIGTERM, init_term);
1558	/* TODO(wad) self jail with seccomp_filters here. */
1559	while ((pid = wait(&status)) > 0) {
1560		/*
1561		 * This loop will only end when either there are no processes
1562		 * left inside our pid namespace or we get a signal.
1563		 */
1564		if (pid == rootpid)
1565			init_exitstatus = status;
1566	}
1567	if (!WIFEXITED(init_exitstatus))
1568		_exit(MINIJAIL_ERR_INIT);
1569	_exit(WEXITSTATUS(init_exitstatus));
1570}
1571
1572int API minijail_from_fd(int fd, struct minijail *j)
1573{
1574	size_t sz = 0;
1575	size_t bytes = read(fd, &sz, sizeof(sz));
1576	char *buf;
1577	int r;
1578	if (sizeof(sz) != bytes)
1579		return -EINVAL;
1580	if (sz > USHRT_MAX)	/* arbitrary sanity check */
1581		return -E2BIG;
1582	buf = malloc(sz);
1583	if (!buf)
1584		return -ENOMEM;
1585	bytes = read(fd, buf, sz);
1586	if (bytes != sz) {
1587		free(buf);
1588		return -EINVAL;
1589	}
1590	r = minijail_unmarshal(j, buf, sz);
1591	free(buf);
1592	return r;
1593}
1594
1595int API minijail_to_fd(struct minijail *j, int fd)
1596{
1597	char *buf;
1598	size_t sz = minijail_size(j);
1599	ssize_t written;
1600	int r;
1601
1602	if (!sz)
1603		return -EINVAL;
1604	buf = malloc(sz);
1605	r = minijail_marshal(j, buf, sz);
1606	if (r) {
1607		free(buf);
1608		return r;
1609	}
1610	/* Sends [size][minijail]. */
1611	written = write(fd, &sz, sizeof(sz));
1612	if (written != sizeof(sz)) {
1613		free(buf);
1614		return -EFAULT;
1615	}
1616	written = write(fd, buf, sz);
1617	if (written < 0 || (size_t) written != sz) {
1618		free(buf);
1619		return -EFAULT;
1620	}
1621	free(buf);
1622	return 0;
1623}
1624
1625int setup_preload(void)
1626{
1627#if defined(__ANDROID__)
1628	/* Don't use LDPRELOAD on Brillo. */
1629	return 0;
1630#else
1631	char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
1632	char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
1633	if (!newenv)
1634		return -ENOMEM;
1635
1636	/* Only insert a separating space if we have something to separate... */
1637	sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
1638		PRELOADPATH);
1639
1640	/* setenv() makes a copy of the string we give it. */
1641	setenv(kLdPreloadEnvVar, newenv, 1);
1642	free(newenv);
1643	return 0;
1644#endif
1645}
1646
1647int setup_pipe(int fds[2])
1648{
1649	int r = pipe(fds);
1650	char fd_buf[11];
1651	if (r)
1652		return r;
1653	r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
1654	if (r <= 0)
1655		return -EINVAL;
1656	setenv(kFdEnvVar, fd_buf, 1);
1657	return 0;
1658}
1659
1660int setup_pipe_end(int fds[2], size_t index)
1661{
1662	if (index > 1)
1663		return -1;
1664
1665	close(fds[1 - index]);
1666	return fds[index];
1667}
1668
1669int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
1670{
1671	if (index > 1)
1672		return -1;
1673
1674	close(fds[1 - index]);
1675	/* dup2(2) the corresponding end of the pipe into |fd|. */
1676	return dup2(fds[index], fd);
1677}
1678
1679int minijail_run_internal(struct minijail *j, const char *filename,
1680			  char *const argv[], pid_t *pchild_pid,
1681			  int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1682			  int use_preload);
1683
1684int API minijail_run(struct minijail *j, const char *filename,
1685		     char *const argv[])
1686{
1687	return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1688				     true);
1689}
1690
1691int API minijail_run_pid(struct minijail *j, const char *filename,
1692			 char *const argv[], pid_t *pchild_pid)
1693{
1694	return minijail_run_internal(j, filename, argv, pchild_pid,
1695				     NULL, NULL, NULL, true);
1696}
1697
1698int API minijail_run_pipe(struct minijail *j, const char *filename,
1699			  char *const argv[], int *pstdin_fd)
1700{
1701	return minijail_run_internal(j, filename, argv, NULL, pstdin_fd,
1702				     NULL, NULL, true);
1703}
1704
1705int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
1706			       char *const argv[], pid_t *pchild_pid,
1707			       int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
1708{
1709	return minijail_run_internal(j, filename, argv, pchild_pid,
1710				     pstdin_fd, pstdout_fd, pstderr_fd, true);
1711}
1712
1713int API minijail_run_no_preload(struct minijail *j, const char *filename,
1714				char *const argv[])
1715{
1716	return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1717				     false);
1718}
1719
1720int API minijail_run_pid_pipes_no_preload(struct minijail *j,
1721					  const char *filename,
1722					  char *const argv[],
1723					  pid_t *pchild_pid,
1724					  int *pstdin_fd, int *pstdout_fd,
1725					  int *pstderr_fd)
1726{
1727	return minijail_run_internal(j, filename, argv, pchild_pid,
1728				     pstdin_fd, pstdout_fd, pstderr_fd, false);
1729}
1730
1731int minijail_run_internal(struct minijail *j, const char *filename,
1732			  char *const argv[], pid_t *pchild_pid,
1733			  int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1734			  int use_preload)
1735{
1736	char *oldenv, *oldenv_copy = NULL;
1737	pid_t child_pid;
1738	int pipe_fds[2];
1739	int stdin_fds[2];
1740	int stdout_fds[2];
1741	int stderr_fds[2];
1742	int child_sync_pipe_fds[2];
1743	int sync_child = 0;
1744	int ret;
1745	/* We need to remember this across the minijail_preexec() call. */
1746	int pid_namespace = j->flags.pids;
1747	int do_init = j->flags.do_init;
1748
1749	if (use_preload) {
1750		oldenv = getenv(kLdPreloadEnvVar);
1751		if (oldenv) {
1752			oldenv_copy = strdup(oldenv);
1753			if (!oldenv_copy)
1754				return -ENOMEM;
1755		}
1756
1757		if (setup_preload())
1758			return -EFAULT;
1759	}
1760
1761	if (!use_preload) {
1762		if (j->flags.use_caps)
1763			die("capabilities are not supported without "
1764			    "LD_PRELOAD");
1765	}
1766
1767	/*
1768	 * Make the process group ID of this process equal to its PID, so that
1769	 * both the Minijail process and the jailed process can be killed
1770	 * together.
1771	 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
1772	 * the process is already a process group leader.
1773	 */
1774	if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
1775		if (errno != EPERM) {
1776			pdie("setpgid(0, 0)");
1777		}
1778	}
1779
1780	if (use_preload) {
1781		/*
1782		 * Before we fork(2) and execve(2) the child process, we need
1783		 * to open a pipe(2) to send the minijail configuration over.
1784		 */
1785		if (setup_pipe(pipe_fds))
1786			return -EFAULT;
1787	}
1788
1789	/*
1790	 * If we want to write to the child process' standard input,
1791	 * create the pipe(2) now.
1792	 */
1793	if (pstdin_fd) {
1794		if (pipe(stdin_fds))
1795			return -EFAULT;
1796	}
1797
1798	/*
1799	 * If we want to read from the child process' standard output,
1800	 * create the pipe(2) now.
1801	 */
1802	if (pstdout_fd) {
1803		if (pipe(stdout_fds))
1804			return -EFAULT;
1805	}
1806
1807	/*
1808	 * If we want to read from the child process' standard error,
1809	 * create the pipe(2) now.
1810	 */
1811	if (pstderr_fd) {
1812		if (pipe(stderr_fds))
1813			return -EFAULT;
1814	}
1815
1816	/*
1817	 * If we want to set up a new uid/gid mapping in the user namespace,
1818	 * or if we need to add the child process to cgroups, create the pipe(2)
1819	 * to sync between parent and child.
1820	 */
1821	if (j->flags.userns || j->flags.cgroups) {
1822		sync_child = 1;
1823		if (pipe(child_sync_pipe_fds))
1824			return -EFAULT;
1825	}
1826
1827	/*
1828	 * Use sys_clone() if and only if we're creating a pid namespace.
1829	 *
1830	 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1831	 *
1832	 * In multithreaded programs, there are a bunch of locks inside libc,
1833	 * some of which may be held by other threads at the time that we call
1834	 * minijail_run_pid(). If we call fork(), glibc does its level best to
1835	 * ensure that we hold all of these locks before it calls clone()
1836	 * internally and drop them after clone() returns, but when we call
1837	 * sys_clone(2) directly, all that gets bypassed and we end up with a
1838	 * child address space where some of libc's important locks are held by
1839	 * other threads (which did not get cloned, and hence will never release
1840	 * those locks). This is okay so long as we call exec() immediately
1841	 * after, but a bunch of seemingly-innocent libc functions like setenv()
1842	 * take locks.
1843	 *
1844	 * Hence, only call sys_clone() if we need to, in order to get at pid
1845	 * namespacing. If we follow this path, the child's address space might
1846	 * have broken locks; you may only call functions that do not acquire
1847	 * any locks.
1848	 *
1849	 * Unfortunately, fork() acquires every lock it can get its hands on, as
1850	 * previously detailed, so this function is highly likely to deadlock
1851	 * later on (see "deadlock here") if we're multithreaded.
1852	 *
1853	 * We might hack around this by having the clone()d child (init of the
1854	 * pid namespace) return directly, rather than leaving the clone()d
1855	 * process hanging around to be init for the new namespace (and having
1856	 * its fork()ed child return in turn), but that process would be
1857	 * crippled with its libc locks potentially broken. We might try
1858	 * fork()ing in the parent before we clone() to ensure that we own all
1859	 * the locks, but then we have to have the forked child hanging around
1860	 * consuming resources (and possibly having file descriptors / shared
1861	 * memory regions / etc attached). We'd need to keep the child around to
1862	 * avoid having its children get reparented to init.
1863	 *
1864	 * TODO(ellyjones): figure out if the "forked child hanging around"
1865	 * problem is fixable or not. It would be nice if we worked in this
1866	 * case.
1867	 */
1868	if (pid_namespace) {
1869		int clone_flags = CLONE_NEWPID | SIGCHLD;
1870		if (j->flags.userns)
1871			clone_flags |= CLONE_NEWUSER;
1872		child_pid = syscall(SYS_clone, clone_flags, NULL);
1873	} else {
1874		child_pid = fork();
1875	}
1876
1877	if (child_pid < 0) {
1878		if (use_preload) {
1879			free(oldenv_copy);
1880		}
1881		die("failed to fork child");
1882	}
1883
1884	if (child_pid) {
1885		if (use_preload) {
1886			/* Restore parent's LD_PRELOAD. */
1887			if (oldenv_copy) {
1888				setenv(kLdPreloadEnvVar, oldenv_copy, 1);
1889				free(oldenv_copy);
1890			} else {
1891				unsetenv(kLdPreloadEnvVar);
1892			}
1893			unsetenv(kFdEnvVar);
1894		}
1895
1896		j->initpid = child_pid;
1897
1898		if (j->flags.pid_file)
1899			write_pid_file(j);
1900
1901		if (j->flags.cgroups)
1902			add_to_cgroups(j);
1903
1904		if (j->flags.userns)
1905			write_ugid_mappings(j);
1906
1907		if (sync_child)
1908			parent_setup_complete(child_sync_pipe_fds);
1909
1910		if (use_preload) {
1911			/* Send marshalled minijail. */
1912			close(pipe_fds[0]);	/* read endpoint */
1913			ret = minijail_to_fd(j, pipe_fds[1]);
1914			close(pipe_fds[1]);	/* write endpoint */
1915			if (ret) {
1916				kill(j->initpid, SIGKILL);
1917				die("failed to send marshalled minijail");
1918			}
1919		}
1920
1921		if (pchild_pid)
1922			*pchild_pid = child_pid;
1923
1924		/*
1925		 * If we want to write to the child process' standard input,
1926		 * set up the write end of the pipe.
1927		 */
1928		if (pstdin_fd)
1929			*pstdin_fd = setup_pipe_end(stdin_fds,
1930						    1 /* write end */);
1931
1932		/*
1933		 * If we want to read from the child process' standard output,
1934		 * set up the read end of the pipe.
1935		 */
1936		if (pstdout_fd)
1937			*pstdout_fd = setup_pipe_end(stdout_fds,
1938						     0 /* read end */);
1939
1940		/*
1941		 * If we want to read from the child process' standard error,
1942		 * set up the read end of the pipe.
1943		 */
1944		if (pstderr_fd)
1945			*pstderr_fd = setup_pipe_end(stderr_fds,
1946						     0 /* read end */);
1947
1948		return 0;
1949	}
1950	free(oldenv_copy);
1951
1952	if (j->flags.reset_signal_mask) {
1953		sigset_t signal_mask;
1954		if (sigemptyset(&signal_mask) != 0)
1955			pdie("sigemptyset failed");
1956		if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0)
1957			pdie("sigprocmask failed");
1958	}
1959
1960	if (sync_child)
1961		wait_for_parent_setup(child_sync_pipe_fds);
1962
1963	if (j->flags.userns)
1964		enter_user_namespace(j);
1965
1966	/*
1967	 * If we want to write to the jailed process' standard input,
1968	 * set up the read end of the pipe.
1969	 */
1970	if (pstdin_fd) {
1971		if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
1972					    STDIN_FILENO) < 0)
1973			die("failed to set up stdin pipe");
1974	}
1975
1976	/*
1977	 * If we want to read from the jailed process' standard output,
1978	 * set up the write end of the pipe.
1979	 */
1980	if (pstdout_fd) {
1981		if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
1982					    STDOUT_FILENO) < 0)
1983			die("failed to set up stdout pipe");
1984	}
1985
1986	/*
1987	 * If we want to read from the jailed process' standard error,
1988	 * set up the write end of the pipe.
1989	 */
1990	if (pstderr_fd) {
1991		if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
1992					    STDERR_FILENO) < 0)
1993			die("failed to set up stderr pipe");
1994	}
1995
1996	/* If running an init program, let it decide when/how to mount /proc. */
1997	if (pid_namespace && !do_init)
1998		j->flags.remount_proc_ro = 0;
1999
2000	if (use_preload) {
2001		/* Strip out flags that cannot be inherited across execve(2). */
2002		minijail_preexec(j);
2003	} else {
2004		j->flags.pids = 0;
2005	}
2006	/* Jail this process, then execve() the target. */
2007	minijail_enter(j);
2008
2009	if (pid_namespace && do_init) {
2010		/*
2011		 * pid namespace: this process will become init inside the new
2012		 * namespace. We don't want all programs we might exec to have
2013		 * to know how to be init. Normally (do_init == 1) we fork off
2014		 * a child to actually run the program. If |do_init == 0|, we
2015		 * let the program keep pid 1 and be init.
2016		 *
2017		 * If we're multithreaded, we'll probably deadlock here. See
2018		 * WARNING above.
2019		 */
2020		child_pid = fork();
2021		if (child_pid < 0)
2022			_exit(child_pid);
2023		else if (child_pid > 0)
2024			init(child_pid);	/* never returns */
2025	}
2026
2027	/*
2028	 * If we aren't pid-namespaced, or the jailed program asked to be init:
2029	 *   calling process
2030	 *   -> execve()-ing process
2031	 * If we are:
2032	 *   calling process
2033	 *   -> init()-ing process
2034	 *      -> execve()-ing process
2035	 */
2036	_exit(execve(filename, argv, environ));
2037}
2038
2039int API minijail_kill(struct minijail *j)
2040{
2041	int st;
2042	if (kill(j->initpid, SIGTERM))
2043		return -errno;
2044	if (waitpid(j->initpid, &st, 0) < 0)
2045		return -errno;
2046	return st;
2047}
2048
2049int API minijail_wait(struct minijail *j)
2050{
2051	int st;
2052	if (waitpid(j->initpid, &st, 0) < 0)
2053		return -errno;
2054
2055	if (!WIFEXITED(st)) {
2056		int error_status = st;
2057		if (WIFSIGNALED(st)) {
2058			int signum = WTERMSIG(st);
2059			warn("child process %d received signal %d",
2060			     j->initpid, signum);
2061			/*
2062			 * We return MINIJAIL_ERR_JAIL if the process received
2063			 * SIGSYS, which happens when a syscall is blocked by
2064			 * seccomp filters.
2065			 * If not, we do what bash(1) does:
2066			 * $? = 128 + signum
2067			 */
2068			if (signum == SIGSYS) {
2069				error_status = MINIJAIL_ERR_JAIL;
2070			} else {
2071				error_status = 128 + signum;
2072			}
2073		}
2074		return error_status;
2075	}
2076
2077	int exit_status = WEXITSTATUS(st);
2078	if (exit_status != 0)
2079		info("child process %d exited with status %d",
2080		     j->initpid, exit_status);
2081
2082	return exit_status;
2083}
2084
2085void API minijail_destroy(struct minijail *j)
2086{
2087	size_t i;
2088
2089	if (j->flags.seccomp_filter && j->filter_prog) {
2090		free(j->filter_prog->filter);
2091		free(j->filter_prog);
2092	}
2093	while (j->mounts_head) {
2094		struct mountpoint *m = j->mounts_head;
2095		j->mounts_head = j->mounts_head->next;
2096		free(m->type);
2097		free(m->dest);
2098		free(m->src);
2099		free(m);
2100	}
2101	j->mounts_tail = NULL;
2102	if (j->user)
2103		free(j->user);
2104	if (j->suppl_gid_list)
2105		free(j->suppl_gid_list);
2106	if (j->chrootdir)
2107		free(j->chrootdir);
2108	if (j->alt_syscall_table)
2109		free(j->alt_syscall_table);
2110	for (i = 0; i < j->cgroup_count; ++i)
2111		free(j->cgroups[i]);
2112	free(j);
2113}
2114