libminijail.c revision 1d697933d1f5c07c0cbad6a79118e67e6e043881
1/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6#define _BSD_SOURCE
7#define _GNU_SOURCE
8#include <ctype.h>
9#include <errno.h>
10#include <grp.h>
11#include <inttypes.h>
12#include <limits.h>
13#include <linux/capability.h>
14#include <linux/securebits.h>
15#include <pwd.h>
16#include <sched.h>
17#include <signal.h>
18#include <stdarg.h>
19#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22#include <syscall.h>
23#include <sys/capability.h>
24#include <sys/mount.h>
25#include <sys/param.h>
26#include <sys/prctl.h>
27#include <sys/wait.h>
28#include <syslog.h>
29#include <unistd.h>
30
31#include "libminijail.h"
32#include "libsyscalls.h"
33#include "libminijail-private.h"
34
35/* Until these are reliably available in linux/prctl.h */
36#ifndef PR_SET_SECCOMP_FILTER
37# define PR_SECCOMP_FILTER_SYSCALL 0
38# define PR_SECCOMP_FILTER_EVENT 1
39# define PR_GET_SECCOMP_FILTER 35
40# define PR_SET_SECCOMP_FILTER 36
41# define PR_CLEAR_SECCOMP_FILTER 37
42#endif
43
44#define die(_msg, ...) do { \
45	syslog(LOG_ERR, "libminijail: " _msg, ## __VA_ARGS__); \
46	abort(); \
47} while (0)
48
49#define pdie(_msg, ...) \
50	die(_msg ": %s", ## __VA_ARGS__, strerror(errno))
51
52#define warn(_msg, ...) \
53	syslog(LOG_WARNING, "libminijail: " _msg, ## __VA_ARGS__)
54
55struct seccomp_filter {
56	int nr;
57	char *filter;
58	struct seccomp_filter *next, *prev;
59};
60
61struct binding {
62	char *src;
63	char *dest;
64	int writeable;
65	struct binding *next;
66};
67
68struct minijail {
69	struct {
70		int uid:1;
71		int gid:1;
72		int caps:1;
73		int vfs:1;
74		int pids:1;
75		int seccomp:1;
76		int readonly:1;
77		int usergroups:1;
78		int ptrace:1;
79		int seccomp_filter:1;
80		int chroot:1;
81	} flags;
82	uid_t uid;
83	gid_t gid;
84	gid_t usergid;
85	char *user;
86	uint64_t caps;
87	pid_t initpid;
88	int filter_count;
89	int binding_count;
90	char *chrootdir;
91	struct seccomp_filter *filters;
92	struct binding *bindings_head;
93	struct binding *bindings_tail;
94};
95
96struct minijail *minijail_new(void)
97{
98	return calloc(1, sizeof(struct minijail));
99}
100
101void minijail_change_uid(struct minijail *j, uid_t uid)
102{
103	if (uid == 0)
104		die("useless change to uid 0");
105	j->uid = uid;
106	j->flags.uid = 1;
107}
108
109void minijail_change_gid(struct minijail *j, gid_t gid)
110{
111	if (gid == 0)
112		die("useless change to gid 0");
113	j->gid = gid;
114	j->flags.gid = 1;
115}
116
117int minijail_change_user(struct minijail *j, const char *user)
118{
119	char *buf = NULL;
120	struct passwd pw;
121	struct passwd *ppw = NULL;
122	ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
123	if (sz == -1)
124		sz = 65536;	/* your guess is as good as mine... */
125
126	/* sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
127	 * the maximum needed size of the buffer, so we don't have to search.
128	 */
129	buf = malloc(sz);
130	if (!buf)
131		return -ENOMEM;
132	getpwnam_r(user, &pw, buf, sz, &ppw);
133	free(buf);
134	if (!ppw)
135		return -errno;
136	minijail_change_uid(j, ppw->pw_uid);
137	j->user = strdup(user);
138	if (!j->user)
139		return -ENOMEM;
140	j->usergid = ppw->pw_gid;
141	return 0;
142}
143
144int minijail_change_group(struct minijail *j, const char *group)
145{
146	char *buf = NULL;
147	struct group gr;
148	struct group *pgr = NULL;
149	ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
150	if (sz == -1)
151		sz = 65536;	/* and mine is as good as yours, really */
152
153	/* sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
154	 * the maximum needed size of the buffer, so we don't have to search.
155	 */
156	buf = malloc(sz);
157	if (!buf)
158		return -ENOMEM;
159	getgrnam_r(group, &gr, buf, sz, &pgr);
160	free(buf);
161	if (!pgr)
162		return -errno;
163	minijail_change_gid(j, pgr->gr_gid);
164	return 0;
165}
166
167void minijail_use_seccomp(struct minijail *j)
168{
169	j->flags.seccomp = 1;
170}
171
172void minijail_use_seccomp_filter(struct minijail *j)
173{
174	j->flags.seccomp_filter = 1;
175}
176
177void minijail_use_caps(struct minijail *j, uint64_t capmask)
178{
179	j->caps = capmask;
180	j->flags.caps = 1;
181}
182
183void minijail_namespace_vfs(struct minijail *j)
184{
185	j->flags.vfs = 1;
186}
187
188void minijail_namespace_pids(struct minijail *j)
189{
190	j->flags.pids = 1;
191}
192
193void minijail_remount_readonly(struct minijail *j)
194{
195	j->flags.vfs = 1;
196	j->flags.readonly = 1;
197}
198
199void minijail_inherit_usergroups(struct minijail *j)
200{
201	j->flags.usergroups = 1;
202}
203
204void minijail_disable_ptrace(struct minijail *j)
205{
206	j->flags.ptrace = 1;
207}
208
209int minijail_enter_chroot(struct minijail *j, const char *dir) {
210	if (j->chrootdir)
211		return -EINVAL;
212	j->chrootdir = strdup(dir);
213	if (!j->chrootdir)
214		return -ENOMEM;
215	j->flags.chroot = 1;
216	return 0;
217}
218
219int minijail_bind(struct minijail *j, const char *src, const char *dest,
220                  int writeable) {
221	struct binding *b;
222
223	if (*dest != '/')
224		return -EINVAL;
225	b = calloc(1, sizeof(*b));
226	if (!b)
227		return -ENOMEM;
228	b->dest = strdup(dest);
229	if (!b->dest)
230		goto error;
231	b->src = strdup(src);
232	if (!b->src)
233		goto error;
234	b->writeable = writeable;
235
236	syslog(LOG_INFO, "libminijail: bind %s -> %s", src, dest);
237
238	/* Force vfs namespacing so the bind mounts don't leak out into the
239	 * containing vfs namespace.
240	 */
241	minijail_namespace_vfs(j);
242
243	if (j->bindings_tail)
244		j->bindings_tail->next = b;
245	else
246		j->bindings_head = b;
247	j->bindings_tail = b;
248	j->binding_count++;
249
250	return 0;
251
252error:
253	free(b->src);
254	free(b->dest);
255	free(b);
256	return -ENOMEM;
257}
258
259int minijail_add_seccomp_filter(struct minijail *j, int nr, const char *filter)
260{
261	struct seccomp_filter *sf;
262	if (!filter || nr < 0)
263		return -EINVAL;
264
265	sf = malloc(sizeof(*sf));
266	if (!sf)
267		return -ENOMEM;
268	sf->nr = nr;
269	sf->filter = strndup(filter, MINIJAIL_MAX_SECCOMP_FILTER_LINE);
270	if (!sf->filter) {
271		free(sf);
272		return -ENOMEM;
273	}
274
275	j->filter_count++;
276
277	if (!j->filters) {
278		j->filters = sf;
279		sf->next = sf;
280		sf->prev = sf;
281		return 0;
282	}
283	sf->next = j->filters;
284	sf->prev = j->filters->prev;
285	sf->prev->next = sf;
286	j->filters->prev = sf;
287	return 0;
288}
289
290int minijail_lookup_syscall(const char *name)
291{
292	const struct syscall_entry *entry = syscall_table;
293	for (; entry->name && entry->nr >= 0; ++entry)
294		if (!strcmp(entry->name, name))
295			return entry->nr;
296	return -1;
297}
298
299static char *strip(char *s)
300{
301	char *end;
302	while (*s && isblank(*s))
303		s++;
304	end = s + strlen(s) - 1;
305	while (*end && (isblank(*end) || *end == '\n'))
306		end--;
307	*(end + 1) = '\0';
308	return s;
309}
310
311void minijail_parse_seccomp_filters(struct minijail *j, const char *path)
312{
313	FILE *file = fopen(path, "r");
314	char line[MINIJAIL_MAX_SECCOMP_FILTER_LINE];
315	int count = 0;
316	if (!file)
317		pdie("failed to open seccomp filters file");
318
319	/* Format is simple:
320	 * syscall_name<COLON><FILTER STRING>[\n|EOF]
321	 * #...comment...
322	 * <empty line?
323	 */
324	while (fgets(line, sizeof(line), file)) {
325		char *filter = line;
326		char *name = strsep(&filter, ":");
327		char *name_end = NULL;
328		int nr = -1;
329		count++;
330
331		/* Allow comment lines */
332		if (*name == '#')
333			continue;
334
335		name = strip(name);
336
337		if (!filter) {
338			if (strlen(name))
339				die("invalid filter on line %d", count);
340			/* Allow empty lines */
341			continue;
342		}
343
344		filter = strip(filter);
345
346		/* Take direct syscall numbers */
347		nr = strtol(name, &name_end, 0);
348		/* Or fail-over to using names */
349		if (*name_end != '\0')
350			nr = minijail_lookup_syscall(name);
351		if (nr < 0)
352			die("syscall '%s' unknown", name);
353
354		if (minijail_add_seccomp_filter(j, nr, filter))
355			pdie("failed to add filter for syscall '%s'", name);
356	}
357	fclose(file);
358}
359
360struct marshal_state {
361	size_t available;
362	size_t total;
363	char *buf;
364};
365
366static void marshal_state_init(struct marshal_state *state,
367			       char *buf, size_t available)
368{
369	state->available = available;
370	state->buf = buf;
371	state->total = 0;
372}
373
374static void marshal_append(struct marshal_state *state,
375			   char *src, size_t length)
376{
377	size_t copy_len = MIN(state->available, length);
378
379	/* Up to |available| will be written. */
380	if (copy_len) {
381		memcpy(state->buf, src, copy_len);
382		state->buf += copy_len;
383		state->available -= copy_len;
384	}
385	/* |total| will contain the expected length. */
386	state->total += length;
387}
388
389static void minijail_marshal_helper(struct marshal_state *state,
390				    const struct minijail *j)
391{
392	struct binding *b = NULL;
393	marshal_append(state, (char *)j, sizeof(*j));
394	if (j->user)
395		marshal_append(state, j->user, strlen(j->user) + 1);
396	if (j->chrootdir)
397		marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
398	if (j->flags.seccomp_filter && j->filters) {
399		struct seccomp_filter *f = j->filters;
400		do {
401			marshal_append(state, (char *)&f->nr, sizeof(f->nr));
402			marshal_append(state, f->filter, strlen(f->filter) + 1);
403			f = f->next;
404		} while (f != j->filters);
405	}
406	for (b = j->bindings_head; b; b = b->next) {
407		marshal_append(state, b->src, strlen(b->src) + 1);
408		marshal_append(state, b->dest, strlen(b->dest) + 1);
409		marshal_append(state, (char *)&b->writeable, sizeof(b->writeable));
410	}
411}
412
413size_t minijail_size(const struct minijail *j)
414{
415	struct marshal_state state;
416	marshal_state_init(&state, NULL, 0);
417	minijail_marshal_helper(&state, j);
418	return state.total;
419}
420
421int minijail_marshal(const struct minijail *j, char *buf, size_t available)
422{
423	struct marshal_state state;
424	marshal_state_init(&state, buf, available);
425	minijail_marshal_helper(&state, j);
426	return (state.total > available);
427}
428
429/* consumebytes: consumes @length bytes from a buffer @buf of length @buflength
430 * @length    Number of bytes to consume
431 * @buf       Buffer to consume from
432 * @buflength Size of @buf
433 *
434 * Returns a pointer to the base of the bytes, or NULL for errors.
435 */
436static void *consumebytes(size_t length, char **buf, size_t *buflength) {
437	char *p = *buf;
438	if (length > *buflength)
439		return NULL;
440	*buf += length;
441	*buflength -= length;
442	return p;
443}
444
445/* consumestr: consumes a C string from a buffer @buf of length @length
446 * @buf    Buffer to consume
447 * @length Length of buffer
448 *
449 * Returns a pointer to the base of the string, or NULL for errors.
450 */
451static char *consumestr(char **buf, size_t *buflength) {
452	size_t len = strnlen(*buf, *buflength);
453	if (len == *buflength)
454		/* There's no null-terminator */
455		return NULL;
456	return consumebytes(len + 1, buf, buflength);
457}
458
459int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
460{
461	int i;
462	int count;
463	if (length < sizeof(*j))
464		return -EINVAL;
465	memcpy((void *)j, serialized, sizeof(*j));
466	serialized += sizeof(*j);
467	length -= sizeof(*j);
468
469	if (j->user) {		/* stale pointer */
470		char *user = consumestr(&serialized, &length);
471		if (!user)
472			return -EINVAL;
473		j->user = strdup(user);
474	}
475
476	if (j->flags.seccomp_filter && j->filter_count) {
477		count = j->filter_count;
478		/* Let add_seccomp_filter recompute the value. */
479		j->filter_count = 0;
480		j->filters = NULL;	/* Don't follow the stale pointer. */
481		for (; count > 0; --count) {
482			int *nr = (int *)consumebytes(sizeof(*nr), &serialized,
483			                              &length);
484			char *filter;
485			if (!nr)
486				return -EINVAL;
487			filter = consumestr(&serialized, &length);
488			if (!filter)
489				return -EINVAL;
490			if (minijail_add_seccomp_filter(j, *nr, filter))
491				return -EINVAL;
492		}
493	}
494
495	count = j->binding_count;
496	j->bindings_head = NULL;
497	j->bindings_tail = NULL;
498	j->binding_count = 0;
499	for (i = 0; i < count; ++i) {
500		int *writeable;
501		const char *dest;
502		const char *src = consumestr(&serialized, &length);
503		if (!src)
504			return -EINVAL;
505		dest = consumestr(&serialized, &length);
506		if (!dest)
507			return -EINVAL;
508		writeable = consumebytes(sizeof(*writeable), &serialized, &length);
509		if (!writeable)
510			return -EINVAL;
511		if (minijail_bind(j, src, dest, *writeable))
512			return -EINVAL;
513	}
514
515	return 0;
516}
517
518void minijail_preenter(struct minijail *j)
519{
520	/* Strip out options which are minijail_run() only. */
521	j->flags.vfs = 0;
522	j->flags.readonly = 0;
523	j->flags.pids = 0;
524}
525
526void minijail_preexec(struct minijail *j)
527{
528	int vfs = j->flags.vfs;
529	int readonly = j->flags.readonly;
530	if (j->user)
531		free(j->user);
532	j->user = NULL;
533	memset(&j->flags, 0, sizeof(j->flags));
534	/* Now restore anything we meant to keep. */
535	j->flags.vfs = vfs;
536	j->flags.readonly = readonly;
537	/* Note, pidns will already have been used before this call. */
538}
539
540/* bind_one: Applies bindings from @b for @j, recursing as needed.
541 * @j Minijail these bindings are for
542 * @b Head of list of bindings
543 *
544 * Returns 0 for success.
545 */
546static int bind_one(const struct minijail *j, struct binding *b) {
547	int ret = 0;
548	char *dest = NULL;
549	int mflags = MS_BIND | (b->writeable ? 0 : MS_RDONLY);
550	if (ret)
551		return ret;
552	/* dest has a leading "/" */
553	if (asprintf(&dest, "%s%s", j->chrootdir, b->dest) < 0)
554		return -ENOMEM;
555	ret = mount(b->src, dest, NULL, mflags, NULL);
556	if (ret)
557		pdie("bind: %s -> %s", b->src, dest);
558	free(dest);
559	if (b->next)
560		return bind_one(j, b->next);
561	return ret;
562}
563
564static int enter_chroot(const struct minijail *j) {
565	int ret;
566	if (j->bindings_head && (ret = bind_one(j, j->bindings_head)))
567		return ret;
568
569	if (chroot(j->chrootdir))
570		return -errno;
571
572	if (chdir("/"))
573		return -errno;
574
575	return 0;
576}
577
578
579
580static int remount_readonly(void)
581{
582	const char *kProcPath = "/proc";
583	const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
584	/* Right now, we're holding a reference to our parent's old mount of
585	 * /proc in our namespace, which means using MS_REMOUNT here would
586	 * mutate our parent's mount as well, even though we're in a VFS
587	 * namespace (!). Instead, remove their mount from our namespace
588	 * and make our own.
589	 */
590	if (umount(kProcPath))
591		return -errno;
592	if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
593		return -errno;
594	return 0;
595}
596
597static void drop_caps(const struct minijail *j)
598{
599	cap_t caps = cap_get_proc();
600	cap_value_t raise_flag[1];
601	unsigned int i;
602	if (!caps)
603		die("can't get process caps");
604	if (cap_clear_flag(caps, CAP_INHERITABLE))
605		die("can't clear inheritable caps");
606	if (cap_clear_flag(caps, CAP_EFFECTIVE))
607		die("can't clear effective caps");
608	if (cap_clear_flag(caps, CAP_PERMITTED))
609		die("can't clear permitted caps");
610	for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
611		if (i != CAP_SETPCAP && !(j->caps & (1 << i)))
612			continue;
613		raise_flag[0] = i;
614		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, raise_flag, CAP_SET))
615			die("can't add effective cap");
616		if (cap_set_flag(caps, CAP_PERMITTED, 1, raise_flag, CAP_SET))
617			die("can't add permitted cap");
618		if (cap_set_flag(caps, CAP_INHERITABLE, 1, raise_flag, CAP_SET))
619			die("can't add inheritable cap");
620	}
621	if (cap_set_proc(caps))
622		die("can't apply cleaned capset");
623	cap_free(caps);
624	for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
625		if (j->caps & (1 << i))
626			continue;
627		if (prctl(PR_CAPBSET_DROP, i))
628			pdie("prctl(PR_CAPBSET_DROP)");
629	}
630}
631
632static int setup_seccomp_filters(const struct minijail *j)
633{
634	const struct seccomp_filter *sf = j->filters;
635	int ret = 0;
636	int broaden = 0;
637
638	/* No filters installed isn't necessarily an error. */
639	if (!sf)
640		return ret;
641
642	do {
643		errno = 0;
644		ret = prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_SYSCALL,
645			    sf->nr, broaden ? "1" : sf->filter);
646		if (ret) {
647			switch (errno) {
648			case ENOSYS:
649				/* TODO(wad) make this a config option */
650				if (broaden)
651					die("CONFIG_SECCOMP_FILTER is not"
652					    "supported by your kernel");
653				warn("missing CONFIG_FTRACE_SYSCALLS; relaxing"
654				     "the filter for %d", sf->nr);
655				broaden = 1;
656				continue;
657			case E2BIG:
658				warn("seccomp filter too long: %d", sf->nr);
659				pdie("filter too long");
660			case ENOSPC:
661				pdie("too many seccomp filters");
662			case EPERM:
663				warn("syscall filter disallowed for %d",
664				     sf->nr);
665				pdie("failed to install seccomp filter");
666			case EINVAL:
667				warn("seccomp filter or call method is"
668				     " invalid. %d:'%s'", sf->nr, sf->filter);
669			default:
670				pdie("failed to install seccomp filter");
671			}
672		}
673		sf = sf->next;
674		broaden = 0;
675	} while (sf != j->filters);
676	return ret;
677}
678
679void minijail_enter(const struct minijail *j)
680{
681	if (j->flags.pids)
682		die("tried to enter a pid-namespaced jail;"
683		    "try minijail_run()?");
684
685	if (j->flags.seccomp_filter && setup_seccomp_filters(j))
686		pdie("failed to configure seccomp filters");
687
688	if (j->flags.usergroups && !j->user)
689		die("usergroup inheritance without username");
690
691	/* We can't recover from failures if we've dropped privileges partially,
692	 * so we don't even try. If any of our operations fail, we abort() the
693	 * entire process.
694	 */
695	if (j->flags.vfs && unshare(CLONE_NEWNS))
696		pdie("unshare");
697
698	if (j->flags.chroot && enter_chroot(j))
699		pdie("chroot");
700
701	if (j->flags.readonly && remount_readonly())
702		pdie("remount");
703
704	if (j->flags.caps) {
705		/* POSIX capabilities are a bit tricky. If we drop our
706		 * capability to change uids, our attempt to use setuid()
707		 * below will fail. Hang on to root caps across setuid(), then
708		 * lock securebits.
709		 */
710		if (prctl(PR_SET_KEEPCAPS, 1))
711			pdie("prctl(PR_SET_KEEPCAPS)");
712		if (prctl
713		    (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
714			pdie("prctl(PR_SET_SECUREBITS)");
715	}
716
717	if (j->flags.usergroups) {
718		if (initgroups(j->user, j->usergid))
719			pdie("initgroups");
720	} else {
721		/* Only attempt to clear supplemental groups if we are changing
722		 * users. */
723		if ((j->uid || j->gid) && setgroups(0, NULL))
724			pdie("setgroups");
725	}
726
727	if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
728		pdie("setresgid");
729
730	if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
731		pdie("setresuid");
732
733	if (j->flags.caps)
734		drop_caps(j);
735
736	/* seccomp has to come last since it cuts off all the other
737	 * privilege-dropping syscalls :)
738	 */
739	if (j->flags.seccomp_filter && prctl(PR_SET_SECCOMP, 13))
740		pdie("prctl(PR_SET_SECCOMP, 13)");
741
742	if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1))
743		pdie("prctl(PR_SET_SECCOMP)");
744}
745
746static int init_exitstatus = 0;
747
748static void init_term(int __attribute__ ((unused)) sig)
749{
750	_exit(init_exitstatus);
751}
752
753static int init(pid_t rootpid)
754{
755	pid_t pid;
756	int status;
757	/* so that we exit with the right status */
758	signal(SIGTERM, init_term);
759	/* TODO(wad) self jail with seccomp_filters here. */
760	while ((pid = wait(&status)) > 0) {
761		/* This loop will only end when either there are no processes
762		 * left inside our pid namespace or we get a signal.
763		 */
764		if (pid == rootpid)
765			init_exitstatus = status;
766	}
767	if (!WIFEXITED(init_exitstatus))
768		_exit(MINIJAIL_ERR_INIT);
769	_exit(WEXITSTATUS(init_exitstatus));
770}
771
772int minijail_from_fd(int fd, struct minijail *j)
773{
774	size_t sz = 0;
775	size_t bytes = read(fd, &sz, sizeof(sz));
776	char *buf;
777	int r;
778	if (sizeof(sz) != bytes)
779		return -EINVAL;
780	if (sz > USHRT_MAX)	/* Arbitrary sanity check */
781		return -E2BIG;
782	buf = malloc(sz);
783	if (!buf)
784		return -ENOMEM;
785	bytes = read(fd, buf, sz);
786	if (bytes != sz) {
787		free(buf);
788		return -EINVAL;
789	}
790	r = minijail_unmarshal(j, buf, sz);
791	free(buf);
792	return r;
793}
794
795int minijail_to_fd(struct minijail *j, int fd)
796{
797	char *buf;
798	size_t sz = minijail_size(j);
799	ssize_t written;
800	int r;
801
802	if (!sz)
803		return -EINVAL;
804	buf = malloc(sz);
805	r = minijail_marshal(j, buf, sz);
806	if (r) {
807		free(buf);
808		return r;
809	}
810	/* Sends [size][minijail]. */
811	written = write(fd, &sz, sizeof(sz));
812	if (written != sizeof(sz)) {
813		free(buf);
814		return -EFAULT;
815	}
816	written = write(fd, buf, sz);
817	if (written < 0 || (size_t) written != sz) {
818		free(buf);
819		return -EFAULT;
820	}
821	free(buf);
822	return 0;
823}
824
825static int setup_preload(void)
826{
827	char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
828	char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
829	if (!newenv)
830		return -ENOMEM;
831
832	/* Only insert a separating space if we have something to separate... */
833	sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
834		PRELOADPATH);
835
836	/* setenv() makes a copy of the string we give it */
837	setenv(kLdPreloadEnvVar, newenv, 1);
838	free(newenv);
839	return 0;
840}
841
842static int setup_pipe(int fds[2])
843{
844	int r = pipe(fds);
845	char fd_buf[11];
846	if (r)
847		return r;
848	r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
849	if (r <= 0)
850		return -EINVAL;
851	setenv(kFdEnvVar, fd_buf, 1);
852	return 0;
853}
854
855int minijail_run(struct minijail *j, const char *filename, char *const argv[])
856{
857	unsigned int pidns = j->flags.pids ? CLONE_NEWPID : 0;
858	char *oldenv, *oldenv_copy = NULL;
859	pid_t child_pid;
860	int pipe_fds[2];
861	int ret;
862
863	oldenv = getenv(kLdPreloadEnvVar);
864	if (oldenv) {
865		oldenv_copy = strdup(oldenv);
866		if (!oldenv_copy)
867			return -ENOMEM;
868	}
869
870	if (setup_preload())
871		return -EFAULT;
872
873	/* Before we fork(2) and execve(2) the child process, we need to open
874	 * a pipe(2) to send the minijail configuration over.
875	 */
876	if (setup_pipe(pipe_fds))
877		return -EFAULT;
878
879	child_pid = syscall(SYS_clone, pidns | SIGCHLD, NULL);
880	if (child_pid < 0) {
881		free(oldenv_copy);
882		return child_pid;
883	}
884
885	if (child_pid) {
886		/* Restore parent's LD_PRELOAD. */
887		if (oldenv_copy) {
888			setenv(kLdPreloadEnvVar, oldenv_copy, 1);
889			free(oldenv_copy);
890		} else {
891			unsetenv(kLdPreloadEnvVar);
892		}
893		unsetenv(kFdEnvVar);
894		j->initpid = child_pid;
895		close(pipe_fds[0]);	/* read endpoint */
896		ret = minijail_to_fd(j, pipe_fds[1]);
897		close(pipe_fds[1]);	/* write endpoint */
898		if (ret) {
899			kill(j->initpid, SIGKILL);
900			die("failed to send marshalled minijail");
901		}
902		return 0;
903	}
904	free(oldenv_copy);
905
906	/* Drop everything that cannot be inherited across execve. */
907	minijail_preexec(j);
908	/* Jail this process and its descendants... */
909	minijail_enter(j);
910
911	if (pidns) {
912		/* pid namespace: this process will become init inside the new
913		 * namespace, so fork off a child to actually run the program
914		 * (we don't want all programs we might exec to have to know
915		 * how to be init).
916		 */
917		child_pid = fork();
918		if (child_pid < 0)
919			_exit(child_pid);
920		else if (child_pid > 0)
921			init(child_pid);	/* never returns */
922	}
923
924	/* If we aren't pid-namespaced:
925	 *   calling process
926	 *   -> execve()-ing process
927	 * If we are:
928	 *   calling process
929	 *   -> init()-ing process
930	 *      -> execve()-ing process
931	 */
932	_exit(execve(filename, argv, environ));
933}
934
935int minijail_kill(struct minijail *j)
936{
937	int st;
938	if (kill(j->initpid, SIGTERM))
939		return -errno;
940	if (waitpid(j->initpid, &st, 0) < 0)
941		return -errno;
942	return st;
943}
944
945int minijail_wait(struct minijail *j)
946{
947	int st;
948	if (waitpid(j->initpid, &st, 0) < 0)
949		return -errno;
950	if (!WIFEXITED(st))
951		return MINIJAIL_ERR_JAIL;
952	return WEXITSTATUS(st);
953}
954
955void minijail_destroy(struct minijail *j)
956{
957	struct seccomp_filter *f = j->filters;
958	/* Unlink the tail and head */
959	if (f)
960		f->prev->next = NULL;
961	while (f) {
962		struct seccomp_filter *next = f->next;
963		free(f->filter);
964		free(f);
965		f = next;
966	}
967	while (j->bindings_head) {
968		struct binding *b = j->bindings_head;
969		j->bindings_head = j->bindings_head->next;
970		free(b->dest);
971		free(b->src);
972		free(b);
973	}
974	j->bindings_tail = NULL;
975	if (j->user)
976		free(j->user);
977	free(j);
978}
979