libminijail.c revision 1d697933d1f5c07c0cbad6a79118e67e6e043881
1/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6#define _BSD_SOURCE 7#define _GNU_SOURCE 8#include <ctype.h> 9#include <errno.h> 10#include <grp.h> 11#include <inttypes.h> 12#include <limits.h> 13#include <linux/capability.h> 14#include <linux/securebits.h> 15#include <pwd.h> 16#include <sched.h> 17#include <signal.h> 18#include <stdarg.h> 19#include <stdio.h> 20#include <stdlib.h> 21#include <string.h> 22#include <syscall.h> 23#include <sys/capability.h> 24#include <sys/mount.h> 25#include <sys/param.h> 26#include <sys/prctl.h> 27#include <sys/wait.h> 28#include <syslog.h> 29#include <unistd.h> 30 31#include "libminijail.h" 32#include "libsyscalls.h" 33#include "libminijail-private.h" 34 35/* Until these are reliably available in linux/prctl.h */ 36#ifndef PR_SET_SECCOMP_FILTER 37# define PR_SECCOMP_FILTER_SYSCALL 0 38# define PR_SECCOMP_FILTER_EVENT 1 39# define PR_GET_SECCOMP_FILTER 35 40# define PR_SET_SECCOMP_FILTER 36 41# define PR_CLEAR_SECCOMP_FILTER 37 42#endif 43 44#define die(_msg, ...) do { \ 45 syslog(LOG_ERR, "libminijail: " _msg, ## __VA_ARGS__); \ 46 abort(); \ 47} while (0) 48 49#define pdie(_msg, ...) \ 50 die(_msg ": %s", ## __VA_ARGS__, strerror(errno)) 51 52#define warn(_msg, ...) \ 53 syslog(LOG_WARNING, "libminijail: " _msg, ## __VA_ARGS__) 54 55struct seccomp_filter { 56 int nr; 57 char *filter; 58 struct seccomp_filter *next, *prev; 59}; 60 61struct binding { 62 char *src; 63 char *dest; 64 int writeable; 65 struct binding *next; 66}; 67 68struct minijail { 69 struct { 70 int uid:1; 71 int gid:1; 72 int caps:1; 73 int vfs:1; 74 int pids:1; 75 int seccomp:1; 76 int readonly:1; 77 int usergroups:1; 78 int ptrace:1; 79 int seccomp_filter:1; 80 int chroot:1; 81 } flags; 82 uid_t uid; 83 gid_t gid; 84 gid_t usergid; 85 char *user; 86 uint64_t caps; 87 pid_t initpid; 88 int filter_count; 89 int binding_count; 90 char *chrootdir; 91 struct seccomp_filter *filters; 92 struct binding *bindings_head; 93 struct binding *bindings_tail; 94}; 95 96struct minijail *minijail_new(void) 97{ 98 return calloc(1, sizeof(struct minijail)); 99} 100 101void minijail_change_uid(struct minijail *j, uid_t uid) 102{ 103 if (uid == 0) 104 die("useless change to uid 0"); 105 j->uid = uid; 106 j->flags.uid = 1; 107} 108 109void minijail_change_gid(struct minijail *j, gid_t gid) 110{ 111 if (gid == 0) 112 die("useless change to gid 0"); 113 j->gid = gid; 114 j->flags.gid = 1; 115} 116 117int minijail_change_user(struct minijail *j, const char *user) 118{ 119 char *buf = NULL; 120 struct passwd pw; 121 struct passwd *ppw = NULL; 122 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX); 123 if (sz == -1) 124 sz = 65536; /* your guess is as good as mine... */ 125 126 /* sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return 127 * the maximum needed size of the buffer, so we don't have to search. 128 */ 129 buf = malloc(sz); 130 if (!buf) 131 return -ENOMEM; 132 getpwnam_r(user, &pw, buf, sz, &ppw); 133 free(buf); 134 if (!ppw) 135 return -errno; 136 minijail_change_uid(j, ppw->pw_uid); 137 j->user = strdup(user); 138 if (!j->user) 139 return -ENOMEM; 140 j->usergid = ppw->pw_gid; 141 return 0; 142} 143 144int minijail_change_group(struct minijail *j, const char *group) 145{ 146 char *buf = NULL; 147 struct group gr; 148 struct group *pgr = NULL; 149 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX); 150 if (sz == -1) 151 sz = 65536; /* and mine is as good as yours, really */ 152 153 /* sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return 154 * the maximum needed size of the buffer, so we don't have to search. 155 */ 156 buf = malloc(sz); 157 if (!buf) 158 return -ENOMEM; 159 getgrnam_r(group, &gr, buf, sz, &pgr); 160 free(buf); 161 if (!pgr) 162 return -errno; 163 minijail_change_gid(j, pgr->gr_gid); 164 return 0; 165} 166 167void minijail_use_seccomp(struct minijail *j) 168{ 169 j->flags.seccomp = 1; 170} 171 172void minijail_use_seccomp_filter(struct minijail *j) 173{ 174 j->flags.seccomp_filter = 1; 175} 176 177void minijail_use_caps(struct minijail *j, uint64_t capmask) 178{ 179 j->caps = capmask; 180 j->flags.caps = 1; 181} 182 183void minijail_namespace_vfs(struct minijail *j) 184{ 185 j->flags.vfs = 1; 186} 187 188void minijail_namespace_pids(struct minijail *j) 189{ 190 j->flags.pids = 1; 191} 192 193void minijail_remount_readonly(struct minijail *j) 194{ 195 j->flags.vfs = 1; 196 j->flags.readonly = 1; 197} 198 199void minijail_inherit_usergroups(struct minijail *j) 200{ 201 j->flags.usergroups = 1; 202} 203 204void minijail_disable_ptrace(struct minijail *j) 205{ 206 j->flags.ptrace = 1; 207} 208 209int minijail_enter_chroot(struct minijail *j, const char *dir) { 210 if (j->chrootdir) 211 return -EINVAL; 212 j->chrootdir = strdup(dir); 213 if (!j->chrootdir) 214 return -ENOMEM; 215 j->flags.chroot = 1; 216 return 0; 217} 218 219int minijail_bind(struct minijail *j, const char *src, const char *dest, 220 int writeable) { 221 struct binding *b; 222 223 if (*dest != '/') 224 return -EINVAL; 225 b = calloc(1, sizeof(*b)); 226 if (!b) 227 return -ENOMEM; 228 b->dest = strdup(dest); 229 if (!b->dest) 230 goto error; 231 b->src = strdup(src); 232 if (!b->src) 233 goto error; 234 b->writeable = writeable; 235 236 syslog(LOG_INFO, "libminijail: bind %s -> %s", src, dest); 237 238 /* Force vfs namespacing so the bind mounts don't leak out into the 239 * containing vfs namespace. 240 */ 241 minijail_namespace_vfs(j); 242 243 if (j->bindings_tail) 244 j->bindings_tail->next = b; 245 else 246 j->bindings_head = b; 247 j->bindings_tail = b; 248 j->binding_count++; 249 250 return 0; 251 252error: 253 free(b->src); 254 free(b->dest); 255 free(b); 256 return -ENOMEM; 257} 258 259int minijail_add_seccomp_filter(struct minijail *j, int nr, const char *filter) 260{ 261 struct seccomp_filter *sf; 262 if (!filter || nr < 0) 263 return -EINVAL; 264 265 sf = malloc(sizeof(*sf)); 266 if (!sf) 267 return -ENOMEM; 268 sf->nr = nr; 269 sf->filter = strndup(filter, MINIJAIL_MAX_SECCOMP_FILTER_LINE); 270 if (!sf->filter) { 271 free(sf); 272 return -ENOMEM; 273 } 274 275 j->filter_count++; 276 277 if (!j->filters) { 278 j->filters = sf; 279 sf->next = sf; 280 sf->prev = sf; 281 return 0; 282 } 283 sf->next = j->filters; 284 sf->prev = j->filters->prev; 285 sf->prev->next = sf; 286 j->filters->prev = sf; 287 return 0; 288} 289 290int minijail_lookup_syscall(const char *name) 291{ 292 const struct syscall_entry *entry = syscall_table; 293 for (; entry->name && entry->nr >= 0; ++entry) 294 if (!strcmp(entry->name, name)) 295 return entry->nr; 296 return -1; 297} 298 299static char *strip(char *s) 300{ 301 char *end; 302 while (*s && isblank(*s)) 303 s++; 304 end = s + strlen(s) - 1; 305 while (*end && (isblank(*end) || *end == '\n')) 306 end--; 307 *(end + 1) = '\0'; 308 return s; 309} 310 311void minijail_parse_seccomp_filters(struct minijail *j, const char *path) 312{ 313 FILE *file = fopen(path, "r"); 314 char line[MINIJAIL_MAX_SECCOMP_FILTER_LINE]; 315 int count = 0; 316 if (!file) 317 pdie("failed to open seccomp filters file"); 318 319 /* Format is simple: 320 * syscall_name<COLON><FILTER STRING>[\n|EOF] 321 * #...comment... 322 * <empty line? 323 */ 324 while (fgets(line, sizeof(line), file)) { 325 char *filter = line; 326 char *name = strsep(&filter, ":"); 327 char *name_end = NULL; 328 int nr = -1; 329 count++; 330 331 /* Allow comment lines */ 332 if (*name == '#') 333 continue; 334 335 name = strip(name); 336 337 if (!filter) { 338 if (strlen(name)) 339 die("invalid filter on line %d", count); 340 /* Allow empty lines */ 341 continue; 342 } 343 344 filter = strip(filter); 345 346 /* Take direct syscall numbers */ 347 nr = strtol(name, &name_end, 0); 348 /* Or fail-over to using names */ 349 if (*name_end != '\0') 350 nr = minijail_lookup_syscall(name); 351 if (nr < 0) 352 die("syscall '%s' unknown", name); 353 354 if (minijail_add_seccomp_filter(j, nr, filter)) 355 pdie("failed to add filter for syscall '%s'", name); 356 } 357 fclose(file); 358} 359 360struct marshal_state { 361 size_t available; 362 size_t total; 363 char *buf; 364}; 365 366static void marshal_state_init(struct marshal_state *state, 367 char *buf, size_t available) 368{ 369 state->available = available; 370 state->buf = buf; 371 state->total = 0; 372} 373 374static void marshal_append(struct marshal_state *state, 375 char *src, size_t length) 376{ 377 size_t copy_len = MIN(state->available, length); 378 379 /* Up to |available| will be written. */ 380 if (copy_len) { 381 memcpy(state->buf, src, copy_len); 382 state->buf += copy_len; 383 state->available -= copy_len; 384 } 385 /* |total| will contain the expected length. */ 386 state->total += length; 387} 388 389static void minijail_marshal_helper(struct marshal_state *state, 390 const struct minijail *j) 391{ 392 struct binding *b = NULL; 393 marshal_append(state, (char *)j, sizeof(*j)); 394 if (j->user) 395 marshal_append(state, j->user, strlen(j->user) + 1); 396 if (j->chrootdir) 397 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1); 398 if (j->flags.seccomp_filter && j->filters) { 399 struct seccomp_filter *f = j->filters; 400 do { 401 marshal_append(state, (char *)&f->nr, sizeof(f->nr)); 402 marshal_append(state, f->filter, strlen(f->filter) + 1); 403 f = f->next; 404 } while (f != j->filters); 405 } 406 for (b = j->bindings_head; b; b = b->next) { 407 marshal_append(state, b->src, strlen(b->src) + 1); 408 marshal_append(state, b->dest, strlen(b->dest) + 1); 409 marshal_append(state, (char *)&b->writeable, sizeof(b->writeable)); 410 } 411} 412 413size_t minijail_size(const struct minijail *j) 414{ 415 struct marshal_state state; 416 marshal_state_init(&state, NULL, 0); 417 minijail_marshal_helper(&state, j); 418 return state.total; 419} 420 421int minijail_marshal(const struct minijail *j, char *buf, size_t available) 422{ 423 struct marshal_state state; 424 marshal_state_init(&state, buf, available); 425 minijail_marshal_helper(&state, j); 426 return (state.total > available); 427} 428 429/* consumebytes: consumes @length bytes from a buffer @buf of length @buflength 430 * @length Number of bytes to consume 431 * @buf Buffer to consume from 432 * @buflength Size of @buf 433 * 434 * Returns a pointer to the base of the bytes, or NULL for errors. 435 */ 436static void *consumebytes(size_t length, char **buf, size_t *buflength) { 437 char *p = *buf; 438 if (length > *buflength) 439 return NULL; 440 *buf += length; 441 *buflength -= length; 442 return p; 443} 444 445/* consumestr: consumes a C string from a buffer @buf of length @length 446 * @buf Buffer to consume 447 * @length Length of buffer 448 * 449 * Returns a pointer to the base of the string, or NULL for errors. 450 */ 451static char *consumestr(char **buf, size_t *buflength) { 452 size_t len = strnlen(*buf, *buflength); 453 if (len == *buflength) 454 /* There's no null-terminator */ 455 return NULL; 456 return consumebytes(len + 1, buf, buflength); 457} 458 459int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) 460{ 461 int i; 462 int count; 463 if (length < sizeof(*j)) 464 return -EINVAL; 465 memcpy((void *)j, serialized, sizeof(*j)); 466 serialized += sizeof(*j); 467 length -= sizeof(*j); 468 469 if (j->user) { /* stale pointer */ 470 char *user = consumestr(&serialized, &length); 471 if (!user) 472 return -EINVAL; 473 j->user = strdup(user); 474 } 475 476 if (j->flags.seccomp_filter && j->filter_count) { 477 count = j->filter_count; 478 /* Let add_seccomp_filter recompute the value. */ 479 j->filter_count = 0; 480 j->filters = NULL; /* Don't follow the stale pointer. */ 481 for (; count > 0; --count) { 482 int *nr = (int *)consumebytes(sizeof(*nr), &serialized, 483 &length); 484 char *filter; 485 if (!nr) 486 return -EINVAL; 487 filter = consumestr(&serialized, &length); 488 if (!filter) 489 return -EINVAL; 490 if (minijail_add_seccomp_filter(j, *nr, filter)) 491 return -EINVAL; 492 } 493 } 494 495 count = j->binding_count; 496 j->bindings_head = NULL; 497 j->bindings_tail = NULL; 498 j->binding_count = 0; 499 for (i = 0; i < count; ++i) { 500 int *writeable; 501 const char *dest; 502 const char *src = consumestr(&serialized, &length); 503 if (!src) 504 return -EINVAL; 505 dest = consumestr(&serialized, &length); 506 if (!dest) 507 return -EINVAL; 508 writeable = consumebytes(sizeof(*writeable), &serialized, &length); 509 if (!writeable) 510 return -EINVAL; 511 if (minijail_bind(j, src, dest, *writeable)) 512 return -EINVAL; 513 } 514 515 return 0; 516} 517 518void minijail_preenter(struct minijail *j) 519{ 520 /* Strip out options which are minijail_run() only. */ 521 j->flags.vfs = 0; 522 j->flags.readonly = 0; 523 j->flags.pids = 0; 524} 525 526void minijail_preexec(struct minijail *j) 527{ 528 int vfs = j->flags.vfs; 529 int readonly = j->flags.readonly; 530 if (j->user) 531 free(j->user); 532 j->user = NULL; 533 memset(&j->flags, 0, sizeof(j->flags)); 534 /* Now restore anything we meant to keep. */ 535 j->flags.vfs = vfs; 536 j->flags.readonly = readonly; 537 /* Note, pidns will already have been used before this call. */ 538} 539 540/* bind_one: Applies bindings from @b for @j, recursing as needed. 541 * @j Minijail these bindings are for 542 * @b Head of list of bindings 543 * 544 * Returns 0 for success. 545 */ 546static int bind_one(const struct minijail *j, struct binding *b) { 547 int ret = 0; 548 char *dest = NULL; 549 int mflags = MS_BIND | (b->writeable ? 0 : MS_RDONLY); 550 if (ret) 551 return ret; 552 /* dest has a leading "/" */ 553 if (asprintf(&dest, "%s%s", j->chrootdir, b->dest) < 0) 554 return -ENOMEM; 555 ret = mount(b->src, dest, NULL, mflags, NULL); 556 if (ret) 557 pdie("bind: %s -> %s", b->src, dest); 558 free(dest); 559 if (b->next) 560 return bind_one(j, b->next); 561 return ret; 562} 563 564static int enter_chroot(const struct minijail *j) { 565 int ret; 566 if (j->bindings_head && (ret = bind_one(j, j->bindings_head))) 567 return ret; 568 569 if (chroot(j->chrootdir)) 570 return -errno; 571 572 if (chdir("/")) 573 return -errno; 574 575 return 0; 576} 577 578 579 580static int remount_readonly(void) 581{ 582 const char *kProcPath = "/proc"; 583 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 584 /* Right now, we're holding a reference to our parent's old mount of 585 * /proc in our namespace, which means using MS_REMOUNT here would 586 * mutate our parent's mount as well, even though we're in a VFS 587 * namespace (!). Instead, remove their mount from our namespace 588 * and make our own. 589 */ 590 if (umount(kProcPath)) 591 return -errno; 592 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) 593 return -errno; 594 return 0; 595} 596 597static void drop_caps(const struct minijail *j) 598{ 599 cap_t caps = cap_get_proc(); 600 cap_value_t raise_flag[1]; 601 unsigned int i; 602 if (!caps) 603 die("can't get process caps"); 604 if (cap_clear_flag(caps, CAP_INHERITABLE)) 605 die("can't clear inheritable caps"); 606 if (cap_clear_flag(caps, CAP_EFFECTIVE)) 607 die("can't clear effective caps"); 608 if (cap_clear_flag(caps, CAP_PERMITTED)) 609 die("can't clear permitted caps"); 610 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) { 611 if (i != CAP_SETPCAP && !(j->caps & (1 << i))) 612 continue; 613 raise_flag[0] = i; 614 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, raise_flag, CAP_SET)) 615 die("can't add effective cap"); 616 if (cap_set_flag(caps, CAP_PERMITTED, 1, raise_flag, CAP_SET)) 617 die("can't add permitted cap"); 618 if (cap_set_flag(caps, CAP_INHERITABLE, 1, raise_flag, CAP_SET)) 619 die("can't add inheritable cap"); 620 } 621 if (cap_set_proc(caps)) 622 die("can't apply cleaned capset"); 623 cap_free(caps); 624 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) { 625 if (j->caps & (1 << i)) 626 continue; 627 if (prctl(PR_CAPBSET_DROP, i)) 628 pdie("prctl(PR_CAPBSET_DROP)"); 629 } 630} 631 632static int setup_seccomp_filters(const struct minijail *j) 633{ 634 const struct seccomp_filter *sf = j->filters; 635 int ret = 0; 636 int broaden = 0; 637 638 /* No filters installed isn't necessarily an error. */ 639 if (!sf) 640 return ret; 641 642 do { 643 errno = 0; 644 ret = prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_SYSCALL, 645 sf->nr, broaden ? "1" : sf->filter); 646 if (ret) { 647 switch (errno) { 648 case ENOSYS: 649 /* TODO(wad) make this a config option */ 650 if (broaden) 651 die("CONFIG_SECCOMP_FILTER is not" 652 "supported by your kernel"); 653 warn("missing CONFIG_FTRACE_SYSCALLS; relaxing" 654 "the filter for %d", sf->nr); 655 broaden = 1; 656 continue; 657 case E2BIG: 658 warn("seccomp filter too long: %d", sf->nr); 659 pdie("filter too long"); 660 case ENOSPC: 661 pdie("too many seccomp filters"); 662 case EPERM: 663 warn("syscall filter disallowed for %d", 664 sf->nr); 665 pdie("failed to install seccomp filter"); 666 case EINVAL: 667 warn("seccomp filter or call method is" 668 " invalid. %d:'%s'", sf->nr, sf->filter); 669 default: 670 pdie("failed to install seccomp filter"); 671 } 672 } 673 sf = sf->next; 674 broaden = 0; 675 } while (sf != j->filters); 676 return ret; 677} 678 679void minijail_enter(const struct minijail *j) 680{ 681 if (j->flags.pids) 682 die("tried to enter a pid-namespaced jail;" 683 "try minijail_run()?"); 684 685 if (j->flags.seccomp_filter && setup_seccomp_filters(j)) 686 pdie("failed to configure seccomp filters"); 687 688 if (j->flags.usergroups && !j->user) 689 die("usergroup inheritance without username"); 690 691 /* We can't recover from failures if we've dropped privileges partially, 692 * so we don't even try. If any of our operations fail, we abort() the 693 * entire process. 694 */ 695 if (j->flags.vfs && unshare(CLONE_NEWNS)) 696 pdie("unshare"); 697 698 if (j->flags.chroot && enter_chroot(j)) 699 pdie("chroot"); 700 701 if (j->flags.readonly && remount_readonly()) 702 pdie("remount"); 703 704 if (j->flags.caps) { 705 /* POSIX capabilities are a bit tricky. If we drop our 706 * capability to change uids, our attempt to use setuid() 707 * below will fail. Hang on to root caps across setuid(), then 708 * lock securebits. 709 */ 710 if (prctl(PR_SET_KEEPCAPS, 1)) 711 pdie("prctl(PR_SET_KEEPCAPS)"); 712 if (prctl 713 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS)) 714 pdie("prctl(PR_SET_SECUREBITS)"); 715 } 716 717 if (j->flags.usergroups) { 718 if (initgroups(j->user, j->usergid)) 719 pdie("initgroups"); 720 } else { 721 /* Only attempt to clear supplemental groups if we are changing 722 * users. */ 723 if ((j->uid || j->gid) && setgroups(0, NULL)) 724 pdie("setgroups"); 725 } 726 727 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) 728 pdie("setresgid"); 729 730 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) 731 pdie("setresuid"); 732 733 if (j->flags.caps) 734 drop_caps(j); 735 736 /* seccomp has to come last since it cuts off all the other 737 * privilege-dropping syscalls :) 738 */ 739 if (j->flags.seccomp_filter && prctl(PR_SET_SECCOMP, 13)) 740 pdie("prctl(PR_SET_SECCOMP, 13)"); 741 742 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) 743 pdie("prctl(PR_SET_SECCOMP)"); 744} 745 746static int init_exitstatus = 0; 747 748static void init_term(int __attribute__ ((unused)) sig) 749{ 750 _exit(init_exitstatus); 751} 752 753static int init(pid_t rootpid) 754{ 755 pid_t pid; 756 int status; 757 /* so that we exit with the right status */ 758 signal(SIGTERM, init_term); 759 /* TODO(wad) self jail with seccomp_filters here. */ 760 while ((pid = wait(&status)) > 0) { 761 /* This loop will only end when either there are no processes 762 * left inside our pid namespace or we get a signal. 763 */ 764 if (pid == rootpid) 765 init_exitstatus = status; 766 } 767 if (!WIFEXITED(init_exitstatus)) 768 _exit(MINIJAIL_ERR_INIT); 769 _exit(WEXITSTATUS(init_exitstatus)); 770} 771 772int minijail_from_fd(int fd, struct minijail *j) 773{ 774 size_t sz = 0; 775 size_t bytes = read(fd, &sz, sizeof(sz)); 776 char *buf; 777 int r; 778 if (sizeof(sz) != bytes) 779 return -EINVAL; 780 if (sz > USHRT_MAX) /* Arbitrary sanity check */ 781 return -E2BIG; 782 buf = malloc(sz); 783 if (!buf) 784 return -ENOMEM; 785 bytes = read(fd, buf, sz); 786 if (bytes != sz) { 787 free(buf); 788 return -EINVAL; 789 } 790 r = minijail_unmarshal(j, buf, sz); 791 free(buf); 792 return r; 793} 794 795int minijail_to_fd(struct minijail *j, int fd) 796{ 797 char *buf; 798 size_t sz = minijail_size(j); 799 ssize_t written; 800 int r; 801 802 if (!sz) 803 return -EINVAL; 804 buf = malloc(sz); 805 r = minijail_marshal(j, buf, sz); 806 if (r) { 807 free(buf); 808 return r; 809 } 810 /* Sends [size][minijail]. */ 811 written = write(fd, &sz, sizeof(sz)); 812 if (written != sizeof(sz)) { 813 free(buf); 814 return -EFAULT; 815 } 816 written = write(fd, buf, sz); 817 if (written < 0 || (size_t) written != sz) { 818 free(buf); 819 return -EFAULT; 820 } 821 free(buf); 822 return 0; 823} 824 825static int setup_preload(void) 826{ 827 char *oldenv = getenv(kLdPreloadEnvVar) ? : ""; 828 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH)); 829 if (!newenv) 830 return -ENOMEM; 831 832 /* Only insert a separating space if we have something to separate... */ 833 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", 834 PRELOADPATH); 835 836 /* setenv() makes a copy of the string we give it */ 837 setenv(kLdPreloadEnvVar, newenv, 1); 838 free(newenv); 839 return 0; 840} 841 842static int setup_pipe(int fds[2]) 843{ 844 int r = pipe(fds); 845 char fd_buf[11]; 846 if (r) 847 return r; 848 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]); 849 if (r <= 0) 850 return -EINVAL; 851 setenv(kFdEnvVar, fd_buf, 1); 852 return 0; 853} 854 855int minijail_run(struct minijail *j, const char *filename, char *const argv[]) 856{ 857 unsigned int pidns = j->flags.pids ? CLONE_NEWPID : 0; 858 char *oldenv, *oldenv_copy = NULL; 859 pid_t child_pid; 860 int pipe_fds[2]; 861 int ret; 862 863 oldenv = getenv(kLdPreloadEnvVar); 864 if (oldenv) { 865 oldenv_copy = strdup(oldenv); 866 if (!oldenv_copy) 867 return -ENOMEM; 868 } 869 870 if (setup_preload()) 871 return -EFAULT; 872 873 /* Before we fork(2) and execve(2) the child process, we need to open 874 * a pipe(2) to send the minijail configuration over. 875 */ 876 if (setup_pipe(pipe_fds)) 877 return -EFAULT; 878 879 child_pid = syscall(SYS_clone, pidns | SIGCHLD, NULL); 880 if (child_pid < 0) { 881 free(oldenv_copy); 882 return child_pid; 883 } 884 885 if (child_pid) { 886 /* Restore parent's LD_PRELOAD. */ 887 if (oldenv_copy) { 888 setenv(kLdPreloadEnvVar, oldenv_copy, 1); 889 free(oldenv_copy); 890 } else { 891 unsetenv(kLdPreloadEnvVar); 892 } 893 unsetenv(kFdEnvVar); 894 j->initpid = child_pid; 895 close(pipe_fds[0]); /* read endpoint */ 896 ret = minijail_to_fd(j, pipe_fds[1]); 897 close(pipe_fds[1]); /* write endpoint */ 898 if (ret) { 899 kill(j->initpid, SIGKILL); 900 die("failed to send marshalled minijail"); 901 } 902 return 0; 903 } 904 free(oldenv_copy); 905 906 /* Drop everything that cannot be inherited across execve. */ 907 minijail_preexec(j); 908 /* Jail this process and its descendants... */ 909 minijail_enter(j); 910 911 if (pidns) { 912 /* pid namespace: this process will become init inside the new 913 * namespace, so fork off a child to actually run the program 914 * (we don't want all programs we might exec to have to know 915 * how to be init). 916 */ 917 child_pid = fork(); 918 if (child_pid < 0) 919 _exit(child_pid); 920 else if (child_pid > 0) 921 init(child_pid); /* never returns */ 922 } 923 924 /* If we aren't pid-namespaced: 925 * calling process 926 * -> execve()-ing process 927 * If we are: 928 * calling process 929 * -> init()-ing process 930 * -> execve()-ing process 931 */ 932 _exit(execve(filename, argv, environ)); 933} 934 935int minijail_kill(struct minijail *j) 936{ 937 int st; 938 if (kill(j->initpid, SIGTERM)) 939 return -errno; 940 if (waitpid(j->initpid, &st, 0) < 0) 941 return -errno; 942 return st; 943} 944 945int minijail_wait(struct minijail *j) 946{ 947 int st; 948 if (waitpid(j->initpid, &st, 0) < 0) 949 return -errno; 950 if (!WIFEXITED(st)) 951 return MINIJAIL_ERR_JAIL; 952 return WEXITSTATUS(st); 953} 954 955void minijail_destroy(struct minijail *j) 956{ 957 struct seccomp_filter *f = j->filters; 958 /* Unlink the tail and head */ 959 if (f) 960 f->prev->next = NULL; 961 while (f) { 962 struct seccomp_filter *next = f->next; 963 free(f->filter); 964 free(f); 965 f = next; 966 } 967 while (j->bindings_head) { 968 struct binding *b = j->bindings_head; 969 j->bindings_head = j->bindings_head->next; 970 free(b->dest); 971 free(b->src); 972 free(b); 973 } 974 j->bindings_tail = NULL; 975 if (j->user) 976 free(j->user); 977 free(j); 978} 979