libminijail.c revision 2860c4693ea5f40b44e4b2eb2f0b6970ffcd7f27
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6#define _BSD_SOURCE 7#define _GNU_SOURCE 8 9#include <asm/unistd.h> 10#include <ctype.h> 11#include <errno.h> 12#include <fcntl.h> 13#include <grp.h> 14#include <inttypes.h> 15#include <limits.h> 16#include <linux/capability.h> 17#include <pwd.h> 18#include <sched.h> 19#include <signal.h> 20#include <stdarg.h> 21#include <stdbool.h> 22#include <stddef.h> 23#include <stdio.h> 24#include <stdlib.h> 25#include <string.h> 26#include <syscall.h> 27#include <sys/capability.h> 28#include <sys/mount.h> 29#include <sys/param.h> 30#include <sys/prctl.h> 31#include <sys/stat.h> 32#include <sys/types.h> 33#include <sys/user.h> 34#include <sys/wait.h> 35#include <unistd.h> 36 37#include "libminijail.h" 38#include "libminijail-private.h" 39 40#include "signal_handler.h" 41#include "syscall_filter.h" 42#include "util.h" 43 44#ifdef HAVE_SECUREBITS_H 45#include <linux/securebits.h> 46#else 47#define SECURE_ALL_BITS 0x15 48#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1) 49#endif 50 51/* Until these are reliably available in linux/prctl.h */ 52#ifndef PR_SET_SECCOMP 53# define PR_SET_SECCOMP 22 54#endif 55 56#ifndef PR_ALT_SYSCALL 57# define PR_ALT_SYSCALL 0x43724f53 58#endif 59 60/* For seccomp_filter using BPF. */ 61#ifndef PR_SET_NO_NEW_PRIVS 62# define PR_SET_NO_NEW_PRIVS 38 63#endif 64#ifndef SECCOMP_MODE_FILTER 65# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ 66#endif 67 68#ifdef USE_SECCOMP_SOFTFAIL 69# define SECCOMP_SOFTFAIL 1 70#else 71# define SECCOMP_SOFTFAIL 0 72#endif 73 74struct mountpoint { 75 char *src; 76 char *dest; 77 char *type; 78 unsigned long flags; 79 struct mountpoint *next; 80}; 81 82struct minijail { 83 /* 84 * WARNING: if you add a flag here you need to make sure it's 85 * accounted for in minijail_pre{enter|exec}() below. 86 */ 87 struct { 88 int uid:1; 89 int gid:1; 90 int usergroups:1; 91 int suppl_gids:1; 92 int caps:1; 93 int vfs:1; 94 int enter_vfs:1; 95 int pids:1; 96 int ipc:1; 97 int net:1; 98 int enter_net:1; 99 int userns:1; 100 int seccomp:1; 101 int remount_proc_ro:1; 102 int no_new_privs:1; 103 int seccomp_filter:1; 104 int log_seccomp_filter:1; 105 int chroot:1; 106 int pivot_root:1; 107 int mount_tmp:1; 108 int do_init:1; 109 int pid_file:1; 110 int alt_syscall:1; 111 int reset_signal_mask:1; 112 } flags; 113 uid_t uid; 114 gid_t gid; 115 gid_t usergid; 116 char *user; 117 size_t suppl_gid_count; 118 gid_t *suppl_gid_list; 119 uint64_t caps; 120 pid_t initpid; 121 int mountns_fd; 122 int netns_fd; 123 char *chrootdir; 124 char *pid_file_path; 125 char *uidmap; 126 char *gidmap; 127 size_t filter_len; 128 struct sock_fprog *filter_prog; 129 char *alt_syscall_table; 130 struct mountpoint *mounts_head; 131 struct mountpoint *mounts_tail; 132 size_t mounts_count; 133}; 134 135/* 136 * Strip out flags meant for the parent. 137 * We keep things that are not inherited across execve(2) (e.g. capabilities), 138 * or are easier to set after execve(2) (e.g. seccomp filters). 139 */ 140void minijail_preenter(struct minijail *j) 141{ 142 j->flags.vfs = 0; 143 j->flags.enter_vfs = 0; 144 j->flags.remount_proc_ro = 0; 145 j->flags.pids = 0; 146 j->flags.do_init = 0; 147 j->flags.pid_file = 0; 148} 149 150/* 151 * Strip out flags meant for the child. 152 * We keep things that are inherited across execve(2). 153 */ 154void minijail_preexec(struct minijail *j) 155{ 156 int vfs = j->flags.vfs; 157 int enter_vfs = j->flags.enter_vfs; 158 int remount_proc_ro = j->flags.remount_proc_ro; 159 int userns = j->flags.userns; 160 if (j->user) 161 free(j->user); 162 j->user = NULL; 163 if (j->suppl_gid_list) 164 free(j->suppl_gid_list); 165 j->suppl_gid_list = NULL; 166 memset(&j->flags, 0, sizeof(j->flags)); 167 /* Now restore anything we meant to keep. */ 168 j->flags.vfs = vfs; 169 j->flags.enter_vfs = enter_vfs; 170 j->flags.remount_proc_ro = remount_proc_ro; 171 j->flags.userns = userns; 172 /* Note, |pids| will already have been used before this call. */ 173} 174 175/* Minijail API. */ 176 177struct minijail API *minijail_new(void) 178{ 179 return calloc(1, sizeof(struct minijail)); 180} 181 182void API minijail_change_uid(struct minijail *j, uid_t uid) 183{ 184 if (uid == 0) 185 die("useless change to uid 0"); 186 j->uid = uid; 187 j->flags.uid = 1; 188} 189 190void API minijail_change_gid(struct minijail *j, gid_t gid) 191{ 192 if (gid == 0) 193 die("useless change to gid 0"); 194 j->gid = gid; 195 j->flags.gid = 1; 196} 197 198int API minijail_set_supplementary_gids(struct minijail *j, size_t size, 199 const gid_t *list) 200{ 201 size_t i; 202 203 if (j->flags.usergroups) 204 die("cannot inherit *and* set supplementary groups"); 205 206 if (size == 0) 207 return -EINVAL; 208 209 /* Copy the gid_t array. */ 210 j->suppl_gid_list = calloc(size, sizeof(gid_t)); 211 if (!j->suppl_gid_list) { 212 return -ENOMEM; 213 } 214 for (i = 0; i < size; i++) { 215 j->suppl_gid_list[i] = list[i]; 216 } 217 j->suppl_gid_count = size; 218 j->flags.suppl_gids = 1; 219 return 0; 220} 221 222int API minijail_change_user(struct minijail *j, const char *user) 223{ 224 char *buf = NULL; 225 struct passwd pw; 226 struct passwd *ppw = NULL; 227 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX); 228 if (sz == -1) 229 sz = 65536; /* your guess is as good as mine... */ 230 231 /* 232 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return 233 * the maximum needed size of the buffer, so we don't have to search. 234 */ 235 buf = malloc(sz); 236 if (!buf) 237 return -ENOMEM; 238 getpwnam_r(user, &pw, buf, sz, &ppw); 239 /* 240 * We're safe to free the buffer here. The strings inside pw point 241 * inside buf, but we don't use any of them; this leaves the pointers 242 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded. 243 */ 244 free(buf); 245 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */ 246 if (!ppw) 247 return -1; 248 minijail_change_uid(j, ppw->pw_uid); 249 j->user = strdup(user); 250 if (!j->user) 251 return -ENOMEM; 252 j->usergid = ppw->pw_gid; 253 return 0; 254} 255 256int API minijail_change_group(struct minijail *j, const char *group) 257{ 258 char *buf = NULL; 259 struct group gr; 260 struct group *pgr = NULL; 261 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX); 262 if (sz == -1) 263 sz = 65536; /* and mine is as good as yours, really */ 264 265 /* 266 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return 267 * the maximum needed size of the buffer, so we don't have to search. 268 */ 269 buf = malloc(sz); 270 if (!buf) 271 return -ENOMEM; 272 getgrnam_r(group, &gr, buf, sz, &pgr); 273 /* 274 * We're safe to free the buffer here. The strings inside gr point 275 * inside buf, but we don't use any of them; this leaves the pointers 276 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded. 277 */ 278 free(buf); 279 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */ 280 if (!pgr) 281 return -1; 282 minijail_change_gid(j, pgr->gr_gid); 283 return 0; 284} 285 286void API minijail_use_seccomp(struct minijail *j) 287{ 288 j->flags.seccomp = 1; 289} 290 291void API minijail_no_new_privs(struct minijail *j) 292{ 293 j->flags.no_new_privs = 1; 294} 295 296void API minijail_use_seccomp_filter(struct minijail *j) 297{ 298 j->flags.seccomp_filter = 1; 299} 300 301void API minijail_log_seccomp_filter_failures(struct minijail *j) 302{ 303 j->flags.log_seccomp_filter = 1; 304} 305 306void API minijail_use_caps(struct minijail *j, uint64_t capmask) 307{ 308 j->caps = capmask; 309 j->flags.caps = 1; 310} 311 312void API minijail_reset_signal_mask(struct minijail* j) { 313 j->flags.reset_signal_mask = 1; 314} 315 316void API minijail_namespace_vfs(struct minijail *j) 317{ 318 j->flags.vfs = 1; 319} 320 321void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path) 322{ 323 int ns_fd = open(ns_path, O_RDONLY); 324 if (ns_fd < 0) { 325 pdie("failed to open namespace '%s'", ns_path); 326 } 327 j->mountns_fd = ns_fd; 328 j->flags.enter_vfs = 1; 329} 330 331void API minijail_namespace_pids(struct minijail *j) 332{ 333 j->flags.vfs = 1; 334 j->flags.remount_proc_ro = 1; 335 j->flags.pids = 1; 336 j->flags.do_init = 1; 337} 338 339void API minijail_namespace_ipc(struct minijail *j) 340{ 341 j->flags.ipc = 1; 342} 343 344void API minijail_namespace_net(struct minijail *j) 345{ 346 j->flags.net = 1; 347} 348 349void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path) 350{ 351 int ns_fd = open(ns_path, O_RDONLY); 352 if (ns_fd < 0) { 353 pdie("failed to open namespace '%s'", ns_path); 354 } 355 j->netns_fd = ns_fd; 356 j->flags.enter_net = 1; 357} 358 359void API minijail_remount_proc_readonly(struct minijail *j) 360{ 361 j->flags.vfs = 1; 362 j->flags.remount_proc_ro = 1; 363} 364 365void API minijail_namespace_user(struct minijail *j) 366{ 367 j->flags.userns = 1; 368} 369 370int API minijail_uidmap(struct minijail *j, const char *uidmap) 371{ 372 j->uidmap = strdup(uidmap); 373 if (!j->uidmap) 374 return -ENOMEM; 375 char *ch; 376 for (ch = j->uidmap; *ch; ch++) { 377 if (*ch == ',') 378 *ch = '\n'; 379 } 380 return 0; 381} 382 383int API minijail_gidmap(struct minijail *j, const char *gidmap) 384{ 385 j->gidmap = strdup(gidmap); 386 if (!j->gidmap) 387 return -ENOMEM; 388 char *ch; 389 for (ch = j->gidmap; *ch; ch++) { 390 if (*ch == ',') 391 *ch = '\n'; 392 } 393 return 0; 394} 395 396void API minijail_inherit_usergroups(struct minijail *j) 397{ 398 j->flags.usergroups = 1; 399} 400 401void API minijail_run_as_init(struct minijail *j) 402{ 403 /* 404 * Since the jailed program will become 'init' in the new PID namespace, 405 * Minijail does not need to fork an 'init' process. 406 */ 407 j->flags.do_init = 0; 408} 409 410int API minijail_enter_chroot(struct minijail *j, const char *dir) 411{ 412 if (j->chrootdir) 413 return -EINVAL; 414 j->chrootdir = strdup(dir); 415 if (!j->chrootdir) 416 return -ENOMEM; 417 j->flags.chroot = 1; 418 return 0; 419} 420 421int API minijail_enter_pivot_root(struct minijail *j, const char *dir) 422{ 423 if (j->chrootdir) 424 return -EINVAL; 425 j->chrootdir = strdup(dir); 426 if (!j->chrootdir) 427 return -ENOMEM; 428 j->flags.pivot_root = 1; 429 return 0; 430} 431 432static char *append_external_path(const char *external_path, 433 const char *path_inside_chroot) 434{ 435 char *path; 436 size_t pathlen; 437 438 /* One extra char for '/' and one for '\0', hence + 2. */ 439 pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2; 440 path = malloc(pathlen); 441 snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot); 442 443 return path; 444} 445 446char API *minijail_get_original_path(struct minijail *j, 447 const char *path_inside_chroot) 448{ 449 struct mountpoint *b; 450 451 b = j->mounts_head; 452 while (b) { 453 /* 454 * If |path_inside_chroot| is the exact destination of a 455 * mount, then the original path is exactly the source of 456 * the mount. 457 * for example: "-b /some/path/exe,/chroot/path/exe" 458 * mount source = /some/path/exe, mount dest = 459 * /chroot/path/exe Then when getting the original path of 460 * "/chroot/path/exe", the source of that mount, 461 * "/some/path/exe" is what should be returned. 462 */ 463 if (!strcmp(b->dest, path_inside_chroot)) 464 return strdup(b->src); 465 466 /* 467 * If |path_inside_chroot| is within the destination path of a 468 * mount, take the suffix of the chroot path relative to the 469 * mount destination path, and append it to the mount source 470 * path. 471 */ 472 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) { 473 const char *relative_path = 474 path_inside_chroot + strlen(b->dest); 475 return append_external_path(b->src, relative_path); 476 } 477 b = b->next; 478 } 479 480 /* If there is a chroot path, append |path_inside_chroot| to that. */ 481 if (j->chrootdir) 482 return append_external_path(j->chrootdir, path_inside_chroot); 483 484 /* No chroot, so the path outside is the same as it is inside. */ 485 return strdup(path_inside_chroot); 486} 487 488void API minijail_mount_tmp(struct minijail *j) 489{ 490 j->flags.mount_tmp = 1; 491} 492 493int API minijail_write_pid_file(struct minijail *j, const char *path) 494{ 495 j->pid_file_path = strdup(path); 496 if (!j->pid_file_path) 497 return -ENOMEM; 498 j->flags.pid_file = 1; 499 return 0; 500} 501 502int API minijail_mount(struct minijail *j, const char *src, const char *dest, 503 const char *type, unsigned long flags) 504{ 505 struct mountpoint *m; 506 507 if (*dest != '/') 508 return -EINVAL; 509 m = calloc(1, sizeof(*m)); 510 if (!m) 511 return -ENOMEM; 512 m->dest = strdup(dest); 513 if (!m->dest) 514 goto error; 515 m->src = strdup(src); 516 if (!m->src) 517 goto error; 518 m->type = strdup(type); 519 if (!m->type) 520 goto error; 521 m->flags = flags; 522 523 info("mount %s -> %s type %s", src, dest, type); 524 525 /* 526 * Force vfs namespacing so the mounts don't leak out into the 527 * containing vfs namespace. 528 */ 529 minijail_namespace_vfs(j); 530 531 if (j->mounts_tail) 532 j->mounts_tail->next = m; 533 else 534 j->mounts_head = m; 535 j->mounts_tail = m; 536 j->mounts_count++; 537 538 return 0; 539 540error: 541 free(m->src); 542 free(m->dest); 543 free(m); 544 return -ENOMEM; 545} 546 547int API minijail_bind(struct minijail *j, const char *src, const char *dest, 548 int writeable) 549{ 550 unsigned long flags = MS_BIND; 551 552 if (!writeable) 553 flags |= MS_RDONLY; 554 555 return minijail_mount(j, src, dest, "", flags); 556} 557 558void API minijail_parse_seccomp_filters(struct minijail *j, const char *path) 559{ 560 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) { 561 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) { 562 warn("not loading seccomp filter," 563 " seccomp not supported"); 564 return; 565 } 566 } 567 FILE *file = fopen(path, "r"); 568 if (!file) { 569 pdie("failed to open seccomp filter file '%s'", path); 570 } 571 572 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog)); 573 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) { 574 die("failed to compile seccomp filter BPF program in '%s'", 575 path); 576 } 577 578 j->filter_len = fprog->len; 579 j->filter_prog = fprog; 580 581 fclose(file); 582} 583 584int API minijail_use_alt_syscall(struct minijail *j, const char *table) 585{ 586 j->alt_syscall_table = strdup(table); 587 if (!j->alt_syscall_table) 588 return -ENOMEM; 589 j->flags.alt_syscall = 1; 590 return 0; 591} 592 593struct marshal_state { 594 size_t available; 595 size_t total; 596 char *buf; 597}; 598 599void marshal_state_init(struct marshal_state *state, 600 char *buf, size_t available) 601{ 602 state->available = available; 603 state->buf = buf; 604 state->total = 0; 605} 606 607void marshal_append(struct marshal_state *state, 608 void *src, size_t length) 609{ 610 size_t copy_len = MIN(state->available, length); 611 612 /* Up to |available| will be written. */ 613 if (copy_len) { 614 memcpy(state->buf, src, copy_len); 615 state->buf += copy_len; 616 state->available -= copy_len; 617 } 618 /* |total| will contain the expected length. */ 619 state->total += length; 620} 621 622void minijail_marshal_helper(struct marshal_state *state, 623 const struct minijail *j) 624{ 625 struct mountpoint *m = NULL; 626 marshal_append(state, (char *)j, sizeof(*j)); 627 if (j->user) 628 marshal_append(state, j->user, strlen(j->user) + 1); 629 if (j->suppl_gid_list) { 630 marshal_append(state, j->suppl_gid_list, 631 j->suppl_gid_count * sizeof(gid_t)); 632 } 633 if (j->chrootdir) 634 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1); 635 if (j->alt_syscall_table) { 636 marshal_append(state, j->alt_syscall_table, 637 strlen(j->alt_syscall_table) + 1); 638 } 639 if (j->flags.seccomp_filter && j->filter_prog) { 640 struct sock_fprog *fp = j->filter_prog; 641 marshal_append(state, (char *)fp->filter, 642 fp->len * sizeof(struct sock_filter)); 643 } 644 for (m = j->mounts_head; m; m = m->next) { 645 marshal_append(state, m->src, strlen(m->src) + 1); 646 marshal_append(state, m->dest, strlen(m->dest) + 1); 647 marshal_append(state, m->type, strlen(m->type) + 1); 648 marshal_append(state, (char *)&m->flags, sizeof(m->flags)); 649 } 650} 651 652size_t API minijail_size(const struct minijail *j) 653{ 654 struct marshal_state state; 655 marshal_state_init(&state, NULL, 0); 656 minijail_marshal_helper(&state, j); 657 return state.total; 658} 659 660int minijail_marshal(const struct minijail *j, char *buf, size_t available) 661{ 662 struct marshal_state state; 663 marshal_state_init(&state, buf, available); 664 minijail_marshal_helper(&state, j); 665 return (state.total > available); 666} 667 668/* 669 * consumebytes: consumes @length bytes from a buffer @buf of length @buflength 670 * @length Number of bytes to consume 671 * @buf Buffer to consume from 672 * @buflength Size of @buf 673 * 674 * Returns a pointer to the base of the bytes, or NULL for errors. 675 */ 676void *consumebytes(size_t length, char **buf, size_t *buflength) 677{ 678 char *p = *buf; 679 if (length > *buflength) 680 return NULL; 681 *buf += length; 682 *buflength -= length; 683 return p; 684} 685 686/* 687 * consumestr: consumes a C string from a buffer @buf of length @length 688 * @buf Buffer to consume 689 * @length Length of buffer 690 * 691 * Returns a pointer to the base of the string, or NULL for errors. 692 */ 693char *consumestr(char **buf, size_t *buflength) 694{ 695 size_t len = strnlen(*buf, *buflength); 696 if (len == *buflength) 697 /* There's no null-terminator. */ 698 return NULL; 699 return consumebytes(len + 1, buf, buflength); 700} 701 702int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) 703{ 704 size_t i; 705 size_t count; 706 int ret = -EINVAL; 707 708 if (length < sizeof(*j)) 709 goto out; 710 memcpy((void *)j, serialized, sizeof(*j)); 711 serialized += sizeof(*j); 712 length -= sizeof(*j); 713 714 /* Potentially stale pointers not used as signals. */ 715 j->mounts_head = NULL; 716 j->mounts_tail = NULL; 717 j->filter_prog = NULL; 718 719 if (j->user) { /* stale pointer */ 720 char *user = consumestr(&serialized, &length); 721 if (!user) 722 goto clear_pointers; 723 j->user = strdup(user); 724 if (!j->user) 725 goto clear_pointers; 726 } 727 728 if (j->suppl_gid_list) { /* stale pointer */ 729 if (j->suppl_gid_count > NGROUPS_MAX) { 730 goto bad_gid_list; 731 } 732 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t); 733 void *gid_list_bytes = 734 consumebytes(gid_list_size, &serialized, &length); 735 if (!gid_list_bytes) 736 goto bad_gid_list; 737 738 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t)); 739 if (!j->suppl_gid_list) 740 goto bad_gid_list; 741 742 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size); 743 } 744 745 if (j->chrootdir) { /* stale pointer */ 746 char *chrootdir = consumestr(&serialized, &length); 747 if (!chrootdir) 748 goto bad_chrootdir; 749 j->chrootdir = strdup(chrootdir); 750 if (!j->chrootdir) 751 goto bad_chrootdir; 752 } 753 754 if (j->alt_syscall_table) { /* stale pointer */ 755 char *alt_syscall_table = consumestr(&serialized, &length); 756 if (!alt_syscall_table) 757 goto bad_syscall_table; 758 j->alt_syscall_table = strdup(alt_syscall_table); 759 if (!j->alt_syscall_table) 760 goto bad_syscall_table; 761 } 762 763 if (j->flags.seccomp_filter && j->filter_len > 0) { 764 size_t ninstrs = j->filter_len; 765 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) || 766 ninstrs > USHRT_MAX) 767 goto bad_filters; 768 769 size_t program_len = ninstrs * sizeof(struct sock_filter); 770 void *program = consumebytes(program_len, &serialized, &length); 771 if (!program) 772 goto bad_filters; 773 774 j->filter_prog = malloc(sizeof(struct sock_fprog)); 775 if (!j->filter_prog) 776 goto bad_filters; 777 778 j->filter_prog->len = ninstrs; 779 j->filter_prog->filter = malloc(program_len); 780 if (!j->filter_prog->filter) 781 goto bad_filter_prog_instrs; 782 783 memcpy(j->filter_prog->filter, program, program_len); 784 } 785 786 count = j->mounts_count; 787 j->mounts_count = 0; 788 for (i = 0; i < count; ++i) { 789 unsigned long *flags; 790 const char *dest; 791 const char *type; 792 const char *src = consumestr(&serialized, &length); 793 if (!src) 794 goto bad_mounts; 795 dest = consumestr(&serialized, &length); 796 if (!dest) 797 goto bad_mounts; 798 type = consumestr(&serialized, &length); 799 if (!type) 800 goto bad_mounts; 801 flags = consumebytes(sizeof(*flags), &serialized, &length); 802 if (!flags) 803 goto bad_mounts; 804 if (minijail_mount(j, src, dest, type, *flags)) 805 goto bad_mounts; 806 } 807 808 return 0; 809 810bad_mounts: 811 if (j->flags.seccomp_filter && j->filter_len > 0) { 812 free(j->filter_prog->filter); 813 free(j->filter_prog); 814 } 815bad_filter_prog_instrs: 816 if (j->filter_prog) 817 free(j->filter_prog); 818bad_filters: 819 if (j->alt_syscall_table) 820 free(j->alt_syscall_table); 821bad_syscall_table: 822 if (j->chrootdir) 823 free(j->chrootdir); 824bad_chrootdir: 825 if (j->suppl_gid_list) 826 free(j->suppl_gid_list); 827bad_gid_list: 828 if (j->user) 829 free(j->user); 830clear_pointers: 831 j->user = NULL; 832 j->suppl_gid_list = NULL; 833 j->chrootdir = NULL; 834 j->alt_syscall_table = NULL; 835out: 836 return ret; 837} 838 839static void write_ugid_mappings(const struct minijail *j, int *pipe_fds) 840{ 841 int fd, ret, len; 842 size_t sz; 843 char fname[32]; 844 close(pipe_fds[0]); 845 846 sz = sizeof(fname); 847 if (j->uidmap) { 848 ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid); 849 if (ret < 0 || (size_t)ret >= sz) 850 die("failed to write file name of uid_map"); 851 fd = open(fname, O_WRONLY); 852 if (fd < 0) 853 pdie("failed to open '%s'", fname); 854 len = strlen(j->uidmap); 855 if (write(fd, j->uidmap, len) < len) 856 die("failed to set uid_map"); 857 close(fd); 858 } 859 if (j->gidmap) { 860 ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid); 861 if (ret < 0 || (size_t)ret >= sz) 862 die("failed to write file name of gid_map"); 863 fd = open(fname, O_WRONLY); 864 if (fd < 0) 865 pdie("failed to open '%s'", fname); 866 len = strlen(j->gidmap); 867 if (write(fd, j->gidmap, len) < len) 868 die("failed to set gid_map"); 869 close(fd); 870 } 871 872 close(pipe_fds[1]); 873} 874 875static void enter_user_namespace(const struct minijail *j, int *pipe_fds) 876{ 877 char buf; 878 879 close(pipe_fds[1]); 880 881 /* Wait for parent to set up uid/gid mappings. */ 882 if (read(pipe_fds[0], &buf, 1) != 0) 883 die("failed to sync with parent"); 884 close(pipe_fds[0]); 885 886 if (j->uidmap && setresuid(0, 0, 0)) 887 pdie("setresuid"); 888 if (j->gidmap && setresgid(0, 0, 0)) 889 pdie("setresgid"); 890} 891 892/* 893 * mount_one: Applies mounts from @m for @j, recursing as needed. 894 * @j Minijail these mounts are for 895 * @m Head of list of mounts 896 * 897 * Returns 0 for success. 898 */ 899static int mount_one(const struct minijail *j, struct mountpoint *m) 900{ 901 int ret; 902 char *dest; 903 int remount_ro = 0; 904 905 /* dest has a leading "/" */ 906 if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0) 907 return -ENOMEM; 908 909 /* 910 * R/O bind mounts have to be remounted since bind and ro can't both be 911 * specified in the original bind mount. Remount R/O after the initial 912 * mount. 913 */ 914 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) { 915 remount_ro = 1; 916 m->flags &= ~MS_RDONLY; 917 } 918 919 ret = mount(m->src, dest, m->type, m->flags, NULL); 920 if (ret) 921 pdie("mount: %s -> %s", m->src, dest); 922 923 if (remount_ro) { 924 m->flags |= MS_RDONLY; 925 ret = mount(m->src, dest, NULL, 926 m->flags | MS_REMOUNT, NULL); 927 if (ret) 928 pdie("bind ro: %s -> %s", m->src, dest); 929 } 930 931 free(dest); 932 if (m->next) 933 return mount_one(j, m->next); 934 return ret; 935} 936 937int enter_chroot(const struct minijail *j) 938{ 939 int ret; 940 941 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 942 return ret; 943 944 if (chroot(j->chrootdir)) 945 return -errno; 946 947 if (chdir("/")) 948 return -errno; 949 950 return 0; 951} 952 953int enter_pivot_root(const struct minijail *j) 954{ 955 int ret, oldroot, newroot; 956 957 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 958 return ret; 959 960 /* 961 * Keep the fd for both old and new root. 962 * It will be used in fchdir later. 963 */ 964 oldroot = open("/", O_DIRECTORY | O_RDONLY); 965 if (oldroot < 0) 966 pdie("failed to open / for fchdir"); 967 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY); 968 if (newroot < 0) 969 pdie("failed to open %s for fchdir", j->chrootdir); 970 971 /* 972 * To ensure chrootdir is the root of a file system, 973 * do a self bind mount. 974 */ 975 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, "")) 976 pdie("failed to bind mount '%s'", j->chrootdir); 977 if (chdir(j->chrootdir)) 978 return -errno; 979 if (syscall(SYS_pivot_root, ".", ".")) 980 pdie("pivot_root"); 981 982 /* 983 * Now the old root is mounted on top of the new root. Use fchdir to 984 * change to the old root and unmount it. 985 */ 986 if (fchdir(oldroot)) 987 pdie("failed to fchdir to old /"); 988 /* The old root might be busy, so use lazy unmount. */ 989 if (umount2(".", MNT_DETACH)) 990 pdie("umount(/)"); 991 /* Change back to the new root. */ 992 if (fchdir(newroot)) 993 return -errno; 994 if (chroot("/")) 995 return -errno; 996 /* Set correct CWD for getcwd(3). */ 997 if (chdir("/")) 998 return -errno; 999 1000 return 0; 1001} 1002 1003int mount_tmp(void) 1004{ 1005 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777"); 1006} 1007 1008int remount_proc_readonly(const struct minijail *j) 1009{ 1010 const char *kProcPath = "/proc"; 1011 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 1012 /* 1013 * Right now, we're holding a reference to our parent's old mount of 1014 * /proc in our namespace, which means using MS_REMOUNT here would 1015 * mutate our parent's mount as well, even though we're in a VFS 1016 * namespace (!). Instead, remove their mount from our namespace 1017 * and make our own. However, if we are in a new user namespace, /proc 1018 * is not seen as mounted, so don't return error if umount() fails. 1019 */ 1020 if (umount2(kProcPath, MNT_DETACH) && !j->flags.userns) 1021 return -errno; 1022 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) 1023 return -errno; 1024 return 0; 1025} 1026 1027static void write_pid_file(const struct minijail *j) 1028{ 1029 FILE *fp = fopen(j->pid_file_path, "w"); 1030 1031 if (!fp) 1032 pdie("failed to open '%s'", j->pid_file_path); 1033 if (fprintf(fp, "%d\n", (int)j->initpid) < 0) 1034 pdie("fprintf(%s)", j->pid_file_path); 1035 if (fclose(fp)) 1036 pdie("fclose(%s)", j->pid_file_path); 1037} 1038 1039void drop_ugid(const struct minijail *j) 1040{ 1041 if (j->flags.usergroups && j->flags.suppl_gids) { 1042 die("tried to inherit *and* set supplementary groups;" 1043 " can only do one"); 1044 } 1045 1046 if (j->flags.usergroups) { 1047 if (initgroups(j->user, j->usergid)) 1048 pdie("initgroups"); 1049 } else if (j->flags.suppl_gids) { 1050 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) { 1051 pdie("setgroups"); 1052 } 1053 } else { 1054 /* 1055 * Only attempt to clear supplementary groups if we are changing 1056 * users. 1057 */ 1058 if ((j->uid || j->gid) && setgroups(0, NULL)) 1059 pdie("setgroups"); 1060 } 1061 1062 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) 1063 pdie("setresgid"); 1064 1065 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) 1066 pdie("setresuid"); 1067} 1068 1069/* 1070 * We specifically do not use cap_valid() as that only tells us the last 1071 * valid cap we were *compiled* against (i.e. what the version of kernel 1072 * headers says). If we run on a different kernel version, then it's not 1073 * uncommon for that to be less (if an older kernel) or more (if a newer 1074 * kernel). So suck up the answer via /proc. 1075 */ 1076static unsigned int get_last_valid_cap() 1077{ 1078 const char cap_file[] = "/proc/sys/kernel/cap_last_cap"; 1079 FILE *fp = fopen(cap_file, "re"); 1080 unsigned int last_valid_cap; 1081 1082 if (fscanf(fp, "%u", &last_valid_cap) != 1) 1083 pdie("fscanf(%s)", cap_file); 1084 fclose(fp); 1085 1086 return last_valid_cap; 1087} 1088 1089void drop_caps(const struct minijail *j, unsigned int last_valid_cap) 1090{ 1091 cap_t caps = cap_get_proc(); 1092 cap_value_t flag[1]; 1093 const uint64_t one = 1; 1094 unsigned int i; 1095 if (!caps) 1096 die("can't get process caps"); 1097 if (cap_clear_flag(caps, CAP_INHERITABLE)) 1098 die("can't clear inheritable caps"); 1099 if (cap_clear_flag(caps, CAP_EFFECTIVE)) 1100 die("can't clear effective caps"); 1101 if (cap_clear_flag(caps, CAP_PERMITTED)) 1102 die("can't clear permitted caps"); 1103 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) { 1104 /* Keep CAP_SETPCAP for dropping bounding set bits. */ 1105 if (i != CAP_SETPCAP && !(j->caps & (one << i))) 1106 continue; 1107 flag[0] = i; 1108 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET)) 1109 die("can't add effective cap"); 1110 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET)) 1111 die("can't add permitted cap"); 1112 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET)) 1113 die("can't add inheritable cap"); 1114 } 1115 if (cap_set_proc(caps)) 1116 die("can't apply initial cleaned capset"); 1117 1118 /* 1119 * Instead of dropping bounding set first, do it here in case 1120 * the caller had a more permissive bounding set which could 1121 * have been used above to raise a capability that wasn't already 1122 * present. This requires CAP_SETPCAP, so we raised/kept it above. 1123 */ 1124 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) { 1125 if (j->caps & (one << i)) 1126 continue; 1127 if (prctl(PR_CAPBSET_DROP, i)) 1128 pdie("prctl(PR_CAPBSET_DROP)"); 1129 } 1130 1131 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */ 1132 if ((j->caps & (one << CAP_SETPCAP)) == 0) { 1133 flag[0] = CAP_SETPCAP; 1134 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR)) 1135 die("can't clear effective cap"); 1136 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR)) 1137 die("can't clear permitted cap"); 1138 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR)) 1139 die("can't clear inheritable cap"); 1140 } 1141 1142 if (cap_set_proc(caps)) 1143 die("can't apply final cleaned capset"); 1144 1145 cap_free(caps); 1146} 1147 1148void set_seccomp_filter(const struct minijail *j) 1149{ 1150 /* 1151 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c> 1152 * in the kernel source tree for an explanation of the parameters. 1153 */ 1154 if (j->flags.no_new_privs) { 1155 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) 1156 pdie("prctl(PR_SET_NO_NEW_PRIVS)"); 1157 } 1158 1159 /* 1160 * If we're logging seccomp filter failures, 1161 * install the SIGSYS handler first. 1162 */ 1163 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) { 1164 if (install_sigsys_handler()) 1165 pdie("install SIGSYS handler"); 1166 warn("logging seccomp filter failures"); 1167 } 1168 1169 /* 1170 * Install the syscall filter. 1171 */ 1172 if (j->flags.seccomp_filter) { 1173 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 1174 j->filter_prog)) { 1175 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) { 1176 warn("seccomp not supported"); 1177 return; 1178 } 1179 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)"); 1180 } 1181 } 1182} 1183 1184void API minijail_enter(const struct minijail *j) 1185{ 1186 /* 1187 * If we're dropping caps, get the last valid cap from /proc now, 1188 * since /proc can be unmounted before drop_caps() is called. 1189 */ 1190 unsigned int last_valid_cap = 0; 1191 if (j->flags.caps) 1192 last_valid_cap = get_last_valid_cap(); 1193 1194 if (j->flags.pids) 1195 die("tried to enter a pid-namespaced jail;" 1196 " try minijail_run()?"); 1197 1198 if (j->flags.usergroups && !j->user) 1199 die("usergroup inheritance without username"); 1200 1201 /* 1202 * We can't recover from failures if we've dropped privileges partially, 1203 * so we don't even try. If any of our operations fail, we abort() the 1204 * entire process. 1205 */ 1206 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS)) 1207 pdie("setns(CLONE_NEWNS)"); 1208 1209 if (j->flags.vfs) { 1210 if (unshare(CLONE_NEWNS)) 1211 pdie("unshare(vfs)"); 1212 /* 1213 * Remount all filesystems as private. If they are shared 1214 * new bind mounts will creep out of our namespace. 1215 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt 1216 */ 1217 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 1218 pdie("mount(/, private)"); 1219 } 1220 1221 if (j->flags.ipc && unshare(CLONE_NEWIPC)) { 1222 pdie("unshare(ipc)"); 1223 } 1224 1225 if (j->flags.enter_net) { 1226 if (setns(j->netns_fd, CLONE_NEWNET)) 1227 pdie("setns(CLONE_NEWNET)"); 1228 } else if (j->flags.net && unshare(CLONE_NEWNET)) { 1229 pdie("unshare(net)"); 1230 } 1231 1232 if (j->flags.chroot && enter_chroot(j)) 1233 pdie("chroot"); 1234 1235 if (j->flags.pivot_root && enter_pivot_root(j)) 1236 pdie("pivot_root"); 1237 1238 if (j->flags.mount_tmp && mount_tmp()) 1239 pdie("mount_tmp"); 1240 1241 if (j->flags.remount_proc_ro && remount_proc_readonly(j)) 1242 pdie("remount"); 1243 1244 if (j->flags.caps) { 1245 /* 1246 * POSIX capabilities are a bit tricky. If we drop our 1247 * capability to change uids, our attempt to use setuid() 1248 * below will fail. Hang on to root caps across setuid(), then 1249 * lock securebits. 1250 */ 1251 if (prctl(PR_SET_KEEPCAPS, 1)) 1252 pdie("prctl(PR_SET_KEEPCAPS)"); 1253 if (prctl 1254 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS)) 1255 pdie("prctl(PR_SET_SECUREBITS)"); 1256 } 1257 1258 /* 1259 * If we're setting no_new_privs, we can drop privileges 1260 * before setting seccomp filter. This way filter policies 1261 * don't need to allow privilege-dropping syscalls. 1262 */ 1263 if (j->flags.no_new_privs) { 1264 drop_ugid(j); 1265 if (j->flags.caps) 1266 drop_caps(j, last_valid_cap); 1267 1268 set_seccomp_filter(j); 1269 } else { 1270 /* 1271 * If we're not setting no_new_privs, 1272 * we need to set seccomp filter *before* dropping privileges. 1273 * WARNING: this means that filter policies *must* allow 1274 * setgroups()/setresgid()/setresuid() for dropping root and 1275 * capget()/capset()/prctl() for dropping caps. 1276 */ 1277 set_seccomp_filter(j); 1278 1279 drop_ugid(j); 1280 if (j->flags.caps) 1281 drop_caps(j, last_valid_cap); 1282 } 1283 1284 /* 1285 * Select the specified alternate syscall table. The table must not 1286 * block prctl(2) if we're using seccomp as well. 1287 */ 1288 if (j->flags.alt_syscall) { 1289 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table)) 1290 pdie("prctl(PR_ALT_SYSCALL)"); 1291 } 1292 1293 /* 1294 * seccomp has to come last since it cuts off all the other 1295 * privilege-dropping syscalls :) 1296 */ 1297 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) { 1298 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) { 1299 warn("seccomp not supported"); 1300 return; 1301 } 1302 pdie("prctl(PR_SET_SECCOMP)"); 1303 } 1304} 1305 1306/* TODO(wad) will visibility affect this variable? */ 1307static int init_exitstatus = 0; 1308 1309void init_term(int __attribute__ ((unused)) sig) 1310{ 1311 _exit(init_exitstatus); 1312} 1313 1314int init(pid_t rootpid) 1315{ 1316 pid_t pid; 1317 int status; 1318 /* so that we exit with the right status */ 1319 signal(SIGTERM, init_term); 1320 /* TODO(wad) self jail with seccomp_filters here. */ 1321 while ((pid = wait(&status)) > 0) { 1322 /* 1323 * This loop will only end when either there are no processes 1324 * left inside our pid namespace or we get a signal. 1325 */ 1326 if (pid == rootpid) 1327 init_exitstatus = status; 1328 } 1329 if (!WIFEXITED(init_exitstatus)) 1330 _exit(MINIJAIL_ERR_INIT); 1331 _exit(WEXITSTATUS(init_exitstatus)); 1332} 1333 1334int API minijail_from_fd(int fd, struct minijail *j) 1335{ 1336 size_t sz = 0; 1337 size_t bytes = read(fd, &sz, sizeof(sz)); 1338 char *buf; 1339 int r; 1340 if (sizeof(sz) != bytes) 1341 return -EINVAL; 1342 if (sz > USHRT_MAX) /* arbitrary sanity check */ 1343 return -E2BIG; 1344 buf = malloc(sz); 1345 if (!buf) 1346 return -ENOMEM; 1347 bytes = read(fd, buf, sz); 1348 if (bytes != sz) { 1349 free(buf); 1350 return -EINVAL; 1351 } 1352 r = minijail_unmarshal(j, buf, sz); 1353 free(buf); 1354 return r; 1355} 1356 1357int API minijail_to_fd(struct minijail *j, int fd) 1358{ 1359 char *buf; 1360 size_t sz = minijail_size(j); 1361 ssize_t written; 1362 int r; 1363 1364 if (!sz) 1365 return -EINVAL; 1366 buf = malloc(sz); 1367 r = minijail_marshal(j, buf, sz); 1368 if (r) { 1369 free(buf); 1370 return r; 1371 } 1372 /* Sends [size][minijail]. */ 1373 written = write(fd, &sz, sizeof(sz)); 1374 if (written != sizeof(sz)) { 1375 free(buf); 1376 return -EFAULT; 1377 } 1378 written = write(fd, buf, sz); 1379 if (written < 0 || (size_t) written != sz) { 1380 free(buf); 1381 return -EFAULT; 1382 } 1383 free(buf); 1384 return 0; 1385} 1386 1387int setup_preload(void) 1388{ 1389#if defined(__ANDROID__) 1390 /* Don't use LDPRELOAD on Brillo. */ 1391 return 0; 1392#else 1393 char *oldenv = getenv(kLdPreloadEnvVar) ? : ""; 1394 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH)); 1395 if (!newenv) 1396 return -ENOMEM; 1397 1398 /* Only insert a separating space if we have something to separate... */ 1399 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", 1400 PRELOADPATH); 1401 1402 /* setenv() makes a copy of the string we give it. */ 1403 setenv(kLdPreloadEnvVar, newenv, 1); 1404 free(newenv); 1405 return 0; 1406#endif 1407} 1408 1409int setup_pipe(int fds[2]) 1410{ 1411 int r = pipe(fds); 1412 char fd_buf[11]; 1413 if (r) 1414 return r; 1415 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]); 1416 if (r <= 0) 1417 return -EINVAL; 1418 setenv(kFdEnvVar, fd_buf, 1); 1419 return 0; 1420} 1421 1422int setup_pipe_end(int fds[2], size_t index) 1423{ 1424 if (index > 1) 1425 return -1; 1426 1427 close(fds[1 - index]); 1428 return fds[index]; 1429} 1430 1431int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd) 1432{ 1433 if (index > 1) 1434 return -1; 1435 1436 close(fds[1 - index]); 1437 /* dup2(2) the corresponding end of the pipe into |fd|. */ 1438 return dup2(fds[index], fd); 1439} 1440 1441int minijail_run_internal(struct minijail *j, const char *filename, 1442 char *const argv[], pid_t *pchild_pid, 1443 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1444 int use_preload); 1445 1446int API minijail_run(struct minijail *j, const char *filename, 1447 char *const argv[]) 1448{ 1449 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1450 true); 1451} 1452 1453int API minijail_run_pid(struct minijail *j, const char *filename, 1454 char *const argv[], pid_t *pchild_pid) 1455{ 1456 return minijail_run_internal(j, filename, argv, pchild_pid, 1457 NULL, NULL, NULL, true); 1458} 1459 1460int API minijail_run_pipe(struct minijail *j, const char *filename, 1461 char *const argv[], int *pstdin_fd) 1462{ 1463 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd, 1464 NULL, NULL, true); 1465} 1466 1467int API minijail_run_pid_pipes(struct minijail *j, const char *filename, 1468 char *const argv[], pid_t *pchild_pid, 1469 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) 1470{ 1471 return minijail_run_internal(j, filename, argv, pchild_pid, 1472 pstdin_fd, pstdout_fd, pstderr_fd, true); 1473} 1474 1475int API minijail_run_no_preload(struct minijail *j, const char *filename, 1476 char *const argv[]) 1477{ 1478 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1479 false); 1480} 1481 1482int API minijail_run_pid_pipes_no_preload(struct minijail *j, 1483 const char *filename, 1484 char *const argv[], 1485 pid_t *pchild_pid, 1486 int *pstdin_fd, int *pstdout_fd, 1487 int *pstderr_fd) { 1488 return minijail_run_internal(j, filename, argv, pchild_pid, 1489 pstdin_fd, pstdout_fd, pstderr_fd, false); 1490} 1491 1492int minijail_run_internal(struct minijail *j, const char *filename, 1493 char *const argv[], pid_t *pchild_pid, 1494 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1495 int use_preload) 1496{ 1497 char *oldenv, *oldenv_copy = NULL; 1498 pid_t child_pid; 1499 int pipe_fds[2]; 1500 int stdin_fds[2]; 1501 int stdout_fds[2]; 1502 int stderr_fds[2]; 1503 int userns_pipe_fds[2]; 1504 int ret; 1505 /* We need to remember this across the minijail_preexec() call. */ 1506 int pid_namespace = j->flags.pids; 1507 int do_init = j->flags.do_init; 1508 1509 if (use_preload) { 1510 oldenv = getenv(kLdPreloadEnvVar); 1511 if (oldenv) { 1512 oldenv_copy = strdup(oldenv); 1513 if (!oldenv_copy) 1514 return -ENOMEM; 1515 } 1516 1517 if (setup_preload()) 1518 return -EFAULT; 1519 } 1520 1521 if (!use_preload) { 1522 if (j->flags.caps) 1523 die("Capabilities are not supported without " 1524 "LD_PRELOAD"); 1525 } 1526 1527 /* 1528 * Make the process group ID of this process equal to its PID, so that 1529 * both the Minijail process and the jailed process can be killed 1530 * together. 1531 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when 1532 * the process is already a process group leader. 1533 */ 1534 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) { 1535 if (errno != EPERM) { 1536 pdie("setpgid(0, 0)"); 1537 } 1538 } 1539 1540 if (use_preload) { 1541 /* 1542 * Before we fork(2) and execve(2) the child process, we need 1543 * to open a pipe(2) to send the minijail configuration over. 1544 */ 1545 if (setup_pipe(pipe_fds)) 1546 return -EFAULT; 1547 } 1548 1549 /* 1550 * If we want to write to the child process' standard input, 1551 * create the pipe(2) now. 1552 */ 1553 if (pstdin_fd) { 1554 if (pipe(stdin_fds)) 1555 return -EFAULT; 1556 } 1557 1558 /* 1559 * If we want to read from the child process' standard output, 1560 * create the pipe(2) now. 1561 */ 1562 if (pstdout_fd) { 1563 if (pipe(stdout_fds)) 1564 return -EFAULT; 1565 } 1566 1567 /* 1568 * If we want to read from the child process' standard error, 1569 * create the pipe(2) now. 1570 */ 1571 if (pstderr_fd) { 1572 if (pipe(stderr_fds)) 1573 return -EFAULT; 1574 } 1575 1576 /* 1577 * If we want to set up a new uid/gid mapping in the user namespace, 1578 * create the pipe(2) to sync between parent and child. 1579 */ 1580 if (j->flags.userns) { 1581 if (pipe(userns_pipe_fds)) 1582 return -EFAULT; 1583 } 1584 1585 /* 1586 * Use sys_clone() if and only if we're creating a pid namespace. 1587 * 1588 * tl;dr: WARNING: do not mix pid namespaces and multithreading. 1589 * 1590 * In multithreaded programs, there are a bunch of locks inside libc, 1591 * some of which may be held by other threads at the time that we call 1592 * minijail_run_pid(). If we call fork(), glibc does its level best to 1593 * ensure that we hold all of these locks before it calls clone() 1594 * internally and drop them after clone() returns, but when we call 1595 * sys_clone(2) directly, all that gets bypassed and we end up with a 1596 * child address space where some of libc's important locks are held by 1597 * other threads (which did not get cloned, and hence will never release 1598 * those locks). This is okay so long as we call exec() immediately 1599 * after, but a bunch of seemingly-innocent libc functions like setenv() 1600 * take locks. 1601 * 1602 * Hence, only call sys_clone() if we need to, in order to get at pid 1603 * namespacing. If we follow this path, the child's address space might 1604 * have broken locks; you may only call functions that do not acquire 1605 * any locks. 1606 * 1607 * Unfortunately, fork() acquires every lock it can get its hands on, as 1608 * previously detailed, so this function is highly likely to deadlock 1609 * later on (see "deadlock here") if we're multithreaded. 1610 * 1611 * We might hack around this by having the clone()d child (init of the 1612 * pid namespace) return directly, rather than leaving the clone()d 1613 * process hanging around to be init for the new namespace (and having 1614 * its fork()ed child return in turn), but that process would be crippled 1615 * with its libc locks potentially broken. We might try fork()ing in the 1616 * parent before we clone() to ensure that we own all the locks, but 1617 * then we have to have the forked child hanging around consuming 1618 * resources (and possibly having file descriptors / shared memory 1619 * regions / etc attached). We'd need to keep the child around to avoid 1620 * having its children get reparented to init. 1621 * 1622 * TODO(ellyjones): figure out if the "forked child hanging around" 1623 * problem is fixable or not. It would be nice if we worked in this 1624 * case. 1625 */ 1626 if (pid_namespace) { 1627 int clone_flags = CLONE_NEWPID | SIGCHLD; 1628 if (j->flags.userns) 1629 clone_flags |= CLONE_NEWUSER; 1630 child_pid = syscall(SYS_clone, clone_flags, NULL); 1631 } else { 1632 child_pid = fork(); 1633 } 1634 1635 if (child_pid < 0) { 1636 if (use_preload) { 1637 free(oldenv_copy); 1638 } 1639 die("failed to fork child"); 1640 } 1641 1642 if (child_pid) { 1643 if (use_preload) { 1644 /* Restore parent's LD_PRELOAD. */ 1645 if (oldenv_copy) { 1646 setenv(kLdPreloadEnvVar, oldenv_copy, 1); 1647 free(oldenv_copy); 1648 } else { 1649 unsetenv(kLdPreloadEnvVar); 1650 } 1651 unsetenv(kFdEnvVar); 1652 } 1653 1654 j->initpid = child_pid; 1655 1656 if (j->flags.pid_file) 1657 write_pid_file(j); 1658 1659 if (j->flags.userns) 1660 write_ugid_mappings(j, userns_pipe_fds); 1661 1662 if (use_preload) { 1663 /* Send marshalled minijail. */ 1664 close(pipe_fds[0]); /* read endpoint */ 1665 ret = minijail_to_fd(j, pipe_fds[1]); 1666 close(pipe_fds[1]); /* write endpoint */ 1667 if (ret) { 1668 kill(j->initpid, SIGKILL); 1669 die("failed to send marshalled minijail"); 1670 } 1671 } 1672 1673 if (pchild_pid) 1674 *pchild_pid = child_pid; 1675 1676 /* 1677 * If we want to write to the child process' standard input, 1678 * set up the write end of the pipe. 1679 */ 1680 if (pstdin_fd) 1681 *pstdin_fd = setup_pipe_end(stdin_fds, 1682 1 /* write end */); 1683 1684 /* 1685 * If we want to read from the child process' standard output, 1686 * set up the read end of the pipe. 1687 */ 1688 if (pstdout_fd) 1689 *pstdout_fd = setup_pipe_end(stdout_fds, 1690 0 /* read end */); 1691 1692 /* 1693 * If we want to read from the child process' standard error, 1694 * set up the read end of the pipe. 1695 */ 1696 if (pstderr_fd) 1697 *pstderr_fd = setup_pipe_end(stderr_fds, 1698 0 /* read end */); 1699 1700 return 0; 1701 } 1702 free(oldenv_copy); 1703 1704 if (j->flags.reset_signal_mask) { 1705 sigset_t signal_mask; 1706 if (sigemptyset(&signal_mask) != 0) 1707 pdie("sigemptyset failed"); 1708 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0) 1709 pdie("sigprocmask failed"); 1710 } 1711 1712 if (j->flags.userns) 1713 enter_user_namespace(j, userns_pipe_fds); 1714 1715 /* 1716 * If we want to write to the jailed process' standard input, 1717 * set up the read end of the pipe. 1718 */ 1719 if (pstdin_fd) { 1720 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */, 1721 STDIN_FILENO) < 0) 1722 die("failed to set up stdin pipe"); 1723 } 1724 1725 /* 1726 * If we want to read from the jailed process' standard output, 1727 * set up the write end of the pipe. 1728 */ 1729 if (pstdout_fd) { 1730 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */, 1731 STDOUT_FILENO) < 0) 1732 die("failed to set up stdout pipe"); 1733 } 1734 1735 /* 1736 * If we want to read from the jailed process' standard error, 1737 * set up the write end of the pipe. 1738 */ 1739 if (pstderr_fd) { 1740 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */, 1741 STDERR_FILENO) < 0) 1742 die("failed to set up stderr pipe"); 1743 } 1744 1745 /* If running an init program, let it decide when/how to mount /proc. */ 1746 if (pid_namespace && !do_init) 1747 j->flags.remount_proc_ro = 0; 1748 1749 if (use_preload) { 1750 /* Strip out flags that cannot be inherited across execve(2). */ 1751 minijail_preexec(j); 1752 } else { 1753 j->flags.pids = 0; 1754 } 1755 /* Jail this process, then execve() the target. */ 1756 minijail_enter(j); 1757 1758 if (pid_namespace && do_init) { 1759 /* 1760 * pid namespace: this process will become init inside the new 1761 * namespace. We don't want all programs we might exec to have 1762 * to know how to be init. Normally (do_init == 1) we fork off 1763 * a child to actually run the program. If |do_init == 0|, we 1764 * let the program keep pid 1 and be init. 1765 * 1766 * If we're multithreaded, we'll probably deadlock here. See 1767 * WARNING above. 1768 */ 1769 child_pid = fork(); 1770 if (child_pid < 0) 1771 _exit(child_pid); 1772 else if (child_pid > 0) 1773 init(child_pid); /* never returns */ 1774 } 1775 1776 /* 1777 * If we aren't pid-namespaced, or the jailed program asked to be init: 1778 * calling process 1779 * -> execve()-ing process 1780 * If we are: 1781 * calling process 1782 * -> init()-ing process 1783 * -> execve()-ing process 1784 */ 1785 _exit(execve(filename, argv, environ)); 1786} 1787 1788int API minijail_kill(struct minijail *j) 1789{ 1790 int st; 1791 if (kill(j->initpid, SIGTERM)) 1792 return -errno; 1793 if (waitpid(j->initpid, &st, 0) < 0) 1794 return -errno; 1795 return st; 1796} 1797 1798int API minijail_wait(struct minijail *j) 1799{ 1800 int st; 1801 if (waitpid(j->initpid, &st, 0) < 0) 1802 return -errno; 1803 1804 if (!WIFEXITED(st)) { 1805 int error_status = st; 1806 if (WIFSIGNALED(st)) { 1807 int signum = WTERMSIG(st); 1808 warn("child process %d received signal %d", 1809 j->initpid, signum); 1810 /* 1811 * We return MINIJAIL_ERR_JAIL if the process received 1812 * SIGSYS, which happens when a syscall is blocked by 1813 * seccomp filters. 1814 * If not, we do what bash(1) does: 1815 * $? = 128 + signum 1816 */ 1817 if (signum == SIGSYS) { 1818 error_status = MINIJAIL_ERR_JAIL; 1819 } else { 1820 error_status = 128 + signum; 1821 } 1822 } 1823 return error_status; 1824 } 1825 1826 int exit_status = WEXITSTATUS(st); 1827 if (exit_status != 0) 1828 info("child process %d exited with status %d", 1829 j->initpid, exit_status); 1830 1831 return exit_status; 1832} 1833 1834void API minijail_destroy(struct minijail *j) 1835{ 1836 if (j->flags.seccomp_filter && j->filter_prog) { 1837 free(j->filter_prog->filter); 1838 free(j->filter_prog); 1839 } 1840 while (j->mounts_head) { 1841 struct mountpoint *m = j->mounts_head; 1842 j->mounts_head = j->mounts_head->next; 1843 free(m->type); 1844 free(m->dest); 1845 free(m->src); 1846 free(m); 1847 } 1848 j->mounts_tail = NULL; 1849 if (j->user) 1850 free(j->user); 1851 if (j->suppl_gid_list) 1852 free(j->suppl_gid_list); 1853 if (j->chrootdir) 1854 free(j->chrootdir); 1855 if (j->alt_syscall_table) 1856 free(j->alt_syscall_table); 1857 free(j); 1858} 1859