libminijail.c revision e81a52f36e9d283ba162180136eb5ac81f37440c
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6#define _BSD_SOURCE 7#define _GNU_SOURCE 8 9#include <asm/unistd.h> 10#include <ctype.h> 11#include <errno.h> 12#include <fcntl.h> 13#include <grp.h> 14#include <inttypes.h> 15#include <limits.h> 16#include <linux/capability.h> 17#include <pwd.h> 18#include <sched.h> 19#include <signal.h> 20#include <stdarg.h> 21#include <stdbool.h> 22#include <stddef.h> 23#include <stdio.h> 24#include <stdlib.h> 25#include <string.h> 26#include <syscall.h> 27#include <sys/capability.h> 28#include <sys/mount.h> 29#include <sys/param.h> 30#include <sys/prctl.h> 31#include <sys/stat.h> 32#include <sys/types.h> 33#include <sys/user.h> 34#include <sys/wait.h> 35#include <unistd.h> 36 37#include "libminijail.h" 38#include "libminijail-private.h" 39 40#include "signal_handler.h" 41#include "syscall_filter.h" 42#include "util.h" 43 44#ifdef HAVE_SECUREBITS_H 45#include <linux/securebits.h> 46#else 47#define SECURE_ALL_BITS 0x15 48#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1) 49#endif 50 51/* Until these are reliably available in linux/prctl.h */ 52#ifndef PR_SET_SECCOMP 53# define PR_SET_SECCOMP 22 54#endif 55 56#ifndef PR_ALT_SYSCALL 57# define PR_ALT_SYSCALL 0x43724f53 58#endif 59 60/* For seccomp_filter using BPF. */ 61#ifndef PR_SET_NO_NEW_PRIVS 62# define PR_SET_NO_NEW_PRIVS 38 63#endif 64#ifndef SECCOMP_MODE_FILTER 65# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ 66#endif 67 68#ifdef USE_SECCOMP_SOFTFAIL 69# define SECCOMP_SOFTFAIL 1 70#else 71# define SECCOMP_SOFTFAIL 0 72#endif 73 74struct mountpoint { 75 char *src; 76 char *dest; 77 char *type; 78 unsigned long flags; 79 struct mountpoint *next; 80}; 81 82struct minijail { 83 /* 84 * WARNING: if you add a flag here you need to make sure it's 85 * accounted for in minijail_pre{enter|exec}() below. 86 */ 87 struct { 88 int uid:1; 89 int gid:1; 90 int usergroups:1; 91 int suppl_gids:1; 92 int caps:1; 93 int vfs:1; 94 int enter_vfs:1; 95 int pids:1; 96 int ipc:1; 97 int net:1; 98 int enter_net:1; 99 int userns:1; 100 int seccomp:1; 101 int remount_proc_ro:1; 102 int no_new_privs:1; 103 int seccomp_filter:1; 104 int log_seccomp_filter:1; 105 int chroot:1; 106 int pivot_root:1; 107 int mount_tmp:1; 108 int do_init:1; 109 int pid_file:1; 110 int alt_syscall:1; 111 } flags; 112 uid_t uid; 113 gid_t gid; 114 gid_t usergid; 115 char *user; 116 size_t suppl_gid_count; 117 gid_t *suppl_gid_list; 118 uint64_t caps; 119 pid_t initpid; 120 int mountns_fd; 121 int netns_fd; 122 char *chrootdir; 123 char *pid_file_path; 124 char *uidmap; 125 char *gidmap; 126 size_t filter_len; 127 struct sock_fprog *filter_prog; 128 char *alt_syscall_table; 129 struct mountpoint *mounts_head; 130 struct mountpoint *mounts_tail; 131 size_t mounts_count; 132}; 133 134/* 135 * Strip out flags meant for the parent. 136 * We keep things that are not inherited across execve(2) (e.g. capabilities), 137 * or are easier to set after execve(2) (e.g. seccomp filters). 138 */ 139void minijail_preenter(struct minijail *j) 140{ 141 j->flags.vfs = 0; 142 j->flags.enter_vfs = 0; 143 j->flags.remount_proc_ro = 0; 144 j->flags.pids = 0; 145 j->flags.do_init = 0; 146 j->flags.pid_file = 0; 147} 148 149/* 150 * Strip out flags meant for the child. 151 * We keep things that are inherited across execve(2). 152 */ 153void minijail_preexec(struct minijail *j) 154{ 155 int vfs = j->flags.vfs; 156 int enter_vfs = j->flags.enter_vfs; 157 int remount_proc_ro = j->flags.remount_proc_ro; 158 int userns = j->flags.userns; 159 if (j->user) 160 free(j->user); 161 j->user = NULL; 162 if (j->suppl_gid_list) 163 free(j->suppl_gid_list); 164 j->suppl_gid_list = NULL; 165 memset(&j->flags, 0, sizeof(j->flags)); 166 /* Now restore anything we meant to keep. */ 167 j->flags.vfs = vfs; 168 j->flags.enter_vfs = enter_vfs; 169 j->flags.remount_proc_ro = remount_proc_ro; 170 j->flags.userns = userns; 171 /* Note, |pids| will already have been used before this call. */ 172} 173 174/* Minijail API. */ 175 176struct minijail API *minijail_new(void) 177{ 178 return calloc(1, sizeof(struct minijail)); 179} 180 181void API minijail_change_uid(struct minijail *j, uid_t uid) 182{ 183 if (uid == 0) 184 die("useless change to uid 0"); 185 j->uid = uid; 186 j->flags.uid = 1; 187} 188 189void API minijail_change_gid(struct minijail *j, gid_t gid) 190{ 191 if (gid == 0) 192 die("useless change to gid 0"); 193 j->gid = gid; 194 j->flags.gid = 1; 195} 196 197int API minijail_set_supplementary_gids(struct minijail *j, size_t size, 198 const gid_t *list) 199{ 200 if (j->flags.usergroups) 201 die("cannot inherit *and* set supplementary groups"); 202 203 if (size == 0) 204 return -EINVAL; 205 206 /* Copy the gid_t array. */ 207 j->suppl_gid_list = calloc(size, sizeof(gid_t)); 208 if (!j->suppl_gid_list) { 209 return -ENOMEM; 210 } 211 for (size_t i = 0; i < size; i++) { 212 j->suppl_gid_list[i] = list[i]; 213 } 214 j->suppl_gid_count = size; 215 j->flags.suppl_gids = 1; 216 return 0; 217} 218 219int API minijail_change_user(struct minijail *j, const char *user) 220{ 221 char *buf = NULL; 222 struct passwd pw; 223 struct passwd *ppw = NULL; 224 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX); 225 if (sz == -1) 226 sz = 65536; /* your guess is as good as mine... */ 227 228 /* 229 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return 230 * the maximum needed size of the buffer, so we don't have to search. 231 */ 232 buf = malloc(sz); 233 if (!buf) 234 return -ENOMEM; 235 getpwnam_r(user, &pw, buf, sz, &ppw); 236 /* 237 * We're safe to free the buffer here. The strings inside pw point 238 * inside buf, but we don't use any of them; this leaves the pointers 239 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded. 240 */ 241 free(buf); 242 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */ 243 if (!ppw) 244 return -1; 245 minijail_change_uid(j, ppw->pw_uid); 246 j->user = strdup(user); 247 if (!j->user) 248 return -ENOMEM; 249 j->usergid = ppw->pw_gid; 250 return 0; 251} 252 253int API minijail_change_group(struct minijail *j, const char *group) 254{ 255 char *buf = NULL; 256 struct group gr; 257 struct group *pgr = NULL; 258 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX); 259 if (sz == -1) 260 sz = 65536; /* and mine is as good as yours, really */ 261 262 /* 263 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return 264 * the maximum needed size of the buffer, so we don't have to search. 265 */ 266 buf = malloc(sz); 267 if (!buf) 268 return -ENOMEM; 269 getgrnam_r(group, &gr, buf, sz, &pgr); 270 /* 271 * We're safe to free the buffer here. The strings inside gr point 272 * inside buf, but we don't use any of them; this leaves the pointers 273 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded. 274 */ 275 free(buf); 276 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */ 277 if (!pgr) 278 return -1; 279 minijail_change_gid(j, pgr->gr_gid); 280 return 0; 281} 282 283void API minijail_use_seccomp(struct minijail *j) 284{ 285 j->flags.seccomp = 1; 286} 287 288void API minijail_no_new_privs(struct minijail *j) 289{ 290 j->flags.no_new_privs = 1; 291} 292 293void API minijail_use_seccomp_filter(struct minijail *j) 294{ 295 j->flags.seccomp_filter = 1; 296} 297 298void API minijail_log_seccomp_filter_failures(struct minijail *j) 299{ 300 j->flags.log_seccomp_filter = 1; 301} 302 303void API minijail_use_caps(struct minijail *j, uint64_t capmask) 304{ 305 j->caps = capmask; 306 j->flags.caps = 1; 307} 308 309void API minijail_namespace_vfs(struct minijail *j) 310{ 311 j->flags.vfs = 1; 312} 313 314void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path) 315{ 316 int ns_fd = open(ns_path, O_RDONLY); 317 if (ns_fd < 0) { 318 pdie("failed to open namespace '%s'", ns_path); 319 } 320 j->mountns_fd = ns_fd; 321 j->flags.enter_vfs = 1; 322} 323 324void API minijail_namespace_pids(struct minijail *j) 325{ 326 j->flags.vfs = 1; 327 j->flags.remount_proc_ro = 1; 328 j->flags.pids = 1; 329 j->flags.do_init = 1; 330} 331 332void API minijail_namespace_ipc(struct minijail *j) 333{ 334 j->flags.ipc = 1; 335} 336 337void API minijail_namespace_net(struct minijail *j) 338{ 339 j->flags.net = 1; 340} 341 342void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path) 343{ 344 int ns_fd = open(ns_path, O_RDONLY); 345 if (ns_fd < 0) { 346 pdie("failed to open namespace '%s'", ns_path); 347 } 348 j->netns_fd = ns_fd; 349 j->flags.enter_net = 1; 350} 351 352void API minijail_remount_proc_readonly(struct minijail *j) 353{ 354 j->flags.vfs = 1; 355 j->flags.remount_proc_ro = 1; 356} 357 358void API minijail_namespace_user(struct minijail *j) 359{ 360 j->flags.userns = 1; 361} 362 363int API minijail_uidmap(struct minijail *j, const char *uidmap) 364{ 365 j->uidmap = strdup(uidmap); 366 if (!j->uidmap) 367 return -ENOMEM; 368 char *ch; 369 for (ch = j->uidmap; *ch; ch++) { 370 if (*ch == ',') 371 *ch = '\n'; 372 } 373 return 0; 374} 375 376int API minijail_gidmap(struct minijail *j, const char *gidmap) 377{ 378 j->gidmap = strdup(gidmap); 379 if (!j->gidmap) 380 return -ENOMEM; 381 char *ch; 382 for (ch = j->gidmap; *ch; ch++) { 383 if (*ch == ',') 384 *ch = '\n'; 385 } 386 return 0; 387} 388 389void API minijail_inherit_usergroups(struct minijail *j) 390{ 391 j->flags.usergroups = 1; 392} 393 394void API minijail_run_as_init(struct minijail *j) 395{ 396 /* 397 * Since the jailed program will become 'init' in the new PID namespace, 398 * Minijail does not need to fork an 'init' process. 399 */ 400 j->flags.do_init = 0; 401} 402 403int API minijail_enter_chroot(struct minijail *j, const char *dir) 404{ 405 if (j->chrootdir) 406 return -EINVAL; 407 j->chrootdir = strdup(dir); 408 if (!j->chrootdir) 409 return -ENOMEM; 410 j->flags.chroot = 1; 411 return 0; 412} 413 414int API minijail_enter_pivot_root(struct minijail *j, const char *dir) 415{ 416 if (j->chrootdir) 417 return -EINVAL; 418 j->chrootdir = strdup(dir); 419 if (!j->chrootdir) 420 return -ENOMEM; 421 j->flags.pivot_root = 1; 422 return 0; 423} 424 425static char *append_external_path(const char *external_path, 426 const char *path_inside_chroot) 427{ 428 char *path; 429 size_t pathlen; 430 431 /* One extra char for '/' and one for '\0', hence + 2. */ 432 pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2; 433 path = malloc(pathlen); 434 snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot); 435 436 return path; 437} 438 439char API *minijail_get_original_path(struct minijail *j, 440 const char *path_inside_chroot) 441{ 442 struct mountpoint *b; 443 444 b = j->mounts_head; 445 while (b) { 446 /* 447 * If |path_inside_chroot| is the exact destination of a 448 * mount, then the original path is exactly the source of 449 * the mount. 450 * for example: "-b /some/path/exe,/chroot/path/exe" 451 * mount source = /some/path/exe, mount dest = 452 * /chroot/path/exe Then when getting the original path of 453 * "/chroot/path/exe", the source of that mount, 454 * "/some/path/exe" is what should be returned. 455 */ 456 if (!strcmp(b->dest, path_inside_chroot)) 457 return strdup(b->src); 458 459 /* 460 * If |path_inside_chroot| is within the destination path of a 461 * mount, take the suffix of the chroot path relative to the 462 * mount destination path, and append it to the mount source 463 * path. 464 */ 465 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) { 466 const char *relative_path = 467 path_inside_chroot + strlen(b->dest); 468 return append_external_path(b->src, relative_path); 469 } 470 b = b->next; 471 } 472 473 /* If there is a chroot path, append |path_inside_chroot| to that. */ 474 if (j->chrootdir) 475 return append_external_path(j->chrootdir, path_inside_chroot); 476 477 /* No chroot, so the path outside is the same as it is inside. */ 478 return strdup(path_inside_chroot); 479} 480 481void API minijail_mount_tmp(struct minijail *j) 482{ 483 j->flags.mount_tmp = 1; 484} 485 486int API minijail_write_pid_file(struct minijail *j, const char *path) 487{ 488 j->pid_file_path = strdup(path); 489 if (!j->pid_file_path) 490 return -ENOMEM; 491 j->flags.pid_file = 1; 492 return 0; 493} 494 495int API minijail_mount(struct minijail *j, const char *src, const char *dest, 496 const char *type, unsigned long flags) 497{ 498 struct mountpoint *m; 499 500 if (*dest != '/') 501 return -EINVAL; 502 m = calloc(1, sizeof(*m)); 503 if (!m) 504 return -ENOMEM; 505 m->dest = strdup(dest); 506 if (!m->dest) 507 goto error; 508 m->src = strdup(src); 509 if (!m->src) 510 goto error; 511 m->type = strdup(type); 512 if (!m->type) 513 goto error; 514 m->flags = flags; 515 516 info("mount %s -> %s type %s", src, dest, type); 517 518 /* 519 * Force vfs namespacing so the mounts don't leak out into the 520 * containing vfs namespace. 521 */ 522 minijail_namespace_vfs(j); 523 524 if (j->mounts_tail) 525 j->mounts_tail->next = m; 526 else 527 j->mounts_head = m; 528 j->mounts_tail = m; 529 j->mounts_count++; 530 531 return 0; 532 533error: 534 free(m->src); 535 free(m->dest); 536 free(m); 537 return -ENOMEM; 538} 539 540int API minijail_bind(struct minijail *j, const char *src, const char *dest, 541 int writeable) 542{ 543 unsigned long flags = MS_BIND; 544 545 if (!writeable) 546 flags |= MS_RDONLY; 547 548 return minijail_mount(j, src, dest, "", flags); 549} 550 551void API minijail_parse_seccomp_filters(struct minijail *j, const char *path) 552{ 553 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) { 554 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) { 555 warn("not loading seccomp filter, seccomp not supported"); 556 return; 557 } 558 } 559 FILE *file = fopen(path, "r"); 560 if (!file) { 561 pdie("failed to open seccomp filter file '%s'", path); 562 } 563 564 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog)); 565 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) { 566 die("failed to compile seccomp filter BPF program in '%s'", 567 path); 568 } 569 570 j->filter_len = fprog->len; 571 j->filter_prog = fprog; 572 573 fclose(file); 574} 575 576int API minijail_use_alt_syscall(struct minijail *j, const char *table) 577{ 578 j->alt_syscall_table = strdup(table); 579 if (!j->alt_syscall_table) 580 return -ENOMEM; 581 j->flags.alt_syscall = 1; 582 return 0; 583} 584 585struct marshal_state { 586 size_t available; 587 size_t total; 588 char *buf; 589}; 590 591void marshal_state_init(struct marshal_state *state, 592 char *buf, size_t available) 593{ 594 state->available = available; 595 state->buf = buf; 596 state->total = 0; 597} 598 599void marshal_append(struct marshal_state *state, 600 char *src, size_t length) 601{ 602 size_t copy_len = MIN(state->available, length); 603 604 /* Up to |available| will be written. */ 605 if (copy_len) { 606 memcpy(state->buf, src, copy_len); 607 state->buf += copy_len; 608 state->available -= copy_len; 609 } 610 /* |total| will contain the expected length. */ 611 state->total += length; 612} 613 614void minijail_marshal_helper(struct marshal_state *state, 615 const struct minijail *j) 616{ 617 struct mountpoint *m = NULL; 618 marshal_append(state, (char *)j, sizeof(*j)); 619 if (j->user) 620 marshal_append(state, j->user, strlen(j->user) + 1); 621 if (j->chrootdir) 622 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1); 623 if (j->alt_syscall_table) { 624 marshal_append(state, j->alt_syscall_table, 625 strlen(j->alt_syscall_table) + 1); 626 } 627 if (j->flags.seccomp_filter && j->filter_prog) { 628 struct sock_fprog *fp = j->filter_prog; 629 marshal_append(state, (char *)fp->filter, 630 fp->len * sizeof(struct sock_filter)); 631 } 632 for (m = j->mounts_head; m; m = m->next) { 633 marshal_append(state, m->src, strlen(m->src) + 1); 634 marshal_append(state, m->dest, strlen(m->dest) + 1); 635 marshal_append(state, m->type, strlen(m->type) + 1); 636 marshal_append(state, (char *)&m->flags, sizeof(m->flags)); 637 } 638} 639 640size_t API minijail_size(const struct minijail *j) 641{ 642 struct marshal_state state; 643 marshal_state_init(&state, NULL, 0); 644 minijail_marshal_helper(&state, j); 645 return state.total; 646} 647 648int minijail_marshal(const struct minijail *j, char *buf, size_t available) 649{ 650 struct marshal_state state; 651 marshal_state_init(&state, buf, available); 652 minijail_marshal_helper(&state, j); 653 return (state.total > available); 654} 655 656/* 657 * consumebytes: consumes @length bytes from a buffer @buf of length @buflength 658 * @length Number of bytes to consume 659 * @buf Buffer to consume from 660 * @buflength Size of @buf 661 * 662 * Returns a pointer to the base of the bytes, or NULL for errors. 663 */ 664void *consumebytes(size_t length, char **buf, size_t *buflength) 665{ 666 char *p = *buf; 667 if (length > *buflength) 668 return NULL; 669 *buf += length; 670 *buflength -= length; 671 return p; 672} 673 674/* 675 * consumestr: consumes a C string from a buffer @buf of length @length 676 * @buf Buffer to consume 677 * @length Length of buffer 678 * 679 * Returns a pointer to the base of the string, or NULL for errors. 680 */ 681char *consumestr(char **buf, size_t *buflength) 682{ 683 size_t len = strnlen(*buf, *buflength); 684 if (len == *buflength) 685 /* There's no null-terminator. */ 686 return NULL; 687 return consumebytes(len + 1, buf, buflength); 688} 689 690int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) 691{ 692 size_t i; 693 size_t count; 694 int ret = -EINVAL; 695 696 if (length < sizeof(*j)) 697 goto out; 698 memcpy((void *)j, serialized, sizeof(*j)); 699 serialized += sizeof(*j); 700 length -= sizeof(*j); 701 702 /* Potentially stale pointers not used as signals. */ 703 j->mounts_head = NULL; 704 j->mounts_tail = NULL; 705 j->filter_prog = NULL; 706 707 if (j->user) { /* stale pointer */ 708 char *user = consumestr(&serialized, &length); 709 if (!user) 710 goto clear_pointers; 711 j->user = strdup(user); 712 if (!j->user) 713 goto clear_pointers; 714 } 715 716 if (j->chrootdir) { /* stale pointer */ 717 char *chrootdir = consumestr(&serialized, &length); 718 if (!chrootdir) 719 goto bad_chrootdir; 720 j->chrootdir = strdup(chrootdir); 721 if (!j->chrootdir) 722 goto bad_chrootdir; 723 } 724 725 if (j->alt_syscall_table) { /* stale pointer */ 726 char *alt_syscall_table = consumestr(&serialized, &length); 727 if (!alt_syscall_table) 728 goto bad_syscall_table; 729 j->alt_syscall_table = strdup(alt_syscall_table); 730 if (!j->alt_syscall_table) 731 goto bad_syscall_table; 732 } 733 734 if (j->flags.seccomp_filter && j->filter_len > 0) { 735 size_t ninstrs = j->filter_len; 736 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) || 737 ninstrs > USHRT_MAX) 738 goto bad_filters; 739 740 size_t program_len = ninstrs * sizeof(struct sock_filter); 741 void *program = consumebytes(program_len, &serialized, &length); 742 if (!program) 743 goto bad_filters; 744 745 j->filter_prog = malloc(sizeof(struct sock_fprog)); 746 j->filter_prog->len = ninstrs; 747 j->filter_prog->filter = malloc(program_len); 748 memcpy(j->filter_prog->filter, program, program_len); 749 } 750 751 count = j->mounts_count; 752 j->mounts_count = 0; 753 for (i = 0; i < count; ++i) { 754 unsigned long *flags; 755 const char *dest; 756 const char *type; 757 const char *src = consumestr(&serialized, &length); 758 if (!src) 759 goto bad_mounts; 760 dest = consumestr(&serialized, &length); 761 if (!dest) 762 goto bad_mounts; 763 type = consumestr(&serialized, &length); 764 if (!type) 765 goto bad_mounts; 766 flags = consumebytes(sizeof(*flags), &serialized, &length); 767 if (!flags) 768 goto bad_mounts; 769 if (minijail_mount(j, src, dest, type, *flags)) 770 goto bad_mounts; 771 } 772 773 return 0; 774 775bad_mounts: 776 if (j->flags.seccomp_filter && j->filter_len > 0) { 777 free(j->filter_prog->filter); 778 free(j->filter_prog); 779 } 780bad_filters: 781 if (j->alt_syscall_table) 782 free(j->alt_syscall_table); 783bad_syscall_table: 784 if (j->chrootdir) 785 free(j->chrootdir); 786bad_chrootdir: 787 if (j->user) 788 free(j->user); 789clear_pointers: 790 j->user = NULL; 791 j->chrootdir = NULL; 792 j->alt_syscall_table = NULL; 793out: 794 return ret; 795} 796 797static void write_ugid_mappings(const struct minijail *j, int *pipe_fds) 798{ 799 int fd, ret, len; 800 size_t sz; 801 char fname[32]; 802 close(pipe_fds[0]); 803 804 sz = sizeof(fname); 805 if (j->uidmap) { 806 ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid); 807 if (ret < 0 || (size_t)ret >= sz) 808 die("failed to write file name of uid_map"); 809 fd = open(fname, O_WRONLY); 810 if (fd < 0) 811 pdie("failed to open '%s'", fname); 812 len = strlen(j->uidmap); 813 if (write(fd, j->uidmap, len) < len) 814 die("failed to set uid_map"); 815 close(fd); 816 } 817 if (j->gidmap) { 818 ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid); 819 if (ret < 0 || (size_t)ret >= sz) 820 die("failed to write file name of gid_map"); 821 fd = open(fname, O_WRONLY); 822 if (fd < 0) 823 pdie("failed to open '%s'", fname); 824 len = strlen(j->gidmap); 825 if (write(fd, j->gidmap, len) < len) 826 die("failed to set gid_map"); 827 close(fd); 828 } 829 830 close(pipe_fds[1]); 831} 832 833static void enter_user_namespace(const struct minijail *j, int *pipe_fds) 834{ 835 char buf; 836 837 close(pipe_fds[1]); 838 839 /* Wait for parent to set up uid/gid mappings. */ 840 if (read(pipe_fds[0], &buf, 1) != 0) 841 die("failed to sync with parent"); 842 close(pipe_fds[0]); 843 844 if (j->uidmap && setresuid(0, 0, 0)) 845 pdie("setresuid"); 846 if (j->gidmap && setresgid(0, 0, 0)) 847 pdie("setresgid"); 848} 849 850/* 851 * mount_one: Applies mounts from @m for @j, recursing as needed. 852 * @j Minijail these mounts are for 853 * @m Head of list of mounts 854 * 855 * Returns 0 for success. 856 */ 857static int mount_one(const struct minijail *j, struct mountpoint *m) 858{ 859 int ret; 860 char *dest; 861 int remount_ro = 0; 862 863 /* dest has a leading "/" */ 864 if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0) 865 return -ENOMEM; 866 867 /* 868 * R/O bind mounts have to be remounted since bind and ro can't both be 869 * specified in the original bind mount. Remount R/O after the initial 870 * mount. 871 */ 872 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) { 873 remount_ro = 1; 874 m->flags &= ~MS_RDONLY; 875 } 876 877 ret = mount(m->src, dest, m->type, m->flags, NULL); 878 if (ret) 879 pdie("mount: %s -> %s", m->src, dest); 880 881 if (remount_ro) { 882 m->flags |= MS_RDONLY; 883 ret = mount(m->src, dest, NULL, 884 m->flags | MS_REMOUNT, NULL); 885 if (ret) 886 pdie("bind ro: %s -> %s", m->src, dest); 887 } 888 889 free(dest); 890 if (m->next) 891 return mount_one(j, m->next); 892 return ret; 893} 894 895int enter_chroot(const struct minijail *j) 896{ 897 int ret; 898 899 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 900 return ret; 901 902 if (chroot(j->chrootdir)) 903 return -errno; 904 905 if (chdir("/")) 906 return -errno; 907 908 return 0; 909} 910 911int enter_pivot_root(const struct minijail *j) 912{ 913 int ret, oldroot, newroot; 914 915 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 916 return ret; 917 918 /* Keep the fd for both old and new root. It will be used in fchdir later. */ 919 oldroot = open("/", O_DIRECTORY | O_RDONLY); 920 if (oldroot < 0) 921 pdie("failed to open / for fchdir"); 922 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY); 923 if (newroot < 0) 924 pdie("failed to open %s for fchdir", j->chrootdir); 925 926 /* To ensure chrootdir is the root of a file system, do a self bind mount. */ 927 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, "")) 928 pdie("failed to bind mount '%s'", j->chrootdir); 929 if (chdir(j->chrootdir)) 930 return -errno; 931 if (syscall(SYS_pivot_root, ".", ".")) 932 pdie("pivot_root"); 933 934 /* 935 * Now the old root is mounted on top of the new root. Use fchdir to 936 * change to the old root and unmount it. 937 */ 938 if (fchdir(oldroot)) 939 pdie("failed to fchdir to old /"); 940 /* The old root might be busy, so use lazy unmount. */ 941 if (umount2(".", MNT_DETACH)) 942 pdie("umount(/)"); 943 /* Change back to the new root. */ 944 if (fchdir(newroot)) 945 return -errno; 946 if (chroot("/")) 947 return -errno; 948 /* Set correct CWD for getcwd(3). */ 949 if (chdir("/")) 950 return -errno; 951 952 return 0; 953} 954 955int mount_tmp(void) 956{ 957 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777"); 958} 959 960int remount_proc_readonly(const struct minijail *j) 961{ 962 const char *kProcPath = "/proc"; 963 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 964 /* 965 * Right now, we're holding a reference to our parent's old mount of 966 * /proc in our namespace, which means using MS_REMOUNT here would 967 * mutate our parent's mount as well, even though we're in a VFS 968 * namespace (!). Instead, remove their mount from our namespace 969 * and make our own. However, if we are in a new user namespace, /proc 970 * is not seen as mounted, so don't return error if umount() fails. 971 */ 972 if (umount2(kProcPath, MNT_DETACH) && !j->flags.userns) 973 return -errno; 974 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) 975 return -errno; 976 return 0; 977} 978 979static void write_pid_file(const struct minijail *j) 980{ 981 FILE *fp = fopen(j->pid_file_path, "w"); 982 983 if (!fp) 984 pdie("failed to open '%s'", j->pid_file_path); 985 if (fprintf(fp, "%d\n", (int)j->initpid) < 0) 986 pdie("fprintf(%s)", j->pid_file_path); 987 if (fclose(fp)) 988 pdie("fclose(%s)", j->pid_file_path); 989} 990 991void drop_ugid(const struct minijail *j) 992{ 993 if (j->flags.usergroups && j->flags.suppl_gids) { 994 die("tried to inherit *and* set supplementary groups;" 995 " can only do one"); 996 } 997 998 if (j->flags.usergroups) { 999 if (initgroups(j->user, j->usergid)) 1000 pdie("initgroups"); 1001 } else if (j->flags.suppl_gids) { 1002 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) { 1003 pdie("setgroups"); 1004 } 1005 } else { 1006 /* 1007 * Only attempt to clear supplementary groups if we are changing 1008 * users. 1009 */ 1010 if ((j->uid || j->gid) && setgroups(0, NULL)) 1011 pdie("setgroups"); 1012 } 1013 1014 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) 1015 pdie("setresgid"); 1016 1017 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) 1018 pdie("setresuid"); 1019} 1020 1021/* 1022 * We specifically do not use cap_valid() as that only tells us the last 1023 * valid cap we were *compiled* against (i.e. what the version of kernel 1024 * headers says). If we run on a different kernel version, then it's not 1025 * uncommon for that to be less (if an older kernel) or more (if a newer 1026 * kernel). So suck up the answer via /proc. 1027 */ 1028static unsigned int get_last_valid_cap() 1029{ 1030 const char cap_file[] = "/proc/sys/kernel/cap_last_cap"; 1031 FILE *fp = fopen(cap_file, "re"); 1032 unsigned int last_valid_cap; 1033 1034 if (fscanf(fp, "%u", &last_valid_cap) != 1) 1035 pdie("fscanf(%s)", cap_file); 1036 fclose(fp); 1037 1038 return last_valid_cap; 1039} 1040 1041void drop_caps(const struct minijail *j, unsigned int last_valid_cap) 1042{ 1043 cap_t caps = cap_get_proc(); 1044 cap_value_t flag[1]; 1045 const uint64_t one = 1; 1046 unsigned int i; 1047 if (!caps) 1048 die("can't get process caps"); 1049 if (cap_clear_flag(caps, CAP_INHERITABLE)) 1050 die("can't clear inheritable caps"); 1051 if (cap_clear_flag(caps, CAP_EFFECTIVE)) 1052 die("can't clear effective caps"); 1053 if (cap_clear_flag(caps, CAP_PERMITTED)) 1054 die("can't clear permitted caps"); 1055 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) { 1056 /* Keep CAP_SETPCAP for dropping bounding set bits. */ 1057 if (i != CAP_SETPCAP && !(j->caps & (one << i))) 1058 continue; 1059 flag[0] = i; 1060 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET)) 1061 die("can't add effective cap"); 1062 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET)) 1063 die("can't add permitted cap"); 1064 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET)) 1065 die("can't add inheritable cap"); 1066 } 1067 if (cap_set_proc(caps)) 1068 die("can't apply initial cleaned capset"); 1069 1070 /* 1071 * Instead of dropping bounding set first, do it here in case 1072 * the caller had a more permissive bounding set which could 1073 * have been used above to raise a capability that wasn't already 1074 * present. This requires CAP_SETPCAP, so we raised/kept it above. 1075 */ 1076 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) { 1077 if (j->caps & (one << i)) 1078 continue; 1079 if (prctl(PR_CAPBSET_DROP, i)) 1080 pdie("prctl(PR_CAPBSET_DROP)"); 1081 } 1082 1083 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */ 1084 if ((j->caps & (one << CAP_SETPCAP)) == 0) { 1085 flag[0] = CAP_SETPCAP; 1086 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR)) 1087 die("can't clear effective cap"); 1088 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR)) 1089 die("can't clear permitted cap"); 1090 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR)) 1091 die("can't clear inheritable cap"); 1092 } 1093 1094 if (cap_set_proc(caps)) 1095 die("can't apply final cleaned capset"); 1096 1097 cap_free(caps); 1098} 1099 1100void set_seccomp_filter(const struct minijail *j) 1101{ 1102 /* 1103 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c> 1104 * in the kernel source tree for an explanation of the parameters. 1105 */ 1106 if (j->flags.no_new_privs) { 1107 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) 1108 pdie("prctl(PR_SET_NO_NEW_PRIVS)"); 1109 } 1110 1111 /* 1112 * If we're logging seccomp filter failures, 1113 * install the SIGSYS handler first. 1114 */ 1115 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) { 1116 if (install_sigsys_handler()) 1117 pdie("install SIGSYS handler"); 1118 warn("logging seccomp filter failures"); 1119 } 1120 1121 /* 1122 * Install the syscall filter. 1123 */ 1124 if (j->flags.seccomp_filter) { 1125 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, j->filter_prog)) { 1126 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) { 1127 warn("seccomp not supported"); 1128 return; 1129 } 1130 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)"); 1131 } 1132 } 1133} 1134 1135void API minijail_enter(const struct minijail *j) 1136{ 1137 /* 1138 * Get the last valid cap from /proc, since /proc can be unmounted 1139 * before drop_caps(). 1140 */ 1141 unsigned int last_valid_cap = get_last_valid_cap(); 1142 1143 if (j->flags.pids) 1144 die("tried to enter a pid-namespaced jail;" 1145 " try minijail_run()?"); 1146 1147 if (j->flags.usergroups && !j->user) 1148 die("usergroup inheritance without username"); 1149 1150 /* 1151 * We can't recover from failures if we've dropped privileges partially, 1152 * so we don't even try. If any of our operations fail, we abort() the 1153 * entire process. 1154 */ 1155 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS)) 1156 pdie("setns(CLONE_NEWNS)"); 1157 1158 if (j->flags.vfs) { 1159 if (unshare(CLONE_NEWNS)) 1160 pdie("unshare(vfs)"); 1161 /* 1162 * Remount all filesystems as private. If they are shared 1163 * new bind mounts will creep out of our namespace. 1164 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt 1165 */ 1166 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 1167 pdie("mount(/, private)"); 1168 } 1169 1170 if (j->flags.ipc && unshare(CLONE_NEWIPC)) { 1171 pdie("unshare(ipc)"); 1172 } 1173 1174 if (j->flags.enter_net) { 1175 if (setns(j->netns_fd, CLONE_NEWNET)) 1176 pdie("setns(CLONE_NEWNET)"); 1177 } else if (j->flags.net && unshare(CLONE_NEWNET)) { 1178 pdie("unshare(net)"); 1179 } 1180 1181 if (j->flags.chroot && enter_chroot(j)) 1182 pdie("chroot"); 1183 1184 if (j->flags.pivot_root && enter_pivot_root(j)) 1185 pdie("pivot_root"); 1186 1187 if (j->flags.mount_tmp && mount_tmp()) 1188 pdie("mount_tmp"); 1189 1190 if (j->flags.remount_proc_ro && remount_proc_readonly(j)) 1191 pdie("remount"); 1192 1193 if (j->flags.caps) { 1194 /* 1195 * POSIX capabilities are a bit tricky. If we drop our 1196 * capability to change uids, our attempt to use setuid() 1197 * below will fail. Hang on to root caps across setuid(), then 1198 * lock securebits. 1199 */ 1200 if (prctl(PR_SET_KEEPCAPS, 1)) 1201 pdie("prctl(PR_SET_KEEPCAPS)"); 1202 if (prctl 1203 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS)) 1204 pdie("prctl(PR_SET_SECUREBITS)"); 1205 } 1206 1207 /* 1208 * If we're setting no_new_privs, we can drop privileges 1209 * before setting seccomp filter. This way filter policies 1210 * don't need to allow privilege-dropping syscalls. 1211 */ 1212 if (j->flags.no_new_privs) { 1213 drop_ugid(j); 1214 if (j->flags.caps) 1215 drop_caps(j, last_valid_cap); 1216 1217 set_seccomp_filter(j); 1218 } else { 1219 /* 1220 * If we're not setting no_new_privs, 1221 * we need to set seccomp filter *before* dropping privileges. 1222 * WARNING: this means that filter policies *must* allow 1223 * setgroups()/setresgid()/setresuid() for dropping root and 1224 * capget()/capset()/prctl() for dropping caps. 1225 */ 1226 set_seccomp_filter(j); 1227 1228 drop_ugid(j); 1229 if (j->flags.caps) 1230 drop_caps(j, last_valid_cap); 1231 } 1232 1233 /* 1234 * Select the specified alternate syscall table. The table must not 1235 * block prctl(2) if we're using seccomp as well. 1236 */ 1237 if (j->flags.alt_syscall) { 1238 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table)) 1239 pdie("prctl(PR_ALT_SYSCALL)"); 1240 } 1241 1242 /* 1243 * seccomp has to come last since it cuts off all the other 1244 * privilege-dropping syscalls :) 1245 */ 1246 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) { 1247 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) { 1248 warn("seccomp not supported"); 1249 return; 1250 } 1251 pdie("prctl(PR_SET_SECCOMP)"); 1252 } 1253} 1254 1255/* TODO(wad) will visibility affect this variable? */ 1256static int init_exitstatus = 0; 1257 1258void init_term(int __attribute__ ((unused)) sig) 1259{ 1260 _exit(init_exitstatus); 1261} 1262 1263int init(pid_t rootpid) 1264{ 1265 pid_t pid; 1266 int status; 1267 /* so that we exit with the right status */ 1268 signal(SIGTERM, init_term); 1269 /* TODO(wad) self jail with seccomp_filters here. */ 1270 while ((pid = wait(&status)) > 0) { 1271 /* 1272 * This loop will only end when either there are no processes 1273 * left inside our pid namespace or we get a signal. 1274 */ 1275 if (pid == rootpid) 1276 init_exitstatus = status; 1277 } 1278 if (!WIFEXITED(init_exitstatus)) 1279 _exit(MINIJAIL_ERR_INIT); 1280 _exit(WEXITSTATUS(init_exitstatus)); 1281} 1282 1283int API minijail_from_fd(int fd, struct minijail *j) 1284{ 1285 size_t sz = 0; 1286 size_t bytes = read(fd, &sz, sizeof(sz)); 1287 char *buf; 1288 int r; 1289 if (sizeof(sz) != bytes) 1290 return -EINVAL; 1291 if (sz > USHRT_MAX) /* arbitrary sanity check */ 1292 return -E2BIG; 1293 buf = malloc(sz); 1294 if (!buf) 1295 return -ENOMEM; 1296 bytes = read(fd, buf, sz); 1297 if (bytes != sz) { 1298 free(buf); 1299 return -EINVAL; 1300 } 1301 r = minijail_unmarshal(j, buf, sz); 1302 free(buf); 1303 return r; 1304} 1305 1306int API minijail_to_fd(struct minijail *j, int fd) 1307{ 1308 char *buf; 1309 size_t sz = minijail_size(j); 1310 ssize_t written; 1311 int r; 1312 1313 if (!sz) 1314 return -EINVAL; 1315 buf = malloc(sz); 1316 r = minijail_marshal(j, buf, sz); 1317 if (r) { 1318 free(buf); 1319 return r; 1320 } 1321 /* Sends [size][minijail]. */ 1322 written = write(fd, &sz, sizeof(sz)); 1323 if (written != sizeof(sz)) { 1324 free(buf); 1325 return -EFAULT; 1326 } 1327 written = write(fd, buf, sz); 1328 if (written < 0 || (size_t) written != sz) { 1329 free(buf); 1330 return -EFAULT; 1331 } 1332 free(buf); 1333 return 0; 1334} 1335 1336int setup_preload(void) 1337{ 1338#if defined(__ANDROID__) 1339 /* Don't use LDPRELOAD on Brillo. */ 1340 return 0; 1341#else 1342 char *oldenv = getenv(kLdPreloadEnvVar) ? : ""; 1343 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH)); 1344 if (!newenv) 1345 return -ENOMEM; 1346 1347 /* Only insert a separating space if we have something to separate... */ 1348 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", 1349 PRELOADPATH); 1350 1351 /* setenv() makes a copy of the string we give it. */ 1352 setenv(kLdPreloadEnvVar, newenv, 1); 1353 free(newenv); 1354 return 0; 1355#endif 1356} 1357 1358int setup_pipe(int fds[2]) 1359{ 1360 int r = pipe(fds); 1361 char fd_buf[11]; 1362 if (r) 1363 return r; 1364 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]); 1365 if (r <= 0) 1366 return -EINVAL; 1367 setenv(kFdEnvVar, fd_buf, 1); 1368 return 0; 1369} 1370 1371int setup_pipe_end(int fds[2], size_t index) 1372{ 1373 if (index > 1) 1374 return -1; 1375 1376 close(fds[1 - index]); 1377 return fds[index]; 1378} 1379 1380int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd) 1381{ 1382 if (index > 1) 1383 return -1; 1384 1385 close(fds[1 - index]); 1386 /* dup2(2) the corresponding end of the pipe into |fd|. */ 1387 return dup2(fds[index], fd); 1388} 1389 1390int minijail_run_internal(struct minijail *j, const char *filename, 1391 char *const argv[], pid_t *pchild_pid, 1392 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1393 int use_preload); 1394 1395int API minijail_run(struct minijail *j, const char *filename, 1396 char *const argv[]) 1397{ 1398 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1399 true); 1400} 1401 1402int API minijail_run_pid(struct minijail *j, const char *filename, 1403 char *const argv[], pid_t *pchild_pid) 1404{ 1405 return minijail_run_internal(j, filename, argv, pchild_pid, 1406 NULL, NULL, NULL, true); 1407} 1408 1409int API minijail_run_pipe(struct minijail *j, const char *filename, 1410 char *const argv[], int *pstdin_fd) 1411{ 1412 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd, 1413 NULL, NULL, true); 1414} 1415 1416int API minijail_run_pid_pipes(struct minijail *j, const char *filename, 1417 char *const argv[], pid_t *pchild_pid, 1418 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) 1419{ 1420 return minijail_run_internal(j, filename, argv, pchild_pid, 1421 pstdin_fd, pstdout_fd, pstderr_fd, true); 1422} 1423 1424int API minijail_run_no_preload(struct minijail *j, const char *filename, 1425 char *const argv[]) 1426{ 1427 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1428 false); 1429} 1430 1431int API minijail_run_pid_pipes_no_preload(struct minijail *j, 1432 const char *filename, char *const argv[], 1433 pid_t *pchild_pid, 1434 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) { 1435 return minijail_run_internal(j, filename, argv, pchild_pid, 1436 pstdin_fd, pstdout_fd, pstderr_fd, false); 1437} 1438 1439int minijail_run_internal(struct minijail *j, const char *filename, 1440 char *const argv[], pid_t *pchild_pid, 1441 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1442 int use_preload) 1443{ 1444 char *oldenv, *oldenv_copy = NULL; 1445 pid_t child_pid; 1446 int pipe_fds[2]; 1447 int stdin_fds[2]; 1448 int stdout_fds[2]; 1449 int stderr_fds[2]; 1450 int userns_pipe_fds[2]; 1451 int ret; 1452 /* We need to remember this across the minijail_preexec() call. */ 1453 int pid_namespace = j->flags.pids; 1454 int do_init = j->flags.do_init; 1455 1456 if (use_preload) { 1457 oldenv = getenv(kLdPreloadEnvVar); 1458 if (oldenv) { 1459 oldenv_copy = strdup(oldenv); 1460 if (!oldenv_copy) 1461 return -ENOMEM; 1462 } 1463 1464 if (setup_preload()) 1465 return -EFAULT; 1466 } 1467 1468 if (!use_preload) { 1469 if (j->flags.caps) 1470 die("Capabilities are not supported without " 1471 "LD_PRELOAD"); 1472 } 1473 1474 /* 1475 * Make the process group ID of this process equal to its PID, so that 1476 * both the Minijail process and the jailed process can be killed 1477 * together. 1478 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when 1479 * the process is already a process group leader. 1480 */ 1481 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) { 1482 if (errno != EPERM) { 1483 pdie("setpgid(0, 0)"); 1484 } 1485 } 1486 1487 if (use_preload) { 1488 /* 1489 * Before we fork(2) and execve(2) the child process, we need 1490 * to open a pipe(2) to send the minijail configuration over. 1491 */ 1492 if (setup_pipe(pipe_fds)) 1493 return -EFAULT; 1494 } 1495 1496 /* 1497 * If we want to write to the child process' standard input, 1498 * create the pipe(2) now. 1499 */ 1500 if (pstdin_fd) { 1501 if (pipe(stdin_fds)) 1502 return -EFAULT; 1503 } 1504 1505 /* 1506 * If we want to read from the child process' standard output, 1507 * create the pipe(2) now. 1508 */ 1509 if (pstdout_fd) { 1510 if (pipe(stdout_fds)) 1511 return -EFAULT; 1512 } 1513 1514 /* 1515 * If we want to read from the child process' standard error, 1516 * create the pipe(2) now. 1517 */ 1518 if (pstderr_fd) { 1519 if (pipe(stderr_fds)) 1520 return -EFAULT; 1521 } 1522 1523 /* 1524 * If we want to set up a new uid/gid mapping in the user namespace, 1525 * create the pipe(2) to sync between parent and child. 1526 */ 1527 if (j->flags.userns) { 1528 if (pipe(userns_pipe_fds)) 1529 return -EFAULT; 1530 } 1531 1532 /* 1533 * Use sys_clone() if and only if we're creating a pid namespace. 1534 * 1535 * tl;dr: WARNING: do not mix pid namespaces and multithreading. 1536 * 1537 * In multithreaded programs, there are a bunch of locks inside libc, 1538 * some of which may be held by other threads at the time that we call 1539 * minijail_run_pid(). If we call fork(), glibc does its level best to 1540 * ensure that we hold all of these locks before it calls clone() 1541 * internally and drop them after clone() returns, but when we call 1542 * sys_clone(2) directly, all that gets bypassed and we end up with a 1543 * child address space where some of libc's important locks are held by 1544 * other threads (which did not get cloned, and hence will never release 1545 * those locks). This is okay so long as we call exec() immediately 1546 * after, but a bunch of seemingly-innocent libc functions like setenv() 1547 * take locks. 1548 * 1549 * Hence, only call sys_clone() if we need to, in order to get at pid 1550 * namespacing. If we follow this path, the child's address space might 1551 * have broken locks; you may only call functions that do not acquire 1552 * any locks. 1553 * 1554 * Unfortunately, fork() acquires every lock it can get its hands on, as 1555 * previously detailed, so this function is highly likely to deadlock 1556 * later on (see "deadlock here") if we're multithreaded. 1557 * 1558 * We might hack around this by having the clone()d child (init of the 1559 * pid namespace) return directly, rather than leaving the clone()d 1560 * process hanging around to be init for the new namespace (and having 1561 * its fork()ed child return in turn), but that process would be crippled 1562 * with its libc locks potentially broken. We might try fork()ing in the 1563 * parent before we clone() to ensure that we own all the locks, but 1564 * then we have to have the forked child hanging around consuming 1565 * resources (and possibly having file descriptors / shared memory 1566 * regions / etc attached). We'd need to keep the child around to avoid 1567 * having its children get reparented to init. 1568 * 1569 * TODO(ellyjones): figure out if the "forked child hanging around" 1570 * problem is fixable or not. It would be nice if we worked in this 1571 * case. 1572 */ 1573 if (pid_namespace) { 1574 int clone_flags = CLONE_NEWPID | SIGCHLD; 1575 if (j->flags.userns) 1576 clone_flags |= CLONE_NEWUSER; 1577 child_pid = syscall(SYS_clone, clone_flags, NULL); 1578 } else { 1579 child_pid = fork(); 1580 } 1581 1582 if (child_pid < 0) { 1583 if (use_preload) { 1584 free(oldenv_copy); 1585 } 1586 die("failed to fork child"); 1587 } 1588 1589 if (child_pid) { 1590 if (use_preload) { 1591 /* Restore parent's LD_PRELOAD. */ 1592 if (oldenv_copy) { 1593 setenv(kLdPreloadEnvVar, oldenv_copy, 1); 1594 free(oldenv_copy); 1595 } else { 1596 unsetenv(kLdPreloadEnvVar); 1597 } 1598 unsetenv(kFdEnvVar); 1599 } 1600 1601 j->initpid = child_pid; 1602 1603 if (j->flags.pid_file) 1604 write_pid_file(j); 1605 1606 if (j->flags.userns) 1607 write_ugid_mappings(j, userns_pipe_fds); 1608 1609 if (use_preload) { 1610 /* Send marshalled minijail. */ 1611 close(pipe_fds[0]); /* read endpoint */ 1612 ret = minijail_to_fd(j, pipe_fds[1]); 1613 close(pipe_fds[1]); /* write endpoint */ 1614 if (ret) { 1615 kill(j->initpid, SIGKILL); 1616 die("failed to send marshalled minijail"); 1617 } 1618 } 1619 1620 if (pchild_pid) 1621 *pchild_pid = child_pid; 1622 1623 /* 1624 * If we want to write to the child process' standard input, 1625 * set up the write end of the pipe. 1626 */ 1627 if (pstdin_fd) 1628 *pstdin_fd = setup_pipe_end(stdin_fds, 1629 1 /* write end */); 1630 1631 /* 1632 * If we want to read from the child process' standard output, 1633 * set up the read end of the pipe. 1634 */ 1635 if (pstdout_fd) 1636 *pstdout_fd = setup_pipe_end(stdout_fds, 1637 0 /* read end */); 1638 1639 /* 1640 * If we want to read from the child process' standard error, 1641 * set up the read end of the pipe. 1642 */ 1643 if (pstderr_fd) 1644 *pstderr_fd = setup_pipe_end(stderr_fds, 1645 0 /* read end */); 1646 1647 return 0; 1648 } 1649 free(oldenv_copy); 1650 1651 if (j->flags.userns) 1652 enter_user_namespace(j, userns_pipe_fds); 1653 1654 /* 1655 * If we want to write to the jailed process' standard input, 1656 * set up the read end of the pipe. 1657 */ 1658 if (pstdin_fd) { 1659 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */, 1660 STDIN_FILENO) < 0) 1661 die("failed to set up stdin pipe"); 1662 } 1663 1664 /* 1665 * If we want to read from the jailed process' standard output, 1666 * set up the write end of the pipe. 1667 */ 1668 if (pstdout_fd) { 1669 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */, 1670 STDOUT_FILENO) < 0) 1671 die("failed to set up stdout pipe"); 1672 } 1673 1674 /* 1675 * If we want to read from the jailed process' standard error, 1676 * set up the write end of the pipe. 1677 */ 1678 if (pstderr_fd) { 1679 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */, 1680 STDERR_FILENO) < 0) 1681 die("failed to set up stderr pipe"); 1682 } 1683 1684 /* If running an init program, let it decide when/how to mount /proc. */ 1685 if (pid_namespace && !do_init) 1686 j->flags.remount_proc_ro = 0; 1687 1688 if (use_preload) { 1689 /* Strip out flags that cannot be inherited across execve(2). */ 1690 minijail_preexec(j); 1691 } else { 1692 j->flags.pids = 0; 1693 } 1694 /* Jail this process, then execve() the target. */ 1695 minijail_enter(j); 1696 1697 if (pid_namespace && do_init) { 1698 /* 1699 * pid namespace: this process will become init inside the new 1700 * namespace. We don't want all programs we might exec to have 1701 * to know how to be init. Normally (do_init == 1) we fork off 1702 * a child to actually run the program. If |do_init == 0|, we 1703 * let the program keep pid 1 and be init. 1704 * 1705 * If we're multithreaded, we'll probably deadlock here. See 1706 * WARNING above. 1707 */ 1708 child_pid = fork(); 1709 if (child_pid < 0) 1710 _exit(child_pid); 1711 else if (child_pid > 0) 1712 init(child_pid); /* never returns */ 1713 } 1714 1715 /* 1716 * If we aren't pid-namespaced, or the jailed program asked to be init: 1717 * calling process 1718 * -> execve()-ing process 1719 * If we are: 1720 * calling process 1721 * -> init()-ing process 1722 * -> execve()-ing process 1723 */ 1724 _exit(execve(filename, argv, environ)); 1725} 1726 1727int API minijail_kill(struct minijail *j) 1728{ 1729 int st; 1730 if (kill(j->initpid, SIGTERM)) 1731 return -errno; 1732 if (waitpid(j->initpid, &st, 0) < 0) 1733 return -errno; 1734 return st; 1735} 1736 1737int API minijail_wait(struct minijail *j) 1738{ 1739 int st; 1740 if (waitpid(j->initpid, &st, 0) < 0) 1741 return -errno; 1742 1743 if (!WIFEXITED(st)) { 1744 int error_status = st; 1745 if (WIFSIGNALED(st)) { 1746 int signum = WTERMSIG(st); 1747 warn("child process %d received signal %d", 1748 j->initpid, signum); 1749 /* 1750 * We return MINIJAIL_ERR_JAIL if the process received 1751 * SIGSYS, which happens when a syscall is blocked by 1752 * seccomp filters. 1753 * If not, we do what bash(1) does: 1754 * $? = 128 + signum 1755 */ 1756 if (signum == SIGSYS) { 1757 error_status = MINIJAIL_ERR_JAIL; 1758 } else { 1759 error_status = 128 + signum; 1760 } 1761 } 1762 return error_status; 1763 } 1764 1765 int exit_status = WEXITSTATUS(st); 1766 if (exit_status != 0) 1767 info("child process %d exited with status %d", 1768 j->initpid, exit_status); 1769 1770 return exit_status; 1771} 1772 1773void API minijail_destroy(struct minijail *j) 1774{ 1775 if (j->flags.seccomp_filter && j->filter_prog) { 1776 free(j->filter_prog->filter); 1777 free(j->filter_prog); 1778 } 1779 while (j->mounts_head) { 1780 struct mountpoint *m = j->mounts_head; 1781 j->mounts_head = j->mounts_head->next; 1782 free(m->type); 1783 free(m->dest); 1784 free(m->src); 1785 free(m); 1786 } 1787 j->mounts_tail = NULL; 1788 if (j->user) 1789 free(j->user); 1790 if (j->suppl_gid_list) 1791 free(j->suppl_gid_list); 1792 if (j->chrootdir) 1793 free(j->chrootdir); 1794 if (j->alt_syscall_table) 1795 free(j->alt_syscall_table); 1796 free(j); 1797} 1798