libminijail.c revision 4b276a6c643cee568b9b623b1ce00fd41db9e8b9
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6#define _BSD_SOURCE 7#define _GNU_SOURCE 8 9#include <asm/unistd.h> 10#include <ctype.h> 11#include <errno.h> 12#include <fcntl.h> 13#include <grp.h> 14#include <inttypes.h> 15#include <limits.h> 16#include <linux/capability.h> 17#include <pwd.h> 18#include <sched.h> 19#include <signal.h> 20#include <stdarg.h> 21#include <stdbool.h> 22#include <stddef.h> 23#include <stdio.h> 24#include <stdlib.h> 25#include <string.h> 26#include <syscall.h> 27#include <sys/capability.h> 28#include <sys/mount.h> 29#include <sys/param.h> 30#include <sys/prctl.h> 31#include <sys/stat.h> 32#include <sys/types.h> 33#include <sys/user.h> 34#include <sys/wait.h> 35#include <unistd.h> 36 37#include "libminijail.h" 38#include "libminijail-private.h" 39 40#include "signal_handler.h" 41#include "syscall_filter.h" 42#include "util.h" 43 44#ifdef HAVE_SECUREBITS_H 45#include <linux/securebits.h> 46#else 47#define SECURE_ALL_BITS 0x15 48#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1) 49#endif 50 51/* Until these are reliably available in linux/prctl.h */ 52#ifndef PR_SET_SECCOMP 53# define PR_SET_SECCOMP 22 54#endif 55 56#ifndef PR_ALT_SYSCALL 57# define PR_ALT_SYSCALL 0x43724f53 58#endif 59 60/* For seccomp_filter using BPF. */ 61#ifndef PR_SET_NO_NEW_PRIVS 62# define PR_SET_NO_NEW_PRIVS 38 63#endif 64#ifndef SECCOMP_MODE_FILTER 65# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ 66#endif 67 68#ifdef USE_SECCOMP_SOFTFAIL 69# define SECCOMP_SOFTFAIL 1 70#else 71# define SECCOMP_SOFTFAIL 0 72#endif 73 74struct mountpoint { 75 char *src; 76 char *dest; 77 char *type; 78 unsigned long flags; 79 struct mountpoint *next; 80}; 81 82struct minijail { 83 /* 84 * WARNING: if you add a flag here you need to make sure it's 85 * accounted for in minijail_pre{enter|exec}() below. 86 */ 87 struct { 88 int uid:1; 89 int gid:1; 90 int usergroups:1; 91 int suppl_gids:1; 92 int caps:1; 93 int vfs:1; 94 int enter_vfs:1; 95 int pids:1; 96 int ipc:1; 97 int net:1; 98 int enter_net:1; 99 int userns:1; 100 int seccomp:1; 101 int remount_proc_ro:1; 102 int no_new_privs:1; 103 int seccomp_filter:1; 104 int log_seccomp_filter:1; 105 int chroot:1; 106 int pivot_root:1; 107 int mount_tmp:1; 108 int do_init:1; 109 int pid_file:1; 110 int alt_syscall:1; 111 int reset_signal_mask:1; 112 } flags; 113 uid_t uid; 114 gid_t gid; 115 gid_t usergid; 116 char *user; 117 size_t suppl_gid_count; 118 gid_t *suppl_gid_list; 119 uint64_t caps; 120 pid_t initpid; 121 int mountns_fd; 122 int netns_fd; 123 char *chrootdir; 124 char *pid_file_path; 125 char *uidmap; 126 char *gidmap; 127 size_t filter_len; 128 struct sock_fprog *filter_prog; 129 char *alt_syscall_table; 130 struct mountpoint *mounts_head; 131 struct mountpoint *mounts_tail; 132 size_t mounts_count; 133}; 134 135/* 136 * Strip out flags meant for the parent. 137 * We keep things that are not inherited across execve(2) (e.g. capabilities), 138 * or are easier to set after execve(2) (e.g. seccomp filters). 139 */ 140void minijail_preenter(struct minijail *j) 141{ 142 j->flags.vfs = 0; 143 j->flags.enter_vfs = 0; 144 j->flags.remount_proc_ro = 0; 145 j->flags.pids = 0; 146 j->flags.do_init = 0; 147 j->flags.pid_file = 0; 148} 149 150/* 151 * Strip out flags meant for the child. 152 * We keep things that are inherited across execve(2). 153 */ 154void minijail_preexec(struct minijail *j) 155{ 156 int vfs = j->flags.vfs; 157 int enter_vfs = j->flags.enter_vfs; 158 int remount_proc_ro = j->flags.remount_proc_ro; 159 int userns = j->flags.userns; 160 if (j->user) 161 free(j->user); 162 j->user = NULL; 163 if (j->suppl_gid_list) 164 free(j->suppl_gid_list); 165 j->suppl_gid_list = NULL; 166 memset(&j->flags, 0, sizeof(j->flags)); 167 /* Now restore anything we meant to keep. */ 168 j->flags.vfs = vfs; 169 j->flags.enter_vfs = enter_vfs; 170 j->flags.remount_proc_ro = remount_proc_ro; 171 j->flags.userns = userns; 172 /* Note, |pids| will already have been used before this call. */ 173} 174 175/* Minijail API. */ 176 177struct minijail API *minijail_new(void) 178{ 179 return calloc(1, sizeof(struct minijail)); 180} 181 182void API minijail_change_uid(struct minijail *j, uid_t uid) 183{ 184 if (uid == 0) 185 die("useless change to uid 0"); 186 j->uid = uid; 187 j->flags.uid = 1; 188} 189 190void API minijail_change_gid(struct minijail *j, gid_t gid) 191{ 192 if (gid == 0) 193 die("useless change to gid 0"); 194 j->gid = gid; 195 j->flags.gid = 1; 196} 197 198int API minijail_set_supplementary_gids(struct minijail *j, size_t size, 199 const gid_t *list) 200{ 201 size_t i; 202 203 if (j->flags.usergroups) 204 die("cannot inherit *and* set supplementary groups"); 205 206 if (size == 0) 207 return -EINVAL; 208 209 /* Copy the gid_t array. */ 210 j->suppl_gid_list = calloc(size, sizeof(gid_t)); 211 if (!j->suppl_gid_list) { 212 return -ENOMEM; 213 } 214 for (i = 0; i < size; i++) { 215 j->suppl_gid_list[i] = list[i]; 216 } 217 j->suppl_gid_count = size; 218 j->flags.suppl_gids = 1; 219 return 0; 220} 221 222int API minijail_change_user(struct minijail *j, const char *user) 223{ 224 char *buf = NULL; 225 struct passwd pw; 226 struct passwd *ppw = NULL; 227 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX); 228 if (sz == -1) 229 sz = 65536; /* your guess is as good as mine... */ 230 231 /* 232 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return 233 * the maximum needed size of the buffer, so we don't have to search. 234 */ 235 buf = malloc(sz); 236 if (!buf) 237 return -ENOMEM; 238 getpwnam_r(user, &pw, buf, sz, &ppw); 239 /* 240 * We're safe to free the buffer here. The strings inside pw point 241 * inside buf, but we don't use any of them; this leaves the pointers 242 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded. 243 */ 244 free(buf); 245 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */ 246 if (!ppw) 247 return -1; 248 minijail_change_uid(j, ppw->pw_uid); 249 j->user = strdup(user); 250 if (!j->user) 251 return -ENOMEM; 252 j->usergid = ppw->pw_gid; 253 return 0; 254} 255 256int API minijail_change_group(struct minijail *j, const char *group) 257{ 258 char *buf = NULL; 259 struct group gr; 260 struct group *pgr = NULL; 261 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX); 262 if (sz == -1) 263 sz = 65536; /* and mine is as good as yours, really */ 264 265 /* 266 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return 267 * the maximum needed size of the buffer, so we don't have to search. 268 */ 269 buf = malloc(sz); 270 if (!buf) 271 return -ENOMEM; 272 getgrnam_r(group, &gr, buf, sz, &pgr); 273 /* 274 * We're safe to free the buffer here. The strings inside gr point 275 * inside buf, but we don't use any of them; this leaves the pointers 276 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded. 277 */ 278 free(buf); 279 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */ 280 if (!pgr) 281 return -1; 282 minijail_change_gid(j, pgr->gr_gid); 283 return 0; 284} 285 286void API minijail_use_seccomp(struct minijail *j) 287{ 288 j->flags.seccomp = 1; 289} 290 291void API minijail_no_new_privs(struct minijail *j) 292{ 293 j->flags.no_new_privs = 1; 294} 295 296void API minijail_use_seccomp_filter(struct minijail *j) 297{ 298 j->flags.seccomp_filter = 1; 299} 300 301void API minijail_log_seccomp_filter_failures(struct minijail *j) 302{ 303 j->flags.log_seccomp_filter = 1; 304} 305 306void API minijail_use_caps(struct minijail *j, uint64_t capmask) 307{ 308 j->caps = capmask; 309 j->flags.caps = 1; 310} 311 312void API minijail_reset_signal_mask(struct minijail* j) { 313 j->flags.reset_signal_mask = 1; 314} 315 316void API minijail_namespace_vfs(struct minijail *j) 317{ 318 j->flags.vfs = 1; 319} 320 321void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path) 322{ 323 int ns_fd = open(ns_path, O_RDONLY); 324 if (ns_fd < 0) { 325 pdie("failed to open namespace '%s'", ns_path); 326 } 327 j->mountns_fd = ns_fd; 328 j->flags.enter_vfs = 1; 329} 330 331void API minijail_namespace_pids(struct minijail *j) 332{ 333 j->flags.vfs = 1; 334 j->flags.remount_proc_ro = 1; 335 j->flags.pids = 1; 336 j->flags.do_init = 1; 337} 338 339void API minijail_namespace_ipc(struct minijail *j) 340{ 341 j->flags.ipc = 1; 342} 343 344void API minijail_namespace_net(struct minijail *j) 345{ 346 j->flags.net = 1; 347} 348 349void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path) 350{ 351 int ns_fd = open(ns_path, O_RDONLY); 352 if (ns_fd < 0) { 353 pdie("failed to open namespace '%s'", ns_path); 354 } 355 j->netns_fd = ns_fd; 356 j->flags.enter_net = 1; 357} 358 359void API minijail_remount_proc_readonly(struct minijail *j) 360{ 361 j->flags.vfs = 1; 362 j->flags.remount_proc_ro = 1; 363} 364 365void API minijail_namespace_user(struct minijail *j) 366{ 367 j->flags.userns = 1; 368} 369 370int API minijail_uidmap(struct minijail *j, const char *uidmap) 371{ 372 j->uidmap = strdup(uidmap); 373 if (!j->uidmap) 374 return -ENOMEM; 375 char *ch; 376 for (ch = j->uidmap; *ch; ch++) { 377 if (*ch == ',') 378 *ch = '\n'; 379 } 380 return 0; 381} 382 383int API minijail_gidmap(struct minijail *j, const char *gidmap) 384{ 385 j->gidmap = strdup(gidmap); 386 if (!j->gidmap) 387 return -ENOMEM; 388 char *ch; 389 for (ch = j->gidmap; *ch; ch++) { 390 if (*ch == ',') 391 *ch = '\n'; 392 } 393 return 0; 394} 395 396void API minijail_inherit_usergroups(struct minijail *j) 397{ 398 j->flags.usergroups = 1; 399} 400 401void API minijail_run_as_init(struct minijail *j) 402{ 403 /* 404 * Since the jailed program will become 'init' in the new PID namespace, 405 * Minijail does not need to fork an 'init' process. 406 */ 407 j->flags.do_init = 0; 408} 409 410int API minijail_enter_chroot(struct minijail *j, const char *dir) 411{ 412 if (j->chrootdir) 413 return -EINVAL; 414 j->chrootdir = strdup(dir); 415 if (!j->chrootdir) 416 return -ENOMEM; 417 j->flags.chroot = 1; 418 return 0; 419} 420 421int API minijail_enter_pivot_root(struct minijail *j, const char *dir) 422{ 423 if (j->chrootdir) 424 return -EINVAL; 425 j->chrootdir = strdup(dir); 426 if (!j->chrootdir) 427 return -ENOMEM; 428 j->flags.pivot_root = 1; 429 return 0; 430} 431 432static char *append_external_path(const char *external_path, 433 const char *path_inside_chroot) 434{ 435 char *path; 436 size_t pathlen; 437 438 /* One extra char for '/' and one for '\0', hence + 2. */ 439 pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2; 440 path = malloc(pathlen); 441 snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot); 442 443 return path; 444} 445 446char API *minijail_get_original_path(struct minijail *j, 447 const char *path_inside_chroot) 448{ 449 struct mountpoint *b; 450 451 b = j->mounts_head; 452 while (b) { 453 /* 454 * If |path_inside_chroot| is the exact destination of a 455 * mount, then the original path is exactly the source of 456 * the mount. 457 * for example: "-b /some/path/exe,/chroot/path/exe" 458 * mount source = /some/path/exe, mount dest = 459 * /chroot/path/exe Then when getting the original path of 460 * "/chroot/path/exe", the source of that mount, 461 * "/some/path/exe" is what should be returned. 462 */ 463 if (!strcmp(b->dest, path_inside_chroot)) 464 return strdup(b->src); 465 466 /* 467 * If |path_inside_chroot| is within the destination path of a 468 * mount, take the suffix of the chroot path relative to the 469 * mount destination path, and append it to the mount source 470 * path. 471 */ 472 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) { 473 const char *relative_path = 474 path_inside_chroot + strlen(b->dest); 475 return append_external_path(b->src, relative_path); 476 } 477 b = b->next; 478 } 479 480 /* If there is a chroot path, append |path_inside_chroot| to that. */ 481 if (j->chrootdir) 482 return append_external_path(j->chrootdir, path_inside_chroot); 483 484 /* No chroot, so the path outside is the same as it is inside. */ 485 return strdup(path_inside_chroot); 486} 487 488void API minijail_mount_tmp(struct minijail *j) 489{ 490 j->flags.mount_tmp = 1; 491} 492 493int API minijail_write_pid_file(struct minijail *j, const char *path) 494{ 495 j->pid_file_path = strdup(path); 496 if (!j->pid_file_path) 497 return -ENOMEM; 498 j->flags.pid_file = 1; 499 return 0; 500} 501 502int API minijail_mount(struct minijail *j, const char *src, const char *dest, 503 const char *type, unsigned long flags) 504{ 505 struct mountpoint *m; 506 507 if (*dest != '/') 508 return -EINVAL; 509 m = calloc(1, sizeof(*m)); 510 if (!m) 511 return -ENOMEM; 512 m->dest = strdup(dest); 513 if (!m->dest) 514 goto error; 515 m->src = strdup(src); 516 if (!m->src) 517 goto error; 518 m->type = strdup(type); 519 if (!m->type) 520 goto error; 521 m->flags = flags; 522 523 info("mount %s -> %s type %s", src, dest, type); 524 525 /* 526 * Force vfs namespacing so the mounts don't leak out into the 527 * containing vfs namespace. 528 */ 529 minijail_namespace_vfs(j); 530 531 if (j->mounts_tail) 532 j->mounts_tail->next = m; 533 else 534 j->mounts_head = m; 535 j->mounts_tail = m; 536 j->mounts_count++; 537 538 return 0; 539 540error: 541 free(m->src); 542 free(m->dest); 543 free(m); 544 return -ENOMEM; 545} 546 547int API minijail_bind(struct minijail *j, const char *src, const char *dest, 548 int writeable) 549{ 550 unsigned long flags = MS_BIND; 551 552 if (!writeable) 553 flags |= MS_RDONLY; 554 555 return minijail_mount(j, src, dest, "", flags); 556} 557 558void API minijail_parse_seccomp_filters(struct minijail *j, const char *path) 559{ 560 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) { 561 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) { 562 warn("not loading seccomp filter," 563 " seccomp not supported"); 564 return; 565 } 566 } 567 FILE *file = fopen(path, "r"); 568 if (!file) { 569 pdie("failed to open seccomp filter file '%s'", path); 570 } 571 572 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog)); 573 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) { 574 die("failed to compile seccomp filter BPF program in '%s'", 575 path); 576 } 577 578 j->filter_len = fprog->len; 579 j->filter_prog = fprog; 580 581 fclose(file); 582} 583 584int API minijail_use_alt_syscall(struct minijail *j, const char *table) 585{ 586 j->alt_syscall_table = strdup(table); 587 if (!j->alt_syscall_table) 588 return -ENOMEM; 589 j->flags.alt_syscall = 1; 590 return 0; 591} 592 593struct marshal_state { 594 size_t available; 595 size_t total; 596 char *buf; 597}; 598 599void marshal_state_init(struct marshal_state *state, 600 char *buf, size_t available) 601{ 602 state->available = available; 603 state->buf = buf; 604 state->total = 0; 605} 606 607void marshal_append(struct marshal_state *state, 608 void *src, size_t length) 609{ 610 size_t copy_len = MIN(state->available, length); 611 612 /* Up to |available| will be written. */ 613 if (copy_len) { 614 memcpy(state->buf, src, copy_len); 615 state->buf += copy_len; 616 state->available -= copy_len; 617 } 618 /* |total| will contain the expected length. */ 619 state->total += length; 620} 621 622void minijail_marshal_helper(struct marshal_state *state, 623 const struct minijail *j) 624{ 625 struct mountpoint *m = NULL; 626 marshal_append(state, (char *)j, sizeof(*j)); 627 if (j->user) 628 marshal_append(state, j->user, strlen(j->user) + 1); 629 if (j->suppl_gid_list) { 630 marshal_append(state, j->suppl_gid_list, 631 j->suppl_gid_count * sizeof(gid_t)); 632 } 633 if (j->chrootdir) 634 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1); 635 if (j->alt_syscall_table) { 636 marshal_append(state, j->alt_syscall_table, 637 strlen(j->alt_syscall_table) + 1); 638 } 639 if (j->flags.seccomp_filter && j->filter_prog) { 640 struct sock_fprog *fp = j->filter_prog; 641 marshal_append(state, (char *)fp->filter, 642 fp->len * sizeof(struct sock_filter)); 643 } 644 for (m = j->mounts_head; m; m = m->next) { 645 marshal_append(state, m->src, strlen(m->src) + 1); 646 marshal_append(state, m->dest, strlen(m->dest) + 1); 647 marshal_append(state, m->type, strlen(m->type) + 1); 648 marshal_append(state, (char *)&m->flags, sizeof(m->flags)); 649 } 650} 651 652size_t API minijail_size(const struct minijail *j) 653{ 654 struct marshal_state state; 655 marshal_state_init(&state, NULL, 0); 656 minijail_marshal_helper(&state, j); 657 return state.total; 658} 659 660int minijail_marshal(const struct minijail *j, char *buf, size_t available) 661{ 662 struct marshal_state state; 663 marshal_state_init(&state, buf, available); 664 minijail_marshal_helper(&state, j); 665 return (state.total > available); 666} 667 668/* 669 * consumebytes: consumes @length bytes from a buffer @buf of length @buflength 670 * @length Number of bytes to consume 671 * @buf Buffer to consume from 672 * @buflength Size of @buf 673 * 674 * Returns a pointer to the base of the bytes, or NULL for errors. 675 */ 676void *consumebytes(size_t length, char **buf, size_t *buflength) 677{ 678 char *p = *buf; 679 if (length > *buflength) 680 return NULL; 681 *buf += length; 682 *buflength -= length; 683 return p; 684} 685 686/* 687 * consumestr: consumes a C string from a buffer @buf of length @length 688 * @buf Buffer to consume 689 * @length Length of buffer 690 * 691 * Returns a pointer to the base of the string, or NULL for errors. 692 */ 693char *consumestr(char **buf, size_t *buflength) 694{ 695 size_t len = strnlen(*buf, *buflength); 696 if (len == *buflength) 697 /* There's no null-terminator. */ 698 return NULL; 699 return consumebytes(len + 1, buf, buflength); 700} 701 702int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) 703{ 704 size_t i; 705 size_t count; 706 int ret = -EINVAL; 707 708 if (length < sizeof(*j)) 709 goto out; 710 memcpy((void *)j, serialized, sizeof(*j)); 711 serialized += sizeof(*j); 712 length -= sizeof(*j); 713 714 /* Potentially stale pointers not used as signals. */ 715 j->mounts_head = NULL; 716 j->mounts_tail = NULL; 717 j->filter_prog = NULL; 718 719 if (j->user) { /* stale pointer */ 720 char *user = consumestr(&serialized, &length); 721 if (!user) 722 goto clear_pointers; 723 j->user = strdup(user); 724 if (!j->user) 725 goto clear_pointers; 726 } 727 728 if (j->suppl_gid_list) { /* stale pointer */ 729 if (j->suppl_gid_count > NGROUPS_MAX) { 730 goto bad_gid_list; 731 } 732 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t); 733 void *gid_list_bytes = 734 consumebytes(gid_list_size, &serialized, &length); 735 if (!gid_list_bytes) 736 goto bad_gid_list; 737 738 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t)); 739 if (!j->suppl_gid_list) 740 goto bad_gid_list; 741 742 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size); 743 } 744 745 if (j->chrootdir) { /* stale pointer */ 746 char *chrootdir = consumestr(&serialized, &length); 747 if (!chrootdir) 748 goto bad_chrootdir; 749 j->chrootdir = strdup(chrootdir); 750 if (!j->chrootdir) 751 goto bad_chrootdir; 752 } 753 754 if (j->alt_syscall_table) { /* stale pointer */ 755 char *alt_syscall_table = consumestr(&serialized, &length); 756 if (!alt_syscall_table) 757 goto bad_syscall_table; 758 j->alt_syscall_table = strdup(alt_syscall_table); 759 if (!j->alt_syscall_table) 760 goto bad_syscall_table; 761 } 762 763 if (j->flags.seccomp_filter && j->filter_len > 0) { 764 size_t ninstrs = j->filter_len; 765 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) || 766 ninstrs > USHRT_MAX) 767 goto bad_filters; 768 769 size_t program_len = ninstrs * sizeof(struct sock_filter); 770 void *program = consumebytes(program_len, &serialized, &length); 771 if (!program) 772 goto bad_filters; 773 774 j->filter_prog = malloc(sizeof(struct sock_fprog)); 775 if (!j->filter_prog) 776 goto bad_filters; 777 778 j->filter_prog->len = ninstrs; 779 j->filter_prog->filter = malloc(program_len); 780 if (!j->filter_prog->filter) 781 goto bad_filter_prog_instrs; 782 783 memcpy(j->filter_prog->filter, program, program_len); 784 } 785 786 count = j->mounts_count; 787 j->mounts_count = 0; 788 for (i = 0; i < count; ++i) { 789 unsigned long *flags; 790 const char *dest; 791 const char *type; 792 const char *src = consumestr(&serialized, &length); 793 if (!src) 794 goto bad_mounts; 795 dest = consumestr(&serialized, &length); 796 if (!dest) 797 goto bad_mounts; 798 type = consumestr(&serialized, &length); 799 if (!type) 800 goto bad_mounts; 801 flags = consumebytes(sizeof(*flags), &serialized, &length); 802 if (!flags) 803 goto bad_mounts; 804 if (minijail_mount(j, src, dest, type, *flags)) 805 goto bad_mounts; 806 } 807 808 return 0; 809 810bad_mounts: 811 if (j->flags.seccomp_filter && j->filter_len > 0) { 812 free(j->filter_prog->filter); 813 free(j->filter_prog); 814 } 815bad_filter_prog_instrs: 816 if (j->filter_prog) 817 free(j->filter_prog); 818bad_filters: 819 if (j->alt_syscall_table) 820 free(j->alt_syscall_table); 821bad_syscall_table: 822 if (j->chrootdir) 823 free(j->chrootdir); 824bad_chrootdir: 825 if (j->suppl_gid_list) 826 free(j->suppl_gid_list); 827bad_gid_list: 828 if (j->user) 829 free(j->user); 830clear_pointers: 831 j->user = NULL; 832 j->suppl_gid_list = NULL; 833 j->chrootdir = NULL; 834 j->alt_syscall_table = NULL; 835out: 836 return ret; 837} 838 839static void write_ugid_mappings(const struct minijail *j, int *pipe_fds) 840{ 841 int fd, ret, len; 842 size_t sz; 843 char fname[32]; 844 close(pipe_fds[0]); 845 846 sz = sizeof(fname); 847 if (j->uidmap) { 848 ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid); 849 if (ret < 0 || (size_t)ret >= sz) 850 die("failed to write file name of uid_map"); 851 fd = open(fname, O_WRONLY); 852 if (fd < 0) 853 pdie("failed to open '%s'", fname); 854 len = strlen(j->uidmap); 855 if (write(fd, j->uidmap, len) < len) 856 die("failed to set uid_map"); 857 close(fd); 858 } 859 if (j->gidmap) { 860 ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid); 861 if (ret < 0 || (size_t)ret >= sz) 862 die("failed to write file name of gid_map"); 863 fd = open(fname, O_WRONLY); 864 if (fd < 0) 865 pdie("failed to open '%s'", fname); 866 len = strlen(j->gidmap); 867 if (write(fd, j->gidmap, len) < len) 868 die("failed to set gid_map"); 869 close(fd); 870 } 871 872 close(pipe_fds[1]); 873} 874 875static void enter_user_namespace(const struct minijail *j, int *pipe_fds) 876{ 877 char buf; 878 879 close(pipe_fds[1]); 880 881 /* Wait for parent to set up uid/gid mappings. */ 882 if (read(pipe_fds[0], &buf, 1) != 0) 883 die("failed to sync with parent"); 884 close(pipe_fds[0]); 885 886 if (j->uidmap && setresuid(0, 0, 0)) 887 pdie("setresuid"); 888 if (j->gidmap && setresgid(0, 0, 0)) 889 pdie("setresgid"); 890} 891 892/* 893 * mount_one: Applies mounts from @m for @j, recursing as needed. 894 * @j Minijail these mounts are for 895 * @m Head of list of mounts 896 * 897 * Returns 0 for success. 898 */ 899static int mount_one(const struct minijail *j, struct mountpoint *m) 900{ 901 int ret; 902 char *dest; 903 int remount_ro = 0; 904 905 /* dest has a leading "/" */ 906 if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0) 907 return -ENOMEM; 908 909 /* 910 * R/O bind mounts have to be remounted since bind and ro can't both be 911 * specified in the original bind mount. Remount R/O after the initial 912 * mount. 913 */ 914 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) { 915 remount_ro = 1; 916 m->flags &= ~MS_RDONLY; 917 } 918 919 ret = mount(m->src, dest, m->type, m->flags, NULL); 920 if (ret) 921 pdie("mount: %s -> %s", m->src, dest); 922 923 if (remount_ro) { 924 m->flags |= MS_RDONLY; 925 ret = mount(m->src, dest, NULL, 926 m->flags | MS_REMOUNT, NULL); 927 if (ret) 928 pdie("bind ro: %s -> %s", m->src, dest); 929 } 930 931 free(dest); 932 if (m->next) 933 return mount_one(j, m->next); 934 return ret; 935} 936 937int enter_chroot(const struct minijail *j) 938{ 939 int ret; 940 941 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 942 return ret; 943 944 if (chroot(j->chrootdir)) 945 return -errno; 946 947 if (chdir("/")) 948 return -errno; 949 950 return 0; 951} 952 953int enter_pivot_root(const struct minijail *j) 954{ 955 int ret, oldroot, newroot; 956 957 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 958 return ret; 959 960 /* 961 * Keep the fd for both old and new root. 962 * It will be used in fchdir later. 963 */ 964 oldroot = open("/", O_DIRECTORY | O_RDONLY); 965 if (oldroot < 0) 966 pdie("failed to open / for fchdir"); 967 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY); 968 if (newroot < 0) 969 pdie("failed to open %s for fchdir", j->chrootdir); 970 971 /* 972 * To ensure chrootdir is the root of a file system, 973 * do a self bind mount. 974 */ 975 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, "")) 976 pdie("failed to bind mount '%s'", j->chrootdir); 977 if (chdir(j->chrootdir)) 978 return -errno; 979 if (syscall(SYS_pivot_root, ".", ".")) 980 pdie("pivot_root"); 981 982 /* 983 * Now the old root is mounted on top of the new root. Use fchdir to 984 * change to the old root and unmount it. 985 */ 986 if (fchdir(oldroot)) 987 pdie("failed to fchdir to old /"); 988 /* The old root might be busy, so use lazy unmount. */ 989 if (umount2(".", MNT_DETACH)) 990 pdie("umount(/)"); 991 /* Change back to the new root. */ 992 if (fchdir(newroot)) 993 return -errno; 994 if (chroot("/")) 995 return -errno; 996 /* Set correct CWD for getcwd(3). */ 997 if (chdir("/")) 998 return -errno; 999 1000 return 0; 1001} 1002 1003int mount_tmp(void) 1004{ 1005 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777"); 1006} 1007 1008int remount_proc_readonly(const struct minijail *j) 1009{ 1010 const char *kProcPath = "/proc"; 1011 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 1012 /* 1013 * Right now, we're holding a reference to our parent's old mount of 1014 * /proc in our namespace, which means using MS_REMOUNT here would 1015 * mutate our parent's mount as well, even though we're in a VFS 1016 * namespace (!). Instead, remove their mount from our namespace 1017 * and make our own. However, if we are in a new user namespace, /proc 1018 * is not seen as mounted, so don't return error if umount() fails. 1019 */ 1020 if (umount2(kProcPath, MNT_DETACH) && !j->flags.userns) 1021 return -errno; 1022 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) 1023 return -errno; 1024 return 0; 1025} 1026 1027static void write_pid_file(const struct minijail *j) 1028{ 1029 FILE *fp = fopen(j->pid_file_path, "w"); 1030 1031 if (!fp) 1032 pdie("failed to open '%s'", j->pid_file_path); 1033 if (fprintf(fp, "%d\n", (int)j->initpid) < 0) 1034 pdie("fprintf(%s)", j->pid_file_path); 1035 if (fclose(fp)) 1036 pdie("fclose(%s)", j->pid_file_path); 1037} 1038 1039void drop_ugid(const struct minijail *j) 1040{ 1041 if (j->flags.usergroups && j->flags.suppl_gids) { 1042 die("tried to inherit *and* set supplementary groups;" 1043 " can only do one"); 1044 } 1045 1046 if (j->flags.usergroups) { 1047 if (initgroups(j->user, j->usergid)) 1048 pdie("initgroups"); 1049 } else if (j->flags.suppl_gids) { 1050 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) { 1051 pdie("setgroups"); 1052 } 1053 } else { 1054 /* 1055 * Only attempt to clear supplementary groups if we are changing 1056 * users. 1057 */ 1058 if ((j->uid || j->gid) && setgroups(0, NULL)) 1059 pdie("setgroups"); 1060 } 1061 1062 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) 1063 pdie("setresgid"); 1064 1065 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) 1066 pdie("setresuid"); 1067} 1068 1069/* 1070 * We specifically do not use cap_valid() as that only tells us the last 1071 * valid cap we were *compiled* against (i.e. what the version of kernel 1072 * headers says). If we run on a different kernel version, then it's not 1073 * uncommon for that to be less (if an older kernel) or more (if a newer 1074 * kernel). 1075 * Normally, we suck up the answer via /proc. On Android, not all processes are 1076 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we 1077 * programmatically find the value by calling prctl(PR_CAPBSET_READ). 1078 */ 1079static unsigned int get_last_valid_cap() 1080{ 1081 unsigned int last_valid_cap = 0; 1082 if (is_android()) { 1083 for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0; 1084 ++last_valid_cap); 1085 1086 /* |last_valid_cap| will be the first failing value. */ 1087 if (last_valid_cap > 0) { 1088 last_valid_cap--; 1089 } 1090 } else { 1091 const char cap_file[] = "/proc/sys/kernel/cap_last_cap"; 1092 FILE *fp = fopen(cap_file, "re"); 1093 if (fscanf(fp, "%u", &last_valid_cap) != 1) 1094 pdie("fscanf(%s)", cap_file); 1095 fclose(fp); 1096 } 1097 return last_valid_cap; 1098} 1099 1100void drop_caps(const struct minijail *j, unsigned int last_valid_cap) 1101{ 1102 cap_t caps = cap_get_proc(); 1103 cap_value_t flag[1]; 1104 const uint64_t one = 1; 1105 unsigned int i; 1106 if (!caps) 1107 die("can't get process caps"); 1108 if (cap_clear_flag(caps, CAP_INHERITABLE)) 1109 die("can't clear inheritable caps"); 1110 if (cap_clear_flag(caps, CAP_EFFECTIVE)) 1111 die("can't clear effective caps"); 1112 if (cap_clear_flag(caps, CAP_PERMITTED)) 1113 die("can't clear permitted caps"); 1114 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) { 1115 /* Keep CAP_SETPCAP for dropping bounding set bits. */ 1116 if (i != CAP_SETPCAP && !(j->caps & (one << i))) 1117 continue; 1118 flag[0] = i; 1119 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET)) 1120 die("can't add effective cap"); 1121 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET)) 1122 die("can't add permitted cap"); 1123 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET)) 1124 die("can't add inheritable cap"); 1125 } 1126 if (cap_set_proc(caps)) 1127 die("can't apply initial cleaned capset"); 1128 1129 /* 1130 * Instead of dropping bounding set first, do it here in case 1131 * the caller had a more permissive bounding set which could 1132 * have been used above to raise a capability that wasn't already 1133 * present. This requires CAP_SETPCAP, so we raised/kept it above. 1134 */ 1135 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) { 1136 if (j->caps & (one << i)) 1137 continue; 1138 if (prctl(PR_CAPBSET_DROP, i)) 1139 pdie("prctl(PR_CAPBSET_DROP)"); 1140 } 1141 1142 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */ 1143 if ((j->caps & (one << CAP_SETPCAP)) == 0) { 1144 flag[0] = CAP_SETPCAP; 1145 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR)) 1146 die("can't clear effective cap"); 1147 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR)) 1148 die("can't clear permitted cap"); 1149 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR)) 1150 die("can't clear inheritable cap"); 1151 } 1152 1153 if (cap_set_proc(caps)) 1154 die("can't apply final cleaned capset"); 1155 1156 cap_free(caps); 1157} 1158 1159void set_seccomp_filter(const struct minijail *j) 1160{ 1161 /* 1162 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c> 1163 * in the kernel source tree for an explanation of the parameters. 1164 */ 1165 if (j->flags.no_new_privs) { 1166 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) 1167 pdie("prctl(PR_SET_NO_NEW_PRIVS)"); 1168 } 1169 1170 /* 1171 * If we're logging seccomp filter failures, 1172 * install the SIGSYS handler first. 1173 */ 1174 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) { 1175 if (install_sigsys_handler()) 1176 pdie("install SIGSYS handler"); 1177 warn("logging seccomp filter failures"); 1178 } 1179 1180 /* 1181 * Install the syscall filter. 1182 */ 1183 if (j->flags.seccomp_filter) { 1184 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 1185 j->filter_prog)) { 1186 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) { 1187 warn("seccomp not supported"); 1188 return; 1189 } 1190 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)"); 1191 } 1192 } 1193} 1194 1195void API minijail_enter(const struct minijail *j) 1196{ 1197 /* 1198 * If we're dropping caps, get the last valid cap from /proc now, 1199 * since /proc can be unmounted before drop_caps() is called. 1200 */ 1201 unsigned int last_valid_cap = 0; 1202 if (j->flags.caps) 1203 last_valid_cap = get_last_valid_cap(); 1204 1205 if (j->flags.pids) 1206 die("tried to enter a pid-namespaced jail;" 1207 " try minijail_run()?"); 1208 1209 if (j->flags.usergroups && !j->user) 1210 die("usergroup inheritance without username"); 1211 1212 /* 1213 * We can't recover from failures if we've dropped privileges partially, 1214 * so we don't even try. If any of our operations fail, we abort() the 1215 * entire process. 1216 */ 1217 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS)) 1218 pdie("setns(CLONE_NEWNS)"); 1219 1220 if (j->flags.vfs) { 1221 if (unshare(CLONE_NEWNS)) 1222 pdie("unshare(vfs)"); 1223 /* 1224 * Remount all filesystems as private. If they are shared 1225 * new bind mounts will creep out of our namespace. 1226 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt 1227 */ 1228 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 1229 pdie("mount(/, private)"); 1230 } 1231 1232 if (j->flags.ipc && unshare(CLONE_NEWIPC)) { 1233 pdie("unshare(ipc)"); 1234 } 1235 1236 if (j->flags.enter_net) { 1237 if (setns(j->netns_fd, CLONE_NEWNET)) 1238 pdie("setns(CLONE_NEWNET)"); 1239 } else if (j->flags.net && unshare(CLONE_NEWNET)) { 1240 pdie("unshare(net)"); 1241 } 1242 1243 if (j->flags.chroot && enter_chroot(j)) 1244 pdie("chroot"); 1245 1246 if (j->flags.pivot_root && enter_pivot_root(j)) 1247 pdie("pivot_root"); 1248 1249 if (j->flags.mount_tmp && mount_tmp()) 1250 pdie("mount_tmp"); 1251 1252 if (j->flags.remount_proc_ro && remount_proc_readonly(j)) 1253 pdie("remount"); 1254 1255 if (j->flags.caps) { 1256 /* 1257 * POSIX capabilities are a bit tricky. If we drop our 1258 * capability to change uids, our attempt to use setuid() 1259 * below will fail. Hang on to root caps across setuid(), then 1260 * lock securebits. 1261 */ 1262 if (prctl(PR_SET_KEEPCAPS, 1)) 1263 pdie("prctl(PR_SET_KEEPCAPS)"); 1264 if (prctl 1265 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS)) 1266 pdie("prctl(PR_SET_SECUREBITS)"); 1267 } 1268 1269 /* 1270 * If we're setting no_new_privs, we can drop privileges 1271 * before setting seccomp filter. This way filter policies 1272 * don't need to allow privilege-dropping syscalls. 1273 */ 1274 if (j->flags.no_new_privs) { 1275 drop_ugid(j); 1276 if (j->flags.caps) 1277 drop_caps(j, last_valid_cap); 1278 1279 set_seccomp_filter(j); 1280 } else { 1281 /* 1282 * If we're not setting no_new_privs, 1283 * we need to set seccomp filter *before* dropping privileges. 1284 * WARNING: this means that filter policies *must* allow 1285 * setgroups()/setresgid()/setresuid() for dropping root and 1286 * capget()/capset()/prctl() for dropping caps. 1287 */ 1288 set_seccomp_filter(j); 1289 1290 drop_ugid(j); 1291 if (j->flags.caps) 1292 drop_caps(j, last_valid_cap); 1293 } 1294 1295 /* 1296 * Select the specified alternate syscall table. The table must not 1297 * block prctl(2) if we're using seccomp as well. 1298 */ 1299 if (j->flags.alt_syscall) { 1300 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table)) 1301 pdie("prctl(PR_ALT_SYSCALL)"); 1302 } 1303 1304 /* 1305 * seccomp has to come last since it cuts off all the other 1306 * privilege-dropping syscalls :) 1307 */ 1308 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) { 1309 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) { 1310 warn("seccomp not supported"); 1311 return; 1312 } 1313 pdie("prctl(PR_SET_SECCOMP)"); 1314 } 1315} 1316 1317/* TODO(wad) will visibility affect this variable? */ 1318static int init_exitstatus = 0; 1319 1320void init_term(int __attribute__ ((unused)) sig) 1321{ 1322 _exit(init_exitstatus); 1323} 1324 1325int init(pid_t rootpid) 1326{ 1327 pid_t pid; 1328 int status; 1329 /* so that we exit with the right status */ 1330 signal(SIGTERM, init_term); 1331 /* TODO(wad) self jail with seccomp_filters here. */ 1332 while ((pid = wait(&status)) > 0) { 1333 /* 1334 * This loop will only end when either there are no processes 1335 * left inside our pid namespace or we get a signal. 1336 */ 1337 if (pid == rootpid) 1338 init_exitstatus = status; 1339 } 1340 if (!WIFEXITED(init_exitstatus)) 1341 _exit(MINIJAIL_ERR_INIT); 1342 _exit(WEXITSTATUS(init_exitstatus)); 1343} 1344 1345int API minijail_from_fd(int fd, struct minijail *j) 1346{ 1347 size_t sz = 0; 1348 size_t bytes = read(fd, &sz, sizeof(sz)); 1349 char *buf; 1350 int r; 1351 if (sizeof(sz) != bytes) 1352 return -EINVAL; 1353 if (sz > USHRT_MAX) /* arbitrary sanity check */ 1354 return -E2BIG; 1355 buf = malloc(sz); 1356 if (!buf) 1357 return -ENOMEM; 1358 bytes = read(fd, buf, sz); 1359 if (bytes != sz) { 1360 free(buf); 1361 return -EINVAL; 1362 } 1363 r = minijail_unmarshal(j, buf, sz); 1364 free(buf); 1365 return r; 1366} 1367 1368int API minijail_to_fd(struct minijail *j, int fd) 1369{ 1370 char *buf; 1371 size_t sz = minijail_size(j); 1372 ssize_t written; 1373 int r; 1374 1375 if (!sz) 1376 return -EINVAL; 1377 buf = malloc(sz); 1378 r = minijail_marshal(j, buf, sz); 1379 if (r) { 1380 free(buf); 1381 return r; 1382 } 1383 /* Sends [size][minijail]. */ 1384 written = write(fd, &sz, sizeof(sz)); 1385 if (written != sizeof(sz)) { 1386 free(buf); 1387 return -EFAULT; 1388 } 1389 written = write(fd, buf, sz); 1390 if (written < 0 || (size_t) written != sz) { 1391 free(buf); 1392 return -EFAULT; 1393 } 1394 free(buf); 1395 return 0; 1396} 1397 1398int setup_preload(void) 1399{ 1400#if defined(__ANDROID__) 1401 /* Don't use LDPRELOAD on Brillo. */ 1402 return 0; 1403#else 1404 char *oldenv = getenv(kLdPreloadEnvVar) ? : ""; 1405 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH)); 1406 if (!newenv) 1407 return -ENOMEM; 1408 1409 /* Only insert a separating space if we have something to separate... */ 1410 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", 1411 PRELOADPATH); 1412 1413 /* setenv() makes a copy of the string we give it. */ 1414 setenv(kLdPreloadEnvVar, newenv, 1); 1415 free(newenv); 1416 return 0; 1417#endif 1418} 1419 1420int setup_pipe(int fds[2]) 1421{ 1422 int r = pipe(fds); 1423 char fd_buf[11]; 1424 if (r) 1425 return r; 1426 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]); 1427 if (r <= 0) 1428 return -EINVAL; 1429 setenv(kFdEnvVar, fd_buf, 1); 1430 return 0; 1431} 1432 1433int setup_pipe_end(int fds[2], size_t index) 1434{ 1435 if (index > 1) 1436 return -1; 1437 1438 close(fds[1 - index]); 1439 return fds[index]; 1440} 1441 1442int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd) 1443{ 1444 if (index > 1) 1445 return -1; 1446 1447 close(fds[1 - index]); 1448 /* dup2(2) the corresponding end of the pipe into |fd|. */ 1449 return dup2(fds[index], fd); 1450} 1451 1452int minijail_run_internal(struct minijail *j, const char *filename, 1453 char *const argv[], pid_t *pchild_pid, 1454 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1455 int use_preload); 1456 1457int API minijail_run(struct minijail *j, const char *filename, 1458 char *const argv[]) 1459{ 1460 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1461 true); 1462} 1463 1464int API minijail_run_pid(struct minijail *j, const char *filename, 1465 char *const argv[], pid_t *pchild_pid) 1466{ 1467 return minijail_run_internal(j, filename, argv, pchild_pid, 1468 NULL, NULL, NULL, true); 1469} 1470 1471int API minijail_run_pipe(struct minijail *j, const char *filename, 1472 char *const argv[], int *pstdin_fd) 1473{ 1474 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd, 1475 NULL, NULL, true); 1476} 1477 1478int API minijail_run_pid_pipes(struct minijail *j, const char *filename, 1479 char *const argv[], pid_t *pchild_pid, 1480 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) 1481{ 1482 return minijail_run_internal(j, filename, argv, pchild_pid, 1483 pstdin_fd, pstdout_fd, pstderr_fd, true); 1484} 1485 1486int API minijail_run_no_preload(struct minijail *j, const char *filename, 1487 char *const argv[]) 1488{ 1489 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1490 false); 1491} 1492 1493int API minijail_run_pid_pipes_no_preload(struct minijail *j, 1494 const char *filename, 1495 char *const argv[], 1496 pid_t *pchild_pid, 1497 int *pstdin_fd, int *pstdout_fd, 1498 int *pstderr_fd) { 1499 return minijail_run_internal(j, filename, argv, pchild_pid, 1500 pstdin_fd, pstdout_fd, pstderr_fd, false); 1501} 1502 1503int minijail_run_internal(struct minijail *j, const char *filename, 1504 char *const argv[], pid_t *pchild_pid, 1505 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1506 int use_preload) 1507{ 1508 char *oldenv, *oldenv_copy = NULL; 1509 pid_t child_pid; 1510 int pipe_fds[2]; 1511 int stdin_fds[2]; 1512 int stdout_fds[2]; 1513 int stderr_fds[2]; 1514 int userns_pipe_fds[2]; 1515 int ret; 1516 /* We need to remember this across the minijail_preexec() call. */ 1517 int pid_namespace = j->flags.pids; 1518 int do_init = j->flags.do_init; 1519 1520 if (use_preload) { 1521 oldenv = getenv(kLdPreloadEnvVar); 1522 if (oldenv) { 1523 oldenv_copy = strdup(oldenv); 1524 if (!oldenv_copy) 1525 return -ENOMEM; 1526 } 1527 1528 if (setup_preload()) 1529 return -EFAULT; 1530 } 1531 1532 if (!use_preload) { 1533 if (j->flags.caps) 1534 die("Capabilities are not supported without " 1535 "LD_PRELOAD"); 1536 } 1537 1538 /* 1539 * Make the process group ID of this process equal to its PID, so that 1540 * both the Minijail process and the jailed process can be killed 1541 * together. 1542 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when 1543 * the process is already a process group leader. 1544 */ 1545 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) { 1546 if (errno != EPERM) { 1547 pdie("setpgid(0, 0)"); 1548 } 1549 } 1550 1551 if (use_preload) { 1552 /* 1553 * Before we fork(2) and execve(2) the child process, we need 1554 * to open a pipe(2) to send the minijail configuration over. 1555 */ 1556 if (setup_pipe(pipe_fds)) 1557 return -EFAULT; 1558 } 1559 1560 /* 1561 * If we want to write to the child process' standard input, 1562 * create the pipe(2) now. 1563 */ 1564 if (pstdin_fd) { 1565 if (pipe(stdin_fds)) 1566 return -EFAULT; 1567 } 1568 1569 /* 1570 * If we want to read from the child process' standard output, 1571 * create the pipe(2) now. 1572 */ 1573 if (pstdout_fd) { 1574 if (pipe(stdout_fds)) 1575 return -EFAULT; 1576 } 1577 1578 /* 1579 * If we want to read from the child process' standard error, 1580 * create the pipe(2) now. 1581 */ 1582 if (pstderr_fd) { 1583 if (pipe(stderr_fds)) 1584 return -EFAULT; 1585 } 1586 1587 /* 1588 * If we want to set up a new uid/gid mapping in the user namespace, 1589 * create the pipe(2) to sync between parent and child. 1590 */ 1591 if (j->flags.userns) { 1592 if (pipe(userns_pipe_fds)) 1593 return -EFAULT; 1594 } 1595 1596 /* 1597 * Use sys_clone() if and only if we're creating a pid namespace. 1598 * 1599 * tl;dr: WARNING: do not mix pid namespaces and multithreading. 1600 * 1601 * In multithreaded programs, there are a bunch of locks inside libc, 1602 * some of which may be held by other threads at the time that we call 1603 * minijail_run_pid(). If we call fork(), glibc does its level best to 1604 * ensure that we hold all of these locks before it calls clone() 1605 * internally and drop them after clone() returns, but when we call 1606 * sys_clone(2) directly, all that gets bypassed and we end up with a 1607 * child address space where some of libc's important locks are held by 1608 * other threads (which did not get cloned, and hence will never release 1609 * those locks). This is okay so long as we call exec() immediately 1610 * after, but a bunch of seemingly-innocent libc functions like setenv() 1611 * take locks. 1612 * 1613 * Hence, only call sys_clone() if we need to, in order to get at pid 1614 * namespacing. If we follow this path, the child's address space might 1615 * have broken locks; you may only call functions that do not acquire 1616 * any locks. 1617 * 1618 * Unfortunately, fork() acquires every lock it can get its hands on, as 1619 * previously detailed, so this function is highly likely to deadlock 1620 * later on (see "deadlock here") if we're multithreaded. 1621 * 1622 * We might hack around this by having the clone()d child (init of the 1623 * pid namespace) return directly, rather than leaving the clone()d 1624 * process hanging around to be init for the new namespace (and having 1625 * its fork()ed child return in turn), but that process would be crippled 1626 * with its libc locks potentially broken. We might try fork()ing in the 1627 * parent before we clone() to ensure that we own all the locks, but 1628 * then we have to have the forked child hanging around consuming 1629 * resources (and possibly having file descriptors / shared memory 1630 * regions / etc attached). We'd need to keep the child around to avoid 1631 * having its children get reparented to init. 1632 * 1633 * TODO(ellyjones): figure out if the "forked child hanging around" 1634 * problem is fixable or not. It would be nice if we worked in this 1635 * case. 1636 */ 1637 if (pid_namespace) { 1638 int clone_flags = CLONE_NEWPID | SIGCHLD; 1639 if (j->flags.userns) 1640 clone_flags |= CLONE_NEWUSER; 1641 child_pid = syscall(SYS_clone, clone_flags, NULL); 1642 } else { 1643 child_pid = fork(); 1644 } 1645 1646 if (child_pid < 0) { 1647 if (use_preload) { 1648 free(oldenv_copy); 1649 } 1650 die("failed to fork child"); 1651 } 1652 1653 if (child_pid) { 1654 if (use_preload) { 1655 /* Restore parent's LD_PRELOAD. */ 1656 if (oldenv_copy) { 1657 setenv(kLdPreloadEnvVar, oldenv_copy, 1); 1658 free(oldenv_copy); 1659 } else { 1660 unsetenv(kLdPreloadEnvVar); 1661 } 1662 unsetenv(kFdEnvVar); 1663 } 1664 1665 j->initpid = child_pid; 1666 1667 if (j->flags.pid_file) 1668 write_pid_file(j); 1669 1670 if (j->flags.userns) 1671 write_ugid_mappings(j, userns_pipe_fds); 1672 1673 if (use_preload) { 1674 /* Send marshalled minijail. */ 1675 close(pipe_fds[0]); /* read endpoint */ 1676 ret = minijail_to_fd(j, pipe_fds[1]); 1677 close(pipe_fds[1]); /* write endpoint */ 1678 if (ret) { 1679 kill(j->initpid, SIGKILL); 1680 die("failed to send marshalled minijail"); 1681 } 1682 } 1683 1684 if (pchild_pid) 1685 *pchild_pid = child_pid; 1686 1687 /* 1688 * If we want to write to the child process' standard input, 1689 * set up the write end of the pipe. 1690 */ 1691 if (pstdin_fd) 1692 *pstdin_fd = setup_pipe_end(stdin_fds, 1693 1 /* write end */); 1694 1695 /* 1696 * If we want to read from the child process' standard output, 1697 * set up the read end of the pipe. 1698 */ 1699 if (pstdout_fd) 1700 *pstdout_fd = setup_pipe_end(stdout_fds, 1701 0 /* read end */); 1702 1703 /* 1704 * If we want to read from the child process' standard error, 1705 * set up the read end of the pipe. 1706 */ 1707 if (pstderr_fd) 1708 *pstderr_fd = setup_pipe_end(stderr_fds, 1709 0 /* read end */); 1710 1711 return 0; 1712 } 1713 free(oldenv_copy); 1714 1715 if (j->flags.reset_signal_mask) { 1716 sigset_t signal_mask; 1717 if (sigemptyset(&signal_mask) != 0) 1718 pdie("sigemptyset failed"); 1719 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0) 1720 pdie("sigprocmask failed"); 1721 } 1722 1723 if (j->flags.userns) 1724 enter_user_namespace(j, userns_pipe_fds); 1725 1726 /* 1727 * If we want to write to the jailed process' standard input, 1728 * set up the read end of the pipe. 1729 */ 1730 if (pstdin_fd) { 1731 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */, 1732 STDIN_FILENO) < 0) 1733 die("failed to set up stdin pipe"); 1734 } 1735 1736 /* 1737 * If we want to read from the jailed process' standard output, 1738 * set up the write end of the pipe. 1739 */ 1740 if (pstdout_fd) { 1741 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */, 1742 STDOUT_FILENO) < 0) 1743 die("failed to set up stdout pipe"); 1744 } 1745 1746 /* 1747 * If we want to read from the jailed process' standard error, 1748 * set up the write end of the pipe. 1749 */ 1750 if (pstderr_fd) { 1751 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */, 1752 STDERR_FILENO) < 0) 1753 die("failed to set up stderr pipe"); 1754 } 1755 1756 /* If running an init program, let it decide when/how to mount /proc. */ 1757 if (pid_namespace && !do_init) 1758 j->flags.remount_proc_ro = 0; 1759 1760 if (use_preload) { 1761 /* Strip out flags that cannot be inherited across execve(2). */ 1762 minijail_preexec(j); 1763 } else { 1764 j->flags.pids = 0; 1765 } 1766 /* Jail this process, then execve() the target. */ 1767 minijail_enter(j); 1768 1769 if (pid_namespace && do_init) { 1770 /* 1771 * pid namespace: this process will become init inside the new 1772 * namespace. We don't want all programs we might exec to have 1773 * to know how to be init. Normally (do_init == 1) we fork off 1774 * a child to actually run the program. If |do_init == 0|, we 1775 * let the program keep pid 1 and be init. 1776 * 1777 * If we're multithreaded, we'll probably deadlock here. See 1778 * WARNING above. 1779 */ 1780 child_pid = fork(); 1781 if (child_pid < 0) 1782 _exit(child_pid); 1783 else if (child_pid > 0) 1784 init(child_pid); /* never returns */ 1785 } 1786 1787 /* 1788 * If we aren't pid-namespaced, or the jailed program asked to be init: 1789 * calling process 1790 * -> execve()-ing process 1791 * If we are: 1792 * calling process 1793 * -> init()-ing process 1794 * -> execve()-ing process 1795 */ 1796 _exit(execve(filename, argv, environ)); 1797} 1798 1799int API minijail_kill(struct minijail *j) 1800{ 1801 int st; 1802 if (kill(j->initpid, SIGTERM)) 1803 return -errno; 1804 if (waitpid(j->initpid, &st, 0) < 0) 1805 return -errno; 1806 return st; 1807} 1808 1809int API minijail_wait(struct minijail *j) 1810{ 1811 int st; 1812 if (waitpid(j->initpid, &st, 0) < 0) 1813 return -errno; 1814 1815 if (!WIFEXITED(st)) { 1816 int error_status = st; 1817 if (WIFSIGNALED(st)) { 1818 int signum = WTERMSIG(st); 1819 warn("child process %d received signal %d", 1820 j->initpid, signum); 1821 /* 1822 * We return MINIJAIL_ERR_JAIL if the process received 1823 * SIGSYS, which happens when a syscall is blocked by 1824 * seccomp filters. 1825 * If not, we do what bash(1) does: 1826 * $? = 128 + signum 1827 */ 1828 if (signum == SIGSYS) { 1829 error_status = MINIJAIL_ERR_JAIL; 1830 } else { 1831 error_status = 128 + signum; 1832 } 1833 } 1834 return error_status; 1835 } 1836 1837 int exit_status = WEXITSTATUS(st); 1838 if (exit_status != 0) 1839 info("child process %d exited with status %d", 1840 j->initpid, exit_status); 1841 1842 return exit_status; 1843} 1844 1845void API minijail_destroy(struct minijail *j) 1846{ 1847 if (j->flags.seccomp_filter && j->filter_prog) { 1848 free(j->filter_prog->filter); 1849 free(j->filter_prog); 1850 } 1851 while (j->mounts_head) { 1852 struct mountpoint *m = j->mounts_head; 1853 j->mounts_head = j->mounts_head->next; 1854 free(m->type); 1855 free(m->dest); 1856 free(m->src); 1857 free(m); 1858 } 1859 j->mounts_tail = NULL; 1860 if (j->user) 1861 free(j->user); 1862 if (j->suppl_gid_list) 1863 free(j->suppl_gid_list); 1864 if (j->chrootdir) 1865 free(j->chrootdir); 1866 if (j->alt_syscall_table) 1867 free(j->alt_syscall_table); 1868 free(j); 1869} 1870