libminijail.c revision 457a5e333407ea2a0f90d8c6ea85ccf08a3c8083
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6#define _BSD_SOURCE 7#define _DEFAULT_SOURCE 8#define _GNU_SOURCE 9 10#include <asm/unistd.h> 11#include <ctype.h> 12#include <dirent.h> 13#include <errno.h> 14#include <fcntl.h> 15#include <grp.h> 16#include <inttypes.h> 17#include <limits.h> 18#include <linux/capability.h> 19#include <net/if.h> 20#include <pwd.h> 21#include <sched.h> 22#include <signal.h> 23#include <stdarg.h> 24#include <stdbool.h> 25#include <stddef.h> 26#include <stdio.h> 27#include <stdlib.h> 28#include <string.h> 29#include <sys/capability.h> 30#include <sys/mount.h> 31#include <sys/param.h> 32#include <sys/prctl.h> 33#include <sys/socket.h> 34#include <sys/stat.h> 35#include <sys/types.h> 36#include <sys/user.h> 37#include <sys/wait.h> 38#include <syscall.h> 39#include <unistd.h> 40 41#include "libminijail.h" 42#include "libminijail-private.h" 43 44#include "signal_handler.h" 45#include "syscall_filter.h" 46#include "syscall_wrapper.h" 47#include "util.h" 48 49#ifdef HAVE_SECUREBITS_H 50# include <linux/securebits.h> 51#else 52# define SECURE_ALL_BITS 0x55 53# define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1) 54#endif 55/* For kernels < 4.3. */ 56#define OLD_SECURE_ALL_BITS 0x15 57#define OLD_SECURE_ALL_LOCKS (OLD_SECURE_ALL_BITS << 1) 58 59/* 60 * Assert the value of SECURE_ALL_BITS at compile-time. 61 * Brillo devices are currently compiled against 4.4 kernel headers. Kernel 4.3 62 * added a new securebit. 63 * When a new securebit is added, the new SECURE_ALL_BITS mask will return EPERM 64 * when used on older kernels. The compile-time assert will catch this situation 65 * at compile time. 66 */ 67#ifdef __BRILLO__ 68_Static_assert(SECURE_ALL_BITS == 0x55, "SECURE_ALL_BITS == 0x55."); 69#endif 70 71/* Until these are reliably available in linux/prctl.h. */ 72#ifndef PR_SET_SECCOMP 73# define PR_SET_SECCOMP 22 74#endif 75 76#ifndef PR_ALT_SYSCALL 77# define PR_ALT_SYSCALL 0x43724f53 78#endif 79 80/* Seccomp filter related flags. */ 81#ifndef PR_SET_NO_NEW_PRIVS 82# define PR_SET_NO_NEW_PRIVS 38 83#endif 84 85#ifndef SECCOMP_MODE_FILTER 86# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ 87#endif 88 89#ifndef SECCOMP_SET_MODE_STRICT 90# define SECCOMP_SET_MODE_STRICT 0 91#endif 92#ifndef SECCOMP_SET_MODE_FILTER 93# define SECCOMP_SET_MODE_FILTER 1 94#endif 95 96#ifndef SECCOMP_FILTER_FLAG_TSYNC 97# define SECCOMP_FILTER_FLAG_TSYNC 1 98#endif 99/* End seccomp filter related flags. */ 100 101/* New cgroup namespace might not be in linux-headers yet. */ 102#ifndef CLONE_NEWCGROUP 103# define CLONE_NEWCGROUP 0x02000000 104#endif 105 106#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */ 107 108struct mountpoint { 109 char *src; 110 char *dest; 111 char *type; 112 char *data; 113 int has_data; 114 unsigned long flags; 115 struct mountpoint *next; 116}; 117 118struct minijail { 119 /* 120 * WARNING: if you add a flag here you need to make sure it's 121 * accounted for in minijail_pre{enter|exec}() below. 122 */ 123 struct { 124 int uid : 1; 125 int gid : 1; 126 int usergroups : 1; 127 int suppl_gids : 1; 128 int use_caps : 1; 129 int capbset_drop : 1; 130 int vfs : 1; 131 int enter_vfs : 1; 132 int skip_remount_private : 1; 133 int pids : 1; 134 int ipc : 1; 135 int net : 1; 136 int enter_net : 1; 137 int ns_cgroups : 1; 138 int userns : 1; 139 int disable_setgroups : 1; 140 int seccomp : 1; 141 int remount_proc_ro : 1; 142 int no_new_privs : 1; 143 int seccomp_filter : 1; 144 int seccomp_filter_tsync : 1; 145 int seccomp_filter_logging : 1; 146 int chroot : 1; 147 int pivot_root : 1; 148 int mount_tmp : 1; 149 int do_init : 1; 150 int pid_file : 1; 151 int cgroups : 1; 152 int alt_syscall : 1; 153 int reset_signal_mask : 1; 154 int close_open_fds : 1; 155 } flags; 156 uid_t uid; 157 gid_t gid; 158 gid_t usergid; 159 char *user; 160 size_t suppl_gid_count; 161 gid_t *suppl_gid_list; 162 uint64_t caps; 163 uint64_t cap_bset; 164 pid_t initpid; 165 int mountns_fd; 166 int netns_fd; 167 char *chrootdir; 168 char *pid_file_path; 169 char *uidmap; 170 char *gidmap; 171 size_t filter_len; 172 struct sock_fprog *filter_prog; 173 char *alt_syscall_table; 174 struct mountpoint *mounts_head; 175 struct mountpoint *mounts_tail; 176 size_t mounts_count; 177 char *cgroups[MAX_CGROUPS]; 178 size_t cgroup_count; 179}; 180 181/* 182 * Strip out flags meant for the parent. 183 * We keep things that are not inherited across execve(2) (e.g. capabilities), 184 * or are easier to set after execve(2) (e.g. seccomp filters). 185 */ 186void minijail_preenter(struct minijail *j) 187{ 188 j->flags.vfs = 0; 189 j->flags.enter_vfs = 0; 190 j->flags.skip_remount_private = 0; 191 j->flags.remount_proc_ro = 0; 192 j->flags.pids = 0; 193 j->flags.do_init = 0; 194 j->flags.pid_file = 0; 195 j->flags.cgroups = 0; 196} 197 198/* 199 * Strip out flags meant for the child. 200 * We keep things that are inherited across execve(2). 201 */ 202void minijail_preexec(struct minijail *j) 203{ 204 int vfs = j->flags.vfs; 205 int enter_vfs = j->flags.enter_vfs; 206 int skip_remount_private = j->flags.skip_remount_private; 207 int remount_proc_ro = j->flags.remount_proc_ro; 208 int userns = j->flags.userns; 209 if (j->user) 210 free(j->user); 211 j->user = NULL; 212 if (j->suppl_gid_list) 213 free(j->suppl_gid_list); 214 j->suppl_gid_list = NULL; 215 memset(&j->flags, 0, sizeof(j->flags)); 216 /* Now restore anything we meant to keep. */ 217 j->flags.vfs = vfs; 218 j->flags.enter_vfs = enter_vfs; 219 j->flags.skip_remount_private = skip_remount_private; 220 j->flags.remount_proc_ro = remount_proc_ro; 221 j->flags.userns = userns; 222 /* Note, |pids| will already have been used before this call. */ 223} 224 225/* Minijail API. */ 226 227struct minijail API *minijail_new(void) 228{ 229 return calloc(1, sizeof(struct minijail)); 230} 231 232void API minijail_change_uid(struct minijail *j, uid_t uid) 233{ 234 if (uid == 0) 235 die("useless change to uid 0"); 236 j->uid = uid; 237 j->flags.uid = 1; 238} 239 240void API minijail_change_gid(struct minijail *j, gid_t gid) 241{ 242 if (gid == 0) 243 die("useless change to gid 0"); 244 j->gid = gid; 245 j->flags.gid = 1; 246} 247 248void API minijail_set_supplementary_gids(struct minijail *j, size_t size, 249 const gid_t *list) 250{ 251 size_t i; 252 253 if (j->flags.usergroups) 254 die("cannot inherit *and* set supplementary groups"); 255 256 if (size == 0) { 257 /* Clear supplementary groups. */ 258 j->suppl_gid_list = NULL; 259 j->suppl_gid_count = 0; 260 j->flags.suppl_gids = 1; 261 return; 262 } 263 264 /* Copy the gid_t array. */ 265 j->suppl_gid_list = calloc(size, sizeof(gid_t)); 266 if (!j->suppl_gid_list) { 267 die("failed to allocate internal supplementary group array"); 268 } 269 for (i = 0; i < size; i++) { 270 j->suppl_gid_list[i] = list[i]; 271 } 272 j->suppl_gid_count = size; 273 j->flags.suppl_gids = 1; 274} 275 276int API minijail_change_user(struct minijail *j, const char *user) 277{ 278 char *buf = NULL; 279 struct passwd pw; 280 struct passwd *ppw = NULL; 281 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX); 282 if (sz == -1) 283 sz = 65536; /* your guess is as good as mine... */ 284 285 /* 286 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return 287 * the maximum needed size of the buffer, so we don't have to search. 288 */ 289 buf = malloc(sz); 290 if (!buf) 291 return -ENOMEM; 292 getpwnam_r(user, &pw, buf, sz, &ppw); 293 /* 294 * We're safe to free the buffer here. The strings inside |pw| point 295 * inside |buf|, but we don't use any of them; this leaves the pointers 296 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3) 297 * succeeded. 298 */ 299 free(buf); 300 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */ 301 if (!ppw) 302 return -1; 303 minijail_change_uid(j, ppw->pw_uid); 304 j->user = strdup(user); 305 if (!j->user) 306 return -ENOMEM; 307 j->usergid = ppw->pw_gid; 308 return 0; 309} 310 311int API minijail_change_group(struct minijail *j, const char *group) 312{ 313 char *buf = NULL; 314 struct group gr; 315 struct group *pgr = NULL; 316 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX); 317 if (sz == -1) 318 sz = 65536; /* and mine is as good as yours, really */ 319 320 /* 321 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return 322 * the maximum needed size of the buffer, so we don't have to search. 323 */ 324 buf = malloc(sz); 325 if (!buf) 326 return -ENOMEM; 327 getgrnam_r(group, &gr, buf, sz, &pgr); 328 /* 329 * We're safe to free the buffer here. The strings inside gr point 330 * inside buf, but we don't use any of them; this leaves the pointers 331 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded. 332 */ 333 free(buf); 334 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */ 335 if (!pgr) 336 return -1; 337 minijail_change_gid(j, pgr->gr_gid); 338 return 0; 339} 340 341void API minijail_use_seccomp(struct minijail *j) 342{ 343 j->flags.seccomp = 1; 344} 345 346void API minijail_no_new_privs(struct minijail *j) 347{ 348 j->flags.no_new_privs = 1; 349} 350 351void API minijail_use_seccomp_filter(struct minijail *j) 352{ 353 j->flags.seccomp_filter = 1; 354} 355 356void API minijail_set_seccomp_filter_tsync(struct minijail *j) 357{ 358 if (j->filter_len > 0 && j->filter_prog != NULL) { 359 die("minijail_set_seccomp_filter_tsync() must be called " 360 "before minijail_parse_seccomp_filters()"); 361 } 362 j->flags.seccomp_filter_tsync = 1; 363} 364 365void API minijail_log_seccomp_filter_failures(struct minijail *j) 366{ 367 if (j->filter_len > 0 && j->filter_prog != NULL) { 368 die("minijail_log_seccomp_filter_failures() must be called " 369 "before minijail_parse_seccomp_filters()"); 370 } 371 j->flags.seccomp_filter_logging = 1; 372} 373 374void API minijail_use_caps(struct minijail *j, uint64_t capmask) 375{ 376 /* 377 * 'minijail_use_caps' configures a runtime-capabilities-only 378 * environment, including a bounding set matching the thread's runtime 379 * (permitted|inheritable|effective) sets. 380 * Therefore, it will override any existing bounding set configurations 381 * since the latter would allow gaining extra runtime capabilities from 382 * file capabilities. 383 */ 384 if (j->flags.capbset_drop) { 385 warn("overriding bounding set configuration"); 386 j->cap_bset = 0; 387 j->flags.capbset_drop = 0; 388 } 389 j->caps = capmask; 390 j->flags.use_caps = 1; 391} 392 393void API minijail_capbset_drop(struct minijail *j, uint64_t capmask) 394{ 395 if (j->flags.use_caps) { 396 /* 397 * 'minijail_use_caps' will have already configured a capability 398 * bounding set matching the (permitted|inheritable|effective) 399 * sets. Abort if the user tries to configure a separate 400 * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps' 401 * are mutually exclusive. 402 */ 403 die("runtime capabilities already configured, can't drop " 404 "bounding set separately"); 405 } 406 j->cap_bset = capmask; 407 j->flags.capbset_drop = 1; 408} 409 410void API minijail_reset_signal_mask(struct minijail *j) 411{ 412 j->flags.reset_signal_mask = 1; 413} 414 415void API minijail_namespace_vfs(struct minijail *j) 416{ 417 j->flags.vfs = 1; 418} 419 420void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path) 421{ 422 int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC); 423 if (ns_fd < 0) { 424 pdie("failed to open namespace '%s'", ns_path); 425 } 426 j->mountns_fd = ns_fd; 427 j->flags.enter_vfs = 1; 428} 429 430void API minijail_skip_remount_private(struct minijail *j) 431{ 432 j->flags.skip_remount_private = 1; 433} 434 435void API minijail_namespace_pids(struct minijail *j) 436{ 437 j->flags.vfs = 1; 438 j->flags.remount_proc_ro = 1; 439 j->flags.pids = 1; 440 j->flags.do_init = 1; 441} 442 443void API minijail_namespace_ipc(struct minijail *j) 444{ 445 j->flags.ipc = 1; 446} 447 448void API minijail_namespace_net(struct minijail *j) 449{ 450 j->flags.net = 1; 451} 452 453void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path) 454{ 455 int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC); 456 if (ns_fd < 0) { 457 pdie("failed to open namespace '%s'", ns_path); 458 } 459 j->netns_fd = ns_fd; 460 j->flags.enter_net = 1; 461} 462 463void API minijail_namespace_cgroups(struct minijail *j) 464{ 465 j->flags.ns_cgroups = 1; 466} 467 468void API minijail_close_open_fds(struct minijail *j) 469{ 470 j->flags.close_open_fds = 1; 471} 472 473void API minijail_remount_proc_readonly(struct minijail *j) 474{ 475 j->flags.vfs = 1; 476 j->flags.remount_proc_ro = 1; 477} 478 479void API minijail_namespace_user(struct minijail *j) 480{ 481 j->flags.userns = 1; 482} 483 484void API minijail_namespace_user_disable_setgroups(struct minijail *j) 485{ 486 j->flags.disable_setgroups = 1; 487} 488 489int API minijail_uidmap(struct minijail *j, const char *uidmap) 490{ 491 j->uidmap = strdup(uidmap); 492 if (!j->uidmap) 493 return -ENOMEM; 494 char *ch; 495 for (ch = j->uidmap; *ch; ch++) { 496 if (*ch == ',') 497 *ch = '\n'; 498 } 499 return 0; 500} 501 502int API minijail_gidmap(struct minijail *j, const char *gidmap) 503{ 504 j->gidmap = strdup(gidmap); 505 if (!j->gidmap) 506 return -ENOMEM; 507 char *ch; 508 for (ch = j->gidmap; *ch; ch++) { 509 if (*ch == ',') 510 *ch = '\n'; 511 } 512 return 0; 513} 514 515void API minijail_inherit_usergroups(struct minijail *j) 516{ 517 j->flags.usergroups = 1; 518} 519 520void API minijail_run_as_init(struct minijail *j) 521{ 522 /* 523 * Since the jailed program will become 'init' in the new PID namespace, 524 * Minijail does not need to fork an 'init' process. 525 */ 526 j->flags.do_init = 0; 527} 528 529int API minijail_enter_chroot(struct minijail *j, const char *dir) 530{ 531 if (j->chrootdir) 532 return -EINVAL; 533 j->chrootdir = strdup(dir); 534 if (!j->chrootdir) 535 return -ENOMEM; 536 j->flags.chroot = 1; 537 return 0; 538} 539 540int API minijail_enter_pivot_root(struct minijail *j, const char *dir) 541{ 542 if (j->chrootdir) 543 return -EINVAL; 544 j->chrootdir = strdup(dir); 545 if (!j->chrootdir) 546 return -ENOMEM; 547 j->flags.pivot_root = 1; 548 return 0; 549} 550 551char API *minijail_get_original_path(struct minijail *j, 552 const char *path_inside_chroot) 553{ 554 struct mountpoint *b; 555 556 b = j->mounts_head; 557 while (b) { 558 /* 559 * If |path_inside_chroot| is the exact destination of a 560 * mount, then the original path is exactly the source of 561 * the mount. 562 * for example: "-b /some/path/exe,/chroot/path/exe" 563 * mount source = /some/path/exe, mount dest = 564 * /chroot/path/exe Then when getting the original path of 565 * "/chroot/path/exe", the source of that mount, 566 * "/some/path/exe" is what should be returned. 567 */ 568 if (!strcmp(b->dest, path_inside_chroot)) 569 return strdup(b->src); 570 571 /* 572 * If |path_inside_chroot| is within the destination path of a 573 * mount, take the suffix of the chroot path relative to the 574 * mount destination path, and append it to the mount source 575 * path. 576 */ 577 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) { 578 const char *relative_path = 579 path_inside_chroot + strlen(b->dest); 580 return path_join(b->src, relative_path); 581 } 582 b = b->next; 583 } 584 585 /* If there is a chroot path, append |path_inside_chroot| to that. */ 586 if (j->chrootdir) 587 return path_join(j->chrootdir, path_inside_chroot); 588 589 /* No chroot, so the path outside is the same as it is inside. */ 590 return strdup(path_inside_chroot); 591} 592 593void API minijail_mount_tmp(struct minijail *j) 594{ 595 j->flags.mount_tmp = 1; 596} 597 598int API minijail_write_pid_file(struct minijail *j, const char *path) 599{ 600 j->pid_file_path = strdup(path); 601 if (!j->pid_file_path) 602 return -ENOMEM; 603 j->flags.pid_file = 1; 604 return 0; 605} 606 607int API minijail_add_to_cgroup(struct minijail *j, const char *path) 608{ 609 if (j->cgroup_count >= MAX_CGROUPS) 610 return -ENOMEM; 611 j->cgroups[j->cgroup_count] = strdup(path); 612 if (!j->cgroups[j->cgroup_count]) 613 return -ENOMEM; 614 j->cgroup_count++; 615 j->flags.cgroups = 1; 616 return 0; 617} 618 619int API minijail_mount_with_data(struct minijail *j, const char *src, 620 const char *dest, const char *type, 621 unsigned long flags, const char *data) 622{ 623 struct mountpoint *m; 624 625 if (*dest != '/') 626 return -EINVAL; 627 m = calloc(1, sizeof(*m)); 628 if (!m) 629 return -ENOMEM; 630 m->dest = strdup(dest); 631 if (!m->dest) 632 goto error; 633 m->src = strdup(src); 634 if (!m->src) 635 goto error; 636 m->type = strdup(type); 637 if (!m->type) 638 goto error; 639 if (data) { 640 m->data = strdup(data); 641 if (!m->data) 642 goto error; 643 m->has_data = 1; 644 } 645 m->flags = flags; 646 647 info("mount %s -> %s type '%s'", src, dest, type); 648 649 /* 650 * Force vfs namespacing so the mounts don't leak out into the 651 * containing vfs namespace. 652 */ 653 minijail_namespace_vfs(j); 654 655 if (j->mounts_tail) 656 j->mounts_tail->next = m; 657 else 658 j->mounts_head = m; 659 j->mounts_tail = m; 660 j->mounts_count++; 661 662 return 0; 663 664error: 665 free(m->type); 666 free(m->src); 667 free(m->dest); 668 free(m); 669 return -ENOMEM; 670} 671 672int API minijail_mount(struct minijail *j, const char *src, const char *dest, 673 const char *type, unsigned long flags) 674{ 675 return minijail_mount_with_data(j, src, dest, type, flags, NULL); 676} 677 678int API minijail_bind(struct minijail *j, const char *src, const char *dest, 679 int writeable) 680{ 681 unsigned long flags = MS_BIND; 682 683 if (!writeable) 684 flags |= MS_RDONLY; 685 686 return minijail_mount(j, src, dest, "", flags); 687} 688 689static void clear_seccomp_options(struct minijail *j) 690{ 691 j->flags.seccomp_filter = 0; 692 j->flags.seccomp_filter_tsync = 0; 693 j->flags.seccomp_filter_logging = 0; 694 j->filter_len = 0; 695 j->filter_prog = NULL; 696 j->flags.no_new_privs = 0; 697} 698 699static int seccomp_should_parse_filters(struct minijail *j) 700{ 701 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL) == -1) { 702 /* 703 * |errno| will be set to EINVAL when seccomp has not been 704 * compiled into the kernel. On certain platforms and kernel 705 * versions this is not a fatal failure. In that case, and only 706 * in that case, disable seccomp and skip loading the filters. 707 */ 708 if ((errno == EINVAL) && seccomp_can_softfail()) { 709 warn("not loading seccomp filters, seccomp filter not " 710 "supported"); 711 clear_seccomp_options(j); 712 return 0; 713 } 714 /* 715 * If |errno| != EINVAL or seccomp_can_softfail() is false, 716 * we can proceed. Worst case scenario minijail_enter() will 717 * abort() if seccomp fails. 718 */ 719 } 720 if (j->flags.seccomp_filter_tsync) { 721 /* Are the seccomp(2) syscall and the TSYNC option supported? */ 722 if (sys_seccomp(SECCOMP_SET_MODE_FILTER, 723 SECCOMP_FILTER_FLAG_TSYNC, NULL) == -1) { 724 int saved_errno = errno; 725 if (saved_errno == ENOSYS && seccomp_can_softfail()) { 726 warn("seccomp(2) syscall not supported"); 727 clear_seccomp_options(j); 728 return 0; 729 } else if (saved_errno == EINVAL && 730 seccomp_can_softfail()) { 731 warn( 732 "seccomp filter thread sync not supported"); 733 clear_seccomp_options(j); 734 return 0; 735 } 736 /* 737 * Similar logic here. If seccomp_can_softfail() is 738 * false, or |errno| != ENOSYS, or |errno| != EINVAL, 739 * we can proceed. Worst case scenario minijail_enter() 740 * will abort() if seccomp or TSYNC fail. 741 */ 742 } 743 } 744 return 1; 745} 746 747static int parse_seccomp_filters(struct minijail *j, FILE *policy_file) 748{ 749 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog)); 750 int use_ret_trap = 751 j->flags.seccomp_filter_tsync || j->flags.seccomp_filter_logging; 752 int allow_logging = j->flags.seccomp_filter_logging; 753 754 if (compile_filter(policy_file, fprog, use_ret_trap, allow_logging)) { 755 free(fprog); 756 return -1; 757 } 758 759 j->filter_len = fprog->len; 760 j->filter_prog = fprog; 761 return 0; 762} 763 764void API minijail_parse_seccomp_filters(struct minijail *j, const char *path) 765{ 766 if (!seccomp_should_parse_filters(j)) 767 return; 768 769 FILE *file = fopen(path, "r"); 770 if (!file) { 771 pdie("failed to open seccomp filter file '%s'", path); 772 } 773 774 if (parse_seccomp_filters(j, file) != 0) { 775 die("failed to compile seccomp filter BPF program in '%s'", 776 path); 777 } 778 fclose(file); 779} 780 781void API minijail_parse_seccomp_filters_from_fd(struct minijail *j, int fd) 782{ 783 if (!seccomp_should_parse_filters(j)) 784 return; 785 786 FILE *file = fdopen(fd, "r"); 787 if (!file) { 788 pdie("failed to associate stream with fd %d", fd); 789 } 790 791 if (parse_seccomp_filters(j, file) != 0) { 792 die("failed to compile seccomp filter BPF program from fd %d", 793 fd); 794 } 795 fclose(file); 796} 797 798int API minijail_use_alt_syscall(struct minijail *j, const char *table) 799{ 800 j->alt_syscall_table = strdup(table); 801 if (!j->alt_syscall_table) 802 return -ENOMEM; 803 j->flags.alt_syscall = 1; 804 return 0; 805} 806 807struct marshal_state { 808 size_t available; 809 size_t total; 810 char *buf; 811}; 812 813void marshal_state_init(struct marshal_state *state, char *buf, 814 size_t available) 815{ 816 state->available = available; 817 state->buf = buf; 818 state->total = 0; 819} 820 821void marshal_append(struct marshal_state *state, void *src, size_t length) 822{ 823 size_t copy_len = MIN(state->available, length); 824 825 /* Up to |available| will be written. */ 826 if (copy_len) { 827 memcpy(state->buf, src, copy_len); 828 state->buf += copy_len; 829 state->available -= copy_len; 830 } 831 /* |total| will contain the expected length. */ 832 state->total += length; 833} 834 835void marshal_mount(struct marshal_state *state, const struct mountpoint *m) 836{ 837 marshal_append(state, m->src, strlen(m->src) + 1); 838 marshal_append(state, m->dest, strlen(m->dest) + 1); 839 marshal_append(state, m->type, strlen(m->type) + 1); 840 marshal_append(state, (char *)&m->has_data, sizeof(m->has_data)); 841 if (m->has_data) 842 marshal_append(state, m->data, strlen(m->data) + 1); 843 marshal_append(state, (char *)&m->flags, sizeof(m->flags)); 844} 845 846void minijail_marshal_helper(struct marshal_state *state, 847 const struct minijail *j) 848{ 849 struct mountpoint *m = NULL; 850 size_t i; 851 852 marshal_append(state, (char *)j, sizeof(*j)); 853 if (j->user) 854 marshal_append(state, j->user, strlen(j->user) + 1); 855 if (j->suppl_gid_list) { 856 marshal_append(state, j->suppl_gid_list, 857 j->suppl_gid_count * sizeof(gid_t)); 858 } 859 if (j->chrootdir) 860 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1); 861 if (j->alt_syscall_table) { 862 marshal_append(state, j->alt_syscall_table, 863 strlen(j->alt_syscall_table) + 1); 864 } 865 if (j->flags.seccomp_filter && j->filter_prog) { 866 struct sock_fprog *fp = j->filter_prog; 867 marshal_append(state, (char *)fp->filter, 868 fp->len * sizeof(struct sock_filter)); 869 } 870 for (m = j->mounts_head; m; m = m->next) { 871 marshal_mount(state, m); 872 } 873 for (i = 0; i < j->cgroup_count; ++i) 874 marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1); 875} 876 877size_t API minijail_size(const struct minijail *j) 878{ 879 struct marshal_state state; 880 marshal_state_init(&state, NULL, 0); 881 minijail_marshal_helper(&state, j); 882 return state.total; 883} 884 885int minijail_marshal(const struct minijail *j, char *buf, size_t available) 886{ 887 struct marshal_state state; 888 marshal_state_init(&state, buf, available); 889 minijail_marshal_helper(&state, j); 890 return (state.total > available); 891} 892 893int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) 894{ 895 size_t i; 896 size_t count; 897 int ret = -EINVAL; 898 899 if (length < sizeof(*j)) 900 goto out; 901 memcpy((void *)j, serialized, sizeof(*j)); 902 serialized += sizeof(*j); 903 length -= sizeof(*j); 904 905 /* Potentially stale pointers not used as signals. */ 906 j->pid_file_path = NULL; 907 j->uidmap = NULL; 908 j->gidmap = NULL; 909 j->mounts_head = NULL; 910 j->mounts_tail = NULL; 911 j->filter_prog = NULL; 912 913 if (j->user) { /* stale pointer */ 914 char *user = consumestr(&serialized, &length); 915 if (!user) 916 goto clear_pointers; 917 j->user = strdup(user); 918 if (!j->user) 919 goto clear_pointers; 920 } 921 922 if (j->suppl_gid_list) { /* stale pointer */ 923 if (j->suppl_gid_count > NGROUPS_MAX) { 924 goto bad_gid_list; 925 } 926 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t); 927 void *gid_list_bytes = 928 consumebytes(gid_list_size, &serialized, &length); 929 if (!gid_list_bytes) 930 goto bad_gid_list; 931 932 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t)); 933 if (!j->suppl_gid_list) 934 goto bad_gid_list; 935 936 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size); 937 } 938 939 if (j->chrootdir) { /* stale pointer */ 940 char *chrootdir = consumestr(&serialized, &length); 941 if (!chrootdir) 942 goto bad_chrootdir; 943 j->chrootdir = strdup(chrootdir); 944 if (!j->chrootdir) 945 goto bad_chrootdir; 946 } 947 948 if (j->alt_syscall_table) { /* stale pointer */ 949 char *alt_syscall_table = consumestr(&serialized, &length); 950 if (!alt_syscall_table) 951 goto bad_syscall_table; 952 j->alt_syscall_table = strdup(alt_syscall_table); 953 if (!j->alt_syscall_table) 954 goto bad_syscall_table; 955 } 956 957 if (j->flags.seccomp_filter && j->filter_len > 0) { 958 size_t ninstrs = j->filter_len; 959 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) || 960 ninstrs > USHRT_MAX) 961 goto bad_filters; 962 963 size_t program_len = ninstrs * sizeof(struct sock_filter); 964 void *program = consumebytes(program_len, &serialized, &length); 965 if (!program) 966 goto bad_filters; 967 968 j->filter_prog = malloc(sizeof(struct sock_fprog)); 969 if (!j->filter_prog) 970 goto bad_filters; 971 972 j->filter_prog->len = ninstrs; 973 j->filter_prog->filter = malloc(program_len); 974 if (!j->filter_prog->filter) 975 goto bad_filter_prog_instrs; 976 977 memcpy(j->filter_prog->filter, program, program_len); 978 } 979 980 count = j->mounts_count; 981 j->mounts_count = 0; 982 for (i = 0; i < count; ++i) { 983 unsigned long *flags; 984 int *has_data; 985 const char *dest; 986 const char *type; 987 const char *data = NULL; 988 const char *src = consumestr(&serialized, &length); 989 if (!src) 990 goto bad_mounts; 991 dest = consumestr(&serialized, &length); 992 if (!dest) 993 goto bad_mounts; 994 type = consumestr(&serialized, &length); 995 if (!type) 996 goto bad_mounts; 997 has_data = consumebytes(sizeof(*has_data), &serialized, 998 &length); 999 if (!has_data) 1000 goto bad_mounts; 1001 if (*has_data) { 1002 data = consumestr(&serialized, &length); 1003 if (!data) 1004 goto bad_mounts; 1005 } 1006 flags = consumebytes(sizeof(*flags), &serialized, &length); 1007 if (!flags) 1008 goto bad_mounts; 1009 if (minijail_mount_with_data(j, src, dest, type, *flags, data)) 1010 goto bad_mounts; 1011 } 1012 1013 count = j->cgroup_count; 1014 j->cgroup_count = 0; 1015 for (i = 0; i < count; ++i) { 1016 char *cgroup = consumestr(&serialized, &length); 1017 if (!cgroup) 1018 goto bad_cgroups; 1019 j->cgroups[i] = strdup(cgroup); 1020 if (!j->cgroups[i]) 1021 goto bad_cgroups; 1022 ++j->cgroup_count; 1023 } 1024 1025 return 0; 1026 1027bad_cgroups: 1028 while (j->mounts_head) { 1029 struct mountpoint *m = j->mounts_head; 1030 j->mounts_head = j->mounts_head->next; 1031 free(m->data); 1032 free(m->type); 1033 free(m->dest); 1034 free(m->src); 1035 free(m); 1036 } 1037 for (i = 0; i < j->cgroup_count; ++i) 1038 free(j->cgroups[i]); 1039bad_mounts: 1040 if (j->flags.seccomp_filter && j->filter_len > 0) { 1041 free(j->filter_prog->filter); 1042 free(j->filter_prog); 1043 } 1044bad_filter_prog_instrs: 1045 if (j->filter_prog) 1046 free(j->filter_prog); 1047bad_filters: 1048 if (j->alt_syscall_table) 1049 free(j->alt_syscall_table); 1050bad_syscall_table: 1051 if (j->chrootdir) 1052 free(j->chrootdir); 1053bad_chrootdir: 1054 if (j->suppl_gid_list) 1055 free(j->suppl_gid_list); 1056bad_gid_list: 1057 if (j->user) 1058 free(j->user); 1059clear_pointers: 1060 j->user = NULL; 1061 j->suppl_gid_list = NULL; 1062 j->chrootdir = NULL; 1063 j->alt_syscall_table = NULL; 1064 j->cgroup_count = 0; 1065out: 1066 return ret; 1067} 1068 1069/* 1070 * setup_mount_destination: Ensures the mount target exists. 1071 * Creates it if needed and possible. 1072 */ 1073static int setup_mount_destination(const char *source, const char *dest, 1074 uid_t uid, uid_t gid) 1075{ 1076 int rc; 1077 struct stat st_buf; 1078 1079 rc = stat(dest, &st_buf); 1080 if (rc == 0) /* destination exists */ 1081 return 0; 1082 1083 /* 1084 * Try to create the destination. 1085 * Either make a directory or touch a file depending on the source type. 1086 * If the source doesn't exist, assume it is a filesystem type such as 1087 * "tmpfs" and create a directory to mount it on. 1088 */ 1089 rc = stat(source, &st_buf); 1090 if (rc || S_ISDIR(st_buf.st_mode) || S_ISBLK(st_buf.st_mode)) { 1091 if (mkdir(dest, 0700)) 1092 return -errno; 1093 } else { 1094 int fd = open(dest, O_RDWR | O_CREAT, 0700); 1095 if (fd < 0) 1096 return -errno; 1097 close(fd); 1098 } 1099 return chown(dest, uid, gid); 1100} 1101 1102/* 1103 * mount_one: Applies mounts from @m for @j, recursing as needed. 1104 * @j Minijail these mounts are for 1105 * @m Head of list of mounts 1106 * 1107 * Returns 0 for success. 1108 */ 1109static int mount_one(const struct minijail *j, struct mountpoint *m) 1110{ 1111 int ret; 1112 char *dest; 1113 int remount_ro = 0; 1114 1115 /* |dest| has a leading "/". */ 1116 if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0) 1117 return -ENOMEM; 1118 1119 if (setup_mount_destination(m->src, dest, j->uid, j->gid)) 1120 pdie("creating mount target '%s' failed", dest); 1121 1122 /* 1123 * R/O bind mounts have to be remounted since 'bind' and 'ro' 1124 * can't both be specified in the original bind mount. 1125 * Remount R/O after the initial mount. 1126 */ 1127 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) { 1128 remount_ro = 1; 1129 m->flags &= ~MS_RDONLY; 1130 } 1131 1132 ret = mount(m->src, dest, m->type, m->flags, m->data); 1133 if (ret) 1134 pdie("mount: %s -> %s", m->src, dest); 1135 1136 if (remount_ro) { 1137 m->flags |= MS_RDONLY; 1138 ret = mount(m->src, dest, NULL, 1139 m->flags | MS_REMOUNT, m->data); 1140 if (ret) 1141 pdie("bind ro: %s -> %s", m->src, dest); 1142 } 1143 1144 free(dest); 1145 if (m->next) 1146 return mount_one(j, m->next); 1147 return ret; 1148} 1149 1150static int enter_chroot(const struct minijail *j) 1151{ 1152 int ret; 1153 1154 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 1155 return ret; 1156 1157 if (chroot(j->chrootdir)) 1158 return -errno; 1159 1160 if (chdir("/")) 1161 return -errno; 1162 1163 return 0; 1164} 1165 1166static int enter_pivot_root(const struct minijail *j) 1167{ 1168 int ret, oldroot, newroot; 1169 1170 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 1171 return ret; 1172 1173 /* 1174 * Keep the fd for both old and new root. 1175 * It will be used in fchdir(2) later. 1176 */ 1177 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); 1178 if (oldroot < 0) 1179 pdie("failed to open / for fchdir"); 1180 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC); 1181 if (newroot < 0) 1182 pdie("failed to open %s for fchdir", j->chrootdir); 1183 1184 /* 1185 * To ensure j->chrootdir is the root of a filesystem, 1186 * do a self bind mount. 1187 */ 1188 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, "")) 1189 pdie("failed to bind mount '%s'", j->chrootdir); 1190 if (chdir(j->chrootdir)) 1191 return -errno; 1192 if (syscall(SYS_pivot_root, ".", ".")) 1193 pdie("pivot_root"); 1194 1195 /* 1196 * Now the old root is mounted on top of the new root. Use fchdir(2) to 1197 * change to the old root and unmount it. 1198 */ 1199 if (fchdir(oldroot)) 1200 pdie("failed to fchdir to old /"); 1201 1202 /* 1203 * If j->flags.skip_remount_private was enabled for minijail_enter(), 1204 * there could be a shared mount point under |oldroot|. In that case, 1205 * mounts under this shared mount point will be unmounted below, and 1206 * this unmounting will propagate to the original mount namespace 1207 * (because the mount point is shared). To prevent this unexpected 1208 * unmounting, remove these mounts from their peer groups by recursively 1209 * remounting them as MS_PRIVATE. 1210 */ 1211 if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL)) 1212 pdie("failed to mount(/, private) before umount(/)"); 1213 /* The old root might be busy, so use lazy unmount. */ 1214 if (umount2(".", MNT_DETACH)) 1215 pdie("umount(/)"); 1216 /* Change back to the new root. */ 1217 if (fchdir(newroot)) 1218 return -errno; 1219 if (close(oldroot)) 1220 return -errno; 1221 if (close(newroot)) 1222 return -errno; 1223 if (chroot("/")) 1224 return -errno; 1225 /* Set correct CWD for getcwd(3). */ 1226 if (chdir("/")) 1227 return -errno; 1228 1229 return 0; 1230} 1231 1232static int mount_tmp(void) 1233{ 1234 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777"); 1235} 1236 1237static int remount_proc_readonly(const struct minijail *j) 1238{ 1239 const char *kProcPath = "/proc"; 1240 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 1241 /* 1242 * Right now, we're holding a reference to our parent's old mount of 1243 * /proc in our namespace, which means using MS_REMOUNT here would 1244 * mutate our parent's mount as well, even though we're in a VFS 1245 * namespace (!). Instead, remove their mount from our namespace lazily 1246 * (MNT_DETACH) and make our own. 1247 */ 1248 if (umount2(kProcPath, MNT_DETACH)) { 1249 /* 1250 * If we are in a new user namespace, umount(2) will fail. 1251 * See http://man7.org/linux/man-pages/man7/user_namespaces.7.html 1252 */ 1253 if (j->flags.userns) { 1254 info("umount(/proc, MNT_DETACH) failed, " 1255 "this is expected when using user namespaces"); 1256 } else { 1257 return -errno; 1258 } 1259 } 1260 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) 1261 return -errno; 1262 return 0; 1263} 1264 1265static void kill_child_and_die(const struct minijail *j, const char *msg) 1266{ 1267 kill(j->initpid, SIGKILL); 1268 die("%s", msg); 1269} 1270 1271static void write_pid_file_or_die(const struct minijail *j) 1272{ 1273 if (write_pid_to_path(j->initpid, j->pid_file_path)) 1274 kill_child_and_die(j, "failed to write pid file"); 1275} 1276 1277static void add_to_cgroups_or_die(const struct minijail *j) 1278{ 1279 size_t i; 1280 1281 for (i = 0; i < j->cgroup_count; ++i) { 1282 if (write_pid_to_path(j->initpid, j->cgroups[i])) 1283 kill_child_and_die(j, "failed to add to cgroups"); 1284 } 1285} 1286 1287static void write_ugid_maps_or_die(const struct minijail *j) 1288{ 1289 if (j->uidmap && write_proc_file(j->initpid, j->uidmap, "uid_map") != 0) 1290 kill_child_and_die(j, "failed to write uid_map"); 1291 if (j->gidmap && j->flags.disable_setgroups && 1292 write_proc_file(j->initpid, "deny", "setgroups") != 0) 1293 kill_child_and_die(j, "failed to disable setgroups(2)"); 1294 if (j->gidmap && write_proc_file(j->initpid, j->gidmap, "gid_map") != 0) 1295 kill_child_and_die(j, "failed to write gid_map"); 1296} 1297 1298static void enter_user_namespace(const struct minijail *j) 1299{ 1300 if (j->uidmap && setresuid(0, 0, 0)) 1301 pdie("user_namespaces: setresuid(0, 0, 0) failed"); 1302 if (j->gidmap && setresgid(0, 0, 0)) 1303 pdie("user_namespaces: setresgid(0, 0, 0) failed"); 1304} 1305 1306static void parent_setup_complete(int *pipe_fds) 1307{ 1308 close(pipe_fds[0]); 1309 close(pipe_fds[1]); 1310} 1311 1312/* 1313 * wait_for_parent_setup: Called by the child process to wait for any 1314 * further parent-side setup to complete before continuing. 1315 */ 1316static void wait_for_parent_setup(int *pipe_fds) 1317{ 1318 char buf; 1319 1320 close(pipe_fds[1]); 1321 1322 /* Wait for parent to complete setup and close the pipe. */ 1323 if (read(pipe_fds[0], &buf, 1) != 0) 1324 die("failed to sync with parent"); 1325 close(pipe_fds[0]); 1326} 1327 1328static void drop_ugid(const struct minijail *j) 1329{ 1330 if (j->flags.usergroups && j->flags.suppl_gids) { 1331 die("tried to inherit *and* set supplementary groups;" 1332 " can only do one"); 1333 } 1334 1335 if (j->flags.usergroups) { 1336 if (initgroups(j->user, j->usergid)) 1337 pdie("initgroups(%s, %d) failed", j->user, j->usergid); 1338 } else if (j->flags.suppl_gids) { 1339 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) { 1340 pdie("setgroups(suppl_gids) failed"); 1341 } 1342 } else { 1343 /* 1344 * Only attempt to clear supplementary groups if we are changing 1345 * users. 1346 */ 1347 if ((j->flags.uid || j->flags.gid) && setgroups(0, NULL)) 1348 pdie("setgroups(0, NULL) failed"); 1349 } 1350 1351 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) 1352 pdie("setresgid(%d, %d, %d) failed", j->gid, j->gid, j->gid); 1353 1354 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) 1355 pdie("setresuid(%d, %d, %d) failed", j->uid, j->uid, j->uid); 1356} 1357 1358/* 1359 * We specifically do not use cap_valid() as that only tells us the last 1360 * valid cap we were *compiled* against (i.e. what the version of kernel 1361 * headers says). If we run on a different kernel version, then it's not 1362 * uncommon for that to be less (if an older kernel) or more (if a newer 1363 * kernel). 1364 * Normally, we suck up the answer via /proc. On Android, not all processes are 1365 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we 1366 * programmatically find the value by calling prctl(PR_CAPBSET_READ). 1367 */ 1368static unsigned int get_last_valid_cap() 1369{ 1370 unsigned int last_valid_cap = 0; 1371 if (is_android()) { 1372 for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0; 1373 ++last_valid_cap); 1374 1375 /* |last_valid_cap| will be the first failing value. */ 1376 if (last_valid_cap > 0) { 1377 last_valid_cap--; 1378 } 1379 } else { 1380 const char cap_file[] = "/proc/sys/kernel/cap_last_cap"; 1381 FILE *fp = fopen(cap_file, "re"); 1382 if (fscanf(fp, "%u", &last_valid_cap) != 1) 1383 pdie("fscanf(%s)", cap_file); 1384 fclose(fp); 1385 } 1386 return last_valid_cap; 1387} 1388 1389static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap) 1390{ 1391 const uint64_t one = 1; 1392 unsigned int i; 1393 for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) { 1394 if (keep_mask & (one << i)) 1395 continue; 1396 if (prctl(PR_CAPBSET_DROP, i)) 1397 pdie("could not drop capability from bounding set"); 1398 } 1399} 1400 1401static void drop_caps(const struct minijail *j, unsigned int last_valid_cap) 1402{ 1403 if (!j->flags.use_caps) 1404 return; 1405 1406 cap_t caps = cap_get_proc(); 1407 cap_value_t flag[1]; 1408 const uint64_t one = 1; 1409 unsigned int i; 1410 if (!caps) 1411 die("can't get process caps"); 1412 if (cap_clear_flag(caps, CAP_INHERITABLE)) 1413 die("can't clear inheritable caps"); 1414 if (cap_clear_flag(caps, CAP_EFFECTIVE)) 1415 die("can't clear effective caps"); 1416 if (cap_clear_flag(caps, CAP_PERMITTED)) 1417 die("can't clear permitted caps"); 1418 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) { 1419 /* Keep CAP_SETPCAP for dropping bounding set bits. */ 1420 if (i != CAP_SETPCAP && !(j->caps & (one << i))) 1421 continue; 1422 flag[0] = i; 1423 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET)) 1424 die("can't add effective cap"); 1425 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET)) 1426 die("can't add permitted cap"); 1427 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET)) 1428 die("can't add inheritable cap"); 1429 } 1430 if (cap_set_proc(caps)) 1431 die("can't apply initial cleaned capset"); 1432 1433 /* 1434 * Instead of dropping bounding set first, do it here in case 1435 * the caller had a more permissive bounding set which could 1436 * have been used above to raise a capability that wasn't already 1437 * present. This requires CAP_SETPCAP, so we raised/kept it above. 1438 */ 1439 drop_capbset(j->caps, last_valid_cap); 1440 1441 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */ 1442 if ((j->caps & (one << CAP_SETPCAP)) == 0) { 1443 flag[0] = CAP_SETPCAP; 1444 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR)) 1445 die("can't clear effective cap"); 1446 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR)) 1447 die("can't clear permitted cap"); 1448 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR)) 1449 die("can't clear inheritable cap"); 1450 } 1451 1452 if (cap_set_proc(caps)) 1453 die("can't apply final cleaned capset"); 1454 1455 cap_free(caps); 1456} 1457 1458static void set_seccomp_filter(const struct minijail *j) 1459{ 1460 /* 1461 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c> 1462 * in the kernel source tree for an explanation of the parameters. 1463 */ 1464 if (j->flags.no_new_privs) { 1465 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) 1466 pdie("prctl(PR_SET_NO_NEW_PRIVS)"); 1467 } 1468 1469 /* 1470 * Code running with ASan 1471 * (https://github.com/google/sanitizers/wiki/AddressSanitizer) 1472 * will make system calls not included in the syscall filter policy, 1473 * which will likely crash the program. Skip setting seccomp filter in 1474 * that case. 1475 * 'running_with_asan()' has no inputs and is completely defined at 1476 * build time, so this cannot be used by an attacker to skip setting 1477 * seccomp filter. 1478 */ 1479 if (j->flags.seccomp_filter && running_with_asan()) { 1480 warn("running with ASan, not setting seccomp filter"); 1481 return; 1482 } 1483 1484 if (j->flags.seccomp_filter) { 1485 if (j->flags.seccomp_filter_logging) { 1486 /* 1487 * If logging seccomp filter failures, 1488 * install the SIGSYS handler first. 1489 */ 1490 if (install_sigsys_handler()) 1491 pdie("failed to install SIGSYS handler"); 1492 warn("logging seccomp filter failures"); 1493 } else if (j->flags.seccomp_filter_tsync) { 1494 /* 1495 * If setting thread sync, 1496 * reset the SIGSYS signal handler so that 1497 * the entire thread group is killed. 1498 */ 1499 if (signal(SIGSYS, SIG_DFL) == SIG_ERR) 1500 pdie("failed to reset SIGSYS disposition"); 1501 info("reset SIGSYS disposition"); 1502 } 1503 } 1504 1505 /* 1506 * Install the syscall filter. 1507 */ 1508 if (j->flags.seccomp_filter) { 1509 if (j->flags.seccomp_filter_tsync) { 1510 if (sys_seccomp(SECCOMP_SET_MODE_FILTER, 1511 SECCOMP_FILTER_FLAG_TSYNC, 1512 j->filter_prog)) { 1513 pdie("seccomp(tsync) failed"); 1514 } 1515 } else { 1516 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 1517 j->filter_prog)) { 1518 pdie("prctl(seccomp_filter) failed"); 1519 } 1520 } 1521 } 1522} 1523 1524static void config_net_loopback(void) 1525{ 1526 static const char ifname[] = "lo"; 1527 int sock; 1528 struct ifreq ifr; 1529 1530 /* Make sure people don't try to add really long names. */ 1531 _Static_assert(sizeof(ifname) <= IFNAMSIZ, "interface name too long"); 1532 1533 sock = socket(AF_LOCAL, SOCK_DGRAM|SOCK_CLOEXEC, 0); 1534 if (sock < 0) 1535 pdie("socket(AF_LOCAL) failed"); 1536 1537 /* 1538 * Do the equiv of `ip link set up lo`. The kernel will assign 1539 * IPv4 (127.0.0.1) & IPv6 (::1) addresses automatically! 1540 */ 1541 strcpy(ifr.ifr_name, ifname); 1542 if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0) 1543 pdie("ioctl(SIOCGIFFLAGS) failed"); 1544 1545 /* The kernel preserves ifr.ifr_name for use. */ 1546 ifr.ifr_flags |= IFF_UP | IFF_RUNNING; 1547 if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0) 1548 pdie("ioctl(SIOCSIFFLAGS) failed"); 1549 1550 close(sock); 1551} 1552 1553void API minijail_enter(const struct minijail *j) 1554{ 1555 /* 1556 * If we're dropping caps, get the last valid cap from /proc now, 1557 * since /proc can be unmounted before drop_caps() is called. 1558 */ 1559 unsigned int last_valid_cap = 0; 1560 if (j->flags.capbset_drop || j->flags.use_caps) 1561 last_valid_cap = get_last_valid_cap(); 1562 1563 if (j->flags.pids) 1564 die("tried to enter a pid-namespaced jail;" 1565 " try minijail_run()?"); 1566 1567 if (j->flags.usergroups && !j->user) 1568 die("usergroup inheritance without username"); 1569 1570 /* 1571 * We can't recover from failures if we've dropped privileges partially, 1572 * so we don't even try. If any of our operations fail, we abort() the 1573 * entire process. 1574 */ 1575 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS)) 1576 pdie("setns(CLONE_NEWNS) failed"); 1577 1578 if (j->flags.vfs) { 1579 if (unshare(CLONE_NEWNS)) 1580 pdie("unshare(CLONE_NEWNS) failed"); 1581 /* 1582 * Unless asked not to, remount all filesystems as private. 1583 * If they are shared, new bind mounts will creep out of our 1584 * namespace. 1585 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt 1586 */ 1587 if (!j->flags.skip_remount_private) { 1588 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 1589 pdie("mount(NULL, /, NULL, MS_REC | MS_PRIVATE," 1590 " NULL) failed"); 1591 } 1592 } 1593 1594 if (j->flags.ipc && unshare(CLONE_NEWIPC)) { 1595 pdie("unshare(CLONE_NEWIPC) failed"); 1596 } 1597 1598 if (j->flags.enter_net) { 1599 if (setns(j->netns_fd, CLONE_NEWNET)) 1600 pdie("setns(CLONE_NEWNET) failed"); 1601 } else if (j->flags.net) { 1602 if (unshare(CLONE_NEWNET)) 1603 pdie("unshare(CLONE_NEWNET) failed"); 1604 config_net_loopback(); 1605 } 1606 1607 if (j->flags.ns_cgroups && unshare(CLONE_NEWCGROUP)) 1608 pdie("unshare(CLONE_NEWCGROUP) failed"); 1609 1610 if (j->flags.chroot && enter_chroot(j)) 1611 pdie("chroot"); 1612 1613 if (j->flags.pivot_root && enter_pivot_root(j)) 1614 pdie("pivot_root"); 1615 1616 if (j->flags.mount_tmp && mount_tmp()) 1617 pdie("mount_tmp"); 1618 1619 if (j->flags.remount_proc_ro && remount_proc_readonly(j)) 1620 pdie("remount"); 1621 1622 /* 1623 * If we're only dropping capabilities from the bounding set, but not 1624 * from the thread's (permitted|inheritable|effective) sets, do it now. 1625 */ 1626 if (j->flags.capbset_drop) { 1627 drop_capbset(j->cap_bset, last_valid_cap); 1628 } 1629 1630 if (j->flags.use_caps) { 1631 /* 1632 * POSIX capabilities are a bit tricky. If we drop our 1633 * capability to change uids, our attempt to use setuid() 1634 * below will fail. Hang on to root caps across setuid(), then 1635 * lock securebits. 1636 */ 1637 if (prctl(PR_SET_KEEPCAPS, 1)) 1638 pdie("prctl(PR_SET_KEEPCAPS) failed"); 1639 1640 /* 1641 * Kernels 4.3+ define a new securebit 1642 * (SECURE_NO_CAP_AMBIENT_RAISE), so using the SECURE_ALL_BITS 1643 * and SECURE_ALL_LOCKS masks from newer kernel headers will 1644 * return EPERM on older kernels. Detect this, and retry with 1645 * the right mask for older (2.6.26-4.2) kernels. 1646 */ 1647 int securebits_ret = prctl(PR_SET_SECUREBITS, 1648 SECURE_ALL_BITS | SECURE_ALL_LOCKS); 1649 if (securebits_ret < 0) { 1650 if (errno == EPERM) { 1651 /* Possibly running on kernel < 4.3. */ 1652 securebits_ret = prctl( 1653 PR_SET_SECUREBITS, 1654 OLD_SECURE_ALL_BITS | OLD_SECURE_ALL_LOCKS); 1655 } 1656 } 1657 if (securebits_ret < 0) 1658 pdie("prctl(PR_SET_SECUREBITS) failed"); 1659 } 1660 1661 if (j->flags.no_new_privs) { 1662 /* 1663 * If we're setting no_new_privs, we can drop privileges 1664 * before setting seccomp filter. This way filter policies 1665 * don't need to allow privilege-dropping syscalls. 1666 */ 1667 drop_ugid(j); 1668 drop_caps(j, last_valid_cap); 1669 set_seccomp_filter(j); 1670 } else { 1671 /* 1672 * If we're not setting no_new_privs, 1673 * we need to set seccomp filter *before* dropping privileges. 1674 * WARNING: this means that filter policies *must* allow 1675 * setgroups()/setresgid()/setresuid() for dropping root and 1676 * capget()/capset()/prctl() for dropping caps. 1677 */ 1678 set_seccomp_filter(j); 1679 drop_ugid(j); 1680 drop_caps(j, last_valid_cap); 1681 } 1682 1683 /* 1684 * Select the specified alternate syscall table. The table must not 1685 * block prctl(2) if we're using seccomp as well. 1686 */ 1687 if (j->flags.alt_syscall) { 1688 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table)) 1689 pdie("prctl(PR_ALT_SYSCALL) failed"); 1690 } 1691 1692 /* 1693 * seccomp has to come last since it cuts off all the other 1694 * privilege-dropping syscalls :) 1695 */ 1696 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) { 1697 if ((errno == EINVAL) && seccomp_can_softfail()) { 1698 warn("seccomp not supported"); 1699 return; 1700 } 1701 pdie("prctl(PR_SET_SECCOMP) failed"); 1702 } 1703} 1704 1705/* TODO(wad): will visibility affect this variable? */ 1706static int init_exitstatus = 0; 1707 1708void init_term(int __attribute__ ((unused)) sig) 1709{ 1710 _exit(init_exitstatus); 1711} 1712 1713void init(pid_t rootpid) 1714{ 1715 pid_t pid; 1716 int status; 1717 /* So that we exit with the right status. */ 1718 signal(SIGTERM, init_term); 1719 /* TODO(wad): self jail with seccomp filters here. */ 1720 while ((pid = wait(&status)) > 0) { 1721 /* 1722 * This loop will only end when either there are no processes 1723 * left inside our pid namespace or we get a signal. 1724 */ 1725 if (pid == rootpid) 1726 init_exitstatus = status; 1727 } 1728 if (!WIFEXITED(init_exitstatus)) 1729 _exit(MINIJAIL_ERR_INIT); 1730 _exit(WEXITSTATUS(init_exitstatus)); 1731} 1732 1733int API minijail_from_fd(int fd, struct minijail *j) 1734{ 1735 size_t sz = 0; 1736 size_t bytes = read(fd, &sz, sizeof(sz)); 1737 char *buf; 1738 int r; 1739 if (sizeof(sz) != bytes) 1740 return -EINVAL; 1741 if (sz > USHRT_MAX) /* arbitrary sanity check */ 1742 return -E2BIG; 1743 buf = malloc(sz); 1744 if (!buf) 1745 return -ENOMEM; 1746 bytes = read(fd, buf, sz); 1747 if (bytes != sz) { 1748 free(buf); 1749 return -EINVAL; 1750 } 1751 r = minijail_unmarshal(j, buf, sz); 1752 free(buf); 1753 return r; 1754} 1755 1756int API minijail_to_fd(struct minijail *j, int fd) 1757{ 1758 char *buf; 1759 size_t sz = minijail_size(j); 1760 ssize_t written; 1761 int r; 1762 1763 if (!sz) 1764 return -EINVAL; 1765 buf = malloc(sz); 1766 r = minijail_marshal(j, buf, sz); 1767 if (r) { 1768 free(buf); 1769 return r; 1770 } 1771 /* Sends [size][minijail]. */ 1772 written = write(fd, &sz, sizeof(sz)); 1773 if (written != sizeof(sz)) { 1774 free(buf); 1775 return -EFAULT; 1776 } 1777 written = write(fd, buf, sz); 1778 if (written < 0 || (size_t) written != sz) { 1779 free(buf); 1780 return -EFAULT; 1781 } 1782 free(buf); 1783 return 0; 1784} 1785 1786int setup_preload(void) 1787{ 1788#if defined(__ANDROID__) 1789 /* Don't use LDPRELOAD on Brillo. */ 1790 return 0; 1791#else 1792 char *oldenv = getenv(kLdPreloadEnvVar) ? : ""; 1793 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH)); 1794 if (!newenv) 1795 return -ENOMEM; 1796 1797 /* Only insert a separating space if we have something to separate... */ 1798 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", 1799 PRELOADPATH); 1800 1801 /* setenv() makes a copy of the string we give it. */ 1802 setenv(kLdPreloadEnvVar, newenv, 1); 1803 free(newenv); 1804 return 0; 1805#endif 1806} 1807 1808int setup_pipe(int fds[2]) 1809{ 1810 int r = pipe(fds); 1811 char fd_buf[11]; 1812 if (r) 1813 return r; 1814 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]); 1815 if (r <= 0) 1816 return -EINVAL; 1817 setenv(kFdEnvVar, fd_buf, 1); 1818 return 0; 1819} 1820 1821int setup_pipe_end(int fds[2], size_t index) 1822{ 1823 if (index > 1) 1824 return -1; 1825 1826 close(fds[1 - index]); 1827 return fds[index]; 1828} 1829 1830int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd) 1831{ 1832 if (index > 1) 1833 return -1; 1834 1835 close(fds[1 - index]); 1836 /* dup2(2) the corresponding end of the pipe into |fd|. */ 1837 return dup2(fds[index], fd); 1838} 1839 1840int close_open_fds(int *inheritable_fds, size_t size) 1841{ 1842 const char *kFdPath = "/proc/self/fd"; 1843 1844 DIR *d = opendir(kFdPath); 1845 struct dirent *dir_entry; 1846 1847 if (d == NULL) 1848 return -1; 1849 int dir_fd = dirfd(d); 1850 while ((dir_entry = readdir(d)) != NULL) { 1851 size_t i; 1852 char *end; 1853 bool should_close = true; 1854 const int fd = strtol(dir_entry->d_name, &end, 10); 1855 1856 if ((*end) != '\0') { 1857 continue; 1858 } 1859 /* 1860 * We might have set up some pipes that we want to share with 1861 * the parent process, and should not be closed. 1862 */ 1863 for (i = 0; i < size; ++i) { 1864 if (fd == inheritable_fds[i]) { 1865 should_close = false; 1866 break; 1867 } 1868 } 1869 /* Also avoid closing the directory fd. */ 1870 if (should_close && fd != dir_fd) 1871 close(fd); 1872 } 1873 closedir(d); 1874 return 0; 1875} 1876 1877int minijail_run_internal(struct minijail *j, const char *filename, 1878 char *const argv[], pid_t *pchild_pid, 1879 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1880 int use_preload); 1881 1882int API minijail_run(struct minijail *j, const char *filename, 1883 char *const argv[]) 1884{ 1885 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1886 true); 1887} 1888 1889int API minijail_run_pid(struct minijail *j, const char *filename, 1890 char *const argv[], pid_t *pchild_pid) 1891{ 1892 return minijail_run_internal(j, filename, argv, pchild_pid, 1893 NULL, NULL, NULL, true); 1894} 1895 1896int API minijail_run_pipe(struct minijail *j, const char *filename, 1897 char *const argv[], int *pstdin_fd) 1898{ 1899 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd, 1900 NULL, NULL, true); 1901} 1902 1903int API minijail_run_pid_pipes(struct minijail *j, const char *filename, 1904 char *const argv[], pid_t *pchild_pid, 1905 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) 1906{ 1907 return minijail_run_internal(j, filename, argv, pchild_pid, 1908 pstdin_fd, pstdout_fd, pstderr_fd, true); 1909} 1910 1911int API minijail_run_no_preload(struct minijail *j, const char *filename, 1912 char *const argv[]) 1913{ 1914 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1915 false); 1916} 1917 1918int API minijail_run_pid_pipes_no_preload(struct minijail *j, 1919 const char *filename, 1920 char *const argv[], 1921 pid_t *pchild_pid, 1922 int *pstdin_fd, int *pstdout_fd, 1923 int *pstderr_fd) 1924{ 1925 return minijail_run_internal(j, filename, argv, pchild_pid, 1926 pstdin_fd, pstdout_fd, pstderr_fd, false); 1927} 1928 1929int minijail_run_internal(struct minijail *j, const char *filename, 1930 char *const argv[], pid_t *pchild_pid, 1931 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1932 int use_preload) 1933{ 1934 char *oldenv, *oldenv_copy = NULL; 1935 pid_t child_pid; 1936 int pipe_fds[2]; 1937 int stdin_fds[2]; 1938 int stdout_fds[2]; 1939 int stderr_fds[2]; 1940 int child_sync_pipe_fds[2]; 1941 int sync_child = 0; 1942 int ret; 1943 /* We need to remember this across the minijail_preexec() call. */ 1944 int pid_namespace = j->flags.pids; 1945 int do_init = j->flags.do_init; 1946 1947 if (use_preload) { 1948 oldenv = getenv(kLdPreloadEnvVar); 1949 if (oldenv) { 1950 oldenv_copy = strdup(oldenv); 1951 if (!oldenv_copy) 1952 return -ENOMEM; 1953 } 1954 1955 if (setup_preload()) 1956 return -EFAULT; 1957 } 1958 1959 if (!use_preload) { 1960 if (j->flags.use_caps && j->caps != 0) 1961 die("non-empty capabilities are not supported without " 1962 "LD_PRELOAD"); 1963 } 1964 1965 /* 1966 * Make the process group ID of this process equal to its PID. 1967 * In the non-interactive case (e.g. when the parent process is started 1968 * from init) this ensures the parent process and the jailed process 1969 * can be killed together. 1970 * When the parent process is started from the console this ensures 1971 * the call to setsid(2) in the jailed process succeeds. 1972 * 1973 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when 1974 * the process is already a process group leader. 1975 */ 1976 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) { 1977 if (errno != EPERM) { 1978 pdie("setpgid(0, 0) failed"); 1979 } 1980 } 1981 1982 if (use_preload) { 1983 /* 1984 * Before we fork(2) and execve(2) the child process, we need 1985 * to open a pipe(2) to send the minijail configuration over. 1986 */ 1987 if (setup_pipe(pipe_fds)) 1988 return -EFAULT; 1989 } 1990 1991 /* 1992 * If we want to write to the child process' standard input, 1993 * create the pipe(2) now. 1994 */ 1995 if (pstdin_fd) { 1996 if (pipe(stdin_fds)) 1997 return -EFAULT; 1998 } 1999 2000 /* 2001 * If we want to read from the child process' standard output, 2002 * create the pipe(2) now. 2003 */ 2004 if (pstdout_fd) { 2005 if (pipe(stdout_fds)) 2006 return -EFAULT; 2007 } 2008 2009 /* 2010 * If we want to read from the child process' standard error, 2011 * create the pipe(2) now. 2012 */ 2013 if (pstderr_fd) { 2014 if (pipe(stderr_fds)) 2015 return -EFAULT; 2016 } 2017 2018 /* 2019 * If we want to set up a new uid/gid map in the user namespace, 2020 * or if we need to add the child process to cgroups, create the pipe(2) 2021 * to sync between parent and child. 2022 */ 2023 if (j->flags.userns || j->flags.cgroups) { 2024 sync_child = 1; 2025 if (pipe(child_sync_pipe_fds)) 2026 return -EFAULT; 2027 } 2028 2029 /* 2030 * Use sys_clone() if and only if we're creating a pid namespace. 2031 * 2032 * tl;dr: WARNING: do not mix pid namespaces and multithreading. 2033 * 2034 * In multithreaded programs, there are a bunch of locks inside libc, 2035 * some of which may be held by other threads at the time that we call 2036 * minijail_run_pid(). If we call fork(), glibc does its level best to 2037 * ensure that we hold all of these locks before it calls clone() 2038 * internally and drop them after clone() returns, but when we call 2039 * sys_clone(2) directly, all that gets bypassed and we end up with a 2040 * child address space where some of libc's important locks are held by 2041 * other threads (which did not get cloned, and hence will never release 2042 * those locks). This is okay so long as we call exec() immediately 2043 * after, but a bunch of seemingly-innocent libc functions like setenv() 2044 * take locks. 2045 * 2046 * Hence, only call sys_clone() if we need to, in order to get at pid 2047 * namespacing. If we follow this path, the child's address space might 2048 * have broken locks; you may only call functions that do not acquire 2049 * any locks. 2050 * 2051 * Unfortunately, fork() acquires every lock it can get its hands on, as 2052 * previously detailed, so this function is highly likely to deadlock 2053 * later on (see "deadlock here") if we're multithreaded. 2054 * 2055 * We might hack around this by having the clone()d child (init of the 2056 * pid namespace) return directly, rather than leaving the clone()d 2057 * process hanging around to be init for the new namespace (and having 2058 * its fork()ed child return in turn), but that process would be 2059 * crippled with its libc locks potentially broken. We might try 2060 * fork()ing in the parent before we clone() to ensure that we own all 2061 * the locks, but then we have to have the forked child hanging around 2062 * consuming resources (and possibly having file descriptors / shared 2063 * memory regions / etc attached). We'd need to keep the child around to 2064 * avoid having its children get reparented to init. 2065 * 2066 * TODO(ellyjones): figure out if the "forked child hanging around" 2067 * problem is fixable or not. It would be nice if we worked in this 2068 * case. 2069 */ 2070 if (pid_namespace) { 2071 int clone_flags = CLONE_NEWPID | SIGCHLD; 2072 if (j->flags.userns) 2073 clone_flags |= CLONE_NEWUSER; 2074 child_pid = syscall(SYS_clone, clone_flags, NULL); 2075 } else { 2076 child_pid = fork(); 2077 } 2078 2079 if (child_pid < 0) { 2080 if (use_preload) { 2081 free(oldenv_copy); 2082 } 2083 die("failed to fork child"); 2084 } 2085 2086 if (child_pid) { 2087 if (use_preload) { 2088 /* Restore parent's LD_PRELOAD. */ 2089 if (oldenv_copy) { 2090 setenv(kLdPreloadEnvVar, oldenv_copy, 1); 2091 free(oldenv_copy); 2092 } else { 2093 unsetenv(kLdPreloadEnvVar); 2094 } 2095 unsetenv(kFdEnvVar); 2096 } 2097 2098 j->initpid = child_pid; 2099 2100 if (j->flags.pid_file) 2101 write_pid_file_or_die(j); 2102 2103 if (j->flags.cgroups) 2104 add_to_cgroups_or_die(j); 2105 2106 if (j->flags.userns) 2107 write_ugid_maps_or_die(j); 2108 2109 if (sync_child) 2110 parent_setup_complete(child_sync_pipe_fds); 2111 2112 if (use_preload) { 2113 /* Send marshalled minijail. */ 2114 close(pipe_fds[0]); /* read endpoint */ 2115 ret = minijail_to_fd(j, pipe_fds[1]); 2116 close(pipe_fds[1]); /* write endpoint */ 2117 if (ret) { 2118 kill(j->initpid, SIGKILL); 2119 die("failed to send marshalled minijail"); 2120 } 2121 } 2122 2123 if (pchild_pid) 2124 *pchild_pid = child_pid; 2125 2126 /* 2127 * If we want to write to the child process' standard input, 2128 * set up the write end of the pipe. 2129 */ 2130 if (pstdin_fd) 2131 *pstdin_fd = setup_pipe_end(stdin_fds, 2132 1 /* write end */); 2133 2134 /* 2135 * If we want to read from the child process' standard output, 2136 * set up the read end of the pipe. 2137 */ 2138 if (pstdout_fd) 2139 *pstdout_fd = setup_pipe_end(stdout_fds, 2140 0 /* read end */); 2141 2142 /* 2143 * If we want to read from the child process' standard error, 2144 * set up the read end of the pipe. 2145 */ 2146 if (pstderr_fd) 2147 *pstderr_fd = setup_pipe_end(stderr_fds, 2148 0 /* read end */); 2149 2150 return 0; 2151 } 2152 /* Child process. */ 2153 free(oldenv_copy); 2154 2155 if (j->flags.reset_signal_mask) { 2156 sigset_t signal_mask; 2157 if (sigemptyset(&signal_mask) != 0) 2158 pdie("sigemptyset failed"); 2159 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0) 2160 pdie("sigprocmask failed"); 2161 } 2162 2163 if (j->flags.close_open_fds) { 2164 const size_t kMaxInheritableFdsSize = 10; 2165 int inheritable_fds[kMaxInheritableFdsSize]; 2166 size_t size = 0; 2167 if (use_preload) { 2168 inheritable_fds[size++] = pipe_fds[0]; 2169 inheritable_fds[size++] = pipe_fds[1]; 2170 } 2171 if (sync_child) { 2172 inheritable_fds[size++] = child_sync_pipe_fds[0]; 2173 inheritable_fds[size++] = child_sync_pipe_fds[1]; 2174 } 2175 if (pstdin_fd) { 2176 inheritable_fds[size++] = stdin_fds[0]; 2177 inheritable_fds[size++] = stdin_fds[1]; 2178 } 2179 if (pstdout_fd) { 2180 inheritable_fds[size++] = stdout_fds[0]; 2181 inheritable_fds[size++] = stdout_fds[1]; 2182 } 2183 if (pstderr_fd) { 2184 inheritable_fds[size++] = stderr_fds[0]; 2185 inheritable_fds[size++] = stderr_fds[1]; 2186 } 2187 2188 if (close_open_fds(inheritable_fds, size) < 0) 2189 die("failed to close open file descriptors"); 2190 } 2191 2192 if (sync_child) 2193 wait_for_parent_setup(child_sync_pipe_fds); 2194 2195 if (j->flags.userns) 2196 enter_user_namespace(j); 2197 2198 /* 2199 * If we want to write to the jailed process' standard input, 2200 * set up the read end of the pipe. 2201 */ 2202 if (pstdin_fd) { 2203 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */, 2204 STDIN_FILENO) < 0) 2205 die("failed to set up stdin pipe"); 2206 } 2207 2208 /* 2209 * If we want to read from the jailed process' standard output, 2210 * set up the write end of the pipe. 2211 */ 2212 if (pstdout_fd) { 2213 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */, 2214 STDOUT_FILENO) < 0) 2215 die("failed to set up stdout pipe"); 2216 } 2217 2218 /* 2219 * If we want to read from the jailed process' standard error, 2220 * set up the write end of the pipe. 2221 */ 2222 if (pstderr_fd) { 2223 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */, 2224 STDERR_FILENO) < 0) 2225 die("failed to set up stderr pipe"); 2226 } 2227 2228 /* 2229 * If any of stdin, stdout, or stderr are TTYs, create a new session. 2230 * This prevents the jailed process from using the TIOCSTI ioctl 2231 * to push characters into the parent process terminal's input buffer, 2232 * therefore escaping the jail. 2233 */ 2234 if (isatty(STDIN_FILENO) || isatty(STDOUT_FILENO) || 2235 isatty(STDERR_FILENO)) { 2236 if (setsid() < 0) { 2237 pdie("setsid() failed"); 2238 } 2239 } 2240 2241 /* If running an init program, let it decide when/how to mount /proc. */ 2242 if (pid_namespace && !do_init) 2243 j->flags.remount_proc_ro = 0; 2244 2245 if (use_preload) { 2246 /* Strip out flags that cannot be inherited across execve(2). */ 2247 minijail_preexec(j); 2248 } else { 2249 /* 2250 * If not using LD_PRELOAD, do all jailing before execve(2). 2251 * Note that PID namespaces can only be entered on fork(2), 2252 * so that flag is still cleared. 2253 */ 2254 j->flags.pids = 0; 2255 } 2256 /* Jail this process, then execve(2) the target. */ 2257 minijail_enter(j); 2258 2259 if (pid_namespace && do_init) { 2260 /* 2261 * pid namespace: this process will become init inside the new 2262 * namespace. We don't want all programs we might exec to have 2263 * to know how to be init. Normally (do_init == 1) we fork off 2264 * a child to actually run the program. If |do_init == 0|, we 2265 * let the program keep pid 1 and be init. 2266 * 2267 * If we're multithreaded, we'll probably deadlock here. See 2268 * WARNING above. 2269 */ 2270 child_pid = fork(); 2271 if (child_pid < 0) { 2272 _exit(child_pid); 2273 } else if (child_pid > 0) { 2274 /* 2275 * Best effort. Don't bother checking the return value. 2276 */ 2277 prctl(PR_SET_NAME, "minijail-init"); 2278 init(child_pid); /* Never returns. */ 2279 } 2280 } 2281 2282 /* 2283 * If we aren't pid-namespaced, or the jailed program asked to be init: 2284 * calling process 2285 * -> execve()-ing process 2286 * If we are: 2287 * calling process 2288 * -> init()-ing process 2289 * -> execve()-ing process 2290 */ 2291 ret = execve(filename, argv, environ); 2292 if (ret == -1) { 2293 pwarn("execve(%s) failed", filename); 2294 } 2295 _exit(ret); 2296} 2297 2298int API minijail_kill(struct minijail *j) 2299{ 2300 int st; 2301 if (kill(j->initpid, SIGTERM)) 2302 return -errno; 2303 if (waitpid(j->initpid, &st, 0) < 0) 2304 return -errno; 2305 return st; 2306} 2307 2308int API minijail_wait(struct minijail *j) 2309{ 2310 int st; 2311 if (waitpid(j->initpid, &st, 0) < 0) 2312 return -errno; 2313 2314 if (!WIFEXITED(st)) { 2315 int error_status = st; 2316 if (WIFSIGNALED(st)) { 2317 int signum = WTERMSIG(st); 2318 warn("child process %d received signal %d", 2319 j->initpid, signum); 2320 /* 2321 * We return MINIJAIL_ERR_JAIL if the process received 2322 * SIGSYS, which happens when a syscall is blocked by 2323 * seccomp filters. 2324 * If not, we do what bash(1) does: 2325 * $? = 128 + signum 2326 */ 2327 if (signum == SIGSYS) { 2328 error_status = MINIJAIL_ERR_JAIL; 2329 } else { 2330 error_status = 128 + signum; 2331 } 2332 } 2333 return error_status; 2334 } 2335 2336 int exit_status = WEXITSTATUS(st); 2337 if (exit_status != 0) 2338 info("child process %d exited with status %d", 2339 j->initpid, exit_status); 2340 2341 return exit_status; 2342} 2343 2344void API minijail_destroy(struct minijail *j) 2345{ 2346 size_t i; 2347 2348 if (j->flags.seccomp_filter && j->filter_prog) { 2349 free(j->filter_prog->filter); 2350 free(j->filter_prog); 2351 } 2352 while (j->mounts_head) { 2353 struct mountpoint *m = j->mounts_head; 2354 j->mounts_head = j->mounts_head->next; 2355 free(m->data); 2356 free(m->type); 2357 free(m->dest); 2358 free(m->src); 2359 free(m); 2360 } 2361 j->mounts_tail = NULL; 2362 if (j->user) 2363 free(j->user); 2364 if (j->suppl_gid_list) 2365 free(j->suppl_gid_list); 2366 if (j->chrootdir) 2367 free(j->chrootdir); 2368 if (j->pid_file_path) 2369 free(j->pid_file_path); 2370 if (j->uidmap) 2371 free(j->uidmap); 2372 if (j->gidmap) 2373 free(j->gidmap); 2374 if (j->alt_syscall_table) 2375 free(j->alt_syscall_table); 2376 for (i = 0; i < j->cgroup_count; ++i) 2377 free(j->cgroups[i]); 2378 free(j); 2379} 2380