libminijail.c revision f9fcdbe67360c30a41b70c2f1271c0767eb073c9
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6#define _BSD_SOURCE 7#define _GNU_SOURCE 8 9#include <asm/unistd.h> 10#include <ctype.h> 11#include <errno.h> 12#include <fcntl.h> 13#include <grp.h> 14#include <inttypes.h> 15#include <limits.h> 16#include <linux/capability.h> 17#include <pwd.h> 18#include <sched.h> 19#include <signal.h> 20#include <stdarg.h> 21#include <stdbool.h> 22#include <stddef.h> 23#include <stdio.h> 24#include <stdlib.h> 25#include <string.h> 26#include <syscall.h> 27#include <sys/capability.h> 28#include <sys/mount.h> 29#include <sys/param.h> 30#include <sys/prctl.h> 31#include <sys/stat.h> 32#include <sys/types.h> 33#include <sys/user.h> 34#include <sys/utsname.h> 35#include <sys/wait.h> 36#include <unistd.h> 37 38#include "libminijail.h" 39#include "libminijail-private.h" 40 41#include "signal_handler.h" 42#include "syscall_filter.h" 43#include "util.h" 44 45#ifdef HAVE_SECUREBITS_H 46#include <linux/securebits.h> 47#else 48#define SECURE_ALL_BITS 0x15 49#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1) 50#endif 51 52/* Until these are reliably available in linux/prctl.h */ 53#ifndef PR_SET_SECCOMP 54# define PR_SET_SECCOMP 22 55#endif 56 57#ifndef PR_ALT_SYSCALL 58# define PR_ALT_SYSCALL 0x43724f53 59#endif 60 61/* For seccomp_filter using BPF. */ 62#ifndef PR_SET_NO_NEW_PRIVS 63# define PR_SET_NO_NEW_PRIVS 38 64#endif 65#ifndef SECCOMP_MODE_FILTER 66# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ 67#endif 68 69#ifdef USE_SECCOMP_SOFTFAIL 70# define SECCOMP_SOFTFAIL 1 71#else 72# define SECCOMP_SOFTFAIL 0 73#endif 74 75#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */ 76 77struct mountpoint { 78 char *src; 79 char *dest; 80 char *type; 81 unsigned long flags; 82 struct mountpoint *next; 83}; 84 85struct minijail { 86 /* 87 * WARNING: if you add a flag here you need to make sure it's 88 * accounted for in minijail_pre{enter|exec}() below. 89 */ 90 struct { 91 int uid:1; 92 int gid:1; 93 int usergroups:1; 94 int suppl_gids:1; 95 int use_caps:1; 96 int capbset_drop:1; 97 int vfs:1; 98 int enter_vfs:1; 99 int pids:1; 100 int ipc:1; 101 int net:1; 102 int enter_net:1; 103 int userns:1; 104 int seccomp:1; 105 int remount_proc_ro:1; 106 int no_new_privs:1; 107 int seccomp_filter:1; 108 int log_seccomp_filter:1; 109 int chroot:1; 110 int pivot_root:1; 111 int mount_tmp:1; 112 int do_init:1; 113 int pid_file:1; 114 int cgroups:1; 115 int alt_syscall:1; 116 int reset_signal_mask:1; 117 } flags; 118 uid_t uid; 119 gid_t gid; 120 gid_t usergid; 121 char *user; 122 size_t suppl_gid_count; 123 gid_t *suppl_gid_list; 124 uint64_t caps; 125 uint64_t cap_bset; 126 pid_t initpid; 127 int mountns_fd; 128 int netns_fd; 129 char *chrootdir; 130 char *pid_file_path; 131 char *uidmap; 132 char *gidmap; 133 size_t filter_len; 134 struct sock_fprog *filter_prog; 135 char *alt_syscall_table; 136 struct mountpoint *mounts_head; 137 struct mountpoint *mounts_tail; 138 size_t mounts_count; 139 char *cgroups[MAX_CGROUPS]; 140 size_t cgroup_count; 141}; 142 143/* 144 * Strip out flags meant for the parent. 145 * We keep things that are not inherited across execve(2) (e.g. capabilities), 146 * or are easier to set after execve(2) (e.g. seccomp filters). 147 */ 148void minijail_preenter(struct minijail *j) 149{ 150 j->flags.vfs = 0; 151 j->flags.enter_vfs = 0; 152 j->flags.remount_proc_ro = 0; 153 j->flags.pids = 0; 154 j->flags.do_init = 0; 155 j->flags.pid_file = 0; 156 j->flags.cgroups = 0; 157} 158 159/* 160 * Strip out flags meant for the child. 161 * We keep things that are inherited across execve(2). 162 */ 163void minijail_preexec(struct minijail *j) 164{ 165 int vfs = j->flags.vfs; 166 int enter_vfs = j->flags.enter_vfs; 167 int remount_proc_ro = j->flags.remount_proc_ro; 168 int userns = j->flags.userns; 169 if (j->user) 170 free(j->user); 171 j->user = NULL; 172 if (j->suppl_gid_list) 173 free(j->suppl_gid_list); 174 j->suppl_gid_list = NULL; 175 memset(&j->flags, 0, sizeof(j->flags)); 176 /* Now restore anything we meant to keep. */ 177 j->flags.vfs = vfs; 178 j->flags.enter_vfs = enter_vfs; 179 j->flags.remount_proc_ro = remount_proc_ro; 180 j->flags.userns = userns; 181 /* Note, |pids| will already have been used before this call. */ 182} 183 184/* Returns true if the kernel version is less than 3.8. */ 185int seccomp_kernel_support_not_required() 186{ 187 int major, minor; 188 struct utsname uts; 189 return (uname(&uts) != -1 && 190 sscanf(uts.release, "%d.%d", &major, &minor) == 2 && 191 ((major < 3) || ((major == 3) && (minor < 8)))); 192} 193 194/* Allow seccomp soft-fail on Android devices with kernel version < 3.8. */ 195int can_softfail() 196{ 197#if SECCOMP_SOFTFAIL 198 if (is_android()) { 199 if (seccomp_kernel_support_not_required()) 200 return 1; 201 else 202 return 0; 203 } else { 204 return 1; 205 } 206#endif 207 return 0; 208} 209 210/* Minijail API. */ 211 212struct minijail API *minijail_new(void) 213{ 214 return calloc(1, sizeof(struct minijail)); 215} 216 217void API minijail_change_uid(struct minijail *j, uid_t uid) 218{ 219 if (uid == 0) 220 die("useless change to uid 0"); 221 j->uid = uid; 222 j->flags.uid = 1; 223} 224 225void API minijail_change_gid(struct minijail *j, gid_t gid) 226{ 227 if (gid == 0) 228 die("useless change to gid 0"); 229 j->gid = gid; 230 j->flags.gid = 1; 231} 232 233void API minijail_set_supplementary_gids(struct minijail *j, size_t size, 234 const gid_t *list) 235{ 236 size_t i; 237 238 if (j->flags.usergroups) 239 die("cannot inherit *and* set supplementary groups"); 240 241 if (size == 0) { 242 /* Clear supplementary groups. */ 243 j->suppl_gid_list = NULL; 244 j->suppl_gid_count = 0; 245 j->flags.suppl_gids = 1; 246 return; 247 } 248 249 /* Copy the gid_t array. */ 250 j->suppl_gid_list = calloc(size, sizeof(gid_t)); 251 if (!j->suppl_gid_list) { 252 die("failed to allocate internal supplementary group array"); 253 } 254 for (i = 0; i < size; i++) { 255 j->suppl_gid_list[i] = list[i]; 256 } 257 j->suppl_gid_count = size; 258 j->flags.suppl_gids = 1; 259} 260 261int API minijail_change_user(struct minijail *j, const char *user) 262{ 263 char *buf = NULL; 264 struct passwd pw; 265 struct passwd *ppw = NULL; 266 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX); 267 if (sz == -1) 268 sz = 65536; /* your guess is as good as mine... */ 269 270 /* 271 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return 272 * the maximum needed size of the buffer, so we don't have to search. 273 */ 274 buf = malloc(sz); 275 if (!buf) 276 return -ENOMEM; 277 getpwnam_r(user, &pw, buf, sz, &ppw); 278 /* 279 * We're safe to free the buffer here. The strings inside |pw| point 280 * inside |buf|, but we don't use any of them; this leaves the pointers 281 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3) succeeded. 282 */ 283 free(buf); 284 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */ 285 if (!ppw) 286 return -1; 287 minijail_change_uid(j, ppw->pw_uid); 288 j->user = strdup(user); 289 if (!j->user) 290 return -ENOMEM; 291 j->usergid = ppw->pw_gid; 292 return 0; 293} 294 295int API minijail_change_group(struct minijail *j, const char *group) 296{ 297 char *buf = NULL; 298 struct group gr; 299 struct group *pgr = NULL; 300 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX); 301 if (sz == -1) 302 sz = 65536; /* and mine is as good as yours, really */ 303 304 /* 305 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return 306 * the maximum needed size of the buffer, so we don't have to search. 307 */ 308 buf = malloc(sz); 309 if (!buf) 310 return -ENOMEM; 311 getgrnam_r(group, &gr, buf, sz, &pgr); 312 /* 313 * We're safe to free the buffer here. The strings inside gr point 314 * inside buf, but we don't use any of them; this leaves the pointers 315 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded. 316 */ 317 free(buf); 318 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */ 319 if (!pgr) 320 return -1; 321 minijail_change_gid(j, pgr->gr_gid); 322 return 0; 323} 324 325void API minijail_use_seccomp(struct minijail *j) 326{ 327 j->flags.seccomp = 1; 328} 329 330void API minijail_no_new_privs(struct minijail *j) 331{ 332 j->flags.no_new_privs = 1; 333} 334 335void API minijail_use_seccomp_filter(struct minijail *j) 336{ 337 j->flags.seccomp_filter = 1; 338} 339 340void API minijail_log_seccomp_filter_failures(struct minijail *j) 341{ 342 j->flags.log_seccomp_filter = 1; 343} 344 345void API minijail_use_caps(struct minijail *j, uint64_t capmask) 346{ 347 /* 348 * 'minijail_use_caps' configures a runtime-capabilities-only 349 * environment, including a bounding set matching the thread's runtime 350 * (permitted|inheritable|effective) sets. 351 * Therefore, it will override any existing bounding set configurations 352 * since the latter would allow gaining extra runtime capabilities from 353 * file capabilities. 354 */ 355 if (j->flags.capbset_drop) { 356 warn("overriding bounding set configuration"); 357 j->cap_bset = 0; 358 j->flags.capbset_drop = 0; 359 } 360 j->caps = capmask; 361 j->flags.use_caps = 1; 362} 363 364void API minijail_capbset_drop(struct minijail *j, uint64_t capmask) 365{ 366 if (j->flags.use_caps) { 367 /* 368 * 'minijail_use_caps' will have already configured a capability 369 * bounding set matching the (permitted|inheritable|effective) 370 * sets. Abort if the user tries to configure a separate 371 * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps' 372 * are mutually exclusive. 373 */ 374 die("runtime capabilities already configured, can't drop " 375 "bounding set separately"); 376 } 377 j->cap_bset = capmask; 378 j->flags.capbset_drop = 1; 379} 380 381void API minijail_reset_signal_mask(struct minijail *j) 382{ 383 j->flags.reset_signal_mask = 1; 384} 385 386void API minijail_namespace_vfs(struct minijail *j) 387{ 388 j->flags.vfs = 1; 389} 390 391void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path) 392{ 393 int ns_fd = open(ns_path, O_RDONLY); 394 if (ns_fd < 0) { 395 pdie("failed to open namespace '%s'", ns_path); 396 } 397 j->mountns_fd = ns_fd; 398 j->flags.enter_vfs = 1; 399} 400 401void API minijail_namespace_pids(struct minijail *j) 402{ 403 j->flags.vfs = 1; 404 j->flags.remount_proc_ro = 1; 405 j->flags.pids = 1; 406 j->flags.do_init = 1; 407} 408 409void API minijail_namespace_ipc(struct minijail *j) 410{ 411 j->flags.ipc = 1; 412} 413 414void API minijail_namespace_net(struct minijail *j) 415{ 416 j->flags.net = 1; 417} 418 419void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path) 420{ 421 int ns_fd = open(ns_path, O_RDONLY); 422 if (ns_fd < 0) { 423 pdie("failed to open namespace '%s'", ns_path); 424 } 425 j->netns_fd = ns_fd; 426 j->flags.enter_net = 1; 427} 428 429void API minijail_remount_proc_readonly(struct minijail *j) 430{ 431 j->flags.vfs = 1; 432 j->flags.remount_proc_ro = 1; 433} 434 435void API minijail_namespace_user(struct minijail *j) 436{ 437 j->flags.userns = 1; 438} 439 440int API minijail_uidmap(struct minijail *j, const char *uidmap) 441{ 442 j->uidmap = strdup(uidmap); 443 if (!j->uidmap) 444 return -ENOMEM; 445 char *ch; 446 for (ch = j->uidmap; *ch; ch++) { 447 if (*ch == ',') 448 *ch = '\n'; 449 } 450 return 0; 451} 452 453int API minijail_gidmap(struct minijail *j, const char *gidmap) 454{ 455 j->gidmap = strdup(gidmap); 456 if (!j->gidmap) 457 return -ENOMEM; 458 char *ch; 459 for (ch = j->gidmap; *ch; ch++) { 460 if (*ch == ',') 461 *ch = '\n'; 462 } 463 return 0; 464} 465 466void API minijail_inherit_usergroups(struct minijail *j) 467{ 468 j->flags.usergroups = 1; 469} 470 471void API minijail_run_as_init(struct minijail *j) 472{ 473 /* 474 * Since the jailed program will become 'init' in the new PID namespace, 475 * Minijail does not need to fork an 'init' process. 476 */ 477 j->flags.do_init = 0; 478} 479 480int API minijail_enter_chroot(struct minijail *j, const char *dir) 481{ 482 if (j->chrootdir) 483 return -EINVAL; 484 j->chrootdir = strdup(dir); 485 if (!j->chrootdir) 486 return -ENOMEM; 487 j->flags.chroot = 1; 488 return 0; 489} 490 491int API minijail_enter_pivot_root(struct minijail *j, const char *dir) 492{ 493 if (j->chrootdir) 494 return -EINVAL; 495 j->chrootdir = strdup(dir); 496 if (!j->chrootdir) 497 return -ENOMEM; 498 j->flags.pivot_root = 1; 499 return 0; 500} 501 502static char *append_external_path(const char *external_path, 503 const char *path_inside_chroot) 504{ 505 char *path; 506 size_t pathlen; 507 508 /* One extra char for '/' and one for '\0', hence + 2. */ 509 pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2; 510 path = malloc(pathlen); 511 snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot); 512 513 return path; 514} 515 516char API *minijail_get_original_path(struct minijail *j, 517 const char *path_inside_chroot) 518{ 519 struct mountpoint *b; 520 521 b = j->mounts_head; 522 while (b) { 523 /* 524 * If |path_inside_chroot| is the exact destination of a 525 * mount, then the original path is exactly the source of 526 * the mount. 527 * for example: "-b /some/path/exe,/chroot/path/exe" 528 * mount source = /some/path/exe, mount dest = 529 * /chroot/path/exe Then when getting the original path of 530 * "/chroot/path/exe", the source of that mount, 531 * "/some/path/exe" is what should be returned. 532 */ 533 if (!strcmp(b->dest, path_inside_chroot)) 534 return strdup(b->src); 535 536 /* 537 * If |path_inside_chroot| is within the destination path of a 538 * mount, take the suffix of the chroot path relative to the 539 * mount destination path, and append it to the mount source 540 * path. 541 */ 542 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) { 543 const char *relative_path = 544 path_inside_chroot + strlen(b->dest); 545 return append_external_path(b->src, relative_path); 546 } 547 b = b->next; 548 } 549 550 /* If there is a chroot path, append |path_inside_chroot| to that. */ 551 if (j->chrootdir) 552 return append_external_path(j->chrootdir, path_inside_chroot); 553 554 /* No chroot, so the path outside is the same as it is inside. */ 555 return strdup(path_inside_chroot); 556} 557 558void API minijail_mount_tmp(struct minijail *j) 559{ 560 j->flags.mount_tmp = 1; 561} 562 563int API minijail_write_pid_file(struct minijail *j, const char *path) 564{ 565 j->pid_file_path = strdup(path); 566 if (!j->pid_file_path) 567 return -ENOMEM; 568 j->flags.pid_file = 1; 569 return 0; 570} 571 572int API minijail_add_to_cgroup(struct minijail *j, const char *path) 573{ 574 if (j->cgroup_count >= MAX_CGROUPS) 575 return -ENOMEM; 576 j->cgroups[j->cgroup_count] = strdup(path); 577 if (!j->cgroups[j->cgroup_count]) 578 return -ENOMEM; 579 j->cgroup_count++; 580 j->flags.cgroups = 1; 581 return 0; 582} 583 584int API minijail_mount(struct minijail *j, const char *src, const char *dest, 585 const char *type, unsigned long flags) 586{ 587 struct mountpoint *m; 588 589 if (*dest != '/') 590 return -EINVAL; 591 m = calloc(1, sizeof(*m)); 592 if (!m) 593 return -ENOMEM; 594 m->dest = strdup(dest); 595 if (!m->dest) 596 goto error; 597 m->src = strdup(src); 598 if (!m->src) 599 goto error; 600 m->type = strdup(type); 601 if (!m->type) 602 goto error; 603 m->flags = flags; 604 605 info("mount %s -> %s type '%s'", src, dest, type); 606 607 /* 608 * Force vfs namespacing so the mounts don't leak out into the 609 * containing vfs namespace. 610 */ 611 minijail_namespace_vfs(j); 612 613 if (j->mounts_tail) 614 j->mounts_tail->next = m; 615 else 616 j->mounts_head = m; 617 j->mounts_tail = m; 618 j->mounts_count++; 619 620 return 0; 621 622error: 623 free(m->src); 624 free(m->dest); 625 free(m); 626 return -ENOMEM; 627} 628 629int API minijail_bind(struct minijail *j, const char *src, const char *dest, 630 int writeable) 631{ 632 unsigned long flags = MS_BIND; 633 634 if (!writeable) 635 flags |= MS_RDONLY; 636 637 return minijail_mount(j, src, dest, "", flags); 638} 639 640void API minijail_parse_seccomp_filters(struct minijail *j, const char *path) 641{ 642 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) { 643 if ((errno == EINVAL) && can_softfail()) { 644 warn("not loading seccomp filter," 645 " seccomp not supported"); 646 j->flags.seccomp_filter = 0; 647 j->flags.log_seccomp_filter = 0; 648 j->filter_len = 0; 649 j->filter_prog = NULL; 650 j->flags.no_new_privs = 0; 651 } 652 } 653 FILE *file = fopen(path, "r"); 654 if (!file) { 655 pdie("failed to open seccomp filter file '%s'", path); 656 } 657 658 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog)); 659 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) { 660 die("failed to compile seccomp filter BPF program in '%s'", 661 path); 662 } 663 664 j->filter_len = fprog->len; 665 j->filter_prog = fprog; 666 667 fclose(file); 668} 669 670int API minijail_use_alt_syscall(struct minijail *j, const char *table) 671{ 672 j->alt_syscall_table = strdup(table); 673 if (!j->alt_syscall_table) 674 return -ENOMEM; 675 j->flags.alt_syscall = 1; 676 return 0; 677} 678 679struct marshal_state { 680 size_t available; 681 size_t total; 682 char *buf; 683}; 684 685void marshal_state_init(struct marshal_state *state, 686 char *buf, size_t available) 687{ 688 state->available = available; 689 state->buf = buf; 690 state->total = 0; 691} 692 693void marshal_append(struct marshal_state *state, 694 void *src, size_t length) 695{ 696 size_t copy_len = MIN(state->available, length); 697 698 /* Up to |available| will be written. */ 699 if (copy_len) { 700 memcpy(state->buf, src, copy_len); 701 state->buf += copy_len; 702 state->available -= copy_len; 703 } 704 /* |total| will contain the expected length. */ 705 state->total += length; 706} 707 708void minijail_marshal_helper(struct marshal_state *state, 709 const struct minijail *j) 710{ 711 struct mountpoint *m = NULL; 712 size_t i; 713 714 marshal_append(state, (char *)j, sizeof(*j)); 715 if (j->user) 716 marshal_append(state, j->user, strlen(j->user) + 1); 717 if (j->suppl_gid_list) { 718 marshal_append(state, j->suppl_gid_list, 719 j->suppl_gid_count * sizeof(gid_t)); 720 } 721 if (j->chrootdir) 722 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1); 723 if (j->alt_syscall_table) { 724 marshal_append(state, j->alt_syscall_table, 725 strlen(j->alt_syscall_table) + 1); 726 } 727 if (j->flags.seccomp_filter && j->filter_prog) { 728 struct sock_fprog *fp = j->filter_prog; 729 marshal_append(state, (char *)fp->filter, 730 fp->len * sizeof(struct sock_filter)); 731 } 732 for (m = j->mounts_head; m; m = m->next) { 733 marshal_append(state, m->src, strlen(m->src) + 1); 734 marshal_append(state, m->dest, strlen(m->dest) + 1); 735 marshal_append(state, m->type, strlen(m->type) + 1); 736 marshal_append(state, (char *)&m->flags, sizeof(m->flags)); 737 } 738 for (i = 0; i < j->cgroup_count; ++i) 739 marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1); 740} 741 742size_t API minijail_size(const struct minijail *j) 743{ 744 struct marshal_state state; 745 marshal_state_init(&state, NULL, 0); 746 minijail_marshal_helper(&state, j); 747 return state.total; 748} 749 750int minijail_marshal(const struct minijail *j, char *buf, size_t available) 751{ 752 struct marshal_state state; 753 marshal_state_init(&state, buf, available); 754 minijail_marshal_helper(&state, j); 755 return (state.total > available); 756} 757 758/* 759 * consumebytes: consumes @length bytes from a buffer @buf of length @buflength 760 * @length Number of bytes to consume 761 * @buf Buffer to consume from 762 * @buflength Size of @buf 763 * 764 * Returns a pointer to the base of the bytes, or NULL for errors. 765 */ 766void *consumebytes(size_t length, char **buf, size_t *buflength) 767{ 768 char *p = *buf; 769 if (length > *buflength) 770 return NULL; 771 *buf += length; 772 *buflength -= length; 773 return p; 774} 775 776/* 777 * consumestr: consumes a C string from a buffer @buf of length @length 778 * @buf Buffer to consume 779 * @length Length of buffer 780 * 781 * Returns a pointer to the base of the string, or NULL for errors. 782 */ 783char *consumestr(char **buf, size_t *buflength) 784{ 785 size_t len = strnlen(*buf, *buflength); 786 if (len == *buflength) 787 /* There's no null-terminator. */ 788 return NULL; 789 return consumebytes(len + 1, buf, buflength); 790} 791 792int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) 793{ 794 size_t i; 795 size_t count; 796 int ret = -EINVAL; 797 798 if (length < sizeof(*j)) 799 goto out; 800 memcpy((void *)j, serialized, sizeof(*j)); 801 serialized += sizeof(*j); 802 length -= sizeof(*j); 803 804 /* Potentially stale pointers not used as signals. */ 805 j->mounts_head = NULL; 806 j->mounts_tail = NULL; 807 j->filter_prog = NULL; 808 809 if (j->user) { /* stale pointer */ 810 char *user = consumestr(&serialized, &length); 811 if (!user) 812 goto clear_pointers; 813 j->user = strdup(user); 814 if (!j->user) 815 goto clear_pointers; 816 } 817 818 if (j->suppl_gid_list) { /* stale pointer */ 819 if (j->suppl_gid_count > NGROUPS_MAX) { 820 goto bad_gid_list; 821 } 822 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t); 823 void *gid_list_bytes = 824 consumebytes(gid_list_size, &serialized, &length); 825 if (!gid_list_bytes) 826 goto bad_gid_list; 827 828 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t)); 829 if (!j->suppl_gid_list) 830 goto bad_gid_list; 831 832 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size); 833 } 834 835 if (j->chrootdir) { /* stale pointer */ 836 char *chrootdir = consumestr(&serialized, &length); 837 if (!chrootdir) 838 goto bad_chrootdir; 839 j->chrootdir = strdup(chrootdir); 840 if (!j->chrootdir) 841 goto bad_chrootdir; 842 } 843 844 if (j->alt_syscall_table) { /* stale pointer */ 845 char *alt_syscall_table = consumestr(&serialized, &length); 846 if (!alt_syscall_table) 847 goto bad_syscall_table; 848 j->alt_syscall_table = strdup(alt_syscall_table); 849 if (!j->alt_syscall_table) 850 goto bad_syscall_table; 851 } 852 853 if (j->flags.seccomp_filter && j->filter_len > 0) { 854 size_t ninstrs = j->filter_len; 855 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) || 856 ninstrs > USHRT_MAX) 857 goto bad_filters; 858 859 size_t program_len = ninstrs * sizeof(struct sock_filter); 860 void *program = consumebytes(program_len, &serialized, &length); 861 if (!program) 862 goto bad_filters; 863 864 j->filter_prog = malloc(sizeof(struct sock_fprog)); 865 if (!j->filter_prog) 866 goto bad_filters; 867 868 j->filter_prog->len = ninstrs; 869 j->filter_prog->filter = malloc(program_len); 870 if (!j->filter_prog->filter) 871 goto bad_filter_prog_instrs; 872 873 memcpy(j->filter_prog->filter, program, program_len); 874 } 875 876 count = j->mounts_count; 877 j->mounts_count = 0; 878 for (i = 0; i < count; ++i) { 879 unsigned long *flags; 880 const char *dest; 881 const char *type; 882 const char *src = consumestr(&serialized, &length); 883 if (!src) 884 goto bad_mounts; 885 dest = consumestr(&serialized, &length); 886 if (!dest) 887 goto bad_mounts; 888 type = consumestr(&serialized, &length); 889 if (!type) 890 goto bad_mounts; 891 flags = consumebytes(sizeof(*flags), &serialized, &length); 892 if (!flags) 893 goto bad_mounts; 894 if (minijail_mount(j, src, dest, type, *flags)) 895 goto bad_mounts; 896 } 897 898 count = j->cgroup_count; 899 j->cgroup_count = 0; 900 for (i = 0; i < count; ++i) { 901 char *cgroup = consumestr(&serialized, &length); 902 if (!cgroup) 903 goto bad_cgroups; 904 j->cgroups[i] = strdup(cgroup); 905 if (!j->cgroups[i]) 906 goto bad_cgroups; 907 ++j->cgroup_count; 908 } 909 910 return 0; 911 912bad_cgroups: 913 while (j->mounts_head) { 914 struct mountpoint *m = j->mounts_head; 915 j->mounts_head = j->mounts_head->next; 916 free(m->type); 917 free(m->dest); 918 free(m->src); 919 free(m); 920 } 921 for (i = 0; i < j->cgroup_count; ++i) 922 free(j->cgroups[i]); 923bad_mounts: 924 if (j->flags.seccomp_filter && j->filter_len > 0) { 925 free(j->filter_prog->filter); 926 free(j->filter_prog); 927 } 928bad_filter_prog_instrs: 929 if (j->filter_prog) 930 free(j->filter_prog); 931bad_filters: 932 if (j->alt_syscall_table) 933 free(j->alt_syscall_table); 934bad_syscall_table: 935 if (j->chrootdir) 936 free(j->chrootdir); 937bad_chrootdir: 938 if (j->suppl_gid_list) 939 free(j->suppl_gid_list); 940bad_gid_list: 941 if (j->user) 942 free(j->user); 943clear_pointers: 944 j->user = NULL; 945 j->suppl_gid_list = NULL; 946 j->chrootdir = NULL; 947 j->alt_syscall_table = NULL; 948 j->cgroup_count = 0; 949out: 950 return ret; 951} 952 953static void write_ugid_mappings(const struct minijail *j) 954{ 955 int fd, ret, len; 956 size_t sz; 957 char fname[32]; 958 959 sz = sizeof(fname); 960 if (j->uidmap) { 961 ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid); 962 if (ret < 0 || (size_t)ret >= sz) 963 die("failed to write file name of uid_map"); 964 fd = open(fname, O_WRONLY); 965 if (fd < 0) 966 pdie("failed to open '%s'", fname); 967 len = strlen(j->uidmap); 968 if (write(fd, j->uidmap, len) < len) 969 die("failed to set uid_map"); 970 close(fd); 971 } 972 if (j->gidmap) { 973 ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid); 974 if (ret < 0 || (size_t)ret >= sz) 975 die("failed to write file name of gid_map"); 976 fd = open(fname, O_WRONLY); 977 if (fd < 0) 978 pdie("failed to open '%s'", fname); 979 len = strlen(j->gidmap); 980 if (write(fd, j->gidmap, len) < len) 981 die("failed to set gid_map"); 982 close(fd); 983 } 984} 985 986static void parent_setup_complete(int *pipe_fds) 987{ 988 close(pipe_fds[0]); 989 close(pipe_fds[1]); 990} 991 992/* 993 * wait_for_parent_setup: Called by the child process to wait for any 994 * further parent-side setup to complete before continuing. 995 */ 996static void wait_for_parent_setup(int *pipe_fds) 997{ 998 char buf; 999 1000 close(pipe_fds[1]); 1001 1002 /* Wait for parent to complete setup and close the pipe. */ 1003 if (read(pipe_fds[0], &buf, 1) != 0) 1004 die("failed to sync with parent"); 1005 close(pipe_fds[0]); 1006} 1007 1008static void enter_user_namespace(const struct minijail *j) 1009{ 1010 if (j->uidmap && setresuid(0, 0, 0)) 1011 pdie("setresuid"); 1012 if (j->gidmap && setresgid(0, 0, 0)) 1013 pdie("setresgid"); 1014} 1015 1016/* 1017 * mount_one: Applies mounts from @m for @j, recursing as needed. 1018 * @j Minijail these mounts are for 1019 * @m Head of list of mounts 1020 * 1021 * Returns 0 for success. 1022 */ 1023static int mount_one(const struct minijail *j, struct mountpoint *m) 1024{ 1025 int ret; 1026 char *dest; 1027 int remount_ro = 0; 1028 1029 /* |dest| has a leading "/". */ 1030 if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0) 1031 return -ENOMEM; 1032 1033 /* 1034 * R/O bind mounts have to be remounted since 'bind' and 'ro' 1035 * can't both be specified in the original bind mount. 1036 * Remount R/O after the initial mount. 1037 */ 1038 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) { 1039 remount_ro = 1; 1040 m->flags &= ~MS_RDONLY; 1041 } 1042 1043 ret = mount(m->src, dest, m->type, m->flags, NULL); 1044 if (ret) 1045 pdie("mount: %s -> %s", m->src, dest); 1046 1047 if (remount_ro) { 1048 m->flags |= MS_RDONLY; 1049 ret = mount(m->src, dest, NULL, 1050 m->flags | MS_REMOUNT, NULL); 1051 if (ret) 1052 pdie("bind ro: %s -> %s", m->src, dest); 1053 } 1054 1055 free(dest); 1056 if (m->next) 1057 return mount_one(j, m->next); 1058 return ret; 1059} 1060 1061int enter_chroot(const struct minijail *j) 1062{ 1063 int ret; 1064 1065 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 1066 return ret; 1067 1068 if (chroot(j->chrootdir)) 1069 return -errno; 1070 1071 if (chdir("/")) 1072 return -errno; 1073 1074 return 0; 1075} 1076 1077int enter_pivot_root(const struct minijail *j) 1078{ 1079 int ret, oldroot, newroot; 1080 1081 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 1082 return ret; 1083 1084 /* 1085 * Keep the fd for both old and new root. 1086 * It will be used in fchdir later. 1087 */ 1088 oldroot = open("/", O_DIRECTORY | O_RDONLY); 1089 if (oldroot < 0) 1090 pdie("failed to open / for fchdir"); 1091 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY); 1092 if (newroot < 0) 1093 pdie("failed to open %s for fchdir", j->chrootdir); 1094 1095 /* 1096 * To ensure chrootdir is the root of a file system, 1097 * do a self bind mount. 1098 */ 1099 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, "")) 1100 pdie("failed to bind mount '%s'", j->chrootdir); 1101 if (chdir(j->chrootdir)) 1102 return -errno; 1103 if (syscall(SYS_pivot_root, ".", ".")) 1104 pdie("pivot_root"); 1105 1106 /* 1107 * Now the old root is mounted on top of the new root. Use fchdir to 1108 * change to the old root and unmount it. 1109 */ 1110 if (fchdir(oldroot)) 1111 pdie("failed to fchdir to old /"); 1112 /* The old root might be busy, so use lazy unmount. */ 1113 if (umount2(".", MNT_DETACH)) 1114 pdie("umount(/)"); 1115 /* Change back to the new root. */ 1116 if (fchdir(newroot)) 1117 return -errno; 1118 if (chroot("/")) 1119 return -errno; 1120 /* Set correct CWD for getcwd(3). */ 1121 if (chdir("/")) 1122 return -errno; 1123 1124 return 0; 1125} 1126 1127int mount_tmp(void) 1128{ 1129 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777"); 1130} 1131 1132int remount_proc_readonly(const struct minijail *j) 1133{ 1134 const char *kProcPath = "/proc"; 1135 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 1136 /* 1137 * Right now, we're holding a reference to our parent's old mount of 1138 * /proc in our namespace, which means using MS_REMOUNT here would 1139 * mutate our parent's mount as well, even though we're in a VFS 1140 * namespace (!). Instead, remove their mount from our namespace 1141 * and make our own. However, if we are in a new user namespace, /proc 1142 * is not seen as mounted, so don't return error if umount() fails. 1143 */ 1144 if (umount2(kProcPath, MNT_DETACH) && !j->flags.userns) 1145 return -errno; 1146 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) 1147 return -errno; 1148 return 0; 1149} 1150 1151static void write_pid_to_path(pid_t pid, const char *path) 1152{ 1153 FILE *fp = fopen(path, "w"); 1154 1155 if (!fp) 1156 pdie("failed to open '%s'", path); 1157 if (fprintf(fp, "%d\n", (int)pid) < 0) 1158 pdie("fprintf(%s)", path); 1159 if (fclose(fp)) 1160 pdie("fclose(%s)", path); 1161} 1162 1163static void write_pid_file(const struct minijail *j) 1164{ 1165 write_pid_to_path(j->initpid, j->pid_file_path); 1166} 1167 1168static void add_to_cgroups(const struct minijail *j) 1169{ 1170 size_t i; 1171 1172 for (i = 0; i < j->cgroup_count; ++i) 1173 write_pid_to_path(j->initpid, j->cgroups[i]); 1174} 1175 1176void drop_ugid(const struct minijail *j) 1177{ 1178 if (j->flags.usergroups && j->flags.suppl_gids) { 1179 die("tried to inherit *and* set supplementary groups;" 1180 " can only do one"); 1181 } 1182 1183 if (j->flags.usergroups) { 1184 if (initgroups(j->user, j->usergid)) 1185 pdie("initgroups"); 1186 } else if (j->flags.suppl_gids) { 1187 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) { 1188 pdie("setgroups"); 1189 } 1190 } else { 1191 /* 1192 * Only attempt to clear supplementary groups if we are changing 1193 * users. 1194 */ 1195 if ((j->uid || j->gid) && setgroups(0, NULL)) 1196 pdie("setgroups"); 1197 } 1198 1199 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) 1200 pdie("setresgid"); 1201 1202 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) 1203 pdie("setresuid"); 1204} 1205 1206/* 1207 * We specifically do not use cap_valid() as that only tells us the last 1208 * valid cap we were *compiled* against (i.e. what the version of kernel 1209 * headers says). If we run on a different kernel version, then it's not 1210 * uncommon for that to be less (if an older kernel) or more (if a newer 1211 * kernel). 1212 * Normally, we suck up the answer via /proc. On Android, not all processes are 1213 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we 1214 * programmatically find the value by calling prctl(PR_CAPBSET_READ). 1215 */ 1216static unsigned int get_last_valid_cap() 1217{ 1218 unsigned int last_valid_cap = 0; 1219 if (is_android()) { 1220 for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0; 1221 ++last_valid_cap); 1222 1223 /* |last_valid_cap| will be the first failing value. */ 1224 if (last_valid_cap > 0) { 1225 last_valid_cap--; 1226 } 1227 } else { 1228 const char cap_file[] = "/proc/sys/kernel/cap_last_cap"; 1229 FILE *fp = fopen(cap_file, "re"); 1230 if (fscanf(fp, "%u", &last_valid_cap) != 1) 1231 pdie("fscanf(%s)", cap_file); 1232 fclose(fp); 1233 } 1234 return last_valid_cap; 1235} 1236 1237static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap) 1238{ 1239 const uint64_t one = 1; 1240 unsigned int i; 1241 for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) { 1242 if (keep_mask & (one << i)) 1243 continue; 1244 if (prctl(PR_CAPBSET_DROP, i)) 1245 pdie("could not drop capability from bounding set"); 1246 } 1247} 1248 1249void drop_caps(const struct minijail *j, unsigned int last_valid_cap) 1250{ 1251 cap_t caps = cap_get_proc(); 1252 cap_value_t flag[1]; 1253 const uint64_t one = 1; 1254 unsigned int i; 1255 if (!caps) 1256 die("can't get process caps"); 1257 if (cap_clear_flag(caps, CAP_INHERITABLE)) 1258 die("can't clear inheritable caps"); 1259 if (cap_clear_flag(caps, CAP_EFFECTIVE)) 1260 die("can't clear effective caps"); 1261 if (cap_clear_flag(caps, CAP_PERMITTED)) 1262 die("can't clear permitted caps"); 1263 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) { 1264 /* Keep CAP_SETPCAP for dropping bounding set bits. */ 1265 if (i != CAP_SETPCAP && !(j->caps & (one << i))) 1266 continue; 1267 flag[0] = i; 1268 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET)) 1269 die("can't add effective cap"); 1270 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET)) 1271 die("can't add permitted cap"); 1272 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET)) 1273 die("can't add inheritable cap"); 1274 } 1275 if (cap_set_proc(caps)) 1276 die("can't apply initial cleaned capset"); 1277 1278 /* 1279 * Instead of dropping bounding set first, do it here in case 1280 * the caller had a more permissive bounding set which could 1281 * have been used above to raise a capability that wasn't already 1282 * present. This requires CAP_SETPCAP, so we raised/kept it above. 1283 */ 1284 drop_capbset(j->caps, last_valid_cap); 1285 1286 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */ 1287 if ((j->caps & (one << CAP_SETPCAP)) == 0) { 1288 flag[0] = CAP_SETPCAP; 1289 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR)) 1290 die("can't clear effective cap"); 1291 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR)) 1292 die("can't clear permitted cap"); 1293 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR)) 1294 die("can't clear inheritable cap"); 1295 } 1296 1297 if (cap_set_proc(caps)) 1298 die("can't apply final cleaned capset"); 1299 1300 cap_free(caps); 1301} 1302 1303void set_seccomp_filter(const struct minijail *j) 1304{ 1305 /* 1306 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c> 1307 * in the kernel source tree for an explanation of the parameters. 1308 */ 1309 if (j->flags.no_new_privs) { 1310 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) 1311 pdie("prctl(PR_SET_NO_NEW_PRIVS)"); 1312 } 1313 1314 /* 1315 * If we're logging seccomp filter failures, 1316 * install the SIGSYS handler first. 1317 */ 1318 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) { 1319 if (install_sigsys_handler()) 1320 pdie("install SIGSYS handler"); 1321 warn("logging seccomp filter failures"); 1322 } 1323 1324 /* 1325 * Install the syscall filter. 1326 */ 1327 if (j->flags.seccomp_filter) { 1328 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 1329 j->filter_prog)) { 1330 if ((errno == EINVAL) && can_softfail()) { 1331 warn("seccomp not supported"); 1332 return; 1333 } 1334 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)"); 1335 } 1336 } 1337} 1338 1339void API minijail_enter(const struct minijail *j) 1340{ 1341 /* 1342 * If we're dropping caps, get the last valid cap from /proc now, 1343 * since /proc can be unmounted before drop_caps() is called. 1344 */ 1345 unsigned int last_valid_cap = 0; 1346 if (j->flags.capbset_drop || j->flags.use_caps) 1347 last_valid_cap = get_last_valid_cap(); 1348 1349 if (j->flags.pids) 1350 die("tried to enter a pid-namespaced jail;" 1351 " try minijail_run()?"); 1352 1353 if (j->flags.usergroups && !j->user) 1354 die("usergroup inheritance without username"); 1355 1356 /* 1357 * We can't recover from failures if we've dropped privileges partially, 1358 * so we don't even try. If any of our operations fail, we abort() the 1359 * entire process. 1360 */ 1361 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS)) 1362 pdie("setns(CLONE_NEWNS)"); 1363 1364 if (j->flags.vfs) { 1365 if (unshare(CLONE_NEWNS)) 1366 pdie("unshare(vfs)"); 1367 /* 1368 * Remount all filesystems as private. If they are shared 1369 * new bind mounts will creep out of our namespace. 1370 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt 1371 */ 1372 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 1373 pdie("mount(/, private)"); 1374 } 1375 1376 if (j->flags.ipc && unshare(CLONE_NEWIPC)) { 1377 pdie("unshare(ipc)"); 1378 } 1379 1380 if (j->flags.enter_net) { 1381 if (setns(j->netns_fd, CLONE_NEWNET)) 1382 pdie("setns(CLONE_NEWNET)"); 1383 } else if (j->flags.net && unshare(CLONE_NEWNET)) { 1384 pdie("unshare(net)"); 1385 } 1386 1387 if (j->flags.chroot && enter_chroot(j)) 1388 pdie("chroot"); 1389 1390 if (j->flags.pivot_root && enter_pivot_root(j)) 1391 pdie("pivot_root"); 1392 1393 if (j->flags.mount_tmp && mount_tmp()) 1394 pdie("mount_tmp"); 1395 1396 if (j->flags.remount_proc_ro && remount_proc_readonly(j)) 1397 pdie("remount"); 1398 1399 /* 1400 * If we're only dropping capabilities from the bounding set, but not 1401 * from the thread's (permitted|inheritable|effective) sets, do it now. 1402 */ 1403 if (j->flags.capbset_drop) { 1404 drop_capbset(j->cap_bset, last_valid_cap); 1405 } 1406 1407 if (j->flags.use_caps) { 1408 /* 1409 * POSIX capabilities are a bit tricky. If we drop our 1410 * capability to change uids, our attempt to use setuid() 1411 * below will fail. Hang on to root caps across setuid(), then 1412 * lock securebits. 1413 */ 1414 if (prctl(PR_SET_KEEPCAPS, 1)) 1415 pdie("prctl(PR_SET_KEEPCAPS)"); 1416 if (prctl 1417 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS)) 1418 pdie("prctl(PR_SET_SECUREBITS)"); 1419 } 1420 1421 /* 1422 * If we're setting no_new_privs, we can drop privileges 1423 * before setting seccomp filter. This way filter policies 1424 * don't need to allow privilege-dropping syscalls. 1425 */ 1426 if (j->flags.no_new_privs) { 1427 drop_ugid(j); 1428 if (j->flags.use_caps) 1429 drop_caps(j, last_valid_cap); 1430 1431 set_seccomp_filter(j); 1432 } else { 1433 /* 1434 * If we're not setting no_new_privs, 1435 * we need to set seccomp filter *before* dropping privileges. 1436 * WARNING: this means that filter policies *must* allow 1437 * setgroups()/setresgid()/setresuid() for dropping root and 1438 * capget()/capset()/prctl() for dropping caps. 1439 */ 1440 set_seccomp_filter(j); 1441 1442 drop_ugid(j); 1443 if (j->flags.use_caps) 1444 drop_caps(j, last_valid_cap); 1445 } 1446 1447 /* 1448 * Select the specified alternate syscall table. The table must not 1449 * block prctl(2) if we're using seccomp as well. 1450 */ 1451 if (j->flags.alt_syscall) { 1452 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table)) 1453 pdie("prctl(PR_ALT_SYSCALL)"); 1454 } 1455 1456 /* 1457 * seccomp has to come last since it cuts off all the other 1458 * privilege-dropping syscalls :) 1459 */ 1460 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) { 1461 if ((errno == EINVAL) && can_softfail()) { 1462 warn("seccomp not supported"); 1463 return; 1464 } 1465 pdie("prctl(PR_SET_SECCOMP)"); 1466 } 1467} 1468 1469/* TODO(wad) will visibility affect this variable? */ 1470static int init_exitstatus = 0; 1471 1472void init_term(int __attribute__ ((unused)) sig) 1473{ 1474 _exit(init_exitstatus); 1475} 1476 1477int init(pid_t rootpid) 1478{ 1479 pid_t pid; 1480 int status; 1481 /* so that we exit with the right status */ 1482 signal(SIGTERM, init_term); 1483 /* TODO(wad) self jail with seccomp_filters here. */ 1484 while ((pid = wait(&status)) > 0) { 1485 /* 1486 * This loop will only end when either there are no processes 1487 * left inside our pid namespace or we get a signal. 1488 */ 1489 if (pid == rootpid) 1490 init_exitstatus = status; 1491 } 1492 if (!WIFEXITED(init_exitstatus)) 1493 _exit(MINIJAIL_ERR_INIT); 1494 _exit(WEXITSTATUS(init_exitstatus)); 1495} 1496 1497int API minijail_from_fd(int fd, struct minijail *j) 1498{ 1499 size_t sz = 0; 1500 size_t bytes = read(fd, &sz, sizeof(sz)); 1501 char *buf; 1502 int r; 1503 if (sizeof(sz) != bytes) 1504 return -EINVAL; 1505 if (sz > USHRT_MAX) /* arbitrary sanity check */ 1506 return -E2BIG; 1507 buf = malloc(sz); 1508 if (!buf) 1509 return -ENOMEM; 1510 bytes = read(fd, buf, sz); 1511 if (bytes != sz) { 1512 free(buf); 1513 return -EINVAL; 1514 } 1515 r = minijail_unmarshal(j, buf, sz); 1516 free(buf); 1517 return r; 1518} 1519 1520int API minijail_to_fd(struct minijail *j, int fd) 1521{ 1522 char *buf; 1523 size_t sz = minijail_size(j); 1524 ssize_t written; 1525 int r; 1526 1527 if (!sz) 1528 return -EINVAL; 1529 buf = malloc(sz); 1530 r = minijail_marshal(j, buf, sz); 1531 if (r) { 1532 free(buf); 1533 return r; 1534 } 1535 /* Sends [size][minijail]. */ 1536 written = write(fd, &sz, sizeof(sz)); 1537 if (written != sizeof(sz)) { 1538 free(buf); 1539 return -EFAULT; 1540 } 1541 written = write(fd, buf, sz); 1542 if (written < 0 || (size_t) written != sz) { 1543 free(buf); 1544 return -EFAULT; 1545 } 1546 free(buf); 1547 return 0; 1548} 1549 1550int setup_preload(void) 1551{ 1552#if defined(__ANDROID__) 1553 /* Don't use LDPRELOAD on Brillo. */ 1554 return 0; 1555#else 1556 char *oldenv = getenv(kLdPreloadEnvVar) ? : ""; 1557 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH)); 1558 if (!newenv) 1559 return -ENOMEM; 1560 1561 /* Only insert a separating space if we have something to separate... */ 1562 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", 1563 PRELOADPATH); 1564 1565 /* setenv() makes a copy of the string we give it. */ 1566 setenv(kLdPreloadEnvVar, newenv, 1); 1567 free(newenv); 1568 return 0; 1569#endif 1570} 1571 1572int setup_pipe(int fds[2]) 1573{ 1574 int r = pipe(fds); 1575 char fd_buf[11]; 1576 if (r) 1577 return r; 1578 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]); 1579 if (r <= 0) 1580 return -EINVAL; 1581 setenv(kFdEnvVar, fd_buf, 1); 1582 return 0; 1583} 1584 1585int setup_pipe_end(int fds[2], size_t index) 1586{ 1587 if (index > 1) 1588 return -1; 1589 1590 close(fds[1 - index]); 1591 return fds[index]; 1592} 1593 1594int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd) 1595{ 1596 if (index > 1) 1597 return -1; 1598 1599 close(fds[1 - index]); 1600 /* dup2(2) the corresponding end of the pipe into |fd|. */ 1601 return dup2(fds[index], fd); 1602} 1603 1604int minijail_run_internal(struct minijail *j, const char *filename, 1605 char *const argv[], pid_t *pchild_pid, 1606 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1607 int use_preload); 1608 1609int API minijail_run(struct minijail *j, const char *filename, 1610 char *const argv[]) 1611{ 1612 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1613 true); 1614} 1615 1616int API minijail_run_pid(struct minijail *j, const char *filename, 1617 char *const argv[], pid_t *pchild_pid) 1618{ 1619 return minijail_run_internal(j, filename, argv, pchild_pid, 1620 NULL, NULL, NULL, true); 1621} 1622 1623int API minijail_run_pipe(struct minijail *j, const char *filename, 1624 char *const argv[], int *pstdin_fd) 1625{ 1626 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd, 1627 NULL, NULL, true); 1628} 1629 1630int API minijail_run_pid_pipes(struct minijail *j, const char *filename, 1631 char *const argv[], pid_t *pchild_pid, 1632 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) 1633{ 1634 return minijail_run_internal(j, filename, argv, pchild_pid, 1635 pstdin_fd, pstdout_fd, pstderr_fd, true); 1636} 1637 1638int API minijail_run_no_preload(struct minijail *j, const char *filename, 1639 char *const argv[]) 1640{ 1641 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1642 false); 1643} 1644 1645int API minijail_run_pid_pipes_no_preload(struct minijail *j, 1646 const char *filename, 1647 char *const argv[], 1648 pid_t *pchild_pid, 1649 int *pstdin_fd, int *pstdout_fd, 1650 int *pstderr_fd) { 1651 return minijail_run_internal(j, filename, argv, pchild_pid, 1652 pstdin_fd, pstdout_fd, pstderr_fd, false); 1653} 1654 1655int minijail_run_internal(struct minijail *j, const char *filename, 1656 char *const argv[], pid_t *pchild_pid, 1657 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1658 int use_preload) 1659{ 1660 char *oldenv, *oldenv_copy = NULL; 1661 pid_t child_pid; 1662 int pipe_fds[2]; 1663 int stdin_fds[2]; 1664 int stdout_fds[2]; 1665 int stderr_fds[2]; 1666 int child_sync_pipe_fds[2]; 1667 int sync_child = 0; 1668 int ret; 1669 /* We need to remember this across the minijail_preexec() call. */ 1670 int pid_namespace = j->flags.pids; 1671 int do_init = j->flags.do_init; 1672 1673 if (use_preload) { 1674 oldenv = getenv(kLdPreloadEnvVar); 1675 if (oldenv) { 1676 oldenv_copy = strdup(oldenv); 1677 if (!oldenv_copy) 1678 return -ENOMEM; 1679 } 1680 1681 if (setup_preload()) 1682 return -EFAULT; 1683 } 1684 1685 if (!use_preload) { 1686 if (j->flags.use_caps) 1687 die("capabilities are not supported without " 1688 "LD_PRELOAD"); 1689 } 1690 1691 /* 1692 * Make the process group ID of this process equal to its PID, so that 1693 * both the Minijail process and the jailed process can be killed 1694 * together. 1695 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when 1696 * the process is already a process group leader. 1697 */ 1698 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) { 1699 if (errno != EPERM) { 1700 pdie("setpgid(0, 0)"); 1701 } 1702 } 1703 1704 if (use_preload) { 1705 /* 1706 * Before we fork(2) and execve(2) the child process, we need 1707 * to open a pipe(2) to send the minijail configuration over. 1708 */ 1709 if (setup_pipe(pipe_fds)) 1710 return -EFAULT; 1711 } 1712 1713 /* 1714 * If we want to write to the child process' standard input, 1715 * create the pipe(2) now. 1716 */ 1717 if (pstdin_fd) { 1718 if (pipe(stdin_fds)) 1719 return -EFAULT; 1720 } 1721 1722 /* 1723 * If we want to read from the child process' standard output, 1724 * create the pipe(2) now. 1725 */ 1726 if (pstdout_fd) { 1727 if (pipe(stdout_fds)) 1728 return -EFAULT; 1729 } 1730 1731 /* 1732 * If we want to read from the child process' standard error, 1733 * create the pipe(2) now. 1734 */ 1735 if (pstderr_fd) { 1736 if (pipe(stderr_fds)) 1737 return -EFAULT; 1738 } 1739 1740 /* 1741 * If we want to set up a new uid/gid mapping in the user namespace, 1742 * or if we need to add the child process to cgroups, create the pipe(2) 1743 * to sync between parent and child. 1744 */ 1745 if (j->flags.userns || j->flags.cgroups) { 1746 sync_child = 1; 1747 if (pipe(child_sync_pipe_fds)) 1748 return -EFAULT; 1749 } 1750 1751 /* 1752 * Use sys_clone() if and only if we're creating a pid namespace. 1753 * 1754 * tl;dr: WARNING: do not mix pid namespaces and multithreading. 1755 * 1756 * In multithreaded programs, there are a bunch of locks inside libc, 1757 * some of which may be held by other threads at the time that we call 1758 * minijail_run_pid(). If we call fork(), glibc does its level best to 1759 * ensure that we hold all of these locks before it calls clone() 1760 * internally and drop them after clone() returns, but when we call 1761 * sys_clone(2) directly, all that gets bypassed and we end up with a 1762 * child address space where some of libc's important locks are held by 1763 * other threads (which did not get cloned, and hence will never release 1764 * those locks). This is okay so long as we call exec() immediately 1765 * after, but a bunch of seemingly-innocent libc functions like setenv() 1766 * take locks. 1767 * 1768 * Hence, only call sys_clone() if we need to, in order to get at pid 1769 * namespacing. If we follow this path, the child's address space might 1770 * have broken locks; you may only call functions that do not acquire 1771 * any locks. 1772 * 1773 * Unfortunately, fork() acquires every lock it can get its hands on, as 1774 * previously detailed, so this function is highly likely to deadlock 1775 * later on (see "deadlock here") if we're multithreaded. 1776 * 1777 * We might hack around this by having the clone()d child (init of the 1778 * pid namespace) return directly, rather than leaving the clone()d 1779 * process hanging around to be init for the new namespace (and having 1780 * its fork()ed child return in turn), but that process would be crippled 1781 * with its libc locks potentially broken. We might try fork()ing in the 1782 * parent before we clone() to ensure that we own all the locks, but 1783 * then we have to have the forked child hanging around consuming 1784 * resources (and possibly having file descriptors / shared memory 1785 * regions / etc attached). We'd need to keep the child around to avoid 1786 * having its children get reparented to init. 1787 * 1788 * TODO(ellyjones): figure out if the "forked child hanging around" 1789 * problem is fixable or not. It would be nice if we worked in this 1790 * case. 1791 */ 1792 if (pid_namespace) { 1793 int clone_flags = CLONE_NEWPID | SIGCHLD; 1794 if (j->flags.userns) 1795 clone_flags |= CLONE_NEWUSER; 1796 child_pid = syscall(SYS_clone, clone_flags, NULL); 1797 } else { 1798 child_pid = fork(); 1799 } 1800 1801 if (child_pid < 0) { 1802 if (use_preload) { 1803 free(oldenv_copy); 1804 } 1805 die("failed to fork child"); 1806 } 1807 1808 if (child_pid) { 1809 if (use_preload) { 1810 /* Restore parent's LD_PRELOAD. */ 1811 if (oldenv_copy) { 1812 setenv(kLdPreloadEnvVar, oldenv_copy, 1); 1813 free(oldenv_copy); 1814 } else { 1815 unsetenv(kLdPreloadEnvVar); 1816 } 1817 unsetenv(kFdEnvVar); 1818 } 1819 1820 j->initpid = child_pid; 1821 1822 if (j->flags.pid_file) 1823 write_pid_file(j); 1824 1825 if (j->flags.cgroups) 1826 add_to_cgroups(j); 1827 1828 if (j->flags.userns) 1829 write_ugid_mappings(j); 1830 1831 if (sync_child) 1832 parent_setup_complete(child_sync_pipe_fds); 1833 1834 if (use_preload) { 1835 /* Send marshalled minijail. */ 1836 close(pipe_fds[0]); /* read endpoint */ 1837 ret = minijail_to_fd(j, pipe_fds[1]); 1838 close(pipe_fds[1]); /* write endpoint */ 1839 if (ret) { 1840 kill(j->initpid, SIGKILL); 1841 die("failed to send marshalled minijail"); 1842 } 1843 } 1844 1845 if (pchild_pid) 1846 *pchild_pid = child_pid; 1847 1848 /* 1849 * If we want to write to the child process' standard input, 1850 * set up the write end of the pipe. 1851 */ 1852 if (pstdin_fd) 1853 *pstdin_fd = setup_pipe_end(stdin_fds, 1854 1 /* write end */); 1855 1856 /* 1857 * If we want to read from the child process' standard output, 1858 * set up the read end of the pipe. 1859 */ 1860 if (pstdout_fd) 1861 *pstdout_fd = setup_pipe_end(stdout_fds, 1862 0 /* read end */); 1863 1864 /* 1865 * If we want to read from the child process' standard error, 1866 * set up the read end of the pipe. 1867 */ 1868 if (pstderr_fd) 1869 *pstderr_fd = setup_pipe_end(stderr_fds, 1870 0 /* read end */); 1871 1872 return 0; 1873 } 1874 free(oldenv_copy); 1875 1876 if (j->flags.reset_signal_mask) { 1877 sigset_t signal_mask; 1878 if (sigemptyset(&signal_mask) != 0) 1879 pdie("sigemptyset failed"); 1880 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0) 1881 pdie("sigprocmask failed"); 1882 } 1883 1884 if (sync_child) 1885 wait_for_parent_setup(child_sync_pipe_fds); 1886 1887 if (j->flags.userns) 1888 enter_user_namespace(j); 1889 1890 /* 1891 * If we want to write to the jailed process' standard input, 1892 * set up the read end of the pipe. 1893 */ 1894 if (pstdin_fd) { 1895 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */, 1896 STDIN_FILENO) < 0) 1897 die("failed to set up stdin pipe"); 1898 } 1899 1900 /* 1901 * If we want to read from the jailed process' standard output, 1902 * set up the write end of the pipe. 1903 */ 1904 if (pstdout_fd) { 1905 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */, 1906 STDOUT_FILENO) < 0) 1907 die("failed to set up stdout pipe"); 1908 } 1909 1910 /* 1911 * If we want to read from the jailed process' standard error, 1912 * set up the write end of the pipe. 1913 */ 1914 if (pstderr_fd) { 1915 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */, 1916 STDERR_FILENO) < 0) 1917 die("failed to set up stderr pipe"); 1918 } 1919 1920 /* If running an init program, let it decide when/how to mount /proc. */ 1921 if (pid_namespace && !do_init) 1922 j->flags.remount_proc_ro = 0; 1923 1924 if (use_preload) { 1925 /* Strip out flags that cannot be inherited across execve(2). */ 1926 minijail_preexec(j); 1927 } else { 1928 j->flags.pids = 0; 1929 } 1930 /* Jail this process, then execve() the target. */ 1931 minijail_enter(j); 1932 1933 if (pid_namespace && do_init) { 1934 /* 1935 * pid namespace: this process will become init inside the new 1936 * namespace. We don't want all programs we might exec to have 1937 * to know how to be init. Normally (do_init == 1) we fork off 1938 * a child to actually run the program. If |do_init == 0|, we 1939 * let the program keep pid 1 and be init. 1940 * 1941 * If we're multithreaded, we'll probably deadlock here. See 1942 * WARNING above. 1943 */ 1944 child_pid = fork(); 1945 if (child_pid < 0) 1946 _exit(child_pid); 1947 else if (child_pid > 0) 1948 init(child_pid); /* never returns */ 1949 } 1950 1951 /* 1952 * If we aren't pid-namespaced, or the jailed program asked to be init: 1953 * calling process 1954 * -> execve()-ing process 1955 * If we are: 1956 * calling process 1957 * -> init()-ing process 1958 * -> execve()-ing process 1959 */ 1960 _exit(execve(filename, argv, environ)); 1961} 1962 1963int API minijail_kill(struct minijail *j) 1964{ 1965 int st; 1966 if (kill(j->initpid, SIGTERM)) 1967 return -errno; 1968 if (waitpid(j->initpid, &st, 0) < 0) 1969 return -errno; 1970 return st; 1971} 1972 1973int API minijail_wait(struct minijail *j) 1974{ 1975 int st; 1976 if (waitpid(j->initpid, &st, 0) < 0) 1977 return -errno; 1978 1979 if (!WIFEXITED(st)) { 1980 int error_status = st; 1981 if (WIFSIGNALED(st)) { 1982 int signum = WTERMSIG(st); 1983 warn("child process %d received signal %d", 1984 j->initpid, signum); 1985 /* 1986 * We return MINIJAIL_ERR_JAIL if the process received 1987 * SIGSYS, which happens when a syscall is blocked by 1988 * seccomp filters. 1989 * If not, we do what bash(1) does: 1990 * $? = 128 + signum 1991 */ 1992 if (signum == SIGSYS) { 1993 error_status = MINIJAIL_ERR_JAIL; 1994 } else { 1995 error_status = 128 + signum; 1996 } 1997 } 1998 return error_status; 1999 } 2000 2001 int exit_status = WEXITSTATUS(st); 2002 if (exit_status != 0) 2003 info("child process %d exited with status %d", 2004 j->initpid, exit_status); 2005 2006 return exit_status; 2007} 2008 2009void API minijail_destroy(struct minijail *j) 2010{ 2011 size_t i; 2012 2013 if (j->flags.seccomp_filter && j->filter_prog) { 2014 free(j->filter_prog->filter); 2015 free(j->filter_prog); 2016 } 2017 while (j->mounts_head) { 2018 struct mountpoint *m = j->mounts_head; 2019 j->mounts_head = j->mounts_head->next; 2020 free(m->type); 2021 free(m->dest); 2022 free(m->src); 2023 free(m); 2024 } 2025 j->mounts_tail = NULL; 2026 if (j->user) 2027 free(j->user); 2028 if (j->suppl_gid_list) 2029 free(j->suppl_gid_list); 2030 if (j->chrootdir) 2031 free(j->chrootdir); 2032 if (j->alt_syscall_table) 2033 free(j->alt_syscall_table); 2034 for (i = 0; i < j->cgroup_count; ++i) 2035 free(j->cgroups[i]); 2036 free(j); 2037} 2038