libminijail.c revision 565e978e80d693c077ec71caf45f6e06636a1a11
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6#define _BSD_SOURCE 7#define _DEFAULT_SOURCE 8#define _GNU_SOURCE 9 10#include <asm/unistd.h> 11#include <ctype.h> 12#include <errno.h> 13#include <fcntl.h> 14#include <grp.h> 15#include <inttypes.h> 16#include <limits.h> 17#include <linux/capability.h> 18#include <pwd.h> 19#include <sched.h> 20#include <signal.h> 21#include <stdarg.h> 22#include <stdbool.h> 23#include <stddef.h> 24#include <stdio.h> 25#include <stdlib.h> 26#include <string.h> 27#include <syscall.h> 28#include <sys/capability.h> 29#include <sys/mount.h> 30#include <sys/param.h> 31#include <sys/prctl.h> 32#include <sys/stat.h> 33#include <sys/types.h> 34#include <sys/user.h> 35#include <sys/wait.h> 36#include <unistd.h> 37 38#include "libminijail.h" 39#include "libminijail-private.h" 40 41#include "signal_handler.h" 42#include "syscall_filter.h" 43#include "util.h" 44 45#ifdef HAVE_SECUREBITS_H 46# include <linux/securebits.h> 47#else 48# define SECURE_ALL_BITS 0x55 49# define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1) 50#endif 51/* For kernels < 4.3. */ 52#define OLD_SECURE_ALL_BITS 0x15 53#define OLD_SECURE_ALL_LOCKS (OLD_SECURE_ALL_BITS << 1) 54 55/* 56 * Assert the value of SECURE_ALL_BITS at compile-time. 57 * Brillo devices are currently compiled against 4.4 kernel headers. Kernel 4.3 58 * added a new securebit. 59 * When a new securebit is added, the new SECURE_ALL_BITS mask will return EPERM 60 * when used on older kernels. The compile-time assert will catch this situation 61 * at compile time. 62 */ 63#ifdef __BRILLO__ 64_Static_assert(SECURE_ALL_BITS == 0x55, "SECURE_ALL_BITS == 0x55."); 65#endif 66 67/* Until these are reliably available in linux/prctl.h. */ 68#ifndef PR_SET_SECCOMP 69# define PR_SET_SECCOMP 22 70#endif 71 72#ifndef PR_ALT_SYSCALL 73# define PR_ALT_SYSCALL 0x43724f53 74#endif 75 76/* For seccomp_filter using BPF. */ 77#ifndef PR_SET_NO_NEW_PRIVS 78# define PR_SET_NO_NEW_PRIVS 38 79#endif 80#ifndef SECCOMP_MODE_FILTER 81# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ 82#endif 83 84/* New cgroup namespace might not be in linux-headers yet. */ 85#ifndef CLONE_NEWCGROUP 86# define CLONE_NEWCGROUP 0x02000000 87#endif 88 89#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */ 90 91struct mountpoint { 92 char *src; 93 char *dest; 94 char *type; 95 char *data; 96 int has_data; 97 unsigned long flags; 98 struct mountpoint *next; 99}; 100 101struct minijail { 102 /* 103 * WARNING: if you add a flag here you need to make sure it's 104 * accounted for in minijail_pre{enter|exec}() below. 105 */ 106 struct { 107 int uid:1; 108 int gid:1; 109 int usergroups:1; 110 int suppl_gids:1; 111 int use_caps:1; 112 int capbset_drop:1; 113 int vfs:1; 114 int enter_vfs:1; 115 int skip_remount_private:1; 116 int pids:1; 117 int ipc:1; 118 int net:1; 119 int enter_net:1; 120 int ns_cgroups:1; 121 int userns:1; 122 int seccomp:1; 123 int remount_proc_ro:1; 124 int no_new_privs:1; 125 int seccomp_filter:1; 126 int log_seccomp_filter:1; 127 int chroot:1; 128 int pivot_root:1; 129 int mount_tmp:1; 130 int do_init:1; 131 int pid_file:1; 132 int cgroups:1; 133 int alt_syscall:1; 134 int reset_signal_mask:1; 135 } flags; 136 uid_t uid; 137 gid_t gid; 138 gid_t usergid; 139 char *user; 140 size_t suppl_gid_count; 141 gid_t *suppl_gid_list; 142 uint64_t caps; 143 uint64_t cap_bset; 144 pid_t initpid; 145 int mountns_fd; 146 int netns_fd; 147 char *chrootdir; 148 char *pid_file_path; 149 char *uidmap; 150 char *gidmap; 151 size_t filter_len; 152 struct sock_fprog *filter_prog; 153 char *alt_syscall_table; 154 struct mountpoint *mounts_head; 155 struct mountpoint *mounts_tail; 156 size_t mounts_count; 157 char *cgroups[MAX_CGROUPS]; 158 size_t cgroup_count; 159}; 160 161/* 162 * Strip out flags meant for the parent. 163 * We keep things that are not inherited across execve(2) (e.g. capabilities), 164 * or are easier to set after execve(2) (e.g. seccomp filters). 165 */ 166void minijail_preenter(struct minijail *j) 167{ 168 j->flags.vfs = 0; 169 j->flags.enter_vfs = 0; 170 j->flags.skip_remount_private = 0; 171 j->flags.remount_proc_ro = 0; 172 j->flags.pids = 0; 173 j->flags.do_init = 0; 174 j->flags.pid_file = 0; 175 j->flags.cgroups = 0; 176} 177 178/* 179 * Strip out flags meant for the child. 180 * We keep things that are inherited across execve(2). 181 */ 182void minijail_preexec(struct minijail *j) 183{ 184 int vfs = j->flags.vfs; 185 int enter_vfs = j->flags.enter_vfs; 186 int skip_remount_private = j->flags.skip_remount_private; 187 int remount_proc_ro = j->flags.remount_proc_ro; 188 int userns = j->flags.userns; 189 if (j->user) 190 free(j->user); 191 j->user = NULL; 192 if (j->suppl_gid_list) 193 free(j->suppl_gid_list); 194 j->suppl_gid_list = NULL; 195 memset(&j->flags, 0, sizeof(j->flags)); 196 /* Now restore anything we meant to keep. */ 197 j->flags.vfs = vfs; 198 j->flags.enter_vfs = enter_vfs; 199 j->flags.skip_remount_private = skip_remount_private; 200 j->flags.remount_proc_ro = remount_proc_ro; 201 j->flags.userns = userns; 202 /* Note, |pids| will already have been used before this call. */ 203} 204 205/* Minijail API. */ 206 207struct minijail API *minijail_new(void) 208{ 209 return calloc(1, sizeof(struct minijail)); 210} 211 212void API minijail_change_uid(struct minijail *j, uid_t uid) 213{ 214 if (uid == 0) 215 die("useless change to uid 0"); 216 j->uid = uid; 217 j->flags.uid = 1; 218} 219 220void API minijail_change_gid(struct minijail *j, gid_t gid) 221{ 222 if (gid == 0) 223 die("useless change to gid 0"); 224 j->gid = gid; 225 j->flags.gid = 1; 226} 227 228void API minijail_set_supplementary_gids(struct minijail *j, size_t size, 229 const gid_t *list) 230{ 231 size_t i; 232 233 if (j->flags.usergroups) 234 die("cannot inherit *and* set supplementary groups"); 235 236 if (size == 0) { 237 /* Clear supplementary groups. */ 238 j->suppl_gid_list = NULL; 239 j->suppl_gid_count = 0; 240 j->flags.suppl_gids = 1; 241 return; 242 } 243 244 /* Copy the gid_t array. */ 245 j->suppl_gid_list = calloc(size, sizeof(gid_t)); 246 if (!j->suppl_gid_list) { 247 die("failed to allocate internal supplementary group array"); 248 } 249 for (i = 0; i < size; i++) { 250 j->suppl_gid_list[i] = list[i]; 251 } 252 j->suppl_gid_count = size; 253 j->flags.suppl_gids = 1; 254} 255 256int API minijail_change_user(struct minijail *j, const char *user) 257{ 258 char *buf = NULL; 259 struct passwd pw; 260 struct passwd *ppw = NULL; 261 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX); 262 if (sz == -1) 263 sz = 65536; /* your guess is as good as mine... */ 264 265 /* 266 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return 267 * the maximum needed size of the buffer, so we don't have to search. 268 */ 269 buf = malloc(sz); 270 if (!buf) 271 return -ENOMEM; 272 getpwnam_r(user, &pw, buf, sz, &ppw); 273 /* 274 * We're safe to free the buffer here. The strings inside |pw| point 275 * inside |buf|, but we don't use any of them; this leaves the pointers 276 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3) 277 * succeeded. 278 */ 279 free(buf); 280 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */ 281 if (!ppw) 282 return -1; 283 minijail_change_uid(j, ppw->pw_uid); 284 j->user = strdup(user); 285 if (!j->user) 286 return -ENOMEM; 287 j->usergid = ppw->pw_gid; 288 return 0; 289} 290 291int API minijail_change_group(struct minijail *j, const char *group) 292{ 293 char *buf = NULL; 294 struct group gr; 295 struct group *pgr = NULL; 296 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX); 297 if (sz == -1) 298 sz = 65536; /* and mine is as good as yours, really */ 299 300 /* 301 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return 302 * the maximum needed size of the buffer, so we don't have to search. 303 */ 304 buf = malloc(sz); 305 if (!buf) 306 return -ENOMEM; 307 getgrnam_r(group, &gr, buf, sz, &pgr); 308 /* 309 * We're safe to free the buffer here. The strings inside gr point 310 * inside buf, but we don't use any of them; this leaves the pointers 311 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded. 312 */ 313 free(buf); 314 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */ 315 if (!pgr) 316 return -1; 317 minijail_change_gid(j, pgr->gr_gid); 318 return 0; 319} 320 321void API minijail_use_seccomp(struct minijail *j) 322{ 323 j->flags.seccomp = 1; 324} 325 326void API minijail_no_new_privs(struct minijail *j) 327{ 328 j->flags.no_new_privs = 1; 329} 330 331void API minijail_use_seccomp_filter(struct minijail *j) 332{ 333 j->flags.seccomp_filter = 1; 334} 335 336void API minijail_log_seccomp_filter_failures(struct minijail *j) 337{ 338 j->flags.log_seccomp_filter = 1; 339} 340 341void API minijail_use_caps(struct minijail *j, uint64_t capmask) 342{ 343 /* 344 * 'minijail_use_caps' configures a runtime-capabilities-only 345 * environment, including a bounding set matching the thread's runtime 346 * (permitted|inheritable|effective) sets. 347 * Therefore, it will override any existing bounding set configurations 348 * since the latter would allow gaining extra runtime capabilities from 349 * file capabilities. 350 */ 351 if (j->flags.capbset_drop) { 352 warn("overriding bounding set configuration"); 353 j->cap_bset = 0; 354 j->flags.capbset_drop = 0; 355 } 356 j->caps = capmask; 357 j->flags.use_caps = 1; 358} 359 360void API minijail_capbset_drop(struct minijail *j, uint64_t capmask) 361{ 362 if (j->flags.use_caps) { 363 /* 364 * 'minijail_use_caps' will have already configured a capability 365 * bounding set matching the (permitted|inheritable|effective) 366 * sets. Abort if the user tries to configure a separate 367 * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps' 368 * are mutually exclusive. 369 */ 370 die("runtime capabilities already configured, can't drop " 371 "bounding set separately"); 372 } 373 j->cap_bset = capmask; 374 j->flags.capbset_drop = 1; 375} 376 377void API minijail_reset_signal_mask(struct minijail *j) 378{ 379 j->flags.reset_signal_mask = 1; 380} 381 382void API minijail_namespace_vfs(struct minijail *j) 383{ 384 j->flags.vfs = 1; 385} 386 387void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path) 388{ 389 int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC); 390 if (ns_fd < 0) { 391 pdie("failed to open namespace '%s'", ns_path); 392 } 393 j->mountns_fd = ns_fd; 394 j->flags.enter_vfs = 1; 395} 396 397void API minijail_skip_remount_private(struct minijail *j) 398{ 399 j->flags.skip_remount_private = 1; 400} 401 402void API minijail_namespace_pids(struct minijail *j) 403{ 404 j->flags.vfs = 1; 405 j->flags.remount_proc_ro = 1; 406 j->flags.pids = 1; 407 j->flags.do_init = 1; 408} 409 410void API minijail_namespace_ipc(struct minijail *j) 411{ 412 j->flags.ipc = 1; 413} 414 415void API minijail_namespace_net(struct minijail *j) 416{ 417 j->flags.net = 1; 418} 419 420void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path) 421{ 422 int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC); 423 if (ns_fd < 0) { 424 pdie("failed to open namespace '%s'", ns_path); 425 } 426 j->netns_fd = ns_fd; 427 j->flags.enter_net = 1; 428} 429 430void API minijail_namespace_cgroups(struct minijail *j) 431{ 432 j->flags.ns_cgroups = 1; 433} 434 435void API minijail_remount_proc_readonly(struct minijail *j) 436{ 437 j->flags.vfs = 1; 438 j->flags.remount_proc_ro = 1; 439} 440 441void API minijail_namespace_user(struct minijail *j) 442{ 443 j->flags.userns = 1; 444} 445 446int API minijail_uidmap(struct minijail *j, const char *uidmap) 447{ 448 j->uidmap = strdup(uidmap); 449 if (!j->uidmap) 450 return -ENOMEM; 451 char *ch; 452 for (ch = j->uidmap; *ch; ch++) { 453 if (*ch == ',') 454 *ch = '\n'; 455 } 456 return 0; 457} 458 459int API minijail_gidmap(struct minijail *j, const char *gidmap) 460{ 461 j->gidmap = strdup(gidmap); 462 if (!j->gidmap) 463 return -ENOMEM; 464 char *ch; 465 for (ch = j->gidmap; *ch; ch++) { 466 if (*ch == ',') 467 *ch = '\n'; 468 } 469 return 0; 470} 471 472void API minijail_inherit_usergroups(struct minijail *j) 473{ 474 j->flags.usergroups = 1; 475} 476 477void API minijail_run_as_init(struct minijail *j) 478{ 479 /* 480 * Since the jailed program will become 'init' in the new PID namespace, 481 * Minijail does not need to fork an 'init' process. 482 */ 483 j->flags.do_init = 0; 484} 485 486int API minijail_enter_chroot(struct minijail *j, const char *dir) 487{ 488 if (j->chrootdir) 489 return -EINVAL; 490 j->chrootdir = strdup(dir); 491 if (!j->chrootdir) 492 return -ENOMEM; 493 j->flags.chroot = 1; 494 return 0; 495} 496 497int API minijail_enter_pivot_root(struct minijail *j, const char *dir) 498{ 499 if (j->chrootdir) 500 return -EINVAL; 501 j->chrootdir = strdup(dir); 502 if (!j->chrootdir) 503 return -ENOMEM; 504 j->flags.pivot_root = 1; 505 return 0; 506} 507 508char API *minijail_get_original_path(struct minijail *j, 509 const char *path_inside_chroot) 510{ 511 struct mountpoint *b; 512 513 b = j->mounts_head; 514 while (b) { 515 /* 516 * If |path_inside_chroot| is the exact destination of a 517 * mount, then the original path is exactly the source of 518 * the mount. 519 * for example: "-b /some/path/exe,/chroot/path/exe" 520 * mount source = /some/path/exe, mount dest = 521 * /chroot/path/exe Then when getting the original path of 522 * "/chroot/path/exe", the source of that mount, 523 * "/some/path/exe" is what should be returned. 524 */ 525 if (!strcmp(b->dest, path_inside_chroot)) 526 return strdup(b->src); 527 528 /* 529 * If |path_inside_chroot| is within the destination path of a 530 * mount, take the suffix of the chroot path relative to the 531 * mount destination path, and append it to the mount source 532 * path. 533 */ 534 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) { 535 const char *relative_path = 536 path_inside_chroot + strlen(b->dest); 537 return path_join(b->src, relative_path); 538 } 539 b = b->next; 540 } 541 542 /* If there is a chroot path, append |path_inside_chroot| to that. */ 543 if (j->chrootdir) 544 return path_join(j->chrootdir, path_inside_chroot); 545 546 /* No chroot, so the path outside is the same as it is inside. */ 547 return strdup(path_inside_chroot); 548} 549 550void API minijail_mount_tmp(struct minijail *j) 551{ 552 j->flags.mount_tmp = 1; 553} 554 555int API minijail_write_pid_file(struct minijail *j, const char *path) 556{ 557 j->pid_file_path = strdup(path); 558 if (!j->pid_file_path) 559 return -ENOMEM; 560 j->flags.pid_file = 1; 561 return 0; 562} 563 564int API minijail_add_to_cgroup(struct minijail *j, const char *path) 565{ 566 if (j->cgroup_count >= MAX_CGROUPS) 567 return -ENOMEM; 568 j->cgroups[j->cgroup_count] = strdup(path); 569 if (!j->cgroups[j->cgroup_count]) 570 return -ENOMEM; 571 j->cgroup_count++; 572 j->flags.cgroups = 1; 573 return 0; 574} 575 576int API minijail_mount_with_data(struct minijail *j, const char *src, 577 const char *dest, const char *type, 578 unsigned long flags, const char *data) 579{ 580 struct mountpoint *m; 581 582 if (*dest != '/') 583 return -EINVAL; 584 m = calloc(1, sizeof(*m)); 585 if (!m) 586 return -ENOMEM; 587 m->dest = strdup(dest); 588 if (!m->dest) 589 goto error; 590 m->src = strdup(src); 591 if (!m->src) 592 goto error; 593 m->type = strdup(type); 594 if (!m->type) 595 goto error; 596 if (data) { 597 m->data = strdup(data); 598 if (!m->data) 599 goto error; 600 m->has_data = 1; 601 } 602 m->flags = flags; 603 604 info("mount %s -> %s type '%s'", src, dest, type); 605 606 /* 607 * Force vfs namespacing so the mounts don't leak out into the 608 * containing vfs namespace. 609 */ 610 minijail_namespace_vfs(j); 611 612 if (j->mounts_tail) 613 j->mounts_tail->next = m; 614 else 615 j->mounts_head = m; 616 j->mounts_tail = m; 617 j->mounts_count++; 618 619 return 0; 620 621error: 622 free(m->type); 623 free(m->src); 624 free(m->dest); 625 free(m); 626 return -ENOMEM; 627} 628 629int API minijail_mount(struct minijail *j, const char *src, const char *dest, 630 const char *type, unsigned long flags) 631{ 632 return minijail_mount_with_data(j, src, dest, type, flags, NULL); 633} 634 635int API minijail_bind(struct minijail *j, const char *src, const char *dest, 636 int writeable) 637{ 638 unsigned long flags = MS_BIND; 639 640 if (!writeable) 641 flags |= MS_RDONLY; 642 643 return minijail_mount(j, src, dest, "", flags); 644} 645 646void API minijail_parse_seccomp_filters(struct minijail *j, const char *path) 647{ 648 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) { 649 if ((errno == EINVAL) && seccomp_can_softfail()) { 650 warn("not loading seccomp filter," 651 " seccomp not supported"); 652 j->flags.seccomp_filter = 0; 653 j->flags.log_seccomp_filter = 0; 654 j->filter_len = 0; 655 j->filter_prog = NULL; 656 j->flags.no_new_privs = 0; 657 } 658 } 659 FILE *file = fopen(path, "r"); 660 if (!file) { 661 pdie("failed to open seccomp filter file '%s'", path); 662 } 663 664 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog)); 665 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) { 666 die("failed to compile seccomp filter BPF program in '%s'", 667 path); 668 } 669 670 j->filter_len = fprog->len; 671 j->filter_prog = fprog; 672 673 fclose(file); 674} 675 676int API minijail_use_alt_syscall(struct minijail *j, const char *table) 677{ 678 j->alt_syscall_table = strdup(table); 679 if (!j->alt_syscall_table) 680 return -ENOMEM; 681 j->flags.alt_syscall = 1; 682 return 0; 683} 684 685struct marshal_state { 686 size_t available; 687 size_t total; 688 char *buf; 689}; 690 691void marshal_state_init(struct marshal_state *state, char *buf, 692 size_t available) 693{ 694 state->available = available; 695 state->buf = buf; 696 state->total = 0; 697} 698 699void marshal_append(struct marshal_state *state, void *src, size_t length) 700{ 701 size_t copy_len = MIN(state->available, length); 702 703 /* Up to |available| will be written. */ 704 if (copy_len) { 705 memcpy(state->buf, src, copy_len); 706 state->buf += copy_len; 707 state->available -= copy_len; 708 } 709 /* |total| will contain the expected length. */ 710 state->total += length; 711} 712 713void marshal_mount(struct marshal_state *state, const struct mountpoint *m) 714{ 715 marshal_append(state, m->src, strlen(m->src) + 1); 716 marshal_append(state, m->dest, strlen(m->dest) + 1); 717 marshal_append(state, m->type, strlen(m->type) + 1); 718 marshal_append(state, (char *)&m->has_data, sizeof(m->has_data)); 719 if (m->has_data) 720 marshal_append(state, m->data, strlen(m->data) + 1); 721 marshal_append(state, (char *)&m->flags, sizeof(m->flags)); 722} 723 724void minijail_marshal_helper(struct marshal_state *state, 725 const struct minijail *j) 726{ 727 struct mountpoint *m = NULL; 728 size_t i; 729 730 marshal_append(state, (char *)j, sizeof(*j)); 731 if (j->user) 732 marshal_append(state, j->user, strlen(j->user) + 1); 733 if (j->suppl_gid_list) { 734 marshal_append(state, j->suppl_gid_list, 735 j->suppl_gid_count * sizeof(gid_t)); 736 } 737 if (j->chrootdir) 738 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1); 739 if (j->alt_syscall_table) { 740 marshal_append(state, j->alt_syscall_table, 741 strlen(j->alt_syscall_table) + 1); 742 } 743 if (j->flags.seccomp_filter && j->filter_prog) { 744 struct sock_fprog *fp = j->filter_prog; 745 marshal_append(state, (char *)fp->filter, 746 fp->len * sizeof(struct sock_filter)); 747 } 748 for (m = j->mounts_head; m; m = m->next) { 749 marshal_mount(state, m); 750 } 751 for (i = 0; i < j->cgroup_count; ++i) 752 marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1); 753} 754 755size_t API minijail_size(const struct minijail *j) 756{ 757 struct marshal_state state; 758 marshal_state_init(&state, NULL, 0); 759 minijail_marshal_helper(&state, j); 760 return state.total; 761} 762 763int minijail_marshal(const struct minijail *j, char *buf, size_t available) 764{ 765 struct marshal_state state; 766 marshal_state_init(&state, buf, available); 767 minijail_marshal_helper(&state, j); 768 return (state.total > available); 769} 770 771int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) 772{ 773 size_t i; 774 size_t count; 775 int ret = -EINVAL; 776 777 if (length < sizeof(*j)) 778 goto out; 779 memcpy((void *)j, serialized, sizeof(*j)); 780 serialized += sizeof(*j); 781 length -= sizeof(*j); 782 783 /* Potentially stale pointers not used as signals. */ 784 j->pid_file_path = NULL; 785 j->uidmap = NULL; 786 j->gidmap = NULL; 787 j->mounts_head = NULL; 788 j->mounts_tail = NULL; 789 j->filter_prog = NULL; 790 791 if (j->user) { /* stale pointer */ 792 char *user = consumestr(&serialized, &length); 793 if (!user) 794 goto clear_pointers; 795 j->user = strdup(user); 796 if (!j->user) 797 goto clear_pointers; 798 } 799 800 if (j->suppl_gid_list) { /* stale pointer */ 801 if (j->suppl_gid_count > NGROUPS_MAX) { 802 goto bad_gid_list; 803 } 804 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t); 805 void *gid_list_bytes = 806 consumebytes(gid_list_size, &serialized, &length); 807 if (!gid_list_bytes) 808 goto bad_gid_list; 809 810 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t)); 811 if (!j->suppl_gid_list) 812 goto bad_gid_list; 813 814 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size); 815 } 816 817 if (j->chrootdir) { /* stale pointer */ 818 char *chrootdir = consumestr(&serialized, &length); 819 if (!chrootdir) 820 goto bad_chrootdir; 821 j->chrootdir = strdup(chrootdir); 822 if (!j->chrootdir) 823 goto bad_chrootdir; 824 } 825 826 if (j->alt_syscall_table) { /* stale pointer */ 827 char *alt_syscall_table = consumestr(&serialized, &length); 828 if (!alt_syscall_table) 829 goto bad_syscall_table; 830 j->alt_syscall_table = strdup(alt_syscall_table); 831 if (!j->alt_syscall_table) 832 goto bad_syscall_table; 833 } 834 835 if (j->flags.seccomp_filter && j->filter_len > 0) { 836 size_t ninstrs = j->filter_len; 837 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) || 838 ninstrs > USHRT_MAX) 839 goto bad_filters; 840 841 size_t program_len = ninstrs * sizeof(struct sock_filter); 842 void *program = consumebytes(program_len, &serialized, &length); 843 if (!program) 844 goto bad_filters; 845 846 j->filter_prog = malloc(sizeof(struct sock_fprog)); 847 if (!j->filter_prog) 848 goto bad_filters; 849 850 j->filter_prog->len = ninstrs; 851 j->filter_prog->filter = malloc(program_len); 852 if (!j->filter_prog->filter) 853 goto bad_filter_prog_instrs; 854 855 memcpy(j->filter_prog->filter, program, program_len); 856 } 857 858 count = j->mounts_count; 859 j->mounts_count = 0; 860 for (i = 0; i < count; ++i) { 861 unsigned long *flags; 862 int *has_data; 863 const char *dest; 864 const char *type; 865 const char *data = NULL; 866 const char *src = consumestr(&serialized, &length); 867 if (!src) 868 goto bad_mounts; 869 dest = consumestr(&serialized, &length); 870 if (!dest) 871 goto bad_mounts; 872 type = consumestr(&serialized, &length); 873 if (!type) 874 goto bad_mounts; 875 has_data = consumebytes(sizeof(*has_data), &serialized, 876 &length); 877 if (!has_data) 878 goto bad_mounts; 879 if (*has_data) { 880 data = consumestr(&serialized, &length); 881 if (!data) 882 goto bad_mounts; 883 } 884 flags = consumebytes(sizeof(*flags), &serialized, &length); 885 if (!flags) 886 goto bad_mounts; 887 if (minijail_mount_with_data(j, src, dest, type, *flags, data)) 888 goto bad_mounts; 889 } 890 891 count = j->cgroup_count; 892 j->cgroup_count = 0; 893 for (i = 0; i < count; ++i) { 894 char *cgroup = consumestr(&serialized, &length); 895 if (!cgroup) 896 goto bad_cgroups; 897 j->cgroups[i] = strdup(cgroup); 898 if (!j->cgroups[i]) 899 goto bad_cgroups; 900 ++j->cgroup_count; 901 } 902 903 return 0; 904 905bad_cgroups: 906 while (j->mounts_head) { 907 struct mountpoint *m = j->mounts_head; 908 j->mounts_head = j->mounts_head->next; 909 free(m->data); 910 free(m->type); 911 free(m->dest); 912 free(m->src); 913 free(m); 914 } 915 for (i = 0; i < j->cgroup_count; ++i) 916 free(j->cgroups[i]); 917bad_mounts: 918 if (j->flags.seccomp_filter && j->filter_len > 0) { 919 free(j->filter_prog->filter); 920 free(j->filter_prog); 921 } 922bad_filter_prog_instrs: 923 if (j->filter_prog) 924 free(j->filter_prog); 925bad_filters: 926 if (j->alt_syscall_table) 927 free(j->alt_syscall_table); 928bad_syscall_table: 929 if (j->chrootdir) 930 free(j->chrootdir); 931bad_chrootdir: 932 if (j->suppl_gid_list) 933 free(j->suppl_gid_list); 934bad_gid_list: 935 if (j->user) 936 free(j->user); 937clear_pointers: 938 j->user = NULL; 939 j->suppl_gid_list = NULL; 940 j->chrootdir = NULL; 941 j->alt_syscall_table = NULL; 942 j->cgroup_count = 0; 943out: 944 return ret; 945} 946 947static void write_ugid_mappings(const struct minijail *j) 948{ 949 int fd, ret, len; 950 size_t sz; 951 char fname[32]; 952 953 sz = sizeof(fname); 954 if (j->uidmap) { 955 ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid); 956 if (ret < 0 || (size_t)ret >= sz) 957 die("failed to write file name of uid_map"); 958 fd = open(fname, O_WRONLY | O_CLOEXEC); 959 if (fd < 0) 960 pdie("failed to open '%s'", fname); 961 len = strlen(j->uidmap); 962 if (write(fd, j->uidmap, len) < len) 963 die("failed to set uid_map"); 964 close(fd); 965 } 966 if (j->gidmap) { 967 ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid); 968 if (ret < 0 || (size_t)ret >= sz) 969 die("failed to write file name of gid_map"); 970 fd = open(fname, O_WRONLY | O_CLOEXEC); 971 if (fd < 0) 972 pdie("failed to open '%s'", fname); 973 len = strlen(j->gidmap); 974 if (write(fd, j->gidmap, len) < len) 975 die("failed to set gid_map"); 976 close(fd); 977 } 978} 979 980static void parent_setup_complete(int *pipe_fds) 981{ 982 close(pipe_fds[0]); 983 close(pipe_fds[1]); 984} 985 986/* 987 * wait_for_parent_setup: Called by the child process to wait for any 988 * further parent-side setup to complete before continuing. 989 */ 990static void wait_for_parent_setup(int *pipe_fds) 991{ 992 char buf; 993 994 close(pipe_fds[1]); 995 996 /* Wait for parent to complete setup and close the pipe. */ 997 if (read(pipe_fds[0], &buf, 1) != 0) 998 die("failed to sync with parent"); 999 close(pipe_fds[0]); 1000} 1001 1002static void enter_user_namespace(const struct minijail *j) 1003{ 1004 if (j->uidmap && setresuid(0, 0, 0)) 1005 pdie("setresuid"); 1006 if (j->gidmap && setresgid(0, 0, 0)) 1007 pdie("setresgid"); 1008} 1009 1010/* 1011 * setup_mount_destination: Ensures the mount target exists. 1012 * Creates it if needed and possible. 1013 */ 1014int setup_mount_destination(const char *source, const char *dest, uid_t uid, 1015 uid_t gid) 1016{ 1017 int rc; 1018 struct stat st_buf; 1019 1020 rc = stat(dest, &st_buf); 1021 if (rc == 0) /* destination exists */ 1022 return 0; 1023 1024 /* 1025 * Try to create the destination. 1026 * Either make a directory or touch a file depending on the source type. 1027 * If the source doesn't exist, assume it is a filesystem type such as 1028 * "tmpfs" and create a directory to mount it on. 1029 */ 1030 rc = stat(source, &st_buf); 1031 if (rc || S_ISDIR(st_buf.st_mode) || S_ISBLK(st_buf.st_mode)) { 1032 if (mkdir(dest, 0700)) 1033 return -errno; 1034 } else { 1035 int fd = open(dest, O_RDWR | O_CREAT, 0700); 1036 if (fd < 0) 1037 return -errno; 1038 close(fd); 1039 } 1040 return chown(dest, uid, gid); 1041} 1042 1043/* 1044 * mount_one: Applies mounts from @m for @j, recursing as needed. 1045 * @j Minijail these mounts are for 1046 * @m Head of list of mounts 1047 * 1048 * Returns 0 for success. 1049 */ 1050static int mount_one(const struct minijail *j, struct mountpoint *m) 1051{ 1052 int ret; 1053 char *dest; 1054 int remount_ro = 0; 1055 1056 /* |dest| has a leading "/". */ 1057 if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0) 1058 return -ENOMEM; 1059 1060 if (setup_mount_destination(m->src, dest, j->uid, j->gid)) 1061 pdie("creating mount target '%s' failed", dest); 1062 1063 /* 1064 * R/O bind mounts have to be remounted since 'bind' and 'ro' 1065 * can't both be specified in the original bind mount. 1066 * Remount R/O after the initial mount. 1067 */ 1068 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) { 1069 remount_ro = 1; 1070 m->flags &= ~MS_RDONLY; 1071 } 1072 1073 ret = mount(m->src, dest, m->type, m->flags, m->data); 1074 if (ret) 1075 pdie("mount: %s -> %s", m->src, dest); 1076 1077 if (remount_ro) { 1078 m->flags |= MS_RDONLY; 1079 ret = mount(m->src, dest, NULL, 1080 m->flags | MS_REMOUNT, m->data); 1081 if (ret) 1082 pdie("bind ro: %s -> %s", m->src, dest); 1083 } 1084 1085 free(dest); 1086 if (m->next) 1087 return mount_one(j, m->next); 1088 return ret; 1089} 1090 1091int enter_chroot(const struct minijail *j) 1092{ 1093 int ret; 1094 1095 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 1096 return ret; 1097 1098 if (chroot(j->chrootdir)) 1099 return -errno; 1100 1101 if (chdir("/")) 1102 return -errno; 1103 1104 return 0; 1105} 1106 1107int enter_pivot_root(const struct minijail *j) 1108{ 1109 int ret, oldroot, newroot; 1110 1111 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 1112 return ret; 1113 1114 /* 1115 * Keep the fd for both old and new root. 1116 * It will be used in fchdir(2) later. 1117 */ 1118 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); 1119 if (oldroot < 0) 1120 pdie("failed to open / for fchdir"); 1121 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC); 1122 if (newroot < 0) 1123 pdie("failed to open %s for fchdir", j->chrootdir); 1124 1125 /* 1126 * To ensure j->chrootdir is the root of a filesystem, 1127 * do a self bind mount. 1128 */ 1129 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, "")) 1130 pdie("failed to bind mount '%s'", j->chrootdir); 1131 if (chdir(j->chrootdir)) 1132 return -errno; 1133 if (syscall(SYS_pivot_root, ".", ".")) 1134 pdie("pivot_root"); 1135 1136 /* 1137 * Now the old root is mounted on top of the new root. Use fchdir(2) to 1138 * change to the old root and unmount it. 1139 */ 1140 if (fchdir(oldroot)) 1141 pdie("failed to fchdir to old /"); 1142 1143 /* 1144 * If j->flags.skip_remount_private was enabled for minijail_enter(), 1145 * there could be a shared mount point under |oldroot|. In that case, 1146 * mounts under this shared mount point will be unmounted below, and 1147 * this unmounting will propagate to the original mount namespace 1148 * (because the mount point is shared). To prevent this unexpected 1149 * unmounting, remove these mounts from their peer groups by recursively 1150 * remounting them as MS_PRIVATE. 1151 */ 1152 if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL)) 1153 pdie("failed to mount(/, private) before umount(/)"); 1154 /* The old root might be busy, so use lazy unmount. */ 1155 if (umount2(".", MNT_DETACH)) 1156 pdie("umount(/)"); 1157 /* Change back to the new root. */ 1158 if (fchdir(newroot)) 1159 return -errno; 1160 if (close(oldroot)) 1161 return -errno; 1162 if (close(newroot)) 1163 return -errno; 1164 if (chroot("/")) 1165 return -errno; 1166 /* Set correct CWD for getcwd(3). */ 1167 if (chdir("/")) 1168 return -errno; 1169 1170 return 0; 1171} 1172 1173int mount_tmp(void) 1174{ 1175 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777"); 1176} 1177 1178int remount_proc_readonly(const struct minijail *j) 1179{ 1180 const char *kProcPath = "/proc"; 1181 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 1182 /* 1183 * Right now, we're holding a reference to our parent's old mount of 1184 * /proc in our namespace, which means using MS_REMOUNT here would 1185 * mutate our parent's mount as well, even though we're in a VFS 1186 * namespace (!). Instead, remove their mount from our namespace lazily 1187 * (MNT_DETACH) and make our own. 1188 */ 1189 if (umount2(kProcPath, MNT_DETACH)) { 1190 /* 1191 * If we are in a new user namespace, umount(2) will fail. 1192 * See http://man7.org/linux/man-pages/man7/user_namespaces.7.html 1193 */ 1194 if (j->flags.userns) { 1195 info("umount(/proc, MNT_DETACH) failed, " 1196 "this is expected when using user namespaces"); 1197 } else { 1198 return -errno; 1199 } 1200 } 1201 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) 1202 return -errno; 1203 return 0; 1204} 1205 1206static void write_pid_to_path(pid_t pid, const char *path) 1207{ 1208 FILE *fp = fopen(path, "w"); 1209 1210 if (!fp) 1211 pdie("failed to open '%s'", path); 1212 if (fprintf(fp, "%d\n", (int)pid) < 0) 1213 pdie("fprintf(%s)", path); 1214 if (fclose(fp)) 1215 pdie("fclose(%s)", path); 1216} 1217 1218static void write_pid_file(const struct minijail *j) 1219{ 1220 write_pid_to_path(j->initpid, j->pid_file_path); 1221} 1222 1223static void add_to_cgroups(const struct minijail *j) 1224{ 1225 size_t i; 1226 1227 for (i = 0; i < j->cgroup_count; ++i) 1228 write_pid_to_path(j->initpid, j->cgroups[i]); 1229} 1230 1231void drop_ugid(const struct minijail *j) 1232{ 1233 if (j->flags.usergroups && j->flags.suppl_gids) { 1234 die("tried to inherit *and* set supplementary groups;" 1235 " can only do one"); 1236 } 1237 1238 if (j->flags.usergroups) { 1239 if (initgroups(j->user, j->usergid)) 1240 pdie("initgroups"); 1241 } else if (j->flags.suppl_gids) { 1242 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) { 1243 pdie("setgroups"); 1244 } 1245 } else { 1246 /* 1247 * Only attempt to clear supplementary groups if we are changing 1248 * users. 1249 */ 1250 if ((j->uid || j->gid) && setgroups(0, NULL)) 1251 pdie("setgroups"); 1252 } 1253 1254 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) 1255 pdie("setresgid"); 1256 1257 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) 1258 pdie("setresuid"); 1259} 1260 1261/* 1262 * We specifically do not use cap_valid() as that only tells us the last 1263 * valid cap we were *compiled* against (i.e. what the version of kernel 1264 * headers says). If we run on a different kernel version, then it's not 1265 * uncommon for that to be less (if an older kernel) or more (if a newer 1266 * kernel). 1267 * Normally, we suck up the answer via /proc. On Android, not all processes are 1268 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we 1269 * programmatically find the value by calling prctl(PR_CAPBSET_READ). 1270 */ 1271static unsigned int get_last_valid_cap() 1272{ 1273 unsigned int last_valid_cap = 0; 1274 if (is_android()) { 1275 for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0; 1276 ++last_valid_cap); 1277 1278 /* |last_valid_cap| will be the first failing value. */ 1279 if (last_valid_cap > 0) { 1280 last_valid_cap--; 1281 } 1282 } else { 1283 const char cap_file[] = "/proc/sys/kernel/cap_last_cap"; 1284 FILE *fp = fopen(cap_file, "re"); 1285 if (fscanf(fp, "%u", &last_valid_cap) != 1) 1286 pdie("fscanf(%s)", cap_file); 1287 fclose(fp); 1288 } 1289 return last_valid_cap; 1290} 1291 1292static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap) 1293{ 1294 const uint64_t one = 1; 1295 unsigned int i; 1296 for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) { 1297 if (keep_mask & (one << i)) 1298 continue; 1299 if (prctl(PR_CAPBSET_DROP, i)) 1300 pdie("could not drop capability from bounding set"); 1301 } 1302} 1303 1304void drop_caps(const struct minijail *j, unsigned int last_valid_cap) 1305{ 1306 if (!j->flags.use_caps) 1307 return; 1308 1309 cap_t caps = cap_get_proc(); 1310 cap_value_t flag[1]; 1311 const uint64_t one = 1; 1312 unsigned int i; 1313 if (!caps) 1314 die("can't get process caps"); 1315 if (cap_clear_flag(caps, CAP_INHERITABLE)) 1316 die("can't clear inheritable caps"); 1317 if (cap_clear_flag(caps, CAP_EFFECTIVE)) 1318 die("can't clear effective caps"); 1319 if (cap_clear_flag(caps, CAP_PERMITTED)) 1320 die("can't clear permitted caps"); 1321 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) { 1322 /* Keep CAP_SETPCAP for dropping bounding set bits. */ 1323 if (i != CAP_SETPCAP && !(j->caps & (one << i))) 1324 continue; 1325 flag[0] = i; 1326 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET)) 1327 die("can't add effective cap"); 1328 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET)) 1329 die("can't add permitted cap"); 1330 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET)) 1331 die("can't add inheritable cap"); 1332 } 1333 if (cap_set_proc(caps)) 1334 die("can't apply initial cleaned capset"); 1335 1336 /* 1337 * Instead of dropping bounding set first, do it here in case 1338 * the caller had a more permissive bounding set which could 1339 * have been used above to raise a capability that wasn't already 1340 * present. This requires CAP_SETPCAP, so we raised/kept it above. 1341 */ 1342 drop_capbset(j->caps, last_valid_cap); 1343 1344 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */ 1345 if ((j->caps & (one << CAP_SETPCAP)) == 0) { 1346 flag[0] = CAP_SETPCAP; 1347 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR)) 1348 die("can't clear effective cap"); 1349 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR)) 1350 die("can't clear permitted cap"); 1351 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR)) 1352 die("can't clear inheritable cap"); 1353 } 1354 1355 if (cap_set_proc(caps)) 1356 die("can't apply final cleaned capset"); 1357 1358 cap_free(caps); 1359} 1360 1361void set_seccomp_filter(const struct minijail *j) 1362{ 1363 /* 1364 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c> 1365 * in the kernel source tree for an explanation of the parameters. 1366 */ 1367 if (j->flags.no_new_privs) { 1368 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) 1369 pdie("prctl(PR_SET_NO_NEW_PRIVS)"); 1370 } 1371 1372 /* 1373 * Code running with ASan 1374 * (https://github.com/google/sanitizers/wiki/AddressSanitizer) 1375 * will make system calls not included in the syscall filter policy, 1376 * which will likely crash the program. Skip setting seccomp filter in 1377 * that case. 1378 * 'running_with_asan()' has no inputs and is completely defined at 1379 * build time, so this cannot be used by an attacker to skip setting 1380 * seccomp filter. 1381 */ 1382 if (j->flags.seccomp_filter && running_with_asan()) { 1383 warn("running with ASan, not setting seccomp filter"); 1384 return; 1385 } 1386 1387 /* 1388 * If we're logging seccomp filter failures, 1389 * install the SIGSYS handler first. 1390 */ 1391 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) { 1392 if (install_sigsys_handler()) 1393 pdie("install SIGSYS handler"); 1394 warn("logging seccomp filter failures"); 1395 } 1396 1397 /* 1398 * Install the syscall filter. 1399 */ 1400 if (j->flags.seccomp_filter) { 1401 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 1402 j->filter_prog)) { 1403 if ((errno == EINVAL) && seccomp_can_softfail()) { 1404 warn("seccomp not supported"); 1405 return; 1406 } 1407 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)"); 1408 } 1409 } 1410} 1411 1412void API minijail_enter(const struct minijail *j) 1413{ 1414 /* 1415 * If we're dropping caps, get the last valid cap from /proc now, 1416 * since /proc can be unmounted before drop_caps() is called. 1417 */ 1418 unsigned int last_valid_cap = 0; 1419 if (j->flags.capbset_drop || j->flags.use_caps) 1420 last_valid_cap = get_last_valid_cap(); 1421 1422 if (j->flags.pids) 1423 die("tried to enter a pid-namespaced jail;" 1424 " try minijail_run()?"); 1425 1426 if (j->flags.usergroups && !j->user) 1427 die("usergroup inheritance without username"); 1428 1429 /* 1430 * We can't recover from failures if we've dropped privileges partially, 1431 * so we don't even try. If any of our operations fail, we abort() the 1432 * entire process. 1433 */ 1434 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS)) 1435 pdie("setns(CLONE_NEWNS)"); 1436 1437 if (j->flags.vfs) { 1438 if (unshare(CLONE_NEWNS)) 1439 pdie("unshare(vfs)"); 1440 /* 1441 * Unless asked not to, remount all filesystems as private. 1442 * If they are shared, new bind mounts will creep out of our 1443 * namespace. 1444 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt 1445 */ 1446 if (!j->flags.skip_remount_private) { 1447 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 1448 pdie("mount(/, private)"); 1449 } 1450 } 1451 1452 if (j->flags.ipc && unshare(CLONE_NEWIPC)) { 1453 pdie("unshare(ipc)"); 1454 } 1455 1456 if (j->flags.enter_net) { 1457 if (setns(j->netns_fd, CLONE_NEWNET)) 1458 pdie("setns(CLONE_NEWNET)"); 1459 } else if (j->flags.net && unshare(CLONE_NEWNET)) { 1460 pdie("unshare(net)"); 1461 } 1462 1463 if (j->flags.ns_cgroups && unshare(CLONE_NEWCGROUP)) 1464 pdie("unshare(cgroups)"); 1465 1466 if (j->flags.chroot && enter_chroot(j)) 1467 pdie("chroot"); 1468 1469 if (j->flags.pivot_root && enter_pivot_root(j)) 1470 pdie("pivot_root"); 1471 1472 if (j->flags.mount_tmp && mount_tmp()) 1473 pdie("mount_tmp"); 1474 1475 if (j->flags.remount_proc_ro && remount_proc_readonly(j)) 1476 pdie("remount"); 1477 1478 /* 1479 * If we're only dropping capabilities from the bounding set, but not 1480 * from the thread's (permitted|inheritable|effective) sets, do it now. 1481 */ 1482 if (j->flags.capbset_drop) { 1483 drop_capbset(j->cap_bset, last_valid_cap); 1484 } 1485 1486 if (j->flags.use_caps) { 1487 /* 1488 * POSIX capabilities are a bit tricky. If we drop our 1489 * capability to change uids, our attempt to use setuid() 1490 * below will fail. Hang on to root caps across setuid(), then 1491 * lock securebits. 1492 */ 1493 if (prctl(PR_SET_KEEPCAPS, 1)) 1494 pdie("prctl(PR_SET_KEEPCAPS)"); 1495 1496 /* 1497 * Kernels 4.3+ define a new securebit 1498 * (SECURE_NO_CAP_AMBIENT_RAISE), so using the SECURE_ALL_BITS 1499 * and SECURE_ALL_LOCKS masks from newer kernel headers will 1500 * return EPERM on older kernels. Detect this, and retry with 1501 * the right mask for older (2.6.26-4.2) kernels. 1502 */ 1503 int securebits_ret = prctl(PR_SET_SECUREBITS, 1504 SECURE_ALL_BITS | SECURE_ALL_LOCKS); 1505 if (securebits_ret < 0) { 1506 if (errno == EPERM) { 1507 /* Possibly running on kernel < 4.3. */ 1508 securebits_ret = prctl( 1509 PR_SET_SECUREBITS, 1510 OLD_SECURE_ALL_BITS | OLD_SECURE_ALL_LOCKS); 1511 } 1512 } 1513 if (securebits_ret < 0) 1514 pdie("prctl(PR_SET_SECUREBITS)"); 1515 } 1516 1517 if (j->flags.no_new_privs) { 1518 /* 1519 * If we're setting no_new_privs, we can drop privileges 1520 * before setting seccomp filter. This way filter policies 1521 * don't need to allow privilege-dropping syscalls. 1522 */ 1523 drop_ugid(j); 1524 drop_caps(j, last_valid_cap); 1525 set_seccomp_filter(j); 1526 } else { 1527 /* 1528 * If we're not setting no_new_privs, 1529 * we need to set seccomp filter *before* dropping privileges. 1530 * WARNING: this means that filter policies *must* allow 1531 * setgroups()/setresgid()/setresuid() for dropping root and 1532 * capget()/capset()/prctl() for dropping caps. 1533 */ 1534 set_seccomp_filter(j); 1535 drop_ugid(j); 1536 drop_caps(j, last_valid_cap); 1537 } 1538 1539 /* 1540 * Select the specified alternate syscall table. The table must not 1541 * block prctl(2) if we're using seccomp as well. 1542 */ 1543 if (j->flags.alt_syscall) { 1544 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table)) 1545 pdie("prctl(PR_ALT_SYSCALL)"); 1546 } 1547 1548 /* 1549 * seccomp has to come last since it cuts off all the other 1550 * privilege-dropping syscalls :) 1551 */ 1552 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) { 1553 if ((errno == EINVAL) && seccomp_can_softfail()) { 1554 warn("seccomp not supported"); 1555 return; 1556 } 1557 pdie("prctl(PR_SET_SECCOMP)"); 1558 } 1559} 1560 1561/* TODO(wad): will visibility affect this variable? */ 1562static int init_exitstatus = 0; 1563 1564void init_term(int __attribute__ ((unused)) sig) 1565{ 1566 _exit(init_exitstatus); 1567} 1568 1569int init(pid_t rootpid) 1570{ 1571 pid_t pid; 1572 int status; 1573 /* So that we exit with the right status. */ 1574 signal(SIGTERM, init_term); 1575 /* TODO(wad): self jail with seccomp filters here. */ 1576 while ((pid = wait(&status)) > 0) { 1577 /* 1578 * This loop will only end when either there are no processes 1579 * left inside our pid namespace or we get a signal. 1580 */ 1581 if (pid == rootpid) 1582 init_exitstatus = status; 1583 } 1584 if (!WIFEXITED(init_exitstatus)) 1585 _exit(MINIJAIL_ERR_INIT); 1586 _exit(WEXITSTATUS(init_exitstatus)); 1587} 1588 1589int API minijail_from_fd(int fd, struct minijail *j) 1590{ 1591 size_t sz = 0; 1592 size_t bytes = read(fd, &sz, sizeof(sz)); 1593 char *buf; 1594 int r; 1595 if (sizeof(sz) != bytes) 1596 return -EINVAL; 1597 if (sz > USHRT_MAX) /* arbitrary sanity check */ 1598 return -E2BIG; 1599 buf = malloc(sz); 1600 if (!buf) 1601 return -ENOMEM; 1602 bytes = read(fd, buf, sz); 1603 if (bytes != sz) { 1604 free(buf); 1605 return -EINVAL; 1606 } 1607 r = minijail_unmarshal(j, buf, sz); 1608 free(buf); 1609 return r; 1610} 1611 1612int API minijail_to_fd(struct minijail *j, int fd) 1613{ 1614 char *buf; 1615 size_t sz = minijail_size(j); 1616 ssize_t written; 1617 int r; 1618 1619 if (!sz) 1620 return -EINVAL; 1621 buf = malloc(sz); 1622 r = minijail_marshal(j, buf, sz); 1623 if (r) { 1624 free(buf); 1625 return r; 1626 } 1627 /* Sends [size][minijail]. */ 1628 written = write(fd, &sz, sizeof(sz)); 1629 if (written != sizeof(sz)) { 1630 free(buf); 1631 return -EFAULT; 1632 } 1633 written = write(fd, buf, sz); 1634 if (written < 0 || (size_t) written != sz) { 1635 free(buf); 1636 return -EFAULT; 1637 } 1638 free(buf); 1639 return 0; 1640} 1641 1642int setup_preload(void) 1643{ 1644#if defined(__ANDROID__) 1645 /* Don't use LDPRELOAD on Brillo. */ 1646 return 0; 1647#else 1648 char *oldenv = getenv(kLdPreloadEnvVar) ? : ""; 1649 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH)); 1650 if (!newenv) 1651 return -ENOMEM; 1652 1653 /* Only insert a separating space if we have something to separate... */ 1654 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", 1655 PRELOADPATH); 1656 1657 /* setenv() makes a copy of the string we give it. */ 1658 setenv(kLdPreloadEnvVar, newenv, 1); 1659 free(newenv); 1660 return 0; 1661#endif 1662} 1663 1664int setup_pipe(int fds[2]) 1665{ 1666 int r = pipe(fds); 1667 char fd_buf[11]; 1668 if (r) 1669 return r; 1670 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]); 1671 if (r <= 0) 1672 return -EINVAL; 1673 setenv(kFdEnvVar, fd_buf, 1); 1674 return 0; 1675} 1676 1677int setup_pipe_end(int fds[2], size_t index) 1678{ 1679 if (index > 1) 1680 return -1; 1681 1682 close(fds[1 - index]); 1683 return fds[index]; 1684} 1685 1686int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd) 1687{ 1688 if (index > 1) 1689 return -1; 1690 1691 close(fds[1 - index]); 1692 /* dup2(2) the corresponding end of the pipe into |fd|. */ 1693 return dup2(fds[index], fd); 1694} 1695 1696int minijail_run_internal(struct minijail *j, const char *filename, 1697 char *const argv[], pid_t *pchild_pid, 1698 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1699 int use_preload); 1700 1701int API minijail_run(struct minijail *j, const char *filename, 1702 char *const argv[]) 1703{ 1704 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1705 true); 1706} 1707 1708int API minijail_run_pid(struct minijail *j, const char *filename, 1709 char *const argv[], pid_t *pchild_pid) 1710{ 1711 return minijail_run_internal(j, filename, argv, pchild_pid, 1712 NULL, NULL, NULL, true); 1713} 1714 1715int API minijail_run_pipe(struct minijail *j, const char *filename, 1716 char *const argv[], int *pstdin_fd) 1717{ 1718 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd, 1719 NULL, NULL, true); 1720} 1721 1722int API minijail_run_pid_pipes(struct minijail *j, const char *filename, 1723 char *const argv[], pid_t *pchild_pid, 1724 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) 1725{ 1726 return minijail_run_internal(j, filename, argv, pchild_pid, 1727 pstdin_fd, pstdout_fd, pstderr_fd, true); 1728} 1729 1730int API minijail_run_no_preload(struct minijail *j, const char *filename, 1731 char *const argv[]) 1732{ 1733 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1734 false); 1735} 1736 1737int API minijail_run_pid_pipes_no_preload(struct minijail *j, 1738 const char *filename, 1739 char *const argv[], 1740 pid_t *pchild_pid, 1741 int *pstdin_fd, int *pstdout_fd, 1742 int *pstderr_fd) 1743{ 1744 return minijail_run_internal(j, filename, argv, pchild_pid, 1745 pstdin_fd, pstdout_fd, pstderr_fd, false); 1746} 1747 1748int minijail_run_internal(struct minijail *j, const char *filename, 1749 char *const argv[], pid_t *pchild_pid, 1750 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1751 int use_preload) 1752{ 1753 char *oldenv, *oldenv_copy = NULL; 1754 pid_t child_pid; 1755 int pipe_fds[2]; 1756 int stdin_fds[2]; 1757 int stdout_fds[2]; 1758 int stderr_fds[2]; 1759 int child_sync_pipe_fds[2]; 1760 int sync_child = 0; 1761 int ret; 1762 /* We need to remember this across the minijail_preexec() call. */ 1763 int pid_namespace = j->flags.pids; 1764 int do_init = j->flags.do_init; 1765 1766 if (use_preload) { 1767 oldenv = getenv(kLdPreloadEnvVar); 1768 if (oldenv) { 1769 oldenv_copy = strdup(oldenv); 1770 if (!oldenv_copy) 1771 return -ENOMEM; 1772 } 1773 1774 if (setup_preload()) 1775 return -EFAULT; 1776 } 1777 1778 if (!use_preload) { 1779 if (j->flags.use_caps && j->caps != 0) 1780 die("non-empty capabilities are not supported without LD_PRELOAD"); 1781 } 1782 1783 /* 1784 * Make the process group ID of this process equal to its PID, so that 1785 * both the Minijail process and the jailed process can be killed 1786 * together. 1787 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when 1788 * the process is already a process group leader. 1789 */ 1790 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) { 1791 if (errno != EPERM) { 1792 pdie("setpgid(0, 0)"); 1793 } 1794 } 1795 1796 if (use_preload) { 1797 /* 1798 * Before we fork(2) and execve(2) the child process, we need 1799 * to open a pipe(2) to send the minijail configuration over. 1800 */ 1801 if (setup_pipe(pipe_fds)) 1802 return -EFAULT; 1803 } 1804 1805 /* 1806 * If we want to write to the child process' standard input, 1807 * create the pipe(2) now. 1808 */ 1809 if (pstdin_fd) { 1810 if (pipe(stdin_fds)) 1811 return -EFAULT; 1812 } 1813 1814 /* 1815 * If we want to read from the child process' standard output, 1816 * create the pipe(2) now. 1817 */ 1818 if (pstdout_fd) { 1819 if (pipe(stdout_fds)) 1820 return -EFAULT; 1821 } 1822 1823 /* 1824 * If we want to read from the child process' standard error, 1825 * create the pipe(2) now. 1826 */ 1827 if (pstderr_fd) { 1828 if (pipe(stderr_fds)) 1829 return -EFAULT; 1830 } 1831 1832 /* 1833 * If we want to set up a new uid/gid mapping in the user namespace, 1834 * or if we need to add the child process to cgroups, create the pipe(2) 1835 * to sync between parent and child. 1836 */ 1837 if (j->flags.userns || j->flags.cgroups) { 1838 sync_child = 1; 1839 if (pipe(child_sync_pipe_fds)) 1840 return -EFAULT; 1841 } 1842 1843 /* 1844 * Use sys_clone() if and only if we're creating a pid namespace. 1845 * 1846 * tl;dr: WARNING: do not mix pid namespaces and multithreading. 1847 * 1848 * In multithreaded programs, there are a bunch of locks inside libc, 1849 * some of which may be held by other threads at the time that we call 1850 * minijail_run_pid(). If we call fork(), glibc does its level best to 1851 * ensure that we hold all of these locks before it calls clone() 1852 * internally and drop them after clone() returns, but when we call 1853 * sys_clone(2) directly, all that gets bypassed and we end up with a 1854 * child address space where some of libc's important locks are held by 1855 * other threads (which did not get cloned, and hence will never release 1856 * those locks). This is okay so long as we call exec() immediately 1857 * after, but a bunch of seemingly-innocent libc functions like setenv() 1858 * take locks. 1859 * 1860 * Hence, only call sys_clone() if we need to, in order to get at pid 1861 * namespacing. If we follow this path, the child's address space might 1862 * have broken locks; you may only call functions that do not acquire 1863 * any locks. 1864 * 1865 * Unfortunately, fork() acquires every lock it can get its hands on, as 1866 * previously detailed, so this function is highly likely to deadlock 1867 * later on (see "deadlock here") if we're multithreaded. 1868 * 1869 * We might hack around this by having the clone()d child (init of the 1870 * pid namespace) return directly, rather than leaving the clone()d 1871 * process hanging around to be init for the new namespace (and having 1872 * its fork()ed child return in turn), but that process would be 1873 * crippled with its libc locks potentially broken. We might try 1874 * fork()ing in the parent before we clone() to ensure that we own all 1875 * the locks, but then we have to have the forked child hanging around 1876 * consuming resources (and possibly having file descriptors / shared 1877 * memory regions / etc attached). We'd need to keep the child around to 1878 * avoid having its children get reparented to init. 1879 * 1880 * TODO(ellyjones): figure out if the "forked child hanging around" 1881 * problem is fixable or not. It would be nice if we worked in this 1882 * case. 1883 */ 1884 if (pid_namespace) { 1885 int clone_flags = CLONE_NEWPID | SIGCHLD; 1886 if (j->flags.userns) 1887 clone_flags |= CLONE_NEWUSER; 1888 child_pid = syscall(SYS_clone, clone_flags, NULL); 1889 } else { 1890 child_pid = fork(); 1891 } 1892 1893 if (child_pid < 0) { 1894 if (use_preload) { 1895 free(oldenv_copy); 1896 } 1897 die("failed to fork child"); 1898 } 1899 1900 if (child_pid) { 1901 if (use_preload) { 1902 /* Restore parent's LD_PRELOAD. */ 1903 if (oldenv_copy) { 1904 setenv(kLdPreloadEnvVar, oldenv_copy, 1); 1905 free(oldenv_copy); 1906 } else { 1907 unsetenv(kLdPreloadEnvVar); 1908 } 1909 unsetenv(kFdEnvVar); 1910 } 1911 1912 j->initpid = child_pid; 1913 1914 if (j->flags.pid_file) 1915 write_pid_file(j); 1916 1917 if (j->flags.cgroups) 1918 add_to_cgroups(j); 1919 1920 if (j->flags.userns) 1921 write_ugid_mappings(j); 1922 1923 if (sync_child) 1924 parent_setup_complete(child_sync_pipe_fds); 1925 1926 if (use_preload) { 1927 /* Send marshalled minijail. */ 1928 close(pipe_fds[0]); /* read endpoint */ 1929 ret = minijail_to_fd(j, pipe_fds[1]); 1930 close(pipe_fds[1]); /* write endpoint */ 1931 if (ret) { 1932 kill(j->initpid, SIGKILL); 1933 die("failed to send marshalled minijail"); 1934 } 1935 } 1936 1937 if (pchild_pid) 1938 *pchild_pid = child_pid; 1939 1940 /* 1941 * If we want to write to the child process' standard input, 1942 * set up the write end of the pipe. 1943 */ 1944 if (pstdin_fd) 1945 *pstdin_fd = setup_pipe_end(stdin_fds, 1946 1 /* write end */); 1947 1948 /* 1949 * If we want to read from the child process' standard output, 1950 * set up the read end of the pipe. 1951 */ 1952 if (pstdout_fd) 1953 *pstdout_fd = setup_pipe_end(stdout_fds, 1954 0 /* read end */); 1955 1956 /* 1957 * If we want to read from the child process' standard error, 1958 * set up the read end of the pipe. 1959 */ 1960 if (pstderr_fd) 1961 *pstderr_fd = setup_pipe_end(stderr_fds, 1962 0 /* read end */); 1963 1964 return 0; 1965 } 1966 /* Child process. */ 1967 free(oldenv_copy); 1968 1969 if (j->flags.reset_signal_mask) { 1970 sigset_t signal_mask; 1971 if (sigemptyset(&signal_mask) != 0) 1972 pdie("sigemptyset failed"); 1973 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0) 1974 pdie("sigprocmask failed"); 1975 } 1976 1977 if (sync_child) 1978 wait_for_parent_setup(child_sync_pipe_fds); 1979 1980 if (j->flags.userns) 1981 enter_user_namespace(j); 1982 1983 /* 1984 * If we want to write to the jailed process' standard input, 1985 * set up the read end of the pipe. 1986 */ 1987 if (pstdin_fd) { 1988 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */, 1989 STDIN_FILENO) < 0) 1990 die("failed to set up stdin pipe"); 1991 } 1992 1993 /* 1994 * If we want to read from the jailed process' standard output, 1995 * set up the write end of the pipe. 1996 */ 1997 if (pstdout_fd) { 1998 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */, 1999 STDOUT_FILENO) < 0) 2000 die("failed to set up stdout pipe"); 2001 } 2002 2003 /* 2004 * If we want to read from the jailed process' standard error, 2005 * set up the write end of the pipe. 2006 */ 2007 if (pstderr_fd) { 2008 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */, 2009 STDERR_FILENO) < 0) 2010 die("failed to set up stderr pipe"); 2011 } 2012 2013 /* If running an init program, let it decide when/how to mount /proc. */ 2014 if (pid_namespace && !do_init) 2015 j->flags.remount_proc_ro = 0; 2016 2017 if (use_preload) { 2018 /* Strip out flags that cannot be inherited across execve(2). */ 2019 minijail_preexec(j); 2020 } else { 2021 /* 2022 * If not using LD_PRELOAD, do all jailing before execve(2). 2023 * Note that PID namespaces can only be entered on fork(2), 2024 * so that flag is still cleared. 2025 */ 2026 j->flags.pids = 0; 2027 } 2028 /* Jail this process, then execve(2) the target. */ 2029 minijail_enter(j); 2030 2031 if (pid_namespace && do_init) { 2032 /* 2033 * pid namespace: this process will become init inside the new 2034 * namespace. We don't want all programs we might exec to have 2035 * to know how to be init. Normally (do_init == 1) we fork off 2036 * a child to actually run the program. If |do_init == 0|, we 2037 * let the program keep pid 1 and be init. 2038 * 2039 * If we're multithreaded, we'll probably deadlock here. See 2040 * WARNING above. 2041 */ 2042 child_pid = fork(); 2043 if (child_pid < 0) 2044 _exit(child_pid); 2045 else if (child_pid > 0) 2046 init(child_pid); /* never returns */ 2047 } 2048 2049 /* 2050 * If we aren't pid-namespaced, or the jailed program asked to be init: 2051 * calling process 2052 * -> execve()-ing process 2053 * If we are: 2054 * calling process 2055 * -> init()-ing process 2056 * -> execve()-ing process 2057 */ 2058 ret = execve(filename, argv, environ); 2059 if (ret == -1) { 2060 pwarn("execve(%s) failed", filename); 2061 } 2062 _exit(ret); 2063} 2064 2065int API minijail_kill(struct minijail *j) 2066{ 2067 int st; 2068 if (kill(j->initpid, SIGTERM)) 2069 return -errno; 2070 if (waitpid(j->initpid, &st, 0) < 0) 2071 return -errno; 2072 return st; 2073} 2074 2075int API minijail_wait(struct minijail *j) 2076{ 2077 int st; 2078 if (waitpid(j->initpid, &st, 0) < 0) 2079 return -errno; 2080 2081 if (!WIFEXITED(st)) { 2082 int error_status = st; 2083 if (WIFSIGNALED(st)) { 2084 int signum = WTERMSIG(st); 2085 warn("child process %d received signal %d", 2086 j->initpid, signum); 2087 /* 2088 * We return MINIJAIL_ERR_JAIL if the process received 2089 * SIGSYS, which happens when a syscall is blocked by 2090 * seccomp filters. 2091 * If not, we do what bash(1) does: 2092 * $? = 128 + signum 2093 */ 2094 if (signum == SIGSYS) { 2095 error_status = MINIJAIL_ERR_JAIL; 2096 } else { 2097 error_status = 128 + signum; 2098 } 2099 } 2100 return error_status; 2101 } 2102 2103 int exit_status = WEXITSTATUS(st); 2104 if (exit_status != 0) 2105 info("child process %d exited with status %d", 2106 j->initpid, exit_status); 2107 2108 return exit_status; 2109} 2110 2111void API minijail_destroy(struct minijail *j) 2112{ 2113 size_t i; 2114 2115 if (j->flags.seccomp_filter && j->filter_prog) { 2116 free(j->filter_prog->filter); 2117 free(j->filter_prog); 2118 } 2119 while (j->mounts_head) { 2120 struct mountpoint *m = j->mounts_head; 2121 j->mounts_head = j->mounts_head->next; 2122 free(m->data); 2123 free(m->type); 2124 free(m->dest); 2125 free(m->src); 2126 free(m); 2127 } 2128 j->mounts_tail = NULL; 2129 if (j->user) 2130 free(j->user); 2131 if (j->suppl_gid_list) 2132 free(j->suppl_gid_list); 2133 if (j->chrootdir) 2134 free(j->chrootdir); 2135 if (j->pid_file_path) 2136 free(j->pid_file_path); 2137 if (j->uidmap) 2138 free(j->uidmap); 2139 if (j->gidmap) 2140 free(j->gidmap); 2141 if (j->alt_syscall_table) 2142 free(j->alt_syscall_table); 2143 for (i = 0; i < j->cgroup_count; ++i) 2144 free(j->cgroups[i]); 2145 free(j); 2146} 2147