libminijail.c revision 4cbc2a522e1bc88424905bee32199af1c0fdbd20
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6#define _BSD_SOURCE 7#define _DEFAULT_SOURCE 8#define _GNU_SOURCE 9 10#include <asm/unistd.h> 11#include <ctype.h> 12#include <errno.h> 13#include <fcntl.h> 14#include <grp.h> 15#include <inttypes.h> 16#include <limits.h> 17#include <linux/capability.h> 18#include <pwd.h> 19#include <sched.h> 20#include <signal.h> 21#include <stdarg.h> 22#include <stdbool.h> 23#include <stddef.h> 24#include <stdio.h> 25#include <stdlib.h> 26#include <string.h> 27#include <syscall.h> 28#include <sys/capability.h> 29#include <sys/mount.h> 30#include <sys/param.h> 31#include <sys/prctl.h> 32#include <sys/stat.h> 33#include <sys/types.h> 34#include <sys/user.h> 35#include <sys/utsname.h> 36#include <sys/wait.h> 37#include <unistd.h> 38 39#include "libminijail.h" 40#include "libminijail-private.h" 41 42#include "signal_handler.h" 43#include "syscall_filter.h" 44#include "util.h" 45 46#ifdef HAVE_SECUREBITS_H 47# include <linux/securebits.h> 48#else 49# define SECURE_ALL_BITS 0x55 50# define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1) 51#endif 52/* For kernels < 4.3. */ 53#define OLD_SECURE_ALL_BITS 0x15 54#define OLD_SECURE_ALL_LOCKS (OLD_SECURE_ALL_BITS << 1) 55 56/* 57 * Assert the value of SECURE_ALL_BITS at compile-time. 58 * Brillo devices are currently compiled against 4.4 kernel headers. Kernel 4.3 59 * added a new securebit. 60 * When a new securebit is added, the new SECURE_ALL_BITS mask will return EPERM 61 * when used on older kernels. The compile-time assert will catch this situation 62 * at compile time. 63 */ 64#ifdef __BRILLO__ 65_Static_assert(SECURE_ALL_BITS == 0x55, "SECURE_ALL_BITS == 0x55."); 66#endif 67 68/* Until these are reliably available in linux/prctl.h. */ 69#ifndef PR_SET_SECCOMP 70# define PR_SET_SECCOMP 22 71#endif 72 73#ifndef PR_ALT_SYSCALL 74# define PR_ALT_SYSCALL 0x43724f53 75#endif 76 77/* For seccomp_filter using BPF. */ 78#ifndef PR_SET_NO_NEW_PRIVS 79# define PR_SET_NO_NEW_PRIVS 38 80#endif 81#ifndef SECCOMP_MODE_FILTER 82# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ 83#endif 84 85#ifdef USE_SECCOMP_SOFTFAIL 86# define SECCOMP_SOFTFAIL 1 87#else 88# define SECCOMP_SOFTFAIL 0 89#endif 90 91/* New cgroup namespace might not be in linux-headers yet. */ 92#ifndef CLONE_NEWCGROUP 93# define CLONE_NEWCGROUP 0x02000000 94#endif 95 96#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */ 97 98struct mountpoint { 99 char *src; 100 char *dest; 101 char *type; 102 unsigned long flags; 103 struct mountpoint *next; 104}; 105 106struct minijail { 107 /* 108 * WARNING: if you add a flag here you need to make sure it's 109 * accounted for in minijail_pre{enter|exec}() below. 110 */ 111 struct { 112 int uid:1; 113 int gid:1; 114 int usergroups:1; 115 int suppl_gids:1; 116 int use_caps:1; 117 int capbset_drop:1; 118 int vfs:1; 119 int enter_vfs:1; 120 int skip_remount_private:1; 121 int pids:1; 122 int ipc:1; 123 int net:1; 124 int enter_net:1; 125 int ns_cgroups:1; 126 int userns:1; 127 int seccomp:1; 128 int remount_proc_ro:1; 129 int no_new_privs:1; 130 int seccomp_filter:1; 131 int log_seccomp_filter:1; 132 int chroot:1; 133 int pivot_root:1; 134 int mount_tmp:1; 135 int do_init:1; 136 int pid_file:1; 137 int cgroups:1; 138 int alt_syscall:1; 139 int reset_signal_mask:1; 140 } flags; 141 uid_t uid; 142 gid_t gid; 143 gid_t usergid; 144 char *user; 145 size_t suppl_gid_count; 146 gid_t *suppl_gid_list; 147 uint64_t caps; 148 uint64_t cap_bset; 149 pid_t initpid; 150 int mountns_fd; 151 int netns_fd; 152 char *chrootdir; 153 char *pid_file_path; 154 char *uidmap; 155 char *gidmap; 156 size_t filter_len; 157 struct sock_fprog *filter_prog; 158 char *alt_syscall_table; 159 struct mountpoint *mounts_head; 160 struct mountpoint *mounts_tail; 161 size_t mounts_count; 162 char *cgroups[MAX_CGROUPS]; 163 size_t cgroup_count; 164}; 165 166/* 167 * Strip out flags meant for the parent. 168 * We keep things that are not inherited across execve(2) (e.g. capabilities), 169 * or are easier to set after execve(2) (e.g. seccomp filters). 170 */ 171void minijail_preenter(struct minijail *j) 172{ 173 j->flags.vfs = 0; 174 j->flags.enter_vfs = 0; 175 j->flags.skip_remount_private = 0; 176 j->flags.remount_proc_ro = 0; 177 j->flags.pids = 0; 178 j->flags.do_init = 0; 179 j->flags.pid_file = 0; 180 j->flags.cgroups = 0; 181} 182 183/* 184 * Strip out flags meant for the child. 185 * We keep things that are inherited across execve(2). 186 */ 187void minijail_preexec(struct minijail *j) 188{ 189 int vfs = j->flags.vfs; 190 int enter_vfs = j->flags.enter_vfs; 191 int skip_remount_private = j->flags.skip_remount_private; 192 int remount_proc_ro = j->flags.remount_proc_ro; 193 int userns = j->flags.userns; 194 if (j->user) 195 free(j->user); 196 j->user = NULL; 197 if (j->suppl_gid_list) 198 free(j->suppl_gid_list); 199 j->suppl_gid_list = NULL; 200 memset(&j->flags, 0, sizeof(j->flags)); 201 /* Now restore anything we meant to keep. */ 202 j->flags.vfs = vfs; 203 j->flags.enter_vfs = enter_vfs; 204 j->flags.skip_remount_private = skip_remount_private; 205 j->flags.remount_proc_ro = remount_proc_ro; 206 j->flags.userns = userns; 207 /* Note, |pids| will already have been used before this call. */ 208} 209 210/* Returns true if the kernel version is less than 3.8. */ 211int seccomp_kernel_support_not_required() 212{ 213 int major, minor; 214 struct utsname uts; 215 return (uname(&uts) != -1 && 216 sscanf(uts.release, "%d.%d", &major, &minor) == 2 && 217 ((major < 3) || ((major == 3) && (minor < 8)))); 218} 219 220/* Allow seccomp soft-fail on Android devices with kernel version < 3.8. */ 221int can_softfail() 222{ 223#if SECCOMP_SOFTFAIL 224 if (is_android()) { 225 if (seccomp_kernel_support_not_required()) 226 return 1; 227 else 228 return 0; 229 } else { 230 return 1; 231 } 232#endif 233 return 0; 234} 235 236/* Minijail API. */ 237 238struct minijail API *minijail_new(void) 239{ 240 return calloc(1, sizeof(struct minijail)); 241} 242 243void API minijail_change_uid(struct minijail *j, uid_t uid) 244{ 245 if (uid == 0) 246 die("useless change to uid 0"); 247 j->uid = uid; 248 j->flags.uid = 1; 249} 250 251void API minijail_change_gid(struct minijail *j, gid_t gid) 252{ 253 if (gid == 0) 254 die("useless change to gid 0"); 255 j->gid = gid; 256 j->flags.gid = 1; 257} 258 259void API minijail_set_supplementary_gids(struct minijail *j, size_t size, 260 const gid_t *list) 261{ 262 size_t i; 263 264 if (j->flags.usergroups) 265 die("cannot inherit *and* set supplementary groups"); 266 267 if (size == 0) { 268 /* Clear supplementary groups. */ 269 j->suppl_gid_list = NULL; 270 j->suppl_gid_count = 0; 271 j->flags.suppl_gids = 1; 272 return; 273 } 274 275 /* Copy the gid_t array. */ 276 j->suppl_gid_list = calloc(size, sizeof(gid_t)); 277 if (!j->suppl_gid_list) { 278 die("failed to allocate internal supplementary group array"); 279 } 280 for (i = 0; i < size; i++) { 281 j->suppl_gid_list[i] = list[i]; 282 } 283 j->suppl_gid_count = size; 284 j->flags.suppl_gids = 1; 285} 286 287int API minijail_change_user(struct minijail *j, const char *user) 288{ 289 char *buf = NULL; 290 struct passwd pw; 291 struct passwd *ppw = NULL; 292 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX); 293 if (sz == -1) 294 sz = 65536; /* your guess is as good as mine... */ 295 296 /* 297 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return 298 * the maximum needed size of the buffer, so we don't have to search. 299 */ 300 buf = malloc(sz); 301 if (!buf) 302 return -ENOMEM; 303 getpwnam_r(user, &pw, buf, sz, &ppw); 304 /* 305 * We're safe to free the buffer here. The strings inside |pw| point 306 * inside |buf|, but we don't use any of them; this leaves the pointers 307 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3) 308 * succeeded. 309 */ 310 free(buf); 311 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */ 312 if (!ppw) 313 return -1; 314 minijail_change_uid(j, ppw->pw_uid); 315 j->user = strdup(user); 316 if (!j->user) 317 return -ENOMEM; 318 j->usergid = ppw->pw_gid; 319 return 0; 320} 321 322int API minijail_change_group(struct minijail *j, const char *group) 323{ 324 char *buf = NULL; 325 struct group gr; 326 struct group *pgr = NULL; 327 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX); 328 if (sz == -1) 329 sz = 65536; /* and mine is as good as yours, really */ 330 331 /* 332 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return 333 * the maximum needed size of the buffer, so we don't have to search. 334 */ 335 buf = malloc(sz); 336 if (!buf) 337 return -ENOMEM; 338 getgrnam_r(group, &gr, buf, sz, &pgr); 339 /* 340 * We're safe to free the buffer here. The strings inside gr point 341 * inside buf, but we don't use any of them; this leaves the pointers 342 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded. 343 */ 344 free(buf); 345 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */ 346 if (!pgr) 347 return -1; 348 minijail_change_gid(j, pgr->gr_gid); 349 return 0; 350} 351 352void API minijail_use_seccomp(struct minijail *j) 353{ 354 j->flags.seccomp = 1; 355} 356 357void API minijail_no_new_privs(struct minijail *j) 358{ 359 j->flags.no_new_privs = 1; 360} 361 362void API minijail_use_seccomp_filter(struct minijail *j) 363{ 364 j->flags.seccomp_filter = 1; 365} 366 367void API minijail_log_seccomp_filter_failures(struct minijail *j) 368{ 369 j->flags.log_seccomp_filter = 1; 370} 371 372void API minijail_use_caps(struct minijail *j, uint64_t capmask) 373{ 374 /* 375 * 'minijail_use_caps' configures a runtime-capabilities-only 376 * environment, including a bounding set matching the thread's runtime 377 * (permitted|inheritable|effective) sets. 378 * Therefore, it will override any existing bounding set configurations 379 * since the latter would allow gaining extra runtime capabilities from 380 * file capabilities. 381 */ 382 if (j->flags.capbset_drop) { 383 warn("overriding bounding set configuration"); 384 j->cap_bset = 0; 385 j->flags.capbset_drop = 0; 386 } 387 j->caps = capmask; 388 j->flags.use_caps = 1; 389} 390 391void API minijail_capbset_drop(struct minijail *j, uint64_t capmask) 392{ 393 if (j->flags.use_caps) { 394 /* 395 * 'minijail_use_caps' will have already configured a capability 396 * bounding set matching the (permitted|inheritable|effective) 397 * sets. Abort if the user tries to configure a separate 398 * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps' 399 * are mutually exclusive. 400 */ 401 die("runtime capabilities already configured, can't drop " 402 "bounding set separately"); 403 } 404 j->cap_bset = capmask; 405 j->flags.capbset_drop = 1; 406} 407 408void API minijail_reset_signal_mask(struct minijail *j) 409{ 410 j->flags.reset_signal_mask = 1; 411} 412 413void API minijail_namespace_vfs(struct minijail *j) 414{ 415 j->flags.vfs = 1; 416} 417 418void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path) 419{ 420 int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC); 421 if (ns_fd < 0) { 422 pdie("failed to open namespace '%s'", ns_path); 423 } 424 j->mountns_fd = ns_fd; 425 j->flags.enter_vfs = 1; 426} 427 428void API minijail_skip_remount_private(struct minijail *j) 429{ 430 j->flags.skip_remount_private = 1; 431} 432 433void API minijail_namespace_pids(struct minijail *j) 434{ 435 j->flags.vfs = 1; 436 j->flags.remount_proc_ro = 1; 437 j->flags.pids = 1; 438 j->flags.do_init = 1; 439} 440 441void API minijail_namespace_ipc(struct minijail *j) 442{ 443 j->flags.ipc = 1; 444} 445 446void API minijail_namespace_net(struct minijail *j) 447{ 448 j->flags.net = 1; 449} 450 451void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path) 452{ 453 int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC); 454 if (ns_fd < 0) { 455 pdie("failed to open namespace '%s'", ns_path); 456 } 457 j->netns_fd = ns_fd; 458 j->flags.enter_net = 1; 459} 460 461void API minijail_namespace_cgroups(struct minijail *j) 462{ 463 j->flags.ns_cgroups = 1; 464} 465 466void API minijail_remount_proc_readonly(struct minijail *j) 467{ 468 j->flags.vfs = 1; 469 j->flags.remount_proc_ro = 1; 470} 471 472void API minijail_namespace_user(struct minijail *j) 473{ 474 j->flags.userns = 1; 475} 476 477int API minijail_uidmap(struct minijail *j, const char *uidmap) 478{ 479 j->uidmap = strdup(uidmap); 480 if (!j->uidmap) 481 return -ENOMEM; 482 char *ch; 483 for (ch = j->uidmap; *ch; ch++) { 484 if (*ch == ',') 485 *ch = '\n'; 486 } 487 return 0; 488} 489 490int API minijail_gidmap(struct minijail *j, const char *gidmap) 491{ 492 j->gidmap = strdup(gidmap); 493 if (!j->gidmap) 494 return -ENOMEM; 495 char *ch; 496 for (ch = j->gidmap; *ch; ch++) { 497 if (*ch == ',') 498 *ch = '\n'; 499 } 500 return 0; 501} 502 503void API minijail_inherit_usergroups(struct minijail *j) 504{ 505 j->flags.usergroups = 1; 506} 507 508void API minijail_run_as_init(struct minijail *j) 509{ 510 /* 511 * Since the jailed program will become 'init' in the new PID namespace, 512 * Minijail does not need to fork an 'init' process. 513 */ 514 j->flags.do_init = 0; 515} 516 517int API minijail_enter_chroot(struct minijail *j, const char *dir) 518{ 519 if (j->chrootdir) 520 return -EINVAL; 521 j->chrootdir = strdup(dir); 522 if (!j->chrootdir) 523 return -ENOMEM; 524 j->flags.chroot = 1; 525 return 0; 526} 527 528int API minijail_enter_pivot_root(struct minijail *j, const char *dir) 529{ 530 if (j->chrootdir) 531 return -EINVAL; 532 j->chrootdir = strdup(dir); 533 if (!j->chrootdir) 534 return -ENOMEM; 535 j->flags.pivot_root = 1; 536 return 0; 537} 538 539static char *append_external_path(const char *external_path, 540 const char *path_inside_chroot) 541{ 542 char *path; 543 size_t pathlen; 544 545 /* One extra char for '/' and one for '\0', hence + 2. */ 546 pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2; 547 path = malloc(pathlen); 548 snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot); 549 550 return path; 551} 552 553char API *minijail_get_original_path(struct minijail *j, 554 const char *path_inside_chroot) 555{ 556 struct mountpoint *b; 557 558 b = j->mounts_head; 559 while (b) { 560 /* 561 * If |path_inside_chroot| is the exact destination of a 562 * mount, then the original path is exactly the source of 563 * the mount. 564 * for example: "-b /some/path/exe,/chroot/path/exe" 565 * mount source = /some/path/exe, mount dest = 566 * /chroot/path/exe Then when getting the original path of 567 * "/chroot/path/exe", the source of that mount, 568 * "/some/path/exe" is what should be returned. 569 */ 570 if (!strcmp(b->dest, path_inside_chroot)) 571 return strdup(b->src); 572 573 /* 574 * If |path_inside_chroot| is within the destination path of a 575 * mount, take the suffix of the chroot path relative to the 576 * mount destination path, and append it to the mount source 577 * path. 578 */ 579 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) { 580 const char *relative_path = 581 path_inside_chroot + strlen(b->dest); 582 return append_external_path(b->src, relative_path); 583 } 584 b = b->next; 585 } 586 587 /* If there is a chroot path, append |path_inside_chroot| to that. */ 588 if (j->chrootdir) 589 return append_external_path(j->chrootdir, path_inside_chroot); 590 591 /* No chroot, so the path outside is the same as it is inside. */ 592 return strdup(path_inside_chroot); 593} 594 595void API minijail_mount_tmp(struct minijail *j) 596{ 597 j->flags.mount_tmp = 1; 598} 599 600int API minijail_write_pid_file(struct minijail *j, const char *path) 601{ 602 j->pid_file_path = strdup(path); 603 if (!j->pid_file_path) 604 return -ENOMEM; 605 j->flags.pid_file = 1; 606 return 0; 607} 608 609int API minijail_add_to_cgroup(struct minijail *j, const char *path) 610{ 611 if (j->cgroup_count >= MAX_CGROUPS) 612 return -ENOMEM; 613 j->cgroups[j->cgroup_count] = strdup(path); 614 if (!j->cgroups[j->cgroup_count]) 615 return -ENOMEM; 616 j->cgroup_count++; 617 j->flags.cgroups = 1; 618 return 0; 619} 620 621int API minijail_mount(struct minijail *j, const char *src, const char *dest, 622 const char *type, unsigned long flags) 623{ 624 struct mountpoint *m; 625 626 if (*dest != '/') 627 return -EINVAL; 628 m = calloc(1, sizeof(*m)); 629 if (!m) 630 return -ENOMEM; 631 m->dest = strdup(dest); 632 if (!m->dest) 633 goto error; 634 m->src = strdup(src); 635 if (!m->src) 636 goto error; 637 m->type = strdup(type); 638 if (!m->type) 639 goto error; 640 m->flags = flags; 641 642 info("mount %s -> %s type '%s'", src, dest, type); 643 644 /* 645 * Force vfs namespacing so the mounts don't leak out into the 646 * containing vfs namespace. 647 */ 648 minijail_namespace_vfs(j); 649 650 if (j->mounts_tail) 651 j->mounts_tail->next = m; 652 else 653 j->mounts_head = m; 654 j->mounts_tail = m; 655 j->mounts_count++; 656 657 return 0; 658 659error: 660 free(m->src); 661 free(m->dest); 662 free(m); 663 return -ENOMEM; 664} 665 666int API minijail_bind(struct minijail *j, const char *src, const char *dest, 667 int writeable) 668{ 669 unsigned long flags = MS_BIND; 670 671 if (!writeable) 672 flags |= MS_RDONLY; 673 674 return minijail_mount(j, src, dest, "", flags); 675} 676 677void API minijail_parse_seccomp_filters(struct minijail *j, const char *path) 678{ 679 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) { 680 if ((errno == EINVAL) && can_softfail()) { 681 warn("not loading seccomp filter," 682 " seccomp not supported"); 683 j->flags.seccomp_filter = 0; 684 j->flags.log_seccomp_filter = 0; 685 j->filter_len = 0; 686 j->filter_prog = NULL; 687 j->flags.no_new_privs = 0; 688 } 689 } 690 FILE *file = fopen(path, "r"); 691 if (!file) { 692 pdie("failed to open seccomp filter file '%s'", path); 693 } 694 695 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog)); 696 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) { 697 die("failed to compile seccomp filter BPF program in '%s'", 698 path); 699 } 700 701 j->filter_len = fprog->len; 702 j->filter_prog = fprog; 703 704 fclose(file); 705} 706 707int API minijail_use_alt_syscall(struct minijail *j, const char *table) 708{ 709 j->alt_syscall_table = strdup(table); 710 if (!j->alt_syscall_table) 711 return -ENOMEM; 712 j->flags.alt_syscall = 1; 713 return 0; 714} 715 716struct marshal_state { 717 size_t available; 718 size_t total; 719 char *buf; 720}; 721 722void marshal_state_init(struct marshal_state *state, char *buf, 723 size_t available) 724{ 725 state->available = available; 726 state->buf = buf; 727 state->total = 0; 728} 729 730void marshal_append(struct marshal_state *state, void *src, size_t length) 731{ 732 size_t copy_len = MIN(state->available, length); 733 734 /* Up to |available| will be written. */ 735 if (copy_len) { 736 memcpy(state->buf, src, copy_len); 737 state->buf += copy_len; 738 state->available -= copy_len; 739 } 740 /* |total| will contain the expected length. */ 741 state->total += length; 742} 743 744void minijail_marshal_helper(struct marshal_state *state, 745 const struct minijail *j) 746{ 747 struct mountpoint *m = NULL; 748 size_t i; 749 750 marshal_append(state, (char *)j, sizeof(*j)); 751 if (j->user) 752 marshal_append(state, j->user, strlen(j->user) + 1); 753 if (j->suppl_gid_list) { 754 marshal_append(state, j->suppl_gid_list, 755 j->suppl_gid_count * sizeof(gid_t)); 756 } 757 if (j->chrootdir) 758 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1); 759 if (j->alt_syscall_table) { 760 marshal_append(state, j->alt_syscall_table, 761 strlen(j->alt_syscall_table) + 1); 762 } 763 if (j->flags.seccomp_filter && j->filter_prog) { 764 struct sock_fprog *fp = j->filter_prog; 765 marshal_append(state, (char *)fp->filter, 766 fp->len * sizeof(struct sock_filter)); 767 } 768 for (m = j->mounts_head; m; m = m->next) { 769 marshal_append(state, m->src, strlen(m->src) + 1); 770 marshal_append(state, m->dest, strlen(m->dest) + 1); 771 marshal_append(state, m->type, strlen(m->type) + 1); 772 marshal_append(state, (char *)&m->flags, sizeof(m->flags)); 773 } 774 for (i = 0; i < j->cgroup_count; ++i) 775 marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1); 776} 777 778size_t API minijail_size(const struct minijail *j) 779{ 780 struct marshal_state state; 781 marshal_state_init(&state, NULL, 0); 782 minijail_marshal_helper(&state, j); 783 return state.total; 784} 785 786int minijail_marshal(const struct minijail *j, char *buf, size_t available) 787{ 788 struct marshal_state state; 789 marshal_state_init(&state, buf, available); 790 minijail_marshal_helper(&state, j); 791 return (state.total > available); 792} 793 794/* 795 * consumebytes: consumes @length bytes from a buffer @buf of length @buflength 796 * @length Number of bytes to consume 797 * @buf Buffer to consume from 798 * @buflength Size of @buf 799 * 800 * Returns a pointer to the base of the bytes, or NULL for errors. 801 */ 802void *consumebytes(size_t length, char **buf, size_t *buflength) 803{ 804 char *p = *buf; 805 if (length > *buflength) 806 return NULL; 807 *buf += length; 808 *buflength -= length; 809 return p; 810} 811 812/* 813 * consumestr: consumes a C string from a buffer @buf of length @length 814 * @buf Buffer to consume 815 * @length Length of buffer 816 * 817 * Returns a pointer to the base of the string, or NULL for errors. 818 */ 819char *consumestr(char **buf, size_t *buflength) 820{ 821 size_t len = strnlen(*buf, *buflength); 822 if (len == *buflength) 823 /* There's no null-terminator. */ 824 return NULL; 825 return consumebytes(len + 1, buf, buflength); 826} 827 828int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) 829{ 830 size_t i; 831 size_t count; 832 int ret = -EINVAL; 833 834 if (length < sizeof(*j)) 835 goto out; 836 memcpy((void *)j, serialized, sizeof(*j)); 837 serialized += sizeof(*j); 838 length -= sizeof(*j); 839 840 /* Potentially stale pointers not used as signals. */ 841 j->mounts_head = NULL; 842 j->mounts_tail = NULL; 843 j->filter_prog = NULL; 844 845 if (j->user) { /* stale pointer */ 846 char *user = consumestr(&serialized, &length); 847 if (!user) 848 goto clear_pointers; 849 j->user = strdup(user); 850 if (!j->user) 851 goto clear_pointers; 852 } 853 854 if (j->suppl_gid_list) { /* stale pointer */ 855 if (j->suppl_gid_count > NGROUPS_MAX) { 856 goto bad_gid_list; 857 } 858 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t); 859 void *gid_list_bytes = 860 consumebytes(gid_list_size, &serialized, &length); 861 if (!gid_list_bytes) 862 goto bad_gid_list; 863 864 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t)); 865 if (!j->suppl_gid_list) 866 goto bad_gid_list; 867 868 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size); 869 } 870 871 if (j->chrootdir) { /* stale pointer */ 872 char *chrootdir = consumestr(&serialized, &length); 873 if (!chrootdir) 874 goto bad_chrootdir; 875 j->chrootdir = strdup(chrootdir); 876 if (!j->chrootdir) 877 goto bad_chrootdir; 878 } 879 880 if (j->alt_syscall_table) { /* stale pointer */ 881 char *alt_syscall_table = consumestr(&serialized, &length); 882 if (!alt_syscall_table) 883 goto bad_syscall_table; 884 j->alt_syscall_table = strdup(alt_syscall_table); 885 if (!j->alt_syscall_table) 886 goto bad_syscall_table; 887 } 888 889 if (j->flags.seccomp_filter && j->filter_len > 0) { 890 size_t ninstrs = j->filter_len; 891 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) || 892 ninstrs > USHRT_MAX) 893 goto bad_filters; 894 895 size_t program_len = ninstrs * sizeof(struct sock_filter); 896 void *program = consumebytes(program_len, &serialized, &length); 897 if (!program) 898 goto bad_filters; 899 900 j->filter_prog = malloc(sizeof(struct sock_fprog)); 901 if (!j->filter_prog) 902 goto bad_filters; 903 904 j->filter_prog->len = ninstrs; 905 j->filter_prog->filter = malloc(program_len); 906 if (!j->filter_prog->filter) 907 goto bad_filter_prog_instrs; 908 909 memcpy(j->filter_prog->filter, program, program_len); 910 } 911 912 count = j->mounts_count; 913 j->mounts_count = 0; 914 for (i = 0; i < count; ++i) { 915 unsigned long *flags; 916 const char *dest; 917 const char *type; 918 const char *src = consumestr(&serialized, &length); 919 if (!src) 920 goto bad_mounts; 921 dest = consumestr(&serialized, &length); 922 if (!dest) 923 goto bad_mounts; 924 type = consumestr(&serialized, &length); 925 if (!type) 926 goto bad_mounts; 927 flags = consumebytes(sizeof(*flags), &serialized, &length); 928 if (!flags) 929 goto bad_mounts; 930 if (minijail_mount(j, src, dest, type, *flags)) 931 goto bad_mounts; 932 } 933 934 count = j->cgroup_count; 935 j->cgroup_count = 0; 936 for (i = 0; i < count; ++i) { 937 char *cgroup = consumestr(&serialized, &length); 938 if (!cgroup) 939 goto bad_cgroups; 940 j->cgroups[i] = strdup(cgroup); 941 if (!j->cgroups[i]) 942 goto bad_cgroups; 943 ++j->cgroup_count; 944 } 945 946 return 0; 947 948bad_cgroups: 949 while (j->mounts_head) { 950 struct mountpoint *m = j->mounts_head; 951 j->mounts_head = j->mounts_head->next; 952 free(m->type); 953 free(m->dest); 954 free(m->src); 955 free(m); 956 } 957 for (i = 0; i < j->cgroup_count; ++i) 958 free(j->cgroups[i]); 959bad_mounts: 960 if (j->flags.seccomp_filter && j->filter_len > 0) { 961 free(j->filter_prog->filter); 962 free(j->filter_prog); 963 } 964bad_filter_prog_instrs: 965 if (j->filter_prog) 966 free(j->filter_prog); 967bad_filters: 968 if (j->alt_syscall_table) 969 free(j->alt_syscall_table); 970bad_syscall_table: 971 if (j->chrootdir) 972 free(j->chrootdir); 973bad_chrootdir: 974 if (j->suppl_gid_list) 975 free(j->suppl_gid_list); 976bad_gid_list: 977 if (j->user) 978 free(j->user); 979clear_pointers: 980 j->user = NULL; 981 j->suppl_gid_list = NULL; 982 j->chrootdir = NULL; 983 j->alt_syscall_table = NULL; 984 j->cgroup_count = 0; 985out: 986 return ret; 987} 988 989static void write_ugid_mappings(const struct minijail *j) 990{ 991 int fd, ret, len; 992 size_t sz; 993 char fname[32]; 994 995 sz = sizeof(fname); 996 if (j->uidmap) { 997 ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid); 998 if (ret < 0 || (size_t)ret >= sz) 999 die("failed to write file name of uid_map"); 1000 fd = open(fname, O_WRONLY | O_CLOEXEC); 1001 if (fd < 0) 1002 pdie("failed to open '%s'", fname); 1003 len = strlen(j->uidmap); 1004 if (write(fd, j->uidmap, len) < len) 1005 die("failed to set uid_map"); 1006 close(fd); 1007 } 1008 if (j->gidmap) { 1009 ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid); 1010 if (ret < 0 || (size_t)ret >= sz) 1011 die("failed to write file name of gid_map"); 1012 fd = open(fname, O_WRONLY | O_CLOEXEC); 1013 if (fd < 0) 1014 pdie("failed to open '%s'", fname); 1015 len = strlen(j->gidmap); 1016 if (write(fd, j->gidmap, len) < len) 1017 die("failed to set gid_map"); 1018 close(fd); 1019 } 1020} 1021 1022static void parent_setup_complete(int *pipe_fds) 1023{ 1024 close(pipe_fds[0]); 1025 close(pipe_fds[1]); 1026} 1027 1028/* 1029 * wait_for_parent_setup: Called by the child process to wait for any 1030 * further parent-side setup to complete before continuing. 1031 */ 1032static void wait_for_parent_setup(int *pipe_fds) 1033{ 1034 char buf; 1035 1036 close(pipe_fds[1]); 1037 1038 /* Wait for parent to complete setup and close the pipe. */ 1039 if (read(pipe_fds[0], &buf, 1) != 0) 1040 die("failed to sync with parent"); 1041 close(pipe_fds[0]); 1042} 1043 1044static void enter_user_namespace(const struct minijail *j) 1045{ 1046 if (j->uidmap && setresuid(0, 0, 0)) 1047 pdie("setresuid"); 1048 if (j->gidmap && setresgid(0, 0, 0)) 1049 pdie("setresgid"); 1050} 1051 1052/* 1053 * mount_one: Applies mounts from @m for @j, recursing as needed. 1054 * @j Minijail these mounts are for 1055 * @m Head of list of mounts 1056 * 1057 * Returns 0 for success. 1058 */ 1059static int mount_one(const struct minijail *j, struct mountpoint *m) 1060{ 1061 int ret; 1062 char *dest; 1063 int remount_ro = 0; 1064 1065 /* |dest| has a leading "/". */ 1066 if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0) 1067 return -ENOMEM; 1068 1069 /* 1070 * R/O bind mounts have to be remounted since 'bind' and 'ro' 1071 * can't both be specified in the original bind mount. 1072 * Remount R/O after the initial mount. 1073 */ 1074 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) { 1075 remount_ro = 1; 1076 m->flags &= ~MS_RDONLY; 1077 } 1078 1079 ret = mount(m->src, dest, m->type, m->flags, NULL); 1080 if (ret) 1081 pdie("mount: %s -> %s", m->src, dest); 1082 1083 if (remount_ro) { 1084 m->flags |= MS_RDONLY; 1085 ret = mount(m->src, dest, NULL, 1086 m->flags | MS_REMOUNT, NULL); 1087 if (ret) 1088 pdie("bind ro: %s -> %s", m->src, dest); 1089 } 1090 1091 free(dest); 1092 if (m->next) 1093 return mount_one(j, m->next); 1094 return ret; 1095} 1096 1097int enter_chroot(const struct minijail *j) 1098{ 1099 int ret; 1100 1101 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 1102 return ret; 1103 1104 if (chroot(j->chrootdir)) 1105 return -errno; 1106 1107 if (chdir("/")) 1108 return -errno; 1109 1110 return 0; 1111} 1112 1113int enter_pivot_root(const struct minijail *j) 1114{ 1115 int ret, oldroot, newroot; 1116 1117 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 1118 return ret; 1119 1120 /* 1121 * Keep the fd for both old and new root. 1122 * It will be used in fchdir(2) later. 1123 */ 1124 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); 1125 if (oldroot < 0) 1126 pdie("failed to open / for fchdir"); 1127 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC); 1128 if (newroot < 0) 1129 pdie("failed to open %s for fchdir", j->chrootdir); 1130 1131 /* 1132 * To ensure j->chrootdir is the root of a filesystem, 1133 * do a self bind mount. 1134 */ 1135 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, "")) 1136 pdie("failed to bind mount '%s'", j->chrootdir); 1137 if (chdir(j->chrootdir)) 1138 return -errno; 1139 if (syscall(SYS_pivot_root, ".", ".")) 1140 pdie("pivot_root"); 1141 1142 /* 1143 * Now the old root is mounted on top of the new root. Use fchdir(2) to 1144 * change to the old root and unmount it. 1145 */ 1146 if (fchdir(oldroot)) 1147 pdie("failed to fchdir to old /"); 1148 1149 /* 1150 * If j->flags.skip_remount_private was enabled for minijail_enter(), 1151 * there could be a shared mount point under |oldroot|. In that case, 1152 * mounts under this shared mount point will be unmounted below, and 1153 * this unmounting will propagate to the original mount namespace 1154 * (because the mount point is shared). To prevent this unexpected 1155 * unmounting, remove these mounts from their peer groups by recursively 1156 * remounting them as MS_PRIVATE. 1157 */ 1158 if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL)) 1159 pdie("failed to mount(/, private) before umount(/)"); 1160 /* The old root might be busy, so use lazy unmount. */ 1161 if (umount2(".", MNT_DETACH)) 1162 pdie("umount(/)"); 1163 /* Change back to the new root. */ 1164 if (fchdir(newroot)) 1165 return -errno; 1166 if (close(oldroot)) 1167 return -errno; 1168 if (close(newroot)) 1169 return -errno; 1170 if (chroot("/")) 1171 return -errno; 1172 /* Set correct CWD for getcwd(3). */ 1173 if (chdir("/")) 1174 return -errno; 1175 1176 return 0; 1177} 1178 1179int mount_tmp(void) 1180{ 1181 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777"); 1182} 1183 1184int remount_proc_readonly(const struct minijail *j) 1185{ 1186 const char *kProcPath = "/proc"; 1187 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 1188 /* 1189 * Right now, we're holding a reference to our parent's old mount of 1190 * /proc in our namespace, which means using MS_REMOUNT here would 1191 * mutate our parent's mount as well, even though we're in a VFS 1192 * namespace (!). Instead, remove their mount from our namespace lazily 1193 * (MNT_DETACH) and make our own. 1194 */ 1195 if (umount2(kProcPath, MNT_DETACH)) { 1196 /* 1197 * If we are in a new user namespace, umount(2) will fail. 1198 * See http://man7.org/linux/man-pages/man7/user_namespaces.7.html 1199 */ 1200 if (j->flags.userns) { 1201 info("umount(/proc, MNT_DETACH) failed, " 1202 "this is expected when using user namespaces"); 1203 } else { 1204 return -errno; 1205 } 1206 } 1207 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) 1208 return -errno; 1209 return 0; 1210} 1211 1212static void write_pid_to_path(pid_t pid, const char *path) 1213{ 1214 FILE *fp = fopen(path, "w"); 1215 1216 if (!fp) 1217 pdie("failed to open '%s'", path); 1218 if (fprintf(fp, "%d\n", (int)pid) < 0) 1219 pdie("fprintf(%s)", path); 1220 if (fclose(fp)) 1221 pdie("fclose(%s)", path); 1222} 1223 1224static void write_pid_file(const struct minijail *j) 1225{ 1226 write_pid_to_path(j->initpid, j->pid_file_path); 1227} 1228 1229static void add_to_cgroups(const struct minijail *j) 1230{ 1231 size_t i; 1232 1233 for (i = 0; i < j->cgroup_count; ++i) 1234 write_pid_to_path(j->initpid, j->cgroups[i]); 1235} 1236 1237void drop_ugid(const struct minijail *j) 1238{ 1239 if (j->flags.usergroups && j->flags.suppl_gids) { 1240 die("tried to inherit *and* set supplementary groups;" 1241 " can only do one"); 1242 } 1243 1244 if (j->flags.usergroups) { 1245 if (initgroups(j->user, j->usergid)) 1246 pdie("initgroups"); 1247 } else if (j->flags.suppl_gids) { 1248 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) { 1249 pdie("setgroups"); 1250 } 1251 } else { 1252 /* 1253 * Only attempt to clear supplementary groups if we are changing 1254 * users. 1255 */ 1256 if ((j->uid || j->gid) && setgroups(0, NULL)) 1257 pdie("setgroups"); 1258 } 1259 1260 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) 1261 pdie("setresgid"); 1262 1263 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) 1264 pdie("setresuid"); 1265} 1266 1267/* 1268 * We specifically do not use cap_valid() as that only tells us the last 1269 * valid cap we were *compiled* against (i.e. what the version of kernel 1270 * headers says). If we run on a different kernel version, then it's not 1271 * uncommon for that to be less (if an older kernel) or more (if a newer 1272 * kernel). 1273 * Normally, we suck up the answer via /proc. On Android, not all processes are 1274 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we 1275 * programmatically find the value by calling prctl(PR_CAPBSET_READ). 1276 */ 1277static unsigned int get_last_valid_cap() 1278{ 1279 unsigned int last_valid_cap = 0; 1280 if (is_android()) { 1281 for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0; 1282 ++last_valid_cap); 1283 1284 /* |last_valid_cap| will be the first failing value. */ 1285 if (last_valid_cap > 0) { 1286 last_valid_cap--; 1287 } 1288 } else { 1289 const char cap_file[] = "/proc/sys/kernel/cap_last_cap"; 1290 FILE *fp = fopen(cap_file, "re"); 1291 if (fscanf(fp, "%u", &last_valid_cap) != 1) 1292 pdie("fscanf(%s)", cap_file); 1293 fclose(fp); 1294 } 1295 return last_valid_cap; 1296} 1297 1298static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap) 1299{ 1300 const uint64_t one = 1; 1301 unsigned int i; 1302 for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) { 1303 if (keep_mask & (one << i)) 1304 continue; 1305 if (prctl(PR_CAPBSET_DROP, i)) 1306 pdie("could not drop capability from bounding set"); 1307 } 1308} 1309 1310void drop_caps(const struct minijail *j, unsigned int last_valid_cap) 1311{ 1312 if (!j->flags.use_caps) 1313 return; 1314 1315 cap_t caps = cap_get_proc(); 1316 cap_value_t flag[1]; 1317 const uint64_t one = 1; 1318 unsigned int i; 1319 if (!caps) 1320 die("can't get process caps"); 1321 if (cap_clear_flag(caps, CAP_INHERITABLE)) 1322 die("can't clear inheritable caps"); 1323 if (cap_clear_flag(caps, CAP_EFFECTIVE)) 1324 die("can't clear effective caps"); 1325 if (cap_clear_flag(caps, CAP_PERMITTED)) 1326 die("can't clear permitted caps"); 1327 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) { 1328 /* Keep CAP_SETPCAP for dropping bounding set bits. */ 1329 if (i != CAP_SETPCAP && !(j->caps & (one << i))) 1330 continue; 1331 flag[0] = i; 1332 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET)) 1333 die("can't add effective cap"); 1334 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET)) 1335 die("can't add permitted cap"); 1336 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET)) 1337 die("can't add inheritable cap"); 1338 } 1339 if (cap_set_proc(caps)) 1340 die("can't apply initial cleaned capset"); 1341 1342 /* 1343 * Instead of dropping bounding set first, do it here in case 1344 * the caller had a more permissive bounding set which could 1345 * have been used above to raise a capability that wasn't already 1346 * present. This requires CAP_SETPCAP, so we raised/kept it above. 1347 */ 1348 drop_capbset(j->caps, last_valid_cap); 1349 1350 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */ 1351 if ((j->caps & (one << CAP_SETPCAP)) == 0) { 1352 flag[0] = CAP_SETPCAP; 1353 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR)) 1354 die("can't clear effective cap"); 1355 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR)) 1356 die("can't clear permitted cap"); 1357 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR)) 1358 die("can't clear inheritable cap"); 1359 } 1360 1361 if (cap_set_proc(caps)) 1362 die("can't apply final cleaned capset"); 1363 1364 cap_free(caps); 1365} 1366 1367void set_seccomp_filter(const struct minijail *j) 1368{ 1369 /* 1370 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c> 1371 * in the kernel source tree for an explanation of the parameters. 1372 */ 1373 if (j->flags.no_new_privs) { 1374 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) 1375 pdie("prctl(PR_SET_NO_NEW_PRIVS)"); 1376 } 1377 1378 /* 1379 * Code running with ASan 1380 * (https://github.com/google/sanitizers/wiki/AddressSanitizer) 1381 * will make system calls not included in the syscall filter policy, 1382 * which will likely crash the program. Skip setting seccomp filter in 1383 * that case. 1384 * 'running_with_asan()' has no inputs and is completely defined at 1385 * build time, so this cannot be used by an attacker to skip setting 1386 * seccomp filter. 1387 */ 1388 if (j->flags.seccomp_filter && running_with_asan()) { 1389 warn("running with ASan, not setting seccomp filter"); 1390 return; 1391 } 1392 1393 /* 1394 * If we're logging seccomp filter failures, 1395 * install the SIGSYS handler first. 1396 */ 1397 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) { 1398 if (install_sigsys_handler()) 1399 pdie("install SIGSYS handler"); 1400 warn("logging seccomp filter failures"); 1401 } 1402 1403 /* 1404 * Install the syscall filter. 1405 */ 1406 if (j->flags.seccomp_filter) { 1407 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 1408 j->filter_prog)) { 1409 if ((errno == EINVAL) && can_softfail()) { 1410 warn("seccomp not supported"); 1411 return; 1412 } 1413 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)"); 1414 } 1415 } 1416} 1417 1418void API minijail_enter(const struct minijail *j) 1419{ 1420 /* 1421 * If we're dropping caps, get the last valid cap from /proc now, 1422 * since /proc can be unmounted before drop_caps() is called. 1423 */ 1424 unsigned int last_valid_cap = 0; 1425 if (j->flags.capbset_drop || j->flags.use_caps) 1426 last_valid_cap = get_last_valid_cap(); 1427 1428 if (j->flags.pids) 1429 die("tried to enter a pid-namespaced jail;" 1430 " try minijail_run()?"); 1431 1432 if (j->flags.usergroups && !j->user) 1433 die("usergroup inheritance without username"); 1434 1435 /* 1436 * We can't recover from failures if we've dropped privileges partially, 1437 * so we don't even try. If any of our operations fail, we abort() the 1438 * entire process. 1439 */ 1440 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS)) 1441 pdie("setns(CLONE_NEWNS)"); 1442 1443 if (j->flags.vfs) { 1444 if (unshare(CLONE_NEWNS)) 1445 pdie("unshare(vfs)"); 1446 /* 1447 * Unless asked not to, remount all filesystems as private. 1448 * If they are shared, new bind mounts will creep out of our 1449 * namespace. 1450 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt 1451 */ 1452 if (!j->flags.skip_remount_private) { 1453 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 1454 pdie("mount(/, private)"); 1455 } 1456 } 1457 1458 if (j->flags.ipc && unshare(CLONE_NEWIPC)) { 1459 pdie("unshare(ipc)"); 1460 } 1461 1462 if (j->flags.enter_net) { 1463 if (setns(j->netns_fd, CLONE_NEWNET)) 1464 pdie("setns(CLONE_NEWNET)"); 1465 } else if (j->flags.net && unshare(CLONE_NEWNET)) { 1466 pdie("unshare(net)"); 1467 } 1468 1469 if (j->flags.ns_cgroups && unshare(CLONE_NEWCGROUP)) 1470 pdie("unshare(cgroups)"); 1471 1472 if (j->flags.chroot && enter_chroot(j)) 1473 pdie("chroot"); 1474 1475 if (j->flags.pivot_root && enter_pivot_root(j)) 1476 pdie("pivot_root"); 1477 1478 if (j->flags.mount_tmp && mount_tmp()) 1479 pdie("mount_tmp"); 1480 1481 if (j->flags.remount_proc_ro && remount_proc_readonly(j)) 1482 pdie("remount"); 1483 1484 /* 1485 * If we're only dropping capabilities from the bounding set, but not 1486 * from the thread's (permitted|inheritable|effective) sets, do it now. 1487 */ 1488 if (j->flags.capbset_drop) { 1489 drop_capbset(j->cap_bset, last_valid_cap); 1490 } 1491 1492 if (j->flags.use_caps) { 1493 /* 1494 * POSIX capabilities are a bit tricky. If we drop our 1495 * capability to change uids, our attempt to use setuid() 1496 * below will fail. Hang on to root caps across setuid(), then 1497 * lock securebits. 1498 */ 1499 if (prctl(PR_SET_KEEPCAPS, 1)) 1500 pdie("prctl(PR_SET_KEEPCAPS)"); 1501 1502 /* 1503 * Kernels 4.3+ define a new securebit 1504 * (SECURE_NO_CAP_AMBIENT_RAISE), so using the SECURE_ALL_BITS 1505 * and SECURE_ALL_LOCKS masks from newer kernel headers will 1506 * return EPERM on older kernels. Detect this, and retry with 1507 * the right mask for older (2.6.26-4.2) kernels. 1508 */ 1509 int securebits_ret = prctl(PR_SET_SECUREBITS, 1510 SECURE_ALL_BITS | SECURE_ALL_LOCKS); 1511 if (securebits_ret < 0) { 1512 if (errno == EPERM) { 1513 /* Possibly running on kernel < 4.3. */ 1514 securebits_ret = prctl( 1515 PR_SET_SECUREBITS, 1516 OLD_SECURE_ALL_BITS | OLD_SECURE_ALL_LOCKS); 1517 } 1518 } 1519 if (securebits_ret < 0) 1520 pdie("prctl(PR_SET_SECUREBITS)"); 1521 } 1522 1523 if (j->flags.no_new_privs) { 1524 /* 1525 * If we're setting no_new_privs, we can drop privileges 1526 * before setting seccomp filter. This way filter policies 1527 * don't need to allow privilege-dropping syscalls. 1528 */ 1529 drop_ugid(j); 1530 drop_caps(j, last_valid_cap); 1531 set_seccomp_filter(j); 1532 } else { 1533 /* 1534 * If we're not setting no_new_privs, 1535 * we need to set seccomp filter *before* dropping privileges. 1536 * WARNING: this means that filter policies *must* allow 1537 * setgroups()/setresgid()/setresuid() for dropping root and 1538 * capget()/capset()/prctl() for dropping caps. 1539 */ 1540 set_seccomp_filter(j); 1541 drop_ugid(j); 1542 drop_caps(j, last_valid_cap); 1543 } 1544 1545 /* 1546 * Select the specified alternate syscall table. The table must not 1547 * block prctl(2) if we're using seccomp as well. 1548 */ 1549 if (j->flags.alt_syscall) { 1550 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table)) 1551 pdie("prctl(PR_ALT_SYSCALL)"); 1552 } 1553 1554 /* 1555 * seccomp has to come last since it cuts off all the other 1556 * privilege-dropping syscalls :) 1557 */ 1558 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) { 1559 if ((errno == EINVAL) && can_softfail()) { 1560 warn("seccomp not supported"); 1561 return; 1562 } 1563 pdie("prctl(PR_SET_SECCOMP)"); 1564 } 1565} 1566 1567/* TODO(wad) will visibility affect this variable? */ 1568static int init_exitstatus = 0; 1569 1570void init_term(int __attribute__ ((unused)) sig) 1571{ 1572 _exit(init_exitstatus); 1573} 1574 1575int init(pid_t rootpid) 1576{ 1577 pid_t pid; 1578 int status; 1579 /* so that we exit with the right status */ 1580 signal(SIGTERM, init_term); 1581 /* TODO(wad) self jail with seccomp_filters here. */ 1582 while ((pid = wait(&status)) > 0) { 1583 /* 1584 * This loop will only end when either there are no processes 1585 * left inside our pid namespace or we get a signal. 1586 */ 1587 if (pid == rootpid) 1588 init_exitstatus = status; 1589 } 1590 if (!WIFEXITED(init_exitstatus)) 1591 _exit(MINIJAIL_ERR_INIT); 1592 _exit(WEXITSTATUS(init_exitstatus)); 1593} 1594 1595int API minijail_from_fd(int fd, struct minijail *j) 1596{ 1597 size_t sz = 0; 1598 size_t bytes = read(fd, &sz, sizeof(sz)); 1599 char *buf; 1600 int r; 1601 if (sizeof(sz) != bytes) 1602 return -EINVAL; 1603 if (sz > USHRT_MAX) /* arbitrary sanity check */ 1604 return -E2BIG; 1605 buf = malloc(sz); 1606 if (!buf) 1607 return -ENOMEM; 1608 bytes = read(fd, buf, sz); 1609 if (bytes != sz) { 1610 free(buf); 1611 return -EINVAL; 1612 } 1613 r = minijail_unmarshal(j, buf, sz); 1614 free(buf); 1615 return r; 1616} 1617 1618int API minijail_to_fd(struct minijail *j, int fd) 1619{ 1620 char *buf; 1621 size_t sz = minijail_size(j); 1622 ssize_t written; 1623 int r; 1624 1625 if (!sz) 1626 return -EINVAL; 1627 buf = malloc(sz); 1628 r = minijail_marshal(j, buf, sz); 1629 if (r) { 1630 free(buf); 1631 return r; 1632 } 1633 /* Sends [size][minijail]. */ 1634 written = write(fd, &sz, sizeof(sz)); 1635 if (written != sizeof(sz)) { 1636 free(buf); 1637 return -EFAULT; 1638 } 1639 written = write(fd, buf, sz); 1640 if (written < 0 || (size_t) written != sz) { 1641 free(buf); 1642 return -EFAULT; 1643 } 1644 free(buf); 1645 return 0; 1646} 1647 1648int setup_preload(void) 1649{ 1650#if defined(__ANDROID__) 1651 /* Don't use LDPRELOAD on Brillo. */ 1652 return 0; 1653#else 1654 char *oldenv = getenv(kLdPreloadEnvVar) ? : ""; 1655 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH)); 1656 if (!newenv) 1657 return -ENOMEM; 1658 1659 /* Only insert a separating space if we have something to separate... */ 1660 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", 1661 PRELOADPATH); 1662 1663 /* setenv() makes a copy of the string we give it. */ 1664 setenv(kLdPreloadEnvVar, newenv, 1); 1665 free(newenv); 1666 return 0; 1667#endif 1668} 1669 1670int setup_pipe(int fds[2]) 1671{ 1672 int r = pipe(fds); 1673 char fd_buf[11]; 1674 if (r) 1675 return r; 1676 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]); 1677 if (r <= 0) 1678 return -EINVAL; 1679 setenv(kFdEnvVar, fd_buf, 1); 1680 return 0; 1681} 1682 1683int setup_pipe_end(int fds[2], size_t index) 1684{ 1685 if (index > 1) 1686 return -1; 1687 1688 close(fds[1 - index]); 1689 return fds[index]; 1690} 1691 1692int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd) 1693{ 1694 if (index > 1) 1695 return -1; 1696 1697 close(fds[1 - index]); 1698 /* dup2(2) the corresponding end of the pipe into |fd|. */ 1699 return dup2(fds[index], fd); 1700} 1701 1702int minijail_run_internal(struct minijail *j, const char *filename, 1703 char *const argv[], pid_t *pchild_pid, 1704 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1705 int use_preload); 1706 1707int API minijail_run(struct minijail *j, const char *filename, 1708 char *const argv[]) 1709{ 1710 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1711 true); 1712} 1713 1714int API minijail_run_pid(struct minijail *j, const char *filename, 1715 char *const argv[], pid_t *pchild_pid) 1716{ 1717 return minijail_run_internal(j, filename, argv, pchild_pid, 1718 NULL, NULL, NULL, true); 1719} 1720 1721int API minijail_run_pipe(struct minijail *j, const char *filename, 1722 char *const argv[], int *pstdin_fd) 1723{ 1724 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd, 1725 NULL, NULL, true); 1726} 1727 1728int API minijail_run_pid_pipes(struct minijail *j, const char *filename, 1729 char *const argv[], pid_t *pchild_pid, 1730 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) 1731{ 1732 return minijail_run_internal(j, filename, argv, pchild_pid, 1733 pstdin_fd, pstdout_fd, pstderr_fd, true); 1734} 1735 1736int API minijail_run_no_preload(struct minijail *j, const char *filename, 1737 char *const argv[]) 1738{ 1739 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1740 false); 1741} 1742 1743int API minijail_run_pid_pipes_no_preload(struct minijail *j, 1744 const char *filename, 1745 char *const argv[], 1746 pid_t *pchild_pid, 1747 int *pstdin_fd, int *pstdout_fd, 1748 int *pstderr_fd) 1749{ 1750 return minijail_run_internal(j, filename, argv, pchild_pid, 1751 pstdin_fd, pstdout_fd, pstderr_fd, false); 1752} 1753 1754int minijail_run_internal(struct minijail *j, const char *filename, 1755 char *const argv[], pid_t *pchild_pid, 1756 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1757 int use_preload) 1758{ 1759 char *oldenv, *oldenv_copy = NULL; 1760 pid_t child_pid; 1761 int pipe_fds[2]; 1762 int stdin_fds[2]; 1763 int stdout_fds[2]; 1764 int stderr_fds[2]; 1765 int child_sync_pipe_fds[2]; 1766 int sync_child = 0; 1767 int ret; 1768 /* We need to remember this across the minijail_preexec() call. */ 1769 int pid_namespace = j->flags.pids; 1770 int do_init = j->flags.do_init; 1771 1772 if (use_preload) { 1773 oldenv = getenv(kLdPreloadEnvVar); 1774 if (oldenv) { 1775 oldenv_copy = strdup(oldenv); 1776 if (!oldenv_copy) 1777 return -ENOMEM; 1778 } 1779 1780 if (setup_preload()) 1781 return -EFAULT; 1782 } 1783 1784 if (!use_preload) { 1785 if (j->flags.use_caps) 1786 die("capabilities are not supported without " 1787 "LD_PRELOAD"); 1788 } 1789 1790 /* 1791 * Make the process group ID of this process equal to its PID, so that 1792 * both the Minijail process and the jailed process can be killed 1793 * together. 1794 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when 1795 * the process is already a process group leader. 1796 */ 1797 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) { 1798 if (errno != EPERM) { 1799 pdie("setpgid(0, 0)"); 1800 } 1801 } 1802 1803 if (use_preload) { 1804 /* 1805 * Before we fork(2) and execve(2) the child process, we need 1806 * to open a pipe(2) to send the minijail configuration over. 1807 */ 1808 if (setup_pipe(pipe_fds)) 1809 return -EFAULT; 1810 } 1811 1812 /* 1813 * If we want to write to the child process' standard input, 1814 * create the pipe(2) now. 1815 */ 1816 if (pstdin_fd) { 1817 if (pipe(stdin_fds)) 1818 return -EFAULT; 1819 } 1820 1821 /* 1822 * If we want to read from the child process' standard output, 1823 * create the pipe(2) now. 1824 */ 1825 if (pstdout_fd) { 1826 if (pipe(stdout_fds)) 1827 return -EFAULT; 1828 } 1829 1830 /* 1831 * If we want to read from the child process' standard error, 1832 * create the pipe(2) now. 1833 */ 1834 if (pstderr_fd) { 1835 if (pipe(stderr_fds)) 1836 return -EFAULT; 1837 } 1838 1839 /* 1840 * If we want to set up a new uid/gid mapping in the user namespace, 1841 * or if we need to add the child process to cgroups, create the pipe(2) 1842 * to sync between parent and child. 1843 */ 1844 if (j->flags.userns || j->flags.cgroups) { 1845 sync_child = 1; 1846 if (pipe(child_sync_pipe_fds)) 1847 return -EFAULT; 1848 } 1849 1850 /* 1851 * Use sys_clone() if and only if we're creating a pid namespace. 1852 * 1853 * tl;dr: WARNING: do not mix pid namespaces and multithreading. 1854 * 1855 * In multithreaded programs, there are a bunch of locks inside libc, 1856 * some of which may be held by other threads at the time that we call 1857 * minijail_run_pid(). If we call fork(), glibc does its level best to 1858 * ensure that we hold all of these locks before it calls clone() 1859 * internally and drop them after clone() returns, but when we call 1860 * sys_clone(2) directly, all that gets bypassed and we end up with a 1861 * child address space where some of libc's important locks are held by 1862 * other threads (which did not get cloned, and hence will never release 1863 * those locks). This is okay so long as we call exec() immediately 1864 * after, but a bunch of seemingly-innocent libc functions like setenv() 1865 * take locks. 1866 * 1867 * Hence, only call sys_clone() if we need to, in order to get at pid 1868 * namespacing. If we follow this path, the child's address space might 1869 * have broken locks; you may only call functions that do not acquire 1870 * any locks. 1871 * 1872 * Unfortunately, fork() acquires every lock it can get its hands on, as 1873 * previously detailed, so this function is highly likely to deadlock 1874 * later on (see "deadlock here") if we're multithreaded. 1875 * 1876 * We might hack around this by having the clone()d child (init of the 1877 * pid namespace) return directly, rather than leaving the clone()d 1878 * process hanging around to be init for the new namespace (and having 1879 * its fork()ed child return in turn), but that process would be 1880 * crippled with its libc locks potentially broken. We might try 1881 * fork()ing in the parent before we clone() to ensure that we own all 1882 * the locks, but then we have to have the forked child hanging around 1883 * consuming resources (and possibly having file descriptors / shared 1884 * memory regions / etc attached). We'd need to keep the child around to 1885 * avoid having its children get reparented to init. 1886 * 1887 * TODO(ellyjones): figure out if the "forked child hanging around" 1888 * problem is fixable or not. It would be nice if we worked in this 1889 * case. 1890 */ 1891 if (pid_namespace) { 1892 int clone_flags = CLONE_NEWPID | SIGCHLD; 1893 if (j->flags.userns) 1894 clone_flags |= CLONE_NEWUSER; 1895 child_pid = syscall(SYS_clone, clone_flags, NULL); 1896 } else { 1897 child_pid = fork(); 1898 } 1899 1900 if (child_pid < 0) { 1901 if (use_preload) { 1902 free(oldenv_copy); 1903 } 1904 die("failed to fork child"); 1905 } 1906 1907 if (child_pid) { 1908 if (use_preload) { 1909 /* Restore parent's LD_PRELOAD. */ 1910 if (oldenv_copy) { 1911 setenv(kLdPreloadEnvVar, oldenv_copy, 1); 1912 free(oldenv_copy); 1913 } else { 1914 unsetenv(kLdPreloadEnvVar); 1915 } 1916 unsetenv(kFdEnvVar); 1917 } 1918 1919 j->initpid = child_pid; 1920 1921 if (j->flags.pid_file) 1922 write_pid_file(j); 1923 1924 if (j->flags.cgroups) 1925 add_to_cgroups(j); 1926 1927 if (j->flags.userns) 1928 write_ugid_mappings(j); 1929 1930 if (sync_child) 1931 parent_setup_complete(child_sync_pipe_fds); 1932 1933 if (use_preload) { 1934 /* Send marshalled minijail. */ 1935 close(pipe_fds[0]); /* read endpoint */ 1936 ret = minijail_to_fd(j, pipe_fds[1]); 1937 close(pipe_fds[1]); /* write endpoint */ 1938 if (ret) { 1939 kill(j->initpid, SIGKILL); 1940 die("failed to send marshalled minijail"); 1941 } 1942 } 1943 1944 if (pchild_pid) 1945 *pchild_pid = child_pid; 1946 1947 /* 1948 * If we want to write to the child process' standard input, 1949 * set up the write end of the pipe. 1950 */ 1951 if (pstdin_fd) 1952 *pstdin_fd = setup_pipe_end(stdin_fds, 1953 1 /* write end */); 1954 1955 /* 1956 * If we want to read from the child process' standard output, 1957 * set up the read end of the pipe. 1958 */ 1959 if (pstdout_fd) 1960 *pstdout_fd = setup_pipe_end(stdout_fds, 1961 0 /* read end */); 1962 1963 /* 1964 * If we want to read from the child process' standard error, 1965 * set up the read end of the pipe. 1966 */ 1967 if (pstderr_fd) 1968 *pstderr_fd = setup_pipe_end(stderr_fds, 1969 0 /* read end */); 1970 1971 return 0; 1972 } 1973 free(oldenv_copy); 1974 1975 if (j->flags.reset_signal_mask) { 1976 sigset_t signal_mask; 1977 if (sigemptyset(&signal_mask) != 0) 1978 pdie("sigemptyset failed"); 1979 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0) 1980 pdie("sigprocmask failed"); 1981 } 1982 1983 if (sync_child) 1984 wait_for_parent_setup(child_sync_pipe_fds); 1985 1986 if (j->flags.userns) 1987 enter_user_namespace(j); 1988 1989 /* 1990 * If we want to write to the jailed process' standard input, 1991 * set up the read end of the pipe. 1992 */ 1993 if (pstdin_fd) { 1994 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */, 1995 STDIN_FILENO) < 0) 1996 die("failed to set up stdin pipe"); 1997 } 1998 1999 /* 2000 * If we want to read from the jailed process' standard output, 2001 * set up the write end of the pipe. 2002 */ 2003 if (pstdout_fd) { 2004 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */, 2005 STDOUT_FILENO) < 0) 2006 die("failed to set up stdout pipe"); 2007 } 2008 2009 /* 2010 * If we want to read from the jailed process' standard error, 2011 * set up the write end of the pipe. 2012 */ 2013 if (pstderr_fd) { 2014 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */, 2015 STDERR_FILENO) < 0) 2016 die("failed to set up stderr pipe"); 2017 } 2018 2019 /* If running an init program, let it decide when/how to mount /proc. */ 2020 if (pid_namespace && !do_init) 2021 j->flags.remount_proc_ro = 0; 2022 2023 if (use_preload) { 2024 /* Strip out flags that cannot be inherited across execve(2). */ 2025 minijail_preexec(j); 2026 } else { 2027 j->flags.pids = 0; 2028 } 2029 /* Jail this process, then execve() the target. */ 2030 minijail_enter(j); 2031 2032 if (pid_namespace && do_init) { 2033 /* 2034 * pid namespace: this process will become init inside the new 2035 * namespace. We don't want all programs we might exec to have 2036 * to know how to be init. Normally (do_init == 1) we fork off 2037 * a child to actually run the program. If |do_init == 0|, we 2038 * let the program keep pid 1 and be init. 2039 * 2040 * If we're multithreaded, we'll probably deadlock here. See 2041 * WARNING above. 2042 */ 2043 child_pid = fork(); 2044 if (child_pid < 0) 2045 _exit(child_pid); 2046 else if (child_pid > 0) 2047 init(child_pid); /* never returns */ 2048 } 2049 2050 /* 2051 * If we aren't pid-namespaced, or the jailed program asked to be init: 2052 * calling process 2053 * -> execve()-ing process 2054 * If we are: 2055 * calling process 2056 * -> init()-ing process 2057 * -> execve()-ing process 2058 */ 2059 _exit(execve(filename, argv, environ)); 2060} 2061 2062int API minijail_kill(struct minijail *j) 2063{ 2064 int st; 2065 if (kill(j->initpid, SIGTERM)) 2066 return -errno; 2067 if (waitpid(j->initpid, &st, 0) < 0) 2068 return -errno; 2069 return st; 2070} 2071 2072int API minijail_wait(struct minijail *j) 2073{ 2074 int st; 2075 if (waitpid(j->initpid, &st, 0) < 0) 2076 return -errno; 2077 2078 if (!WIFEXITED(st)) { 2079 int error_status = st; 2080 if (WIFSIGNALED(st)) { 2081 int signum = WTERMSIG(st); 2082 warn("child process %d received signal %d", 2083 j->initpid, signum); 2084 /* 2085 * We return MINIJAIL_ERR_JAIL if the process received 2086 * SIGSYS, which happens when a syscall is blocked by 2087 * seccomp filters. 2088 * If not, we do what bash(1) does: 2089 * $? = 128 + signum 2090 */ 2091 if (signum == SIGSYS) { 2092 error_status = MINIJAIL_ERR_JAIL; 2093 } else { 2094 error_status = 128 + signum; 2095 } 2096 } 2097 return error_status; 2098 } 2099 2100 int exit_status = WEXITSTATUS(st); 2101 if (exit_status != 0) 2102 info("child process %d exited with status %d", 2103 j->initpid, exit_status); 2104 2105 return exit_status; 2106} 2107 2108void API minijail_destroy(struct minijail *j) 2109{ 2110 size_t i; 2111 2112 if (j->flags.seccomp_filter && j->filter_prog) { 2113 free(j->filter_prog->filter); 2114 free(j->filter_prog); 2115 } 2116 while (j->mounts_head) { 2117 struct mountpoint *m = j->mounts_head; 2118 j->mounts_head = j->mounts_head->next; 2119 free(m->type); 2120 free(m->dest); 2121 free(m->src); 2122 free(m); 2123 } 2124 j->mounts_tail = NULL; 2125 if (j->user) 2126 free(j->user); 2127 if (j->suppl_gid_list) 2128 free(j->suppl_gid_list); 2129 if (j->chrootdir) 2130 free(j->chrootdir); 2131 if (j->alt_syscall_table) 2132 free(j->alt_syscall_table); 2133 for (i = 0; i < j->cgroup_count; ++i) 2134 free(j->cgroups[i]); 2135 free(j); 2136} 2137