libminijail.c revision 7a569073b95af7532892dc726c2f33cd40edfb57
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6#define _BSD_SOURCE 7#define _DEFAULT_SOURCE 8#define _GNU_SOURCE 9 10#include <asm/unistd.h> 11#include <ctype.h> 12#include <errno.h> 13#include <fcntl.h> 14#include <grp.h> 15#include <inttypes.h> 16#include <limits.h> 17#include <linux/capability.h> 18#include <pwd.h> 19#include <sched.h> 20#include <signal.h> 21#include <stdarg.h> 22#include <stdbool.h> 23#include <stddef.h> 24#include <stdio.h> 25#include <stdlib.h> 26#include <string.h> 27#include <syscall.h> 28#include <sys/capability.h> 29#include <sys/mount.h> 30#include <sys/param.h> 31#include <sys/prctl.h> 32#include <sys/stat.h> 33#include <sys/types.h> 34#include <sys/user.h> 35#include <sys/utsname.h> 36#include <sys/wait.h> 37#include <unistd.h> 38 39#include "libminijail.h" 40#include "libminijail-private.h" 41 42#include "signal_handler.h" 43#include "syscall_filter.h" 44#include "util.h" 45 46#ifdef HAVE_SECUREBITS_H 47# include <linux/securebits.h> 48#else 49# define SECURE_ALL_BITS 0x55 50# define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1) 51#endif 52/* For kernels < 4.3. */ 53#define OLD_SECURE_ALL_BITS 0x15 54#define OLD_SECURE_ALL_LOCKS (OLD_SECURE_ALL_BITS << 1) 55 56/* 57 * Assert the value of SECURE_ALL_BITS at compile-time. 58 * Brillo devices are currently compiled against 4.4 kernel headers. Kernel 4.3 59 * added a new securebit. 60 * When a new securebit is added, the new SECURE_ALL_BITS mask will return EPERM 61 * when used on older kernels. The compile-time assert will catch this situation 62 * at compile time. 63 */ 64#ifdef __BRILLO__ 65_Static_assert(SECURE_ALL_BITS == 0x55, "SECURE_ALL_BITS == 0x55."); 66#endif 67 68/* Until these are reliably available in linux/prctl.h. */ 69#ifndef PR_SET_SECCOMP 70# define PR_SET_SECCOMP 22 71#endif 72 73#ifndef PR_ALT_SYSCALL 74# define PR_ALT_SYSCALL 0x43724f53 75#endif 76 77/* For seccomp_filter using BPF. */ 78#ifndef PR_SET_NO_NEW_PRIVS 79# define PR_SET_NO_NEW_PRIVS 38 80#endif 81#ifndef SECCOMP_MODE_FILTER 82# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ 83#endif 84 85#ifdef USE_SECCOMP_SOFTFAIL 86# define SECCOMP_SOFTFAIL 1 87#else 88# define SECCOMP_SOFTFAIL 0 89#endif 90 91#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */ 92 93struct mountpoint { 94 char *src; 95 char *dest; 96 char *type; 97 unsigned long flags; 98 struct mountpoint *next; 99}; 100 101struct minijail { 102 /* 103 * WARNING: if you add a flag here you need to make sure it's 104 * accounted for in minijail_pre{enter|exec}() below. 105 */ 106 struct { 107 int uid:1; 108 int gid:1; 109 int usergroups:1; 110 int suppl_gids:1; 111 int use_caps:1; 112 int capbset_drop:1; 113 int vfs:1; 114 int enter_vfs:1; 115 int skip_remount_private:1; 116 int pids:1; 117 int ipc:1; 118 int net:1; 119 int enter_net:1; 120 int userns:1; 121 int seccomp:1; 122 int remount_proc_ro:1; 123 int no_new_privs:1; 124 int seccomp_filter:1; 125 int log_seccomp_filter:1; 126 int chroot:1; 127 int pivot_root:1; 128 int mount_tmp:1; 129 int do_init:1; 130 int pid_file:1; 131 int cgroups:1; 132 int alt_syscall:1; 133 int reset_signal_mask:1; 134 } flags; 135 uid_t uid; 136 gid_t gid; 137 gid_t usergid; 138 char *user; 139 size_t suppl_gid_count; 140 gid_t *suppl_gid_list; 141 uint64_t caps; 142 uint64_t cap_bset; 143 pid_t initpid; 144 int mountns_fd; 145 int netns_fd; 146 char *chrootdir; 147 char *pid_file_path; 148 char *uidmap; 149 char *gidmap; 150 size_t filter_len; 151 struct sock_fprog *filter_prog; 152 char *alt_syscall_table; 153 struct mountpoint *mounts_head; 154 struct mountpoint *mounts_tail; 155 size_t mounts_count; 156 char *cgroups[MAX_CGROUPS]; 157 size_t cgroup_count; 158}; 159 160/* 161 * Strip out flags meant for the parent. 162 * We keep things that are not inherited across execve(2) (e.g. capabilities), 163 * or are easier to set after execve(2) (e.g. seccomp filters). 164 */ 165void minijail_preenter(struct minijail *j) 166{ 167 j->flags.vfs = 0; 168 j->flags.enter_vfs = 0; 169 j->flags.skip_remount_private = 0; 170 j->flags.remount_proc_ro = 0; 171 j->flags.pids = 0; 172 j->flags.do_init = 0; 173 j->flags.pid_file = 0; 174 j->flags.cgroups = 0; 175} 176 177/* 178 * Strip out flags meant for the child. 179 * We keep things that are inherited across execve(2). 180 */ 181void minijail_preexec(struct minijail *j) 182{ 183 int vfs = j->flags.vfs; 184 int enter_vfs = j->flags.enter_vfs; 185 int skip_remount_private = j->flags.skip_remount_private; 186 int remount_proc_ro = j->flags.remount_proc_ro; 187 int userns = j->flags.userns; 188 if (j->user) 189 free(j->user); 190 j->user = NULL; 191 if (j->suppl_gid_list) 192 free(j->suppl_gid_list); 193 j->suppl_gid_list = NULL; 194 memset(&j->flags, 0, sizeof(j->flags)); 195 /* Now restore anything we meant to keep. */ 196 j->flags.vfs = vfs; 197 j->flags.enter_vfs = enter_vfs; 198 j->flags.skip_remount_private = skip_remount_private; 199 j->flags.remount_proc_ro = remount_proc_ro; 200 j->flags.userns = userns; 201 /* Note, |pids| will already have been used before this call. */ 202} 203 204/* Returns true if the kernel version is less than 3.8. */ 205int seccomp_kernel_support_not_required() 206{ 207 int major, minor; 208 struct utsname uts; 209 return (uname(&uts) != -1 && 210 sscanf(uts.release, "%d.%d", &major, &minor) == 2 && 211 ((major < 3) || ((major == 3) && (minor < 8)))); 212} 213 214/* Allow seccomp soft-fail on Android devices with kernel version < 3.8. */ 215int can_softfail() 216{ 217#if SECCOMP_SOFTFAIL 218 if (is_android()) { 219 if (seccomp_kernel_support_not_required()) 220 return 1; 221 else 222 return 0; 223 } else { 224 return 1; 225 } 226#endif 227 return 0; 228} 229 230/* Minijail API. */ 231 232struct minijail API *minijail_new(void) 233{ 234 return calloc(1, sizeof(struct minijail)); 235} 236 237void API minijail_change_uid(struct minijail *j, uid_t uid) 238{ 239 if (uid == 0) 240 die("useless change to uid 0"); 241 j->uid = uid; 242 j->flags.uid = 1; 243} 244 245void API minijail_change_gid(struct minijail *j, gid_t gid) 246{ 247 if (gid == 0) 248 die("useless change to gid 0"); 249 j->gid = gid; 250 j->flags.gid = 1; 251} 252 253void API minijail_set_supplementary_gids(struct minijail *j, size_t size, 254 const gid_t *list) 255{ 256 size_t i; 257 258 if (j->flags.usergroups) 259 die("cannot inherit *and* set supplementary groups"); 260 261 if (size == 0) { 262 /* Clear supplementary groups. */ 263 j->suppl_gid_list = NULL; 264 j->suppl_gid_count = 0; 265 j->flags.suppl_gids = 1; 266 return; 267 } 268 269 /* Copy the gid_t array. */ 270 j->suppl_gid_list = calloc(size, sizeof(gid_t)); 271 if (!j->suppl_gid_list) { 272 die("failed to allocate internal supplementary group array"); 273 } 274 for (i = 0; i < size; i++) { 275 j->suppl_gid_list[i] = list[i]; 276 } 277 j->suppl_gid_count = size; 278 j->flags.suppl_gids = 1; 279} 280 281int API minijail_change_user(struct minijail *j, const char *user) 282{ 283 char *buf = NULL; 284 struct passwd pw; 285 struct passwd *ppw = NULL; 286 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX); 287 if (sz == -1) 288 sz = 65536; /* your guess is as good as mine... */ 289 290 /* 291 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return 292 * the maximum needed size of the buffer, so we don't have to search. 293 */ 294 buf = malloc(sz); 295 if (!buf) 296 return -ENOMEM; 297 getpwnam_r(user, &pw, buf, sz, &ppw); 298 /* 299 * We're safe to free the buffer here. The strings inside |pw| point 300 * inside |buf|, but we don't use any of them; this leaves the pointers 301 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3) 302 * succeeded. 303 */ 304 free(buf); 305 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */ 306 if (!ppw) 307 return -1; 308 minijail_change_uid(j, ppw->pw_uid); 309 j->user = strdup(user); 310 if (!j->user) 311 return -ENOMEM; 312 j->usergid = ppw->pw_gid; 313 return 0; 314} 315 316int API minijail_change_group(struct minijail *j, const char *group) 317{ 318 char *buf = NULL; 319 struct group gr; 320 struct group *pgr = NULL; 321 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX); 322 if (sz == -1) 323 sz = 65536; /* and mine is as good as yours, really */ 324 325 /* 326 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return 327 * the maximum needed size of the buffer, so we don't have to search. 328 */ 329 buf = malloc(sz); 330 if (!buf) 331 return -ENOMEM; 332 getgrnam_r(group, &gr, buf, sz, &pgr); 333 /* 334 * We're safe to free the buffer here. The strings inside gr point 335 * inside buf, but we don't use any of them; this leaves the pointers 336 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded. 337 */ 338 free(buf); 339 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */ 340 if (!pgr) 341 return -1; 342 minijail_change_gid(j, pgr->gr_gid); 343 return 0; 344} 345 346void API minijail_use_seccomp(struct minijail *j) 347{ 348 j->flags.seccomp = 1; 349} 350 351void API minijail_no_new_privs(struct minijail *j) 352{ 353 j->flags.no_new_privs = 1; 354} 355 356void API minijail_use_seccomp_filter(struct minijail *j) 357{ 358 j->flags.seccomp_filter = 1; 359} 360 361void API minijail_log_seccomp_filter_failures(struct minijail *j) 362{ 363 j->flags.log_seccomp_filter = 1; 364} 365 366void API minijail_use_caps(struct minijail *j, uint64_t capmask) 367{ 368 /* 369 * 'minijail_use_caps' configures a runtime-capabilities-only 370 * environment, including a bounding set matching the thread's runtime 371 * (permitted|inheritable|effective) sets. 372 * Therefore, it will override any existing bounding set configurations 373 * since the latter would allow gaining extra runtime capabilities from 374 * file capabilities. 375 */ 376 if (j->flags.capbset_drop) { 377 warn("overriding bounding set configuration"); 378 j->cap_bset = 0; 379 j->flags.capbset_drop = 0; 380 } 381 j->caps = capmask; 382 j->flags.use_caps = 1; 383} 384 385void API minijail_capbset_drop(struct minijail *j, uint64_t capmask) 386{ 387 if (j->flags.use_caps) { 388 /* 389 * 'minijail_use_caps' will have already configured a capability 390 * bounding set matching the (permitted|inheritable|effective) 391 * sets. Abort if the user tries to configure a separate 392 * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps' 393 * are mutually exclusive. 394 */ 395 die("runtime capabilities already configured, can't drop " 396 "bounding set separately"); 397 } 398 j->cap_bset = capmask; 399 j->flags.capbset_drop = 1; 400} 401 402void API minijail_reset_signal_mask(struct minijail *j) 403{ 404 j->flags.reset_signal_mask = 1; 405} 406 407void API minijail_namespace_vfs(struct minijail *j) 408{ 409 j->flags.vfs = 1; 410} 411 412void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path) 413{ 414 int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC); 415 if (ns_fd < 0) { 416 pdie("failed to open namespace '%s'", ns_path); 417 } 418 j->mountns_fd = ns_fd; 419 j->flags.enter_vfs = 1; 420} 421 422void API minijail_skip_remount_private(struct minijail *j) 423{ 424 j->flags.skip_remount_private = 1; 425} 426 427void API minijail_namespace_pids(struct minijail *j) 428{ 429 j->flags.vfs = 1; 430 j->flags.remount_proc_ro = 1; 431 j->flags.pids = 1; 432 j->flags.do_init = 1; 433} 434 435void API minijail_namespace_ipc(struct minijail *j) 436{ 437 j->flags.ipc = 1; 438} 439 440void API minijail_namespace_net(struct minijail *j) 441{ 442 j->flags.net = 1; 443} 444 445void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path) 446{ 447 int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC); 448 if (ns_fd < 0) { 449 pdie("failed to open namespace '%s'", ns_path); 450 } 451 j->netns_fd = ns_fd; 452 j->flags.enter_net = 1; 453} 454 455void API minijail_remount_proc_readonly(struct minijail *j) 456{ 457 j->flags.vfs = 1; 458 j->flags.remount_proc_ro = 1; 459} 460 461void API minijail_namespace_user(struct minijail *j) 462{ 463 j->flags.userns = 1; 464} 465 466int API minijail_uidmap(struct minijail *j, const char *uidmap) 467{ 468 j->uidmap = strdup(uidmap); 469 if (!j->uidmap) 470 return -ENOMEM; 471 char *ch; 472 for (ch = j->uidmap; *ch; ch++) { 473 if (*ch == ',') 474 *ch = '\n'; 475 } 476 return 0; 477} 478 479int API minijail_gidmap(struct minijail *j, const char *gidmap) 480{ 481 j->gidmap = strdup(gidmap); 482 if (!j->gidmap) 483 return -ENOMEM; 484 char *ch; 485 for (ch = j->gidmap; *ch; ch++) { 486 if (*ch == ',') 487 *ch = '\n'; 488 } 489 return 0; 490} 491 492void API minijail_inherit_usergroups(struct minijail *j) 493{ 494 j->flags.usergroups = 1; 495} 496 497void API minijail_run_as_init(struct minijail *j) 498{ 499 /* 500 * Since the jailed program will become 'init' in the new PID namespace, 501 * Minijail does not need to fork an 'init' process. 502 */ 503 j->flags.do_init = 0; 504} 505 506int API minijail_enter_chroot(struct minijail *j, const char *dir) 507{ 508 if (j->chrootdir) 509 return -EINVAL; 510 j->chrootdir = strdup(dir); 511 if (!j->chrootdir) 512 return -ENOMEM; 513 j->flags.chroot = 1; 514 return 0; 515} 516 517int API minijail_enter_pivot_root(struct minijail *j, const char *dir) 518{ 519 if (j->chrootdir) 520 return -EINVAL; 521 j->chrootdir = strdup(dir); 522 if (!j->chrootdir) 523 return -ENOMEM; 524 j->flags.pivot_root = 1; 525 return 0; 526} 527 528static char *append_external_path(const char *external_path, 529 const char *path_inside_chroot) 530{ 531 char *path; 532 size_t pathlen; 533 534 /* One extra char for '/' and one for '\0', hence + 2. */ 535 pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2; 536 path = malloc(pathlen); 537 snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot); 538 539 return path; 540} 541 542char API *minijail_get_original_path(struct minijail *j, 543 const char *path_inside_chroot) 544{ 545 struct mountpoint *b; 546 547 b = j->mounts_head; 548 while (b) { 549 /* 550 * If |path_inside_chroot| is the exact destination of a 551 * mount, then the original path is exactly the source of 552 * the mount. 553 * for example: "-b /some/path/exe,/chroot/path/exe" 554 * mount source = /some/path/exe, mount dest = 555 * /chroot/path/exe Then when getting the original path of 556 * "/chroot/path/exe", the source of that mount, 557 * "/some/path/exe" is what should be returned. 558 */ 559 if (!strcmp(b->dest, path_inside_chroot)) 560 return strdup(b->src); 561 562 /* 563 * If |path_inside_chroot| is within the destination path of a 564 * mount, take the suffix of the chroot path relative to the 565 * mount destination path, and append it to the mount source 566 * path. 567 */ 568 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) { 569 const char *relative_path = 570 path_inside_chroot + strlen(b->dest); 571 return append_external_path(b->src, relative_path); 572 } 573 b = b->next; 574 } 575 576 /* If there is a chroot path, append |path_inside_chroot| to that. */ 577 if (j->chrootdir) 578 return append_external_path(j->chrootdir, path_inside_chroot); 579 580 /* No chroot, so the path outside is the same as it is inside. */ 581 return strdup(path_inside_chroot); 582} 583 584void API minijail_mount_tmp(struct minijail *j) 585{ 586 j->flags.mount_tmp = 1; 587} 588 589int API minijail_write_pid_file(struct minijail *j, const char *path) 590{ 591 j->pid_file_path = strdup(path); 592 if (!j->pid_file_path) 593 return -ENOMEM; 594 j->flags.pid_file = 1; 595 return 0; 596} 597 598int API minijail_add_to_cgroup(struct minijail *j, const char *path) 599{ 600 if (j->cgroup_count >= MAX_CGROUPS) 601 return -ENOMEM; 602 j->cgroups[j->cgroup_count] = strdup(path); 603 if (!j->cgroups[j->cgroup_count]) 604 return -ENOMEM; 605 j->cgroup_count++; 606 j->flags.cgroups = 1; 607 return 0; 608} 609 610int API minijail_mount(struct minijail *j, const char *src, const char *dest, 611 const char *type, unsigned long flags) 612{ 613 struct mountpoint *m; 614 615 if (*dest != '/') 616 return -EINVAL; 617 m = calloc(1, sizeof(*m)); 618 if (!m) 619 return -ENOMEM; 620 m->dest = strdup(dest); 621 if (!m->dest) 622 goto error; 623 m->src = strdup(src); 624 if (!m->src) 625 goto error; 626 m->type = strdup(type); 627 if (!m->type) 628 goto error; 629 m->flags = flags; 630 631 info("mount %s -> %s type '%s'", src, dest, type); 632 633 /* 634 * Force vfs namespacing so the mounts don't leak out into the 635 * containing vfs namespace. 636 */ 637 minijail_namespace_vfs(j); 638 639 if (j->mounts_tail) 640 j->mounts_tail->next = m; 641 else 642 j->mounts_head = m; 643 j->mounts_tail = m; 644 j->mounts_count++; 645 646 return 0; 647 648error: 649 free(m->src); 650 free(m->dest); 651 free(m); 652 return -ENOMEM; 653} 654 655int API minijail_bind(struct minijail *j, const char *src, const char *dest, 656 int writeable) 657{ 658 unsigned long flags = MS_BIND; 659 660 if (!writeable) 661 flags |= MS_RDONLY; 662 663 return minijail_mount(j, src, dest, "", flags); 664} 665 666void API minijail_parse_seccomp_filters(struct minijail *j, const char *path) 667{ 668 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) { 669 if ((errno == EINVAL) && can_softfail()) { 670 warn("not loading seccomp filter," 671 " seccomp not supported"); 672 j->flags.seccomp_filter = 0; 673 j->flags.log_seccomp_filter = 0; 674 j->filter_len = 0; 675 j->filter_prog = NULL; 676 j->flags.no_new_privs = 0; 677 } 678 } 679 FILE *file = fopen(path, "r"); 680 if (!file) { 681 pdie("failed to open seccomp filter file '%s'", path); 682 } 683 684 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog)); 685 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) { 686 die("failed to compile seccomp filter BPF program in '%s'", 687 path); 688 } 689 690 j->filter_len = fprog->len; 691 j->filter_prog = fprog; 692 693 fclose(file); 694} 695 696int API minijail_use_alt_syscall(struct minijail *j, const char *table) 697{ 698 j->alt_syscall_table = strdup(table); 699 if (!j->alt_syscall_table) 700 return -ENOMEM; 701 j->flags.alt_syscall = 1; 702 return 0; 703} 704 705struct marshal_state { 706 size_t available; 707 size_t total; 708 char *buf; 709}; 710 711void marshal_state_init(struct marshal_state *state, char *buf, 712 size_t available) 713{ 714 state->available = available; 715 state->buf = buf; 716 state->total = 0; 717} 718 719void marshal_append(struct marshal_state *state, void *src, size_t length) 720{ 721 size_t copy_len = MIN(state->available, length); 722 723 /* Up to |available| will be written. */ 724 if (copy_len) { 725 memcpy(state->buf, src, copy_len); 726 state->buf += copy_len; 727 state->available -= copy_len; 728 } 729 /* |total| will contain the expected length. */ 730 state->total += length; 731} 732 733void minijail_marshal_helper(struct marshal_state *state, 734 const struct minijail *j) 735{ 736 struct mountpoint *m = NULL; 737 size_t i; 738 739 marshal_append(state, (char *)j, sizeof(*j)); 740 if (j->user) 741 marshal_append(state, j->user, strlen(j->user) + 1); 742 if (j->suppl_gid_list) { 743 marshal_append(state, j->suppl_gid_list, 744 j->suppl_gid_count * sizeof(gid_t)); 745 } 746 if (j->chrootdir) 747 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1); 748 if (j->alt_syscall_table) { 749 marshal_append(state, j->alt_syscall_table, 750 strlen(j->alt_syscall_table) + 1); 751 } 752 if (j->flags.seccomp_filter && j->filter_prog) { 753 struct sock_fprog *fp = j->filter_prog; 754 marshal_append(state, (char *)fp->filter, 755 fp->len * sizeof(struct sock_filter)); 756 } 757 for (m = j->mounts_head; m; m = m->next) { 758 marshal_append(state, m->src, strlen(m->src) + 1); 759 marshal_append(state, m->dest, strlen(m->dest) + 1); 760 marshal_append(state, m->type, strlen(m->type) + 1); 761 marshal_append(state, (char *)&m->flags, sizeof(m->flags)); 762 } 763 for (i = 0; i < j->cgroup_count; ++i) 764 marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1); 765} 766 767size_t API minijail_size(const struct minijail *j) 768{ 769 struct marshal_state state; 770 marshal_state_init(&state, NULL, 0); 771 minijail_marshal_helper(&state, j); 772 return state.total; 773} 774 775int minijail_marshal(const struct minijail *j, char *buf, size_t available) 776{ 777 struct marshal_state state; 778 marshal_state_init(&state, buf, available); 779 minijail_marshal_helper(&state, j); 780 return (state.total > available); 781} 782 783/* 784 * consumebytes: consumes @length bytes from a buffer @buf of length @buflength 785 * @length Number of bytes to consume 786 * @buf Buffer to consume from 787 * @buflength Size of @buf 788 * 789 * Returns a pointer to the base of the bytes, or NULL for errors. 790 */ 791void *consumebytes(size_t length, char **buf, size_t *buflength) 792{ 793 char *p = *buf; 794 if (length > *buflength) 795 return NULL; 796 *buf += length; 797 *buflength -= length; 798 return p; 799} 800 801/* 802 * consumestr: consumes a C string from a buffer @buf of length @length 803 * @buf Buffer to consume 804 * @length Length of buffer 805 * 806 * Returns a pointer to the base of the string, or NULL for errors. 807 */ 808char *consumestr(char **buf, size_t *buflength) 809{ 810 size_t len = strnlen(*buf, *buflength); 811 if (len == *buflength) 812 /* There's no null-terminator. */ 813 return NULL; 814 return consumebytes(len + 1, buf, buflength); 815} 816 817int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) 818{ 819 size_t i; 820 size_t count; 821 int ret = -EINVAL; 822 823 if (length < sizeof(*j)) 824 goto out; 825 memcpy((void *)j, serialized, sizeof(*j)); 826 serialized += sizeof(*j); 827 length -= sizeof(*j); 828 829 /* Potentially stale pointers not used as signals. */ 830 j->mounts_head = NULL; 831 j->mounts_tail = NULL; 832 j->filter_prog = NULL; 833 834 if (j->user) { /* stale pointer */ 835 char *user = consumestr(&serialized, &length); 836 if (!user) 837 goto clear_pointers; 838 j->user = strdup(user); 839 if (!j->user) 840 goto clear_pointers; 841 } 842 843 if (j->suppl_gid_list) { /* stale pointer */ 844 if (j->suppl_gid_count > NGROUPS_MAX) { 845 goto bad_gid_list; 846 } 847 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t); 848 void *gid_list_bytes = 849 consumebytes(gid_list_size, &serialized, &length); 850 if (!gid_list_bytes) 851 goto bad_gid_list; 852 853 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t)); 854 if (!j->suppl_gid_list) 855 goto bad_gid_list; 856 857 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size); 858 } 859 860 if (j->chrootdir) { /* stale pointer */ 861 char *chrootdir = consumestr(&serialized, &length); 862 if (!chrootdir) 863 goto bad_chrootdir; 864 j->chrootdir = strdup(chrootdir); 865 if (!j->chrootdir) 866 goto bad_chrootdir; 867 } 868 869 if (j->alt_syscall_table) { /* stale pointer */ 870 char *alt_syscall_table = consumestr(&serialized, &length); 871 if (!alt_syscall_table) 872 goto bad_syscall_table; 873 j->alt_syscall_table = strdup(alt_syscall_table); 874 if (!j->alt_syscall_table) 875 goto bad_syscall_table; 876 } 877 878 if (j->flags.seccomp_filter && j->filter_len > 0) { 879 size_t ninstrs = j->filter_len; 880 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) || 881 ninstrs > USHRT_MAX) 882 goto bad_filters; 883 884 size_t program_len = ninstrs * sizeof(struct sock_filter); 885 void *program = consumebytes(program_len, &serialized, &length); 886 if (!program) 887 goto bad_filters; 888 889 j->filter_prog = malloc(sizeof(struct sock_fprog)); 890 if (!j->filter_prog) 891 goto bad_filters; 892 893 j->filter_prog->len = ninstrs; 894 j->filter_prog->filter = malloc(program_len); 895 if (!j->filter_prog->filter) 896 goto bad_filter_prog_instrs; 897 898 memcpy(j->filter_prog->filter, program, program_len); 899 } 900 901 count = j->mounts_count; 902 j->mounts_count = 0; 903 for (i = 0; i < count; ++i) { 904 unsigned long *flags; 905 const char *dest; 906 const char *type; 907 const char *src = consumestr(&serialized, &length); 908 if (!src) 909 goto bad_mounts; 910 dest = consumestr(&serialized, &length); 911 if (!dest) 912 goto bad_mounts; 913 type = consumestr(&serialized, &length); 914 if (!type) 915 goto bad_mounts; 916 flags = consumebytes(sizeof(*flags), &serialized, &length); 917 if (!flags) 918 goto bad_mounts; 919 if (minijail_mount(j, src, dest, type, *flags)) 920 goto bad_mounts; 921 } 922 923 count = j->cgroup_count; 924 j->cgroup_count = 0; 925 for (i = 0; i < count; ++i) { 926 char *cgroup = consumestr(&serialized, &length); 927 if (!cgroup) 928 goto bad_cgroups; 929 j->cgroups[i] = strdup(cgroup); 930 if (!j->cgroups[i]) 931 goto bad_cgroups; 932 ++j->cgroup_count; 933 } 934 935 return 0; 936 937bad_cgroups: 938 while (j->mounts_head) { 939 struct mountpoint *m = j->mounts_head; 940 j->mounts_head = j->mounts_head->next; 941 free(m->type); 942 free(m->dest); 943 free(m->src); 944 free(m); 945 } 946 for (i = 0; i < j->cgroup_count; ++i) 947 free(j->cgroups[i]); 948bad_mounts: 949 if (j->flags.seccomp_filter && j->filter_len > 0) { 950 free(j->filter_prog->filter); 951 free(j->filter_prog); 952 } 953bad_filter_prog_instrs: 954 if (j->filter_prog) 955 free(j->filter_prog); 956bad_filters: 957 if (j->alt_syscall_table) 958 free(j->alt_syscall_table); 959bad_syscall_table: 960 if (j->chrootdir) 961 free(j->chrootdir); 962bad_chrootdir: 963 if (j->suppl_gid_list) 964 free(j->suppl_gid_list); 965bad_gid_list: 966 if (j->user) 967 free(j->user); 968clear_pointers: 969 j->user = NULL; 970 j->suppl_gid_list = NULL; 971 j->chrootdir = NULL; 972 j->alt_syscall_table = NULL; 973 j->cgroup_count = 0; 974out: 975 return ret; 976} 977 978static void write_ugid_mappings(const struct minijail *j) 979{ 980 int fd, ret, len; 981 size_t sz; 982 char fname[32]; 983 984 sz = sizeof(fname); 985 if (j->uidmap) { 986 ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid); 987 if (ret < 0 || (size_t)ret >= sz) 988 die("failed to write file name of uid_map"); 989 fd = open(fname, O_WRONLY | O_CLOEXEC); 990 if (fd < 0) 991 pdie("failed to open '%s'", fname); 992 len = strlen(j->uidmap); 993 if (write(fd, j->uidmap, len) < len) 994 die("failed to set uid_map"); 995 close(fd); 996 } 997 if (j->gidmap) { 998 ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid); 999 if (ret < 0 || (size_t)ret >= sz) 1000 die("failed to write file name of gid_map"); 1001 fd = open(fname, O_WRONLY | O_CLOEXEC); 1002 if (fd < 0) 1003 pdie("failed to open '%s'", fname); 1004 len = strlen(j->gidmap); 1005 if (write(fd, j->gidmap, len) < len) 1006 die("failed to set gid_map"); 1007 close(fd); 1008 } 1009} 1010 1011static void parent_setup_complete(int *pipe_fds) 1012{ 1013 close(pipe_fds[0]); 1014 close(pipe_fds[1]); 1015} 1016 1017/* 1018 * wait_for_parent_setup: Called by the child process to wait for any 1019 * further parent-side setup to complete before continuing. 1020 */ 1021static void wait_for_parent_setup(int *pipe_fds) 1022{ 1023 char buf; 1024 1025 close(pipe_fds[1]); 1026 1027 /* Wait for parent to complete setup and close the pipe. */ 1028 if (read(pipe_fds[0], &buf, 1) != 0) 1029 die("failed to sync with parent"); 1030 close(pipe_fds[0]); 1031} 1032 1033static void enter_user_namespace(const struct minijail *j) 1034{ 1035 if (j->uidmap && setresuid(0, 0, 0)) 1036 pdie("setresuid"); 1037 if (j->gidmap && setresgid(0, 0, 0)) 1038 pdie("setresgid"); 1039} 1040 1041/* 1042 * mount_one: Applies mounts from @m for @j, recursing as needed. 1043 * @j Minijail these mounts are for 1044 * @m Head of list of mounts 1045 * 1046 * Returns 0 for success. 1047 */ 1048static int mount_one(const struct minijail *j, struct mountpoint *m) 1049{ 1050 int ret; 1051 char *dest; 1052 int remount_ro = 0; 1053 1054 /* |dest| has a leading "/". */ 1055 if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0) 1056 return -ENOMEM; 1057 1058 /* 1059 * R/O bind mounts have to be remounted since 'bind' and 'ro' 1060 * can't both be specified in the original bind mount. 1061 * Remount R/O after the initial mount. 1062 */ 1063 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) { 1064 remount_ro = 1; 1065 m->flags &= ~MS_RDONLY; 1066 } 1067 1068 ret = mount(m->src, dest, m->type, m->flags, NULL); 1069 if (ret) 1070 pdie("mount: %s -> %s", m->src, dest); 1071 1072 if (remount_ro) { 1073 m->flags |= MS_RDONLY; 1074 ret = mount(m->src, dest, NULL, 1075 m->flags | MS_REMOUNT, NULL); 1076 if (ret) 1077 pdie("bind ro: %s -> %s", m->src, dest); 1078 } 1079 1080 free(dest); 1081 if (m->next) 1082 return mount_one(j, m->next); 1083 return ret; 1084} 1085 1086int enter_chroot(const struct minijail *j) 1087{ 1088 int ret; 1089 1090 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 1091 return ret; 1092 1093 if (chroot(j->chrootdir)) 1094 return -errno; 1095 1096 if (chdir("/")) 1097 return -errno; 1098 1099 return 0; 1100} 1101 1102int enter_pivot_root(const struct minijail *j) 1103{ 1104 int ret, oldroot, newroot; 1105 1106 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 1107 return ret; 1108 1109 /* 1110 * Keep the fd for both old and new root. 1111 * It will be used in fchdir(2) later. 1112 */ 1113 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); 1114 if (oldroot < 0) 1115 pdie("failed to open / for fchdir"); 1116 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC); 1117 if (newroot < 0) 1118 pdie("failed to open %s for fchdir", j->chrootdir); 1119 1120 /* 1121 * To ensure j->chrootdir is the root of a filesystem, 1122 * do a self bind mount. 1123 */ 1124 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, "")) 1125 pdie("failed to bind mount '%s'", j->chrootdir); 1126 if (chdir(j->chrootdir)) 1127 return -errno; 1128 if (syscall(SYS_pivot_root, ".", ".")) 1129 pdie("pivot_root"); 1130 1131 /* 1132 * Now the old root is mounted on top of the new root. Use fchdir(2) to 1133 * change to the old root and unmount it. 1134 */ 1135 if (fchdir(oldroot)) 1136 pdie("failed to fchdir to old /"); 1137 1138 /* 1139 * If j->flags.skip_remount_private was enabled for minijail_enter(), there 1140 * could be a shared mount point under |oldroot|. In that case, mounts 1141 * under this shared mount point will be unmounted below, and this 1142 * unmounting will propagate to the original mount namespace (because the 1143 * mount point is shared). To prevent this unexpected unmounting, remove 1144 * these mounts from their peer groups by recursively remounting them as 1145 * MS_PRIVATE. 1146 */ 1147 if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL)) 1148 pdie("failed to mount(/, private) before umount(/)"); 1149 /* The old root might be busy, so use lazy unmount. */ 1150 if (umount2(".", MNT_DETACH)) 1151 pdie("umount(/)"); 1152 /* Change back to the new root. */ 1153 if (fchdir(newroot)) 1154 return -errno; 1155 if (close(oldroot)) 1156 return -errno; 1157 if (close(newroot)) 1158 return -errno; 1159 if (chroot("/")) 1160 return -errno; 1161 /* Set correct CWD for getcwd(3). */ 1162 if (chdir("/")) 1163 return -errno; 1164 1165 return 0; 1166} 1167 1168int mount_tmp(void) 1169{ 1170 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777"); 1171} 1172 1173int remount_proc_readonly(const struct minijail *j) 1174{ 1175 const char *kProcPath = "/proc"; 1176 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 1177 /* 1178 * Right now, we're holding a reference to our parent's old mount of 1179 * /proc in our namespace, which means using MS_REMOUNT here would 1180 * mutate our parent's mount as well, even though we're in a VFS 1181 * namespace (!). Instead, remove their mount from our namespace 1182 * and make our own. However, if we are in a new user namespace, /proc 1183 * is not seen as mounted, so don't return error if umount() fails. 1184 */ 1185 if (umount2(kProcPath, MNT_DETACH) && !j->flags.userns) 1186 return -errno; 1187 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) 1188 return -errno; 1189 return 0; 1190} 1191 1192static void write_pid_to_path(pid_t pid, const char *path) 1193{ 1194 FILE *fp = fopen(path, "w"); 1195 1196 if (!fp) 1197 pdie("failed to open '%s'", path); 1198 if (fprintf(fp, "%d\n", (int)pid) < 0) 1199 pdie("fprintf(%s)", path); 1200 if (fclose(fp)) 1201 pdie("fclose(%s)", path); 1202} 1203 1204static void write_pid_file(const struct minijail *j) 1205{ 1206 write_pid_to_path(j->initpid, j->pid_file_path); 1207} 1208 1209static void add_to_cgroups(const struct minijail *j) 1210{ 1211 size_t i; 1212 1213 for (i = 0; i < j->cgroup_count; ++i) 1214 write_pid_to_path(j->initpid, j->cgroups[i]); 1215} 1216 1217void drop_ugid(const struct minijail *j) 1218{ 1219 if (j->flags.usergroups && j->flags.suppl_gids) { 1220 die("tried to inherit *and* set supplementary groups;" 1221 " can only do one"); 1222 } 1223 1224 if (j->flags.usergroups) { 1225 if (initgroups(j->user, j->usergid)) 1226 pdie("initgroups"); 1227 } else if (j->flags.suppl_gids) { 1228 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) { 1229 pdie("setgroups"); 1230 } 1231 } else { 1232 /* 1233 * Only attempt to clear supplementary groups if we are changing 1234 * users. 1235 */ 1236 if ((j->uid || j->gid) && setgroups(0, NULL)) 1237 pdie("setgroups"); 1238 } 1239 1240 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) 1241 pdie("setresgid"); 1242 1243 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) 1244 pdie("setresuid"); 1245} 1246 1247/* 1248 * We specifically do not use cap_valid() as that only tells us the last 1249 * valid cap we were *compiled* against (i.e. what the version of kernel 1250 * headers says). If we run on a different kernel version, then it's not 1251 * uncommon for that to be less (if an older kernel) or more (if a newer 1252 * kernel). 1253 * Normally, we suck up the answer via /proc. On Android, not all processes are 1254 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we 1255 * programmatically find the value by calling prctl(PR_CAPBSET_READ). 1256 */ 1257static unsigned int get_last_valid_cap() 1258{ 1259 unsigned int last_valid_cap = 0; 1260 if (is_android()) { 1261 for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0; 1262 ++last_valid_cap); 1263 1264 /* |last_valid_cap| will be the first failing value. */ 1265 if (last_valid_cap > 0) { 1266 last_valid_cap--; 1267 } 1268 } else { 1269 const char cap_file[] = "/proc/sys/kernel/cap_last_cap"; 1270 FILE *fp = fopen(cap_file, "re"); 1271 if (fscanf(fp, "%u", &last_valid_cap) != 1) 1272 pdie("fscanf(%s)", cap_file); 1273 fclose(fp); 1274 } 1275 return last_valid_cap; 1276} 1277 1278static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap) 1279{ 1280 const uint64_t one = 1; 1281 unsigned int i; 1282 for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) { 1283 if (keep_mask & (one << i)) 1284 continue; 1285 if (prctl(PR_CAPBSET_DROP, i)) 1286 pdie("could not drop capability from bounding set"); 1287 } 1288} 1289 1290void drop_caps(const struct minijail *j, unsigned int last_valid_cap) 1291{ 1292 if (!j->flags.use_caps) 1293 return; 1294 1295 cap_t caps = cap_get_proc(); 1296 cap_value_t flag[1]; 1297 const uint64_t one = 1; 1298 unsigned int i; 1299 if (!caps) 1300 die("can't get process caps"); 1301 if (cap_clear_flag(caps, CAP_INHERITABLE)) 1302 die("can't clear inheritable caps"); 1303 if (cap_clear_flag(caps, CAP_EFFECTIVE)) 1304 die("can't clear effective caps"); 1305 if (cap_clear_flag(caps, CAP_PERMITTED)) 1306 die("can't clear permitted caps"); 1307 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) { 1308 /* Keep CAP_SETPCAP for dropping bounding set bits. */ 1309 if (i != CAP_SETPCAP && !(j->caps & (one << i))) 1310 continue; 1311 flag[0] = i; 1312 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET)) 1313 die("can't add effective cap"); 1314 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET)) 1315 die("can't add permitted cap"); 1316 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET)) 1317 die("can't add inheritable cap"); 1318 } 1319 if (cap_set_proc(caps)) 1320 die("can't apply initial cleaned capset"); 1321 1322 /* 1323 * Instead of dropping bounding set first, do it here in case 1324 * the caller had a more permissive bounding set which could 1325 * have been used above to raise a capability that wasn't already 1326 * present. This requires CAP_SETPCAP, so we raised/kept it above. 1327 */ 1328 drop_capbset(j->caps, last_valid_cap); 1329 1330 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */ 1331 if ((j->caps & (one << CAP_SETPCAP)) == 0) { 1332 flag[0] = CAP_SETPCAP; 1333 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR)) 1334 die("can't clear effective cap"); 1335 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR)) 1336 die("can't clear permitted cap"); 1337 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR)) 1338 die("can't clear inheritable cap"); 1339 } 1340 1341 if (cap_set_proc(caps)) 1342 die("can't apply final cleaned capset"); 1343 1344 cap_free(caps); 1345} 1346 1347void set_seccomp_filter(const struct minijail *j) 1348{ 1349 /* 1350 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c> 1351 * in the kernel source tree for an explanation of the parameters. 1352 */ 1353 if (j->flags.no_new_privs) { 1354 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) 1355 pdie("prctl(PR_SET_NO_NEW_PRIVS)"); 1356 } 1357 1358 /* 1359 * Code running with ASan 1360 * (https://github.com/google/sanitizers/wiki/AddressSanitizer) 1361 * will make system calls not included in the syscall filter policy, 1362 * which will likely crash the program. Skip setting seccomp filter in 1363 * that case. 1364 * 'running_with_asan()' has no inputs and is completely defined at 1365 * build time, so this cannot be used by an attacker to skip setting 1366 * seccomp filter. 1367 */ 1368 if (j->flags.seccomp_filter && running_with_asan()) { 1369 warn("running with ASan, not setting seccomp filter"); 1370 return; 1371 } 1372 1373 /* 1374 * If we're logging seccomp filter failures, 1375 * install the SIGSYS handler first. 1376 */ 1377 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) { 1378 if (install_sigsys_handler()) 1379 pdie("install SIGSYS handler"); 1380 warn("logging seccomp filter failures"); 1381 } 1382 1383 /* 1384 * Install the syscall filter. 1385 */ 1386 if (j->flags.seccomp_filter) { 1387 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 1388 j->filter_prog)) { 1389 if ((errno == EINVAL) && can_softfail()) { 1390 warn("seccomp not supported"); 1391 return; 1392 } 1393 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)"); 1394 } 1395 } 1396} 1397 1398void API minijail_enter(const struct minijail *j) 1399{ 1400 /* 1401 * If we're dropping caps, get the last valid cap from /proc now, 1402 * since /proc can be unmounted before drop_caps() is called. 1403 */ 1404 unsigned int last_valid_cap = 0; 1405 if (j->flags.capbset_drop || j->flags.use_caps) 1406 last_valid_cap = get_last_valid_cap(); 1407 1408 if (j->flags.pids) 1409 die("tried to enter a pid-namespaced jail;" 1410 " try minijail_run()?"); 1411 1412 if (j->flags.usergroups && !j->user) 1413 die("usergroup inheritance without username"); 1414 1415 /* 1416 * We can't recover from failures if we've dropped privileges partially, 1417 * so we don't even try. If any of our operations fail, we abort() the 1418 * entire process. 1419 */ 1420 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS)) 1421 pdie("setns(CLONE_NEWNS)"); 1422 1423 if (j->flags.vfs) { 1424 if (unshare(CLONE_NEWNS)) 1425 pdie("unshare(vfs)"); 1426 /* 1427 * Unless asked not to, remount all filesystems as private. 1428 * If they are shared, new bind mounts will creep out of our 1429 * namespace. 1430 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt 1431 */ 1432 if (!j->flags.skip_remount_private) { 1433 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 1434 pdie("mount(/, private)"); 1435 } 1436 } 1437 1438 if (j->flags.ipc && unshare(CLONE_NEWIPC)) { 1439 pdie("unshare(ipc)"); 1440 } 1441 1442 if (j->flags.enter_net) { 1443 if (setns(j->netns_fd, CLONE_NEWNET)) 1444 pdie("setns(CLONE_NEWNET)"); 1445 } else if (j->flags.net && unshare(CLONE_NEWNET)) { 1446 pdie("unshare(net)"); 1447 } 1448 1449 if (j->flags.chroot && enter_chroot(j)) 1450 pdie("chroot"); 1451 1452 if (j->flags.pivot_root && enter_pivot_root(j)) 1453 pdie("pivot_root"); 1454 1455 if (j->flags.mount_tmp && mount_tmp()) 1456 pdie("mount_tmp"); 1457 1458 if (j->flags.remount_proc_ro && remount_proc_readonly(j)) 1459 pdie("remount"); 1460 1461 /* 1462 * If we're only dropping capabilities from the bounding set, but not 1463 * from the thread's (permitted|inheritable|effective) sets, do it now. 1464 */ 1465 if (j->flags.capbset_drop) { 1466 drop_capbset(j->cap_bset, last_valid_cap); 1467 } 1468 1469 if (j->flags.use_caps) { 1470 /* 1471 * POSIX capabilities are a bit tricky. If we drop our 1472 * capability to change uids, our attempt to use setuid() 1473 * below will fail. Hang on to root caps across setuid(), then 1474 * lock securebits. 1475 */ 1476 if (prctl(PR_SET_KEEPCAPS, 1)) 1477 pdie("prctl(PR_SET_KEEPCAPS)"); 1478 1479 /* 1480 * Kernels 4.3+ define a new securebit 1481 * (SECURE_NO_CAP_AMBIENT_RAISE), so using the SECURE_ALL_BITS 1482 * and SECURE_ALL_LOCKS masks from newer kernel headers will 1483 * return EPERM on older kernels. Detect this, and retry with 1484 * the right mask for older (2.6.26-4.2) kernels. 1485 */ 1486 int securebits_ret = prctl(PR_SET_SECUREBITS, 1487 SECURE_ALL_BITS | SECURE_ALL_LOCKS); 1488 if (securebits_ret < 0) { 1489 if (errno == EPERM) { 1490 /* Possibly running on kernel < 4.3. */ 1491 securebits_ret = prctl( 1492 PR_SET_SECUREBITS, 1493 OLD_SECURE_ALL_BITS | OLD_SECURE_ALL_LOCKS); 1494 } 1495 } 1496 if (securebits_ret < 0) 1497 pdie("prctl(PR_SET_SECUREBITS)"); 1498 } 1499 1500 if (j->flags.no_new_privs) { 1501 /* 1502 * If we're setting no_new_privs, we can drop privileges 1503 * before setting seccomp filter. This way filter policies 1504 * don't need to allow privilege-dropping syscalls. 1505 */ 1506 drop_ugid(j); 1507 drop_caps(j, last_valid_cap); 1508 set_seccomp_filter(j); 1509 } else { 1510 /* 1511 * If we're not setting no_new_privs, 1512 * we need to set seccomp filter *before* dropping privileges. 1513 * WARNING: this means that filter policies *must* allow 1514 * setgroups()/setresgid()/setresuid() for dropping root and 1515 * capget()/capset()/prctl() for dropping caps. 1516 */ 1517 set_seccomp_filter(j); 1518 drop_ugid(j); 1519 drop_caps(j, last_valid_cap); 1520 } 1521 1522 /* 1523 * Select the specified alternate syscall table. The table must not 1524 * block prctl(2) if we're using seccomp as well. 1525 */ 1526 if (j->flags.alt_syscall) { 1527 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table)) 1528 pdie("prctl(PR_ALT_SYSCALL)"); 1529 } 1530 1531 /* 1532 * seccomp has to come last since it cuts off all the other 1533 * privilege-dropping syscalls :) 1534 */ 1535 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) { 1536 if ((errno == EINVAL) && can_softfail()) { 1537 warn("seccomp not supported"); 1538 return; 1539 } 1540 pdie("prctl(PR_SET_SECCOMP)"); 1541 } 1542} 1543 1544/* TODO(wad) will visibility affect this variable? */ 1545static int init_exitstatus = 0; 1546 1547void init_term(int __attribute__ ((unused)) sig) 1548{ 1549 _exit(init_exitstatus); 1550} 1551 1552int init(pid_t rootpid) 1553{ 1554 pid_t pid; 1555 int status; 1556 /* so that we exit with the right status */ 1557 signal(SIGTERM, init_term); 1558 /* TODO(wad) self jail with seccomp_filters here. */ 1559 while ((pid = wait(&status)) > 0) { 1560 /* 1561 * This loop will only end when either there are no processes 1562 * left inside our pid namespace or we get a signal. 1563 */ 1564 if (pid == rootpid) 1565 init_exitstatus = status; 1566 } 1567 if (!WIFEXITED(init_exitstatus)) 1568 _exit(MINIJAIL_ERR_INIT); 1569 _exit(WEXITSTATUS(init_exitstatus)); 1570} 1571 1572int API minijail_from_fd(int fd, struct minijail *j) 1573{ 1574 size_t sz = 0; 1575 size_t bytes = read(fd, &sz, sizeof(sz)); 1576 char *buf; 1577 int r; 1578 if (sizeof(sz) != bytes) 1579 return -EINVAL; 1580 if (sz > USHRT_MAX) /* arbitrary sanity check */ 1581 return -E2BIG; 1582 buf = malloc(sz); 1583 if (!buf) 1584 return -ENOMEM; 1585 bytes = read(fd, buf, sz); 1586 if (bytes != sz) { 1587 free(buf); 1588 return -EINVAL; 1589 } 1590 r = minijail_unmarshal(j, buf, sz); 1591 free(buf); 1592 return r; 1593} 1594 1595int API minijail_to_fd(struct minijail *j, int fd) 1596{ 1597 char *buf; 1598 size_t sz = minijail_size(j); 1599 ssize_t written; 1600 int r; 1601 1602 if (!sz) 1603 return -EINVAL; 1604 buf = malloc(sz); 1605 r = minijail_marshal(j, buf, sz); 1606 if (r) { 1607 free(buf); 1608 return r; 1609 } 1610 /* Sends [size][minijail]. */ 1611 written = write(fd, &sz, sizeof(sz)); 1612 if (written != sizeof(sz)) { 1613 free(buf); 1614 return -EFAULT; 1615 } 1616 written = write(fd, buf, sz); 1617 if (written < 0 || (size_t) written != sz) { 1618 free(buf); 1619 return -EFAULT; 1620 } 1621 free(buf); 1622 return 0; 1623} 1624 1625int setup_preload(void) 1626{ 1627#if defined(__ANDROID__) 1628 /* Don't use LDPRELOAD on Brillo. */ 1629 return 0; 1630#else 1631 char *oldenv = getenv(kLdPreloadEnvVar) ? : ""; 1632 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH)); 1633 if (!newenv) 1634 return -ENOMEM; 1635 1636 /* Only insert a separating space if we have something to separate... */ 1637 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", 1638 PRELOADPATH); 1639 1640 /* setenv() makes a copy of the string we give it. */ 1641 setenv(kLdPreloadEnvVar, newenv, 1); 1642 free(newenv); 1643 return 0; 1644#endif 1645} 1646 1647int setup_pipe(int fds[2]) 1648{ 1649 int r = pipe(fds); 1650 char fd_buf[11]; 1651 if (r) 1652 return r; 1653 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]); 1654 if (r <= 0) 1655 return -EINVAL; 1656 setenv(kFdEnvVar, fd_buf, 1); 1657 return 0; 1658} 1659 1660int setup_pipe_end(int fds[2], size_t index) 1661{ 1662 if (index > 1) 1663 return -1; 1664 1665 close(fds[1 - index]); 1666 return fds[index]; 1667} 1668 1669int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd) 1670{ 1671 if (index > 1) 1672 return -1; 1673 1674 close(fds[1 - index]); 1675 /* dup2(2) the corresponding end of the pipe into |fd|. */ 1676 return dup2(fds[index], fd); 1677} 1678 1679int minijail_run_internal(struct minijail *j, const char *filename, 1680 char *const argv[], pid_t *pchild_pid, 1681 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1682 int use_preload); 1683 1684int API minijail_run(struct minijail *j, const char *filename, 1685 char *const argv[]) 1686{ 1687 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1688 true); 1689} 1690 1691int API minijail_run_pid(struct minijail *j, const char *filename, 1692 char *const argv[], pid_t *pchild_pid) 1693{ 1694 return minijail_run_internal(j, filename, argv, pchild_pid, 1695 NULL, NULL, NULL, true); 1696} 1697 1698int API minijail_run_pipe(struct minijail *j, const char *filename, 1699 char *const argv[], int *pstdin_fd) 1700{ 1701 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd, 1702 NULL, NULL, true); 1703} 1704 1705int API minijail_run_pid_pipes(struct minijail *j, const char *filename, 1706 char *const argv[], pid_t *pchild_pid, 1707 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) 1708{ 1709 return minijail_run_internal(j, filename, argv, pchild_pid, 1710 pstdin_fd, pstdout_fd, pstderr_fd, true); 1711} 1712 1713int API minijail_run_no_preload(struct minijail *j, const char *filename, 1714 char *const argv[]) 1715{ 1716 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1717 false); 1718} 1719 1720int API minijail_run_pid_pipes_no_preload(struct minijail *j, 1721 const char *filename, 1722 char *const argv[], 1723 pid_t *pchild_pid, 1724 int *pstdin_fd, int *pstdout_fd, 1725 int *pstderr_fd) 1726{ 1727 return minijail_run_internal(j, filename, argv, pchild_pid, 1728 pstdin_fd, pstdout_fd, pstderr_fd, false); 1729} 1730 1731int minijail_run_internal(struct minijail *j, const char *filename, 1732 char *const argv[], pid_t *pchild_pid, 1733 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1734 int use_preload) 1735{ 1736 char *oldenv, *oldenv_copy = NULL; 1737 pid_t child_pid; 1738 int pipe_fds[2]; 1739 int stdin_fds[2]; 1740 int stdout_fds[2]; 1741 int stderr_fds[2]; 1742 int child_sync_pipe_fds[2]; 1743 int sync_child = 0; 1744 int ret; 1745 /* We need to remember this across the minijail_preexec() call. */ 1746 int pid_namespace = j->flags.pids; 1747 int do_init = j->flags.do_init; 1748 1749 if (use_preload) { 1750 oldenv = getenv(kLdPreloadEnvVar); 1751 if (oldenv) { 1752 oldenv_copy = strdup(oldenv); 1753 if (!oldenv_copy) 1754 return -ENOMEM; 1755 } 1756 1757 if (setup_preload()) 1758 return -EFAULT; 1759 } 1760 1761 if (!use_preload) { 1762 if (j->flags.use_caps) 1763 die("capabilities are not supported without " 1764 "LD_PRELOAD"); 1765 } 1766 1767 /* 1768 * Make the process group ID of this process equal to its PID, so that 1769 * both the Minijail process and the jailed process can be killed 1770 * together. 1771 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when 1772 * the process is already a process group leader. 1773 */ 1774 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) { 1775 if (errno != EPERM) { 1776 pdie("setpgid(0, 0)"); 1777 } 1778 } 1779 1780 if (use_preload) { 1781 /* 1782 * Before we fork(2) and execve(2) the child process, we need 1783 * to open a pipe(2) to send the minijail configuration over. 1784 */ 1785 if (setup_pipe(pipe_fds)) 1786 return -EFAULT; 1787 } 1788 1789 /* 1790 * If we want to write to the child process' standard input, 1791 * create the pipe(2) now. 1792 */ 1793 if (pstdin_fd) { 1794 if (pipe(stdin_fds)) 1795 return -EFAULT; 1796 } 1797 1798 /* 1799 * If we want to read from the child process' standard output, 1800 * create the pipe(2) now. 1801 */ 1802 if (pstdout_fd) { 1803 if (pipe(stdout_fds)) 1804 return -EFAULT; 1805 } 1806 1807 /* 1808 * If we want to read from the child process' standard error, 1809 * create the pipe(2) now. 1810 */ 1811 if (pstderr_fd) { 1812 if (pipe(stderr_fds)) 1813 return -EFAULT; 1814 } 1815 1816 /* 1817 * If we want to set up a new uid/gid mapping in the user namespace, 1818 * or if we need to add the child process to cgroups, create the pipe(2) 1819 * to sync between parent and child. 1820 */ 1821 if (j->flags.userns || j->flags.cgroups) { 1822 sync_child = 1; 1823 if (pipe(child_sync_pipe_fds)) 1824 return -EFAULT; 1825 } 1826 1827 /* 1828 * Use sys_clone() if and only if we're creating a pid namespace. 1829 * 1830 * tl;dr: WARNING: do not mix pid namespaces and multithreading. 1831 * 1832 * In multithreaded programs, there are a bunch of locks inside libc, 1833 * some of which may be held by other threads at the time that we call 1834 * minijail_run_pid(). If we call fork(), glibc does its level best to 1835 * ensure that we hold all of these locks before it calls clone() 1836 * internally and drop them after clone() returns, but when we call 1837 * sys_clone(2) directly, all that gets bypassed and we end up with a 1838 * child address space where some of libc's important locks are held by 1839 * other threads (which did not get cloned, and hence will never release 1840 * those locks). This is okay so long as we call exec() immediately 1841 * after, but a bunch of seemingly-innocent libc functions like setenv() 1842 * take locks. 1843 * 1844 * Hence, only call sys_clone() if we need to, in order to get at pid 1845 * namespacing. If we follow this path, the child's address space might 1846 * have broken locks; you may only call functions that do not acquire 1847 * any locks. 1848 * 1849 * Unfortunately, fork() acquires every lock it can get its hands on, as 1850 * previously detailed, so this function is highly likely to deadlock 1851 * later on (see "deadlock here") if we're multithreaded. 1852 * 1853 * We might hack around this by having the clone()d child (init of the 1854 * pid namespace) return directly, rather than leaving the clone()d 1855 * process hanging around to be init for the new namespace (and having 1856 * its fork()ed child return in turn), but that process would be 1857 * crippled with its libc locks potentially broken. We might try 1858 * fork()ing in the parent before we clone() to ensure that we own all 1859 * the locks, but then we have to have the forked child hanging around 1860 * consuming resources (and possibly having file descriptors / shared 1861 * memory regions / etc attached). We'd need to keep the child around to 1862 * avoid having its children get reparented to init. 1863 * 1864 * TODO(ellyjones): figure out if the "forked child hanging around" 1865 * problem is fixable or not. It would be nice if we worked in this 1866 * case. 1867 */ 1868 if (pid_namespace) { 1869 int clone_flags = CLONE_NEWPID | SIGCHLD; 1870 if (j->flags.userns) 1871 clone_flags |= CLONE_NEWUSER; 1872 child_pid = syscall(SYS_clone, clone_flags, NULL); 1873 } else { 1874 child_pid = fork(); 1875 } 1876 1877 if (child_pid < 0) { 1878 if (use_preload) { 1879 free(oldenv_copy); 1880 } 1881 die("failed to fork child"); 1882 } 1883 1884 if (child_pid) { 1885 if (use_preload) { 1886 /* Restore parent's LD_PRELOAD. */ 1887 if (oldenv_copy) { 1888 setenv(kLdPreloadEnvVar, oldenv_copy, 1); 1889 free(oldenv_copy); 1890 } else { 1891 unsetenv(kLdPreloadEnvVar); 1892 } 1893 unsetenv(kFdEnvVar); 1894 } 1895 1896 j->initpid = child_pid; 1897 1898 if (j->flags.pid_file) 1899 write_pid_file(j); 1900 1901 if (j->flags.cgroups) 1902 add_to_cgroups(j); 1903 1904 if (j->flags.userns) 1905 write_ugid_mappings(j); 1906 1907 if (sync_child) 1908 parent_setup_complete(child_sync_pipe_fds); 1909 1910 if (use_preload) { 1911 /* Send marshalled minijail. */ 1912 close(pipe_fds[0]); /* read endpoint */ 1913 ret = minijail_to_fd(j, pipe_fds[1]); 1914 close(pipe_fds[1]); /* write endpoint */ 1915 if (ret) { 1916 kill(j->initpid, SIGKILL); 1917 die("failed to send marshalled minijail"); 1918 } 1919 } 1920 1921 if (pchild_pid) 1922 *pchild_pid = child_pid; 1923 1924 /* 1925 * If we want to write to the child process' standard input, 1926 * set up the write end of the pipe. 1927 */ 1928 if (pstdin_fd) 1929 *pstdin_fd = setup_pipe_end(stdin_fds, 1930 1 /* write end */); 1931 1932 /* 1933 * If we want to read from the child process' standard output, 1934 * set up the read end of the pipe. 1935 */ 1936 if (pstdout_fd) 1937 *pstdout_fd = setup_pipe_end(stdout_fds, 1938 0 /* read end */); 1939 1940 /* 1941 * If we want to read from the child process' standard error, 1942 * set up the read end of the pipe. 1943 */ 1944 if (pstderr_fd) 1945 *pstderr_fd = setup_pipe_end(stderr_fds, 1946 0 /* read end */); 1947 1948 return 0; 1949 } 1950 free(oldenv_copy); 1951 1952 if (j->flags.reset_signal_mask) { 1953 sigset_t signal_mask; 1954 if (sigemptyset(&signal_mask) != 0) 1955 pdie("sigemptyset failed"); 1956 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0) 1957 pdie("sigprocmask failed"); 1958 } 1959 1960 if (sync_child) 1961 wait_for_parent_setup(child_sync_pipe_fds); 1962 1963 if (j->flags.userns) 1964 enter_user_namespace(j); 1965 1966 /* 1967 * If we want to write to the jailed process' standard input, 1968 * set up the read end of the pipe. 1969 */ 1970 if (pstdin_fd) { 1971 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */, 1972 STDIN_FILENO) < 0) 1973 die("failed to set up stdin pipe"); 1974 } 1975 1976 /* 1977 * If we want to read from the jailed process' standard output, 1978 * set up the write end of the pipe. 1979 */ 1980 if (pstdout_fd) { 1981 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */, 1982 STDOUT_FILENO) < 0) 1983 die("failed to set up stdout pipe"); 1984 } 1985 1986 /* 1987 * If we want to read from the jailed process' standard error, 1988 * set up the write end of the pipe. 1989 */ 1990 if (pstderr_fd) { 1991 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */, 1992 STDERR_FILENO) < 0) 1993 die("failed to set up stderr pipe"); 1994 } 1995 1996 /* If running an init program, let it decide when/how to mount /proc. */ 1997 if (pid_namespace && !do_init) 1998 j->flags.remount_proc_ro = 0; 1999 2000 if (use_preload) { 2001 /* Strip out flags that cannot be inherited across execve(2). */ 2002 minijail_preexec(j); 2003 } else { 2004 j->flags.pids = 0; 2005 } 2006 /* Jail this process, then execve() the target. */ 2007 minijail_enter(j); 2008 2009 if (pid_namespace && do_init) { 2010 /* 2011 * pid namespace: this process will become init inside the new 2012 * namespace. We don't want all programs we might exec to have 2013 * to know how to be init. Normally (do_init == 1) we fork off 2014 * a child to actually run the program. If |do_init == 0|, we 2015 * let the program keep pid 1 and be init. 2016 * 2017 * If we're multithreaded, we'll probably deadlock here. See 2018 * WARNING above. 2019 */ 2020 child_pid = fork(); 2021 if (child_pid < 0) 2022 _exit(child_pid); 2023 else if (child_pid > 0) 2024 init(child_pid); /* never returns */ 2025 } 2026 2027 /* 2028 * If we aren't pid-namespaced, or the jailed program asked to be init: 2029 * calling process 2030 * -> execve()-ing process 2031 * If we are: 2032 * calling process 2033 * -> init()-ing process 2034 * -> execve()-ing process 2035 */ 2036 _exit(execve(filename, argv, environ)); 2037} 2038 2039int API minijail_kill(struct minijail *j) 2040{ 2041 int st; 2042 if (kill(j->initpid, SIGTERM)) 2043 return -errno; 2044 if (waitpid(j->initpid, &st, 0) < 0) 2045 return -errno; 2046 return st; 2047} 2048 2049int API minijail_wait(struct minijail *j) 2050{ 2051 int st; 2052 if (waitpid(j->initpid, &st, 0) < 0) 2053 return -errno; 2054 2055 if (!WIFEXITED(st)) { 2056 int error_status = st; 2057 if (WIFSIGNALED(st)) { 2058 int signum = WTERMSIG(st); 2059 warn("child process %d received signal %d", 2060 j->initpid, signum); 2061 /* 2062 * We return MINIJAIL_ERR_JAIL if the process received 2063 * SIGSYS, which happens when a syscall is blocked by 2064 * seccomp filters. 2065 * If not, we do what bash(1) does: 2066 * $? = 128 + signum 2067 */ 2068 if (signum == SIGSYS) { 2069 error_status = MINIJAIL_ERR_JAIL; 2070 } else { 2071 error_status = 128 + signum; 2072 } 2073 } 2074 return error_status; 2075 } 2076 2077 int exit_status = WEXITSTATUS(st); 2078 if (exit_status != 0) 2079 info("child process %d exited with status %d", 2080 j->initpid, exit_status); 2081 2082 return exit_status; 2083} 2084 2085void API minijail_destroy(struct minijail *j) 2086{ 2087 size_t i; 2088 2089 if (j->flags.seccomp_filter && j->filter_prog) { 2090 free(j->filter_prog->filter); 2091 free(j->filter_prog); 2092 } 2093 while (j->mounts_head) { 2094 struct mountpoint *m = j->mounts_head; 2095 j->mounts_head = j->mounts_head->next; 2096 free(m->type); 2097 free(m->dest); 2098 free(m->src); 2099 free(m); 2100 } 2101 j->mounts_tail = NULL; 2102 if (j->user) 2103 free(j->user); 2104 if (j->suppl_gid_list) 2105 free(j->suppl_gid_list); 2106 if (j->chrootdir) 2107 free(j->chrootdir); 2108 if (j->alt_syscall_table) 2109 free(j->alt_syscall_table); 2110 for (i = 0; i < j->cgroup_count; ++i) 2111 free(j->cgroups[i]); 2112 free(j); 2113} 2114