libminijail.c revision 200299c81d043606bf1290408251c01d46c51baf
1/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6#define _BSD_SOURCE 7#define _DEFAULT_SOURCE 8#define _GNU_SOURCE 9 10#include <asm/unistd.h> 11#include <ctype.h> 12#include <errno.h> 13#include <fcntl.h> 14#include <grp.h> 15#include <inttypes.h> 16#include <limits.h> 17#include <linux/capability.h> 18#include <pwd.h> 19#include <sched.h> 20#include <signal.h> 21#include <stdarg.h> 22#include <stdbool.h> 23#include <stddef.h> 24#include <stdio.h> 25#include <stdlib.h> 26#include <string.h> 27#include <syscall.h> 28#include <sys/capability.h> 29#include <sys/mount.h> 30#include <sys/param.h> 31#include <sys/prctl.h> 32#include <sys/stat.h> 33#include <sys/types.h> 34#include <sys/user.h> 35#include <sys/wait.h> 36#include <unistd.h> 37 38#include "libminijail.h" 39#include "libminijail-private.h" 40 41#include "signal_handler.h" 42#include "syscall_filter.h" 43#include "syscall_wrapper.h" 44#include "util.h" 45 46#ifdef HAVE_SECUREBITS_H 47# include <linux/securebits.h> 48#else 49# define SECURE_ALL_BITS 0x55 50# define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1) 51#endif 52/* For kernels < 4.3. */ 53#define OLD_SECURE_ALL_BITS 0x15 54#define OLD_SECURE_ALL_LOCKS (OLD_SECURE_ALL_BITS << 1) 55 56/* 57 * Assert the value of SECURE_ALL_BITS at compile-time. 58 * Brillo devices are currently compiled against 4.4 kernel headers. Kernel 4.3 59 * added a new securebit. 60 * When a new securebit is added, the new SECURE_ALL_BITS mask will return EPERM 61 * when used on older kernels. The compile-time assert will catch this situation 62 * at compile time. 63 */ 64#ifdef __BRILLO__ 65_Static_assert(SECURE_ALL_BITS == 0x55, "SECURE_ALL_BITS == 0x55."); 66#endif 67 68/* Until these are reliably available in linux/prctl.h. */ 69#ifndef PR_SET_SECCOMP 70# define PR_SET_SECCOMP 22 71#endif 72 73#ifndef PR_ALT_SYSCALL 74# define PR_ALT_SYSCALL 0x43724f53 75#endif 76 77/* Seccomp filter related flags. */ 78#ifndef PR_SET_NO_NEW_PRIVS 79# define PR_SET_NO_NEW_PRIVS 38 80#endif 81 82#ifndef SECCOMP_MODE_FILTER 83# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ 84#endif 85 86#ifndef SECCOMP_SET_MODE_STRICT 87# define SECCOMP_SET_MODE_STRICT 0 88#endif 89#ifndef SECCOMP_SET_MODE_FILTER 90# define SECCOMP_SET_MODE_FILTER 1 91#endif 92 93#ifndef SECCOMP_FILTER_FLAG_TSYNC 94# define SECCOMP_FILTER_FLAG_TSYNC 1 95#endif 96/* End seccomp filter related flags. */ 97 98/* New cgroup namespace might not be in linux-headers yet. */ 99#ifndef CLONE_NEWCGROUP 100# define CLONE_NEWCGROUP 0x02000000 101#endif 102 103#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */ 104 105struct mountpoint { 106 char *src; 107 char *dest; 108 char *type; 109 char *data; 110 int has_data; 111 unsigned long flags; 112 struct mountpoint *next; 113}; 114 115struct minijail { 116 /* 117 * WARNING: if you add a flag here you need to make sure it's 118 * accounted for in minijail_pre{enter|exec}() below. 119 */ 120 struct { 121 int uid:1; 122 int gid:1; 123 int usergroups:1; 124 int suppl_gids:1; 125 int use_caps:1; 126 int capbset_drop:1; 127 int vfs:1; 128 int enter_vfs:1; 129 int skip_remount_private:1; 130 int pids:1; 131 int ipc:1; 132 int net:1; 133 int enter_net:1; 134 int ns_cgroups:1; 135 int userns:1; 136 int disable_setgroups:1; 137 int seccomp:1; 138 int remount_proc_ro:1; 139 int no_new_privs:1; 140 int seccomp_filter:1; 141 int seccomp_filter_tsync:1; 142 int log_seccomp_filter:1; 143 int chroot:1; 144 int pivot_root:1; 145 int mount_tmp:1; 146 int do_init:1; 147 int pid_file:1; 148 int cgroups:1; 149 int alt_syscall:1; 150 int reset_signal_mask:1; 151 } flags; 152 uid_t uid; 153 gid_t gid; 154 gid_t usergid; 155 char *user; 156 size_t suppl_gid_count; 157 gid_t *suppl_gid_list; 158 uint64_t caps; 159 uint64_t cap_bset; 160 pid_t initpid; 161 int mountns_fd; 162 int netns_fd; 163 char *chrootdir; 164 char *pid_file_path; 165 char *uidmap; 166 char *gidmap; 167 size_t filter_len; 168 struct sock_fprog *filter_prog; 169 char *alt_syscall_table; 170 struct mountpoint *mounts_head; 171 struct mountpoint *mounts_tail; 172 size_t mounts_count; 173 char *cgroups[MAX_CGROUPS]; 174 size_t cgroup_count; 175}; 176 177/* 178 * Strip out flags meant for the parent. 179 * We keep things that are not inherited across execve(2) (e.g. capabilities), 180 * or are easier to set after execve(2) (e.g. seccomp filters). 181 */ 182void minijail_preenter(struct minijail *j) 183{ 184 j->flags.vfs = 0; 185 j->flags.enter_vfs = 0; 186 j->flags.skip_remount_private = 0; 187 j->flags.remount_proc_ro = 0; 188 j->flags.pids = 0; 189 j->flags.do_init = 0; 190 j->flags.pid_file = 0; 191 j->flags.cgroups = 0; 192} 193 194/* 195 * Strip out flags meant for the child. 196 * We keep things that are inherited across execve(2). 197 */ 198void minijail_preexec(struct minijail *j) 199{ 200 int vfs = j->flags.vfs; 201 int enter_vfs = j->flags.enter_vfs; 202 int skip_remount_private = j->flags.skip_remount_private; 203 int remount_proc_ro = j->flags.remount_proc_ro; 204 int userns = j->flags.userns; 205 if (j->user) 206 free(j->user); 207 j->user = NULL; 208 if (j->suppl_gid_list) 209 free(j->suppl_gid_list); 210 j->suppl_gid_list = NULL; 211 memset(&j->flags, 0, sizeof(j->flags)); 212 /* Now restore anything we meant to keep. */ 213 j->flags.vfs = vfs; 214 j->flags.enter_vfs = enter_vfs; 215 j->flags.skip_remount_private = skip_remount_private; 216 j->flags.remount_proc_ro = remount_proc_ro; 217 j->flags.userns = userns; 218 /* Note, |pids| will already have been used before this call. */ 219} 220 221/* Minijail API. */ 222 223struct minijail API *minijail_new(void) 224{ 225 return calloc(1, sizeof(struct minijail)); 226} 227 228void API minijail_change_uid(struct minijail *j, uid_t uid) 229{ 230 if (uid == 0) 231 die("useless change to uid 0"); 232 j->uid = uid; 233 j->flags.uid = 1; 234} 235 236void API minijail_change_gid(struct minijail *j, gid_t gid) 237{ 238 if (gid == 0) 239 die("useless change to gid 0"); 240 j->gid = gid; 241 j->flags.gid = 1; 242} 243 244void API minijail_set_supplementary_gids(struct minijail *j, size_t size, 245 const gid_t *list) 246{ 247 size_t i; 248 249 if (j->flags.usergroups) 250 die("cannot inherit *and* set supplementary groups"); 251 252 if (size == 0) { 253 /* Clear supplementary groups. */ 254 j->suppl_gid_list = NULL; 255 j->suppl_gid_count = 0; 256 j->flags.suppl_gids = 1; 257 return; 258 } 259 260 /* Copy the gid_t array. */ 261 j->suppl_gid_list = calloc(size, sizeof(gid_t)); 262 if (!j->suppl_gid_list) { 263 die("failed to allocate internal supplementary group array"); 264 } 265 for (i = 0; i < size; i++) { 266 j->suppl_gid_list[i] = list[i]; 267 } 268 j->suppl_gid_count = size; 269 j->flags.suppl_gids = 1; 270} 271 272int API minijail_change_user(struct minijail *j, const char *user) 273{ 274 char *buf = NULL; 275 struct passwd pw; 276 struct passwd *ppw = NULL; 277 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX); 278 if (sz == -1) 279 sz = 65536; /* your guess is as good as mine... */ 280 281 /* 282 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return 283 * the maximum needed size of the buffer, so we don't have to search. 284 */ 285 buf = malloc(sz); 286 if (!buf) 287 return -ENOMEM; 288 getpwnam_r(user, &pw, buf, sz, &ppw); 289 /* 290 * We're safe to free the buffer here. The strings inside |pw| point 291 * inside |buf|, but we don't use any of them; this leaves the pointers 292 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3) 293 * succeeded. 294 */ 295 free(buf); 296 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */ 297 if (!ppw) 298 return -1; 299 minijail_change_uid(j, ppw->pw_uid); 300 j->user = strdup(user); 301 if (!j->user) 302 return -ENOMEM; 303 j->usergid = ppw->pw_gid; 304 return 0; 305} 306 307int API minijail_change_group(struct minijail *j, const char *group) 308{ 309 char *buf = NULL; 310 struct group gr; 311 struct group *pgr = NULL; 312 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX); 313 if (sz == -1) 314 sz = 65536; /* and mine is as good as yours, really */ 315 316 /* 317 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return 318 * the maximum needed size of the buffer, so we don't have to search. 319 */ 320 buf = malloc(sz); 321 if (!buf) 322 return -ENOMEM; 323 getgrnam_r(group, &gr, buf, sz, &pgr); 324 /* 325 * We're safe to free the buffer here. The strings inside gr point 326 * inside buf, but we don't use any of them; this leaves the pointers 327 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded. 328 */ 329 free(buf); 330 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */ 331 if (!pgr) 332 return -1; 333 minijail_change_gid(j, pgr->gr_gid); 334 return 0; 335} 336 337void API minijail_use_seccomp(struct minijail *j) 338{ 339 j->flags.seccomp = 1; 340} 341 342void API minijail_no_new_privs(struct minijail *j) 343{ 344 j->flags.no_new_privs = 1; 345} 346 347void API minijail_use_seccomp_filter(struct minijail *j) 348{ 349 j->flags.seccomp_filter = 1; 350} 351 352void API minijail_set_seccomp_filter_tsync(struct minijail *j) 353{ 354 j->flags.seccomp_filter_tsync = 1; 355} 356 357void API minijail_log_seccomp_filter_failures(struct minijail *j) 358{ 359 j->flags.log_seccomp_filter = 1; 360} 361 362void API minijail_use_caps(struct minijail *j, uint64_t capmask) 363{ 364 /* 365 * 'minijail_use_caps' configures a runtime-capabilities-only 366 * environment, including a bounding set matching the thread's runtime 367 * (permitted|inheritable|effective) sets. 368 * Therefore, it will override any existing bounding set configurations 369 * since the latter would allow gaining extra runtime capabilities from 370 * file capabilities. 371 */ 372 if (j->flags.capbset_drop) { 373 warn("overriding bounding set configuration"); 374 j->cap_bset = 0; 375 j->flags.capbset_drop = 0; 376 } 377 j->caps = capmask; 378 j->flags.use_caps = 1; 379} 380 381void API minijail_capbset_drop(struct minijail *j, uint64_t capmask) 382{ 383 if (j->flags.use_caps) { 384 /* 385 * 'minijail_use_caps' will have already configured a capability 386 * bounding set matching the (permitted|inheritable|effective) 387 * sets. Abort if the user tries to configure a separate 388 * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps' 389 * are mutually exclusive. 390 */ 391 die("runtime capabilities already configured, can't drop " 392 "bounding set separately"); 393 } 394 j->cap_bset = capmask; 395 j->flags.capbset_drop = 1; 396} 397 398void API minijail_reset_signal_mask(struct minijail *j) 399{ 400 j->flags.reset_signal_mask = 1; 401} 402 403void API minijail_namespace_vfs(struct minijail *j) 404{ 405 j->flags.vfs = 1; 406} 407 408void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path) 409{ 410 int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC); 411 if (ns_fd < 0) { 412 pdie("failed to open namespace '%s'", ns_path); 413 } 414 j->mountns_fd = ns_fd; 415 j->flags.enter_vfs = 1; 416} 417 418void API minijail_skip_remount_private(struct minijail *j) 419{ 420 j->flags.skip_remount_private = 1; 421} 422 423void API minijail_namespace_pids(struct minijail *j) 424{ 425 j->flags.vfs = 1; 426 j->flags.remount_proc_ro = 1; 427 j->flags.pids = 1; 428 j->flags.do_init = 1; 429} 430 431void API minijail_namespace_ipc(struct minijail *j) 432{ 433 j->flags.ipc = 1; 434} 435 436void API minijail_namespace_net(struct minijail *j) 437{ 438 j->flags.net = 1; 439} 440 441void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path) 442{ 443 int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC); 444 if (ns_fd < 0) { 445 pdie("failed to open namespace '%s'", ns_path); 446 } 447 j->netns_fd = ns_fd; 448 j->flags.enter_net = 1; 449} 450 451void API minijail_namespace_cgroups(struct minijail *j) 452{ 453 j->flags.ns_cgroups = 1; 454} 455 456void API minijail_remount_proc_readonly(struct minijail *j) 457{ 458 j->flags.vfs = 1; 459 j->flags.remount_proc_ro = 1; 460} 461 462void API minijail_namespace_user(struct minijail *j) 463{ 464 j->flags.userns = 1; 465} 466 467void API minijail_namespace_user_disable_setgroups(struct minijail *j) 468{ 469 j->flags.disable_setgroups = 1; 470} 471 472int API minijail_uidmap(struct minijail *j, const char *uidmap) 473{ 474 j->uidmap = strdup(uidmap); 475 if (!j->uidmap) 476 return -ENOMEM; 477 char *ch; 478 for (ch = j->uidmap; *ch; ch++) { 479 if (*ch == ',') 480 *ch = '\n'; 481 } 482 return 0; 483} 484 485int API minijail_gidmap(struct minijail *j, const char *gidmap) 486{ 487 j->gidmap = strdup(gidmap); 488 if (!j->gidmap) 489 return -ENOMEM; 490 char *ch; 491 for (ch = j->gidmap; *ch; ch++) { 492 if (*ch == ',') 493 *ch = '\n'; 494 } 495 return 0; 496} 497 498void API minijail_inherit_usergroups(struct minijail *j) 499{ 500 j->flags.usergroups = 1; 501} 502 503void API minijail_run_as_init(struct minijail *j) 504{ 505 /* 506 * Since the jailed program will become 'init' in the new PID namespace, 507 * Minijail does not need to fork an 'init' process. 508 */ 509 j->flags.do_init = 0; 510} 511 512int API minijail_enter_chroot(struct minijail *j, const char *dir) 513{ 514 if (j->chrootdir) 515 return -EINVAL; 516 j->chrootdir = strdup(dir); 517 if (!j->chrootdir) 518 return -ENOMEM; 519 j->flags.chroot = 1; 520 return 0; 521} 522 523int API minijail_enter_pivot_root(struct minijail *j, const char *dir) 524{ 525 if (j->chrootdir) 526 return -EINVAL; 527 j->chrootdir = strdup(dir); 528 if (!j->chrootdir) 529 return -ENOMEM; 530 j->flags.pivot_root = 1; 531 return 0; 532} 533 534char API *minijail_get_original_path(struct minijail *j, 535 const char *path_inside_chroot) 536{ 537 struct mountpoint *b; 538 539 b = j->mounts_head; 540 while (b) { 541 /* 542 * If |path_inside_chroot| is the exact destination of a 543 * mount, then the original path is exactly the source of 544 * the mount. 545 * for example: "-b /some/path/exe,/chroot/path/exe" 546 * mount source = /some/path/exe, mount dest = 547 * /chroot/path/exe Then when getting the original path of 548 * "/chroot/path/exe", the source of that mount, 549 * "/some/path/exe" is what should be returned. 550 */ 551 if (!strcmp(b->dest, path_inside_chroot)) 552 return strdup(b->src); 553 554 /* 555 * If |path_inside_chroot| is within the destination path of a 556 * mount, take the suffix of the chroot path relative to the 557 * mount destination path, and append it to the mount source 558 * path. 559 */ 560 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) { 561 const char *relative_path = 562 path_inside_chroot + strlen(b->dest); 563 return path_join(b->src, relative_path); 564 } 565 b = b->next; 566 } 567 568 /* If there is a chroot path, append |path_inside_chroot| to that. */ 569 if (j->chrootdir) 570 return path_join(j->chrootdir, path_inside_chroot); 571 572 /* No chroot, so the path outside is the same as it is inside. */ 573 return strdup(path_inside_chroot); 574} 575 576void API minijail_mount_tmp(struct minijail *j) 577{ 578 j->flags.mount_tmp = 1; 579} 580 581int API minijail_write_pid_file(struct minijail *j, const char *path) 582{ 583 j->pid_file_path = strdup(path); 584 if (!j->pid_file_path) 585 return -ENOMEM; 586 j->flags.pid_file = 1; 587 return 0; 588} 589 590int API minijail_add_to_cgroup(struct minijail *j, const char *path) 591{ 592 if (j->cgroup_count >= MAX_CGROUPS) 593 return -ENOMEM; 594 j->cgroups[j->cgroup_count] = strdup(path); 595 if (!j->cgroups[j->cgroup_count]) 596 return -ENOMEM; 597 j->cgroup_count++; 598 j->flags.cgroups = 1; 599 return 0; 600} 601 602int API minijail_mount_with_data(struct minijail *j, const char *src, 603 const char *dest, const char *type, 604 unsigned long flags, const char *data) 605{ 606 struct mountpoint *m; 607 608 if (*dest != '/') 609 return -EINVAL; 610 m = calloc(1, sizeof(*m)); 611 if (!m) 612 return -ENOMEM; 613 m->dest = strdup(dest); 614 if (!m->dest) 615 goto error; 616 m->src = strdup(src); 617 if (!m->src) 618 goto error; 619 m->type = strdup(type); 620 if (!m->type) 621 goto error; 622 if (data) { 623 m->data = strdup(data); 624 if (!m->data) 625 goto error; 626 m->has_data = 1; 627 } 628 m->flags = flags; 629 630 info("mount %s -> %s type '%s'", src, dest, type); 631 632 /* 633 * Force vfs namespacing so the mounts don't leak out into the 634 * containing vfs namespace. 635 */ 636 minijail_namespace_vfs(j); 637 638 if (j->mounts_tail) 639 j->mounts_tail->next = m; 640 else 641 j->mounts_head = m; 642 j->mounts_tail = m; 643 j->mounts_count++; 644 645 return 0; 646 647error: 648 free(m->type); 649 free(m->src); 650 free(m->dest); 651 free(m); 652 return -ENOMEM; 653} 654 655int API minijail_mount(struct minijail *j, const char *src, const char *dest, 656 const char *type, unsigned long flags) 657{ 658 return minijail_mount_with_data(j, src, dest, type, flags, NULL); 659} 660 661int API minijail_bind(struct minijail *j, const char *src, const char *dest, 662 int writeable) 663{ 664 unsigned long flags = MS_BIND; 665 666 if (!writeable) 667 flags |= MS_RDONLY; 668 669 return minijail_mount(j, src, dest, "", flags); 670} 671 672static void clear_seccomp_options(struct minijail *j) 673{ 674 j->flags.seccomp_filter = 0; 675 j->flags.seccomp_filter_tsync = 0; 676 j->flags.log_seccomp_filter = 0; 677 j->filter_len = 0; 678 j->filter_prog = NULL; 679 j->flags.no_new_privs = 0; 680} 681 682static int seccomp_should_parse_filters(struct minijail *j) 683{ 684 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL) == -1) { 685 /* 686 * |errno| will be set to EINVAL when seccomp has not been 687 * compiled into the kernel. On certain platforms and kernel 688 * versions this is not a fatal failure. In that case, and only 689 * in that case, disable seccomp and skip loading the filters. 690 */ 691 if ((errno == EINVAL) && seccomp_can_softfail()) { 692 warn("not loading seccomp filters, seccomp filter not " 693 "supported"); 694 clear_seccomp_options(j); 695 return 0; 696 } 697 /* 698 * If |errno| != EINVAL or seccomp_can_softfail() is false, 699 * we can proceed. Worst case scenario minijail_enter() will 700 * abort() if seccomp fails. 701 */ 702 } 703 if (j->flags.seccomp_filter_tsync) { 704 /* Are the seccomp(2) syscall and the TSYNC option supported? */ 705 if (sys_seccomp(SECCOMP_SET_MODE_FILTER, 706 SECCOMP_FILTER_FLAG_TSYNC, NULL) == -1) { 707 int saved_errno = errno; 708 if (seccomp_can_softfail()) { 709 if (saved_errno == ENOSYS) { 710 warn( 711 "seccomp(2) syscall not supported"); 712 clear_seccomp_options(j); 713 } 714 if (saved_errno == EINVAL) { 715 warn("seccomp filter thread sync not " 716 "supported"); 717 clear_seccomp_options(j); 718 } 719 return 0; 720 } 721 /* 722 * Similar logic here. If seccomp_can_softfail() is 723 * false, or |errno| != ENOSYS, or |errno| != EINVAL, 724 * we can proceed. Worst case scenario minijail_enter() 725 * will abort() if seccomp or TSYNC fail. 726 */ 727 } 728 } 729 return 1; 730} 731 732static int parse_seccomp_filters(struct minijail *j, FILE *policy_file) 733{ 734 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog)); 735 if (compile_filter(policy_file, fprog, j->flags.log_seccomp_filter)) { 736 free(fprog); 737 return -1; 738 } 739 740 j->filter_len = fprog->len; 741 j->filter_prog = fprog; 742 return 0; 743} 744 745void API minijail_parse_seccomp_filters(struct minijail *j, const char *path) 746{ 747 if (!seccomp_should_parse_filters(j)) 748 return; 749 750 FILE *file = fopen(path, "r"); 751 if (!file) { 752 pdie("failed to open seccomp filter file '%s'", path); 753 } 754 755 if (parse_seccomp_filters(j, file) != 0) { 756 die("failed to compile seccomp filter BPF program in '%s'", 757 path); 758 } 759 fclose(file); 760} 761 762void API minijail_parse_seccomp_filters_from_fd(struct minijail *j, int fd) 763{ 764 if (!seccomp_should_parse_filters(j)) 765 return; 766 767 FILE *file = fdopen(fd, "r"); 768 if (!file) { 769 pdie("failed to associate stream with fd %d", fd); 770 } 771 772 if (parse_seccomp_filters(j, file) != 0) { 773 die("failed to compile seccomp filter BPF program from fd %d", 774 fd); 775 } 776 fclose(file); 777} 778 779int API minijail_use_alt_syscall(struct minijail *j, const char *table) 780{ 781 j->alt_syscall_table = strdup(table); 782 if (!j->alt_syscall_table) 783 return -ENOMEM; 784 j->flags.alt_syscall = 1; 785 return 0; 786} 787 788struct marshal_state { 789 size_t available; 790 size_t total; 791 char *buf; 792}; 793 794void marshal_state_init(struct marshal_state *state, char *buf, 795 size_t available) 796{ 797 state->available = available; 798 state->buf = buf; 799 state->total = 0; 800} 801 802void marshal_append(struct marshal_state *state, void *src, size_t length) 803{ 804 size_t copy_len = MIN(state->available, length); 805 806 /* Up to |available| will be written. */ 807 if (copy_len) { 808 memcpy(state->buf, src, copy_len); 809 state->buf += copy_len; 810 state->available -= copy_len; 811 } 812 /* |total| will contain the expected length. */ 813 state->total += length; 814} 815 816void marshal_mount(struct marshal_state *state, const struct mountpoint *m) 817{ 818 marshal_append(state, m->src, strlen(m->src) + 1); 819 marshal_append(state, m->dest, strlen(m->dest) + 1); 820 marshal_append(state, m->type, strlen(m->type) + 1); 821 marshal_append(state, (char *)&m->has_data, sizeof(m->has_data)); 822 if (m->has_data) 823 marshal_append(state, m->data, strlen(m->data) + 1); 824 marshal_append(state, (char *)&m->flags, sizeof(m->flags)); 825} 826 827void minijail_marshal_helper(struct marshal_state *state, 828 const struct minijail *j) 829{ 830 struct mountpoint *m = NULL; 831 size_t i; 832 833 marshal_append(state, (char *)j, sizeof(*j)); 834 if (j->user) 835 marshal_append(state, j->user, strlen(j->user) + 1); 836 if (j->suppl_gid_list) { 837 marshal_append(state, j->suppl_gid_list, 838 j->suppl_gid_count * sizeof(gid_t)); 839 } 840 if (j->chrootdir) 841 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1); 842 if (j->alt_syscall_table) { 843 marshal_append(state, j->alt_syscall_table, 844 strlen(j->alt_syscall_table) + 1); 845 } 846 if (j->flags.seccomp_filter && j->filter_prog) { 847 struct sock_fprog *fp = j->filter_prog; 848 marshal_append(state, (char *)fp->filter, 849 fp->len * sizeof(struct sock_filter)); 850 } 851 for (m = j->mounts_head; m; m = m->next) { 852 marshal_mount(state, m); 853 } 854 for (i = 0; i < j->cgroup_count; ++i) 855 marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1); 856} 857 858size_t API minijail_size(const struct minijail *j) 859{ 860 struct marshal_state state; 861 marshal_state_init(&state, NULL, 0); 862 minijail_marshal_helper(&state, j); 863 return state.total; 864} 865 866int minijail_marshal(const struct minijail *j, char *buf, size_t available) 867{ 868 struct marshal_state state; 869 marshal_state_init(&state, buf, available); 870 minijail_marshal_helper(&state, j); 871 return (state.total > available); 872} 873 874int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) 875{ 876 size_t i; 877 size_t count; 878 int ret = -EINVAL; 879 880 if (length < sizeof(*j)) 881 goto out; 882 memcpy((void *)j, serialized, sizeof(*j)); 883 serialized += sizeof(*j); 884 length -= sizeof(*j); 885 886 /* Potentially stale pointers not used as signals. */ 887 j->pid_file_path = NULL; 888 j->uidmap = NULL; 889 j->gidmap = NULL; 890 j->mounts_head = NULL; 891 j->mounts_tail = NULL; 892 j->filter_prog = NULL; 893 894 if (j->user) { /* stale pointer */ 895 char *user = consumestr(&serialized, &length); 896 if (!user) 897 goto clear_pointers; 898 j->user = strdup(user); 899 if (!j->user) 900 goto clear_pointers; 901 } 902 903 if (j->suppl_gid_list) { /* stale pointer */ 904 if (j->suppl_gid_count > NGROUPS_MAX) { 905 goto bad_gid_list; 906 } 907 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t); 908 void *gid_list_bytes = 909 consumebytes(gid_list_size, &serialized, &length); 910 if (!gid_list_bytes) 911 goto bad_gid_list; 912 913 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t)); 914 if (!j->suppl_gid_list) 915 goto bad_gid_list; 916 917 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size); 918 } 919 920 if (j->chrootdir) { /* stale pointer */ 921 char *chrootdir = consumestr(&serialized, &length); 922 if (!chrootdir) 923 goto bad_chrootdir; 924 j->chrootdir = strdup(chrootdir); 925 if (!j->chrootdir) 926 goto bad_chrootdir; 927 } 928 929 if (j->alt_syscall_table) { /* stale pointer */ 930 char *alt_syscall_table = consumestr(&serialized, &length); 931 if (!alt_syscall_table) 932 goto bad_syscall_table; 933 j->alt_syscall_table = strdup(alt_syscall_table); 934 if (!j->alt_syscall_table) 935 goto bad_syscall_table; 936 } 937 938 if (j->flags.seccomp_filter && j->filter_len > 0) { 939 size_t ninstrs = j->filter_len; 940 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) || 941 ninstrs > USHRT_MAX) 942 goto bad_filters; 943 944 size_t program_len = ninstrs * sizeof(struct sock_filter); 945 void *program = consumebytes(program_len, &serialized, &length); 946 if (!program) 947 goto bad_filters; 948 949 j->filter_prog = malloc(sizeof(struct sock_fprog)); 950 if (!j->filter_prog) 951 goto bad_filters; 952 953 j->filter_prog->len = ninstrs; 954 j->filter_prog->filter = malloc(program_len); 955 if (!j->filter_prog->filter) 956 goto bad_filter_prog_instrs; 957 958 memcpy(j->filter_prog->filter, program, program_len); 959 } 960 961 count = j->mounts_count; 962 j->mounts_count = 0; 963 for (i = 0; i < count; ++i) { 964 unsigned long *flags; 965 int *has_data; 966 const char *dest; 967 const char *type; 968 const char *data = NULL; 969 const char *src = consumestr(&serialized, &length); 970 if (!src) 971 goto bad_mounts; 972 dest = consumestr(&serialized, &length); 973 if (!dest) 974 goto bad_mounts; 975 type = consumestr(&serialized, &length); 976 if (!type) 977 goto bad_mounts; 978 has_data = consumebytes(sizeof(*has_data), &serialized, 979 &length); 980 if (!has_data) 981 goto bad_mounts; 982 if (*has_data) { 983 data = consumestr(&serialized, &length); 984 if (!data) 985 goto bad_mounts; 986 } 987 flags = consumebytes(sizeof(*flags), &serialized, &length); 988 if (!flags) 989 goto bad_mounts; 990 if (minijail_mount_with_data(j, src, dest, type, *flags, data)) 991 goto bad_mounts; 992 } 993 994 count = j->cgroup_count; 995 j->cgroup_count = 0; 996 for (i = 0; i < count; ++i) { 997 char *cgroup = consumestr(&serialized, &length); 998 if (!cgroup) 999 goto bad_cgroups; 1000 j->cgroups[i] = strdup(cgroup); 1001 if (!j->cgroups[i]) 1002 goto bad_cgroups; 1003 ++j->cgroup_count; 1004 } 1005 1006 return 0; 1007 1008bad_cgroups: 1009 while (j->mounts_head) { 1010 struct mountpoint *m = j->mounts_head; 1011 j->mounts_head = j->mounts_head->next; 1012 free(m->data); 1013 free(m->type); 1014 free(m->dest); 1015 free(m->src); 1016 free(m); 1017 } 1018 for (i = 0; i < j->cgroup_count; ++i) 1019 free(j->cgroups[i]); 1020bad_mounts: 1021 if (j->flags.seccomp_filter && j->filter_len > 0) { 1022 free(j->filter_prog->filter); 1023 free(j->filter_prog); 1024 } 1025bad_filter_prog_instrs: 1026 if (j->filter_prog) 1027 free(j->filter_prog); 1028bad_filters: 1029 if (j->alt_syscall_table) 1030 free(j->alt_syscall_table); 1031bad_syscall_table: 1032 if (j->chrootdir) 1033 free(j->chrootdir); 1034bad_chrootdir: 1035 if (j->suppl_gid_list) 1036 free(j->suppl_gid_list); 1037bad_gid_list: 1038 if (j->user) 1039 free(j->user); 1040clear_pointers: 1041 j->user = NULL; 1042 j->suppl_gid_list = NULL; 1043 j->chrootdir = NULL; 1044 j->alt_syscall_table = NULL; 1045 j->cgroup_count = 0; 1046out: 1047 return ret; 1048} 1049 1050/* 1051 * setup_mount_destination: Ensures the mount target exists. 1052 * Creates it if needed and possible. 1053 */ 1054static int setup_mount_destination(const char *source, const char *dest, 1055 uid_t uid, uid_t gid) 1056{ 1057 int rc; 1058 struct stat st_buf; 1059 1060 rc = stat(dest, &st_buf); 1061 if (rc == 0) /* destination exists */ 1062 return 0; 1063 1064 /* 1065 * Try to create the destination. 1066 * Either make a directory or touch a file depending on the source type. 1067 * If the source doesn't exist, assume it is a filesystem type such as 1068 * "tmpfs" and create a directory to mount it on. 1069 */ 1070 rc = stat(source, &st_buf); 1071 if (rc || S_ISDIR(st_buf.st_mode) || S_ISBLK(st_buf.st_mode)) { 1072 if (mkdir(dest, 0700)) 1073 return -errno; 1074 } else { 1075 int fd = open(dest, O_RDWR | O_CREAT, 0700); 1076 if (fd < 0) 1077 return -errno; 1078 close(fd); 1079 } 1080 return chown(dest, uid, gid); 1081} 1082 1083/* 1084 * mount_one: Applies mounts from @m for @j, recursing as needed. 1085 * @j Minijail these mounts are for 1086 * @m Head of list of mounts 1087 * 1088 * Returns 0 for success. 1089 */ 1090static int mount_one(const struct minijail *j, struct mountpoint *m) 1091{ 1092 int ret; 1093 char *dest; 1094 int remount_ro = 0; 1095 1096 /* |dest| has a leading "/". */ 1097 if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0) 1098 return -ENOMEM; 1099 1100 if (setup_mount_destination(m->src, dest, j->uid, j->gid)) 1101 pdie("creating mount target '%s' failed", dest); 1102 1103 /* 1104 * R/O bind mounts have to be remounted since 'bind' and 'ro' 1105 * can't both be specified in the original bind mount. 1106 * Remount R/O after the initial mount. 1107 */ 1108 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) { 1109 remount_ro = 1; 1110 m->flags &= ~MS_RDONLY; 1111 } 1112 1113 ret = mount(m->src, dest, m->type, m->flags, m->data); 1114 if (ret) 1115 pdie("mount: %s -> %s", m->src, dest); 1116 1117 if (remount_ro) { 1118 m->flags |= MS_RDONLY; 1119 ret = mount(m->src, dest, NULL, 1120 m->flags | MS_REMOUNT, m->data); 1121 if (ret) 1122 pdie("bind ro: %s -> %s", m->src, dest); 1123 } 1124 1125 free(dest); 1126 if (m->next) 1127 return mount_one(j, m->next); 1128 return ret; 1129} 1130 1131static int enter_chroot(const struct minijail *j) 1132{ 1133 int ret; 1134 1135 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 1136 return ret; 1137 1138 if (chroot(j->chrootdir)) 1139 return -errno; 1140 1141 if (chdir("/")) 1142 return -errno; 1143 1144 return 0; 1145} 1146 1147static int enter_pivot_root(const struct minijail *j) 1148{ 1149 int ret, oldroot, newroot; 1150 1151 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 1152 return ret; 1153 1154 /* 1155 * Keep the fd for both old and new root. 1156 * It will be used in fchdir(2) later. 1157 */ 1158 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); 1159 if (oldroot < 0) 1160 pdie("failed to open / for fchdir"); 1161 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC); 1162 if (newroot < 0) 1163 pdie("failed to open %s for fchdir", j->chrootdir); 1164 1165 /* 1166 * To ensure j->chrootdir is the root of a filesystem, 1167 * do a self bind mount. 1168 */ 1169 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, "")) 1170 pdie("failed to bind mount '%s'", j->chrootdir); 1171 if (chdir(j->chrootdir)) 1172 return -errno; 1173 if (syscall(SYS_pivot_root, ".", ".")) 1174 pdie("pivot_root"); 1175 1176 /* 1177 * Now the old root is mounted on top of the new root. Use fchdir(2) to 1178 * change to the old root and unmount it. 1179 */ 1180 if (fchdir(oldroot)) 1181 pdie("failed to fchdir to old /"); 1182 1183 /* 1184 * If j->flags.skip_remount_private was enabled for minijail_enter(), 1185 * there could be a shared mount point under |oldroot|. In that case, 1186 * mounts under this shared mount point will be unmounted below, and 1187 * this unmounting will propagate to the original mount namespace 1188 * (because the mount point is shared). To prevent this unexpected 1189 * unmounting, remove these mounts from their peer groups by recursively 1190 * remounting them as MS_PRIVATE. 1191 */ 1192 if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL)) 1193 pdie("failed to mount(/, private) before umount(/)"); 1194 /* The old root might be busy, so use lazy unmount. */ 1195 if (umount2(".", MNT_DETACH)) 1196 pdie("umount(/)"); 1197 /* Change back to the new root. */ 1198 if (fchdir(newroot)) 1199 return -errno; 1200 if (close(oldroot)) 1201 return -errno; 1202 if (close(newroot)) 1203 return -errno; 1204 if (chroot("/")) 1205 return -errno; 1206 /* Set correct CWD for getcwd(3). */ 1207 if (chdir("/")) 1208 return -errno; 1209 1210 return 0; 1211} 1212 1213static int mount_tmp(void) 1214{ 1215 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777"); 1216} 1217 1218static int remount_proc_readonly(const struct minijail *j) 1219{ 1220 const char *kProcPath = "/proc"; 1221 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 1222 /* 1223 * Right now, we're holding a reference to our parent's old mount of 1224 * /proc in our namespace, which means using MS_REMOUNT here would 1225 * mutate our parent's mount as well, even though we're in a VFS 1226 * namespace (!). Instead, remove their mount from our namespace lazily 1227 * (MNT_DETACH) and make our own. 1228 */ 1229 if (umount2(kProcPath, MNT_DETACH)) { 1230 /* 1231 * If we are in a new user namespace, umount(2) will fail. 1232 * See http://man7.org/linux/man-pages/man7/user_namespaces.7.html 1233 */ 1234 if (j->flags.userns) { 1235 info("umount(/proc, MNT_DETACH) failed, " 1236 "this is expected when using user namespaces"); 1237 } else { 1238 return -errno; 1239 } 1240 } 1241 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) 1242 return -errno; 1243 return 0; 1244} 1245 1246static void kill_child_and_die(const struct minijail *j, const char *msg) 1247{ 1248 kill(j->initpid, SIGKILL); 1249 die("%s", msg); 1250} 1251 1252static void write_pid_file_or_die(const struct minijail *j) 1253{ 1254 if (write_pid_to_path(j->initpid, j->pid_file_path)) 1255 kill_child_and_die(j, "failed to write pid file"); 1256} 1257 1258static void add_to_cgroups_or_die(const struct minijail *j) 1259{ 1260 size_t i; 1261 1262 for (i = 0; i < j->cgroup_count; ++i) { 1263 if (write_pid_to_path(j->initpid, j->cgroups[i])) 1264 kill_child_and_die(j, "failed to add to cgroups"); 1265 } 1266} 1267 1268static void write_ugid_maps_or_die(const struct minijail *j) 1269{ 1270 if (j->uidmap && write_proc_file(j->initpid, j->uidmap, "uid_map") != 0) 1271 kill_child_and_die(j, "failed to write uid_map"); 1272 if (j->gidmap && j->flags.disable_setgroups && 1273 write_proc_file(j->initpid, "deny", "setgroups") != 0) 1274 kill_child_and_die(j, "failed to disable setgroups(2)"); 1275 if (j->gidmap && write_proc_file(j->initpid, j->gidmap, "gid_map") != 0) 1276 kill_child_and_die(j, "failed to write gid_map"); 1277} 1278 1279static void enter_user_namespace(const struct minijail *j) 1280{ 1281 if (j->uidmap && setresuid(0, 0, 0)) 1282 pdie("user_namespaces: setresuid(0, 0, 0) failed"); 1283 if (j->gidmap && setresgid(0, 0, 0)) 1284 pdie("user_namespaces: setresgid(0, 0, 0) failed"); 1285} 1286 1287static void parent_setup_complete(int *pipe_fds) 1288{ 1289 close(pipe_fds[0]); 1290 close(pipe_fds[1]); 1291} 1292 1293/* 1294 * wait_for_parent_setup: Called by the child process to wait for any 1295 * further parent-side setup to complete before continuing. 1296 */ 1297static void wait_for_parent_setup(int *pipe_fds) 1298{ 1299 char buf; 1300 1301 close(pipe_fds[1]); 1302 1303 /* Wait for parent to complete setup and close the pipe. */ 1304 if (read(pipe_fds[0], &buf, 1) != 0) 1305 die("failed to sync with parent"); 1306 close(pipe_fds[0]); 1307} 1308 1309static void drop_ugid(const struct minijail *j) 1310{ 1311 if (j->flags.usergroups && j->flags.suppl_gids) { 1312 die("tried to inherit *and* set supplementary groups;" 1313 " can only do one"); 1314 } 1315 1316 if (j->flags.usergroups) { 1317 if (initgroups(j->user, j->usergid)) 1318 pdie("initgroups"); 1319 } else if (j->flags.suppl_gids) { 1320 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) { 1321 pdie("setgroups"); 1322 } 1323 } else { 1324 /* 1325 * Only attempt to clear supplementary groups if we are changing 1326 * users. 1327 */ 1328 if ((j->uid || j->gid) && setgroups(0, NULL)) 1329 pdie("setgroups"); 1330 } 1331 1332 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) 1333 pdie("setresgid"); 1334 1335 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) 1336 pdie("setresuid"); 1337} 1338 1339/* 1340 * We specifically do not use cap_valid() as that only tells us the last 1341 * valid cap we were *compiled* against (i.e. what the version of kernel 1342 * headers says). If we run on a different kernel version, then it's not 1343 * uncommon for that to be less (if an older kernel) or more (if a newer 1344 * kernel). 1345 * Normally, we suck up the answer via /proc. On Android, not all processes are 1346 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we 1347 * programmatically find the value by calling prctl(PR_CAPBSET_READ). 1348 */ 1349static unsigned int get_last_valid_cap() 1350{ 1351 unsigned int last_valid_cap = 0; 1352 if (is_android()) { 1353 for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0; 1354 ++last_valid_cap); 1355 1356 /* |last_valid_cap| will be the first failing value. */ 1357 if (last_valid_cap > 0) { 1358 last_valid_cap--; 1359 } 1360 } else { 1361 const char cap_file[] = "/proc/sys/kernel/cap_last_cap"; 1362 FILE *fp = fopen(cap_file, "re"); 1363 if (fscanf(fp, "%u", &last_valid_cap) != 1) 1364 pdie("fscanf(%s)", cap_file); 1365 fclose(fp); 1366 } 1367 return last_valid_cap; 1368} 1369 1370static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap) 1371{ 1372 const uint64_t one = 1; 1373 unsigned int i; 1374 for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) { 1375 if (keep_mask & (one << i)) 1376 continue; 1377 if (prctl(PR_CAPBSET_DROP, i)) 1378 pdie("could not drop capability from bounding set"); 1379 } 1380} 1381 1382static void drop_caps(const struct minijail *j, unsigned int last_valid_cap) 1383{ 1384 if (!j->flags.use_caps) 1385 return; 1386 1387 cap_t caps = cap_get_proc(); 1388 cap_value_t flag[1]; 1389 const uint64_t one = 1; 1390 unsigned int i; 1391 if (!caps) 1392 die("can't get process caps"); 1393 if (cap_clear_flag(caps, CAP_INHERITABLE)) 1394 die("can't clear inheritable caps"); 1395 if (cap_clear_flag(caps, CAP_EFFECTIVE)) 1396 die("can't clear effective caps"); 1397 if (cap_clear_flag(caps, CAP_PERMITTED)) 1398 die("can't clear permitted caps"); 1399 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) { 1400 /* Keep CAP_SETPCAP for dropping bounding set bits. */ 1401 if (i != CAP_SETPCAP && !(j->caps & (one << i))) 1402 continue; 1403 flag[0] = i; 1404 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET)) 1405 die("can't add effective cap"); 1406 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET)) 1407 die("can't add permitted cap"); 1408 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET)) 1409 die("can't add inheritable cap"); 1410 } 1411 if (cap_set_proc(caps)) 1412 die("can't apply initial cleaned capset"); 1413 1414 /* 1415 * Instead of dropping bounding set first, do it here in case 1416 * the caller had a more permissive bounding set which could 1417 * have been used above to raise a capability that wasn't already 1418 * present. This requires CAP_SETPCAP, so we raised/kept it above. 1419 */ 1420 drop_capbset(j->caps, last_valid_cap); 1421 1422 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */ 1423 if ((j->caps & (one << CAP_SETPCAP)) == 0) { 1424 flag[0] = CAP_SETPCAP; 1425 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR)) 1426 die("can't clear effective cap"); 1427 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR)) 1428 die("can't clear permitted cap"); 1429 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR)) 1430 die("can't clear inheritable cap"); 1431 } 1432 1433 if (cap_set_proc(caps)) 1434 die("can't apply final cleaned capset"); 1435 1436 cap_free(caps); 1437} 1438 1439static void set_seccomp_filter(const struct minijail *j) 1440{ 1441 /* 1442 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c> 1443 * in the kernel source tree for an explanation of the parameters. 1444 */ 1445 if (j->flags.no_new_privs) { 1446 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) 1447 pdie("prctl(PR_SET_NO_NEW_PRIVS)"); 1448 } 1449 1450 /* 1451 * Code running with ASan 1452 * (https://github.com/google/sanitizers/wiki/AddressSanitizer) 1453 * will make system calls not included in the syscall filter policy, 1454 * which will likely crash the program. Skip setting seccomp filter in 1455 * that case. 1456 * 'running_with_asan()' has no inputs and is completely defined at 1457 * build time, so this cannot be used by an attacker to skip setting 1458 * seccomp filter. 1459 */ 1460 if (j->flags.seccomp_filter && running_with_asan()) { 1461 warn("running with ASan, not setting seccomp filter"); 1462 return; 1463 } 1464 1465 /* 1466 * If we're logging seccomp filter failures, 1467 * install the SIGSYS handler first. 1468 */ 1469 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) { 1470 if (install_sigsys_handler()) 1471 pdie("install SIGSYS handler"); 1472 warn("logging seccomp filter failures"); 1473 } 1474 1475 /* 1476 * Install the syscall filter. 1477 */ 1478 if (j->flags.seccomp_filter) { 1479 if (j->flags.seccomp_filter_tsync) { 1480 if (sys_seccomp(SECCOMP_SET_MODE_FILTER, 1481 SECCOMP_FILTER_FLAG_TSYNC, 1482 j->filter_prog)) { 1483 pdie("seccomp(tsync) failed"); 1484 } 1485 } else { 1486 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 1487 j->filter_prog)) { 1488 pdie("prctl(seccomp_filter) failed"); 1489 } 1490 } 1491 } 1492} 1493 1494void API minijail_enter(const struct minijail *j) 1495{ 1496 /* 1497 * If we're dropping caps, get the last valid cap from /proc now, 1498 * since /proc can be unmounted before drop_caps() is called. 1499 */ 1500 unsigned int last_valid_cap = 0; 1501 if (j->flags.capbset_drop || j->flags.use_caps) 1502 last_valid_cap = get_last_valid_cap(); 1503 1504 if (j->flags.pids) 1505 die("tried to enter a pid-namespaced jail;" 1506 " try minijail_run()?"); 1507 1508 if (j->flags.usergroups && !j->user) 1509 die("usergroup inheritance without username"); 1510 1511 /* 1512 * We can't recover from failures if we've dropped privileges partially, 1513 * so we don't even try. If any of our operations fail, we abort() the 1514 * entire process. 1515 */ 1516 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS)) 1517 pdie("setns(CLONE_NEWNS)"); 1518 1519 if (j->flags.vfs) { 1520 if (unshare(CLONE_NEWNS)) 1521 pdie("unshare(vfs)"); 1522 /* 1523 * Unless asked not to, remount all filesystems as private. 1524 * If they are shared, new bind mounts will creep out of our 1525 * namespace. 1526 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt 1527 */ 1528 if (!j->flags.skip_remount_private) { 1529 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 1530 pdie("mount(/, private)"); 1531 } 1532 } 1533 1534 if (j->flags.ipc && unshare(CLONE_NEWIPC)) { 1535 pdie("unshare(ipc)"); 1536 } 1537 1538 if (j->flags.enter_net) { 1539 if (setns(j->netns_fd, CLONE_NEWNET)) 1540 pdie("setns(CLONE_NEWNET)"); 1541 } else if (j->flags.net && unshare(CLONE_NEWNET)) { 1542 pdie("unshare(net)"); 1543 } 1544 1545 if (j->flags.ns_cgroups && unshare(CLONE_NEWCGROUP)) 1546 pdie("unshare(cgroups)"); 1547 1548 if (j->flags.chroot && enter_chroot(j)) 1549 pdie("chroot"); 1550 1551 if (j->flags.pivot_root && enter_pivot_root(j)) 1552 pdie("pivot_root"); 1553 1554 if (j->flags.mount_tmp && mount_tmp()) 1555 pdie("mount_tmp"); 1556 1557 if (j->flags.remount_proc_ro && remount_proc_readonly(j)) 1558 pdie("remount"); 1559 1560 /* 1561 * If we're only dropping capabilities from the bounding set, but not 1562 * from the thread's (permitted|inheritable|effective) sets, do it now. 1563 */ 1564 if (j->flags.capbset_drop) { 1565 drop_capbset(j->cap_bset, last_valid_cap); 1566 } 1567 1568 if (j->flags.use_caps) { 1569 /* 1570 * POSIX capabilities are a bit tricky. If we drop our 1571 * capability to change uids, our attempt to use setuid() 1572 * below will fail. Hang on to root caps across setuid(), then 1573 * lock securebits. 1574 */ 1575 if (prctl(PR_SET_KEEPCAPS, 1)) 1576 pdie("prctl(PR_SET_KEEPCAPS)"); 1577 1578 /* 1579 * Kernels 4.3+ define a new securebit 1580 * (SECURE_NO_CAP_AMBIENT_RAISE), so using the SECURE_ALL_BITS 1581 * and SECURE_ALL_LOCKS masks from newer kernel headers will 1582 * return EPERM on older kernels. Detect this, and retry with 1583 * the right mask for older (2.6.26-4.2) kernels. 1584 */ 1585 int securebits_ret = prctl(PR_SET_SECUREBITS, 1586 SECURE_ALL_BITS | SECURE_ALL_LOCKS); 1587 if (securebits_ret < 0) { 1588 if (errno == EPERM) { 1589 /* Possibly running on kernel < 4.3. */ 1590 securebits_ret = prctl( 1591 PR_SET_SECUREBITS, 1592 OLD_SECURE_ALL_BITS | OLD_SECURE_ALL_LOCKS); 1593 } 1594 } 1595 if (securebits_ret < 0) 1596 pdie("prctl(PR_SET_SECUREBITS)"); 1597 } 1598 1599 if (j->flags.no_new_privs) { 1600 /* 1601 * If we're setting no_new_privs, we can drop privileges 1602 * before setting seccomp filter. This way filter policies 1603 * don't need to allow privilege-dropping syscalls. 1604 */ 1605 drop_ugid(j); 1606 drop_caps(j, last_valid_cap); 1607 set_seccomp_filter(j); 1608 } else { 1609 /* 1610 * If we're not setting no_new_privs, 1611 * we need to set seccomp filter *before* dropping privileges. 1612 * WARNING: this means that filter policies *must* allow 1613 * setgroups()/setresgid()/setresuid() for dropping root and 1614 * capget()/capset()/prctl() for dropping caps. 1615 */ 1616 set_seccomp_filter(j); 1617 drop_ugid(j); 1618 drop_caps(j, last_valid_cap); 1619 } 1620 1621 /* 1622 * Select the specified alternate syscall table. The table must not 1623 * block prctl(2) if we're using seccomp as well. 1624 */ 1625 if (j->flags.alt_syscall) { 1626 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table)) 1627 pdie("prctl(PR_ALT_SYSCALL)"); 1628 } 1629 1630 /* 1631 * seccomp has to come last since it cuts off all the other 1632 * privilege-dropping syscalls :) 1633 */ 1634 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) { 1635 if ((errno == EINVAL) && seccomp_can_softfail()) { 1636 warn("seccomp not supported"); 1637 return; 1638 } 1639 pdie("prctl(PR_SET_SECCOMP)"); 1640 } 1641} 1642 1643/* TODO(wad): will visibility affect this variable? */ 1644static int init_exitstatus = 0; 1645 1646void init_term(int __attribute__ ((unused)) sig) 1647{ 1648 _exit(init_exitstatus); 1649} 1650 1651void init(pid_t rootpid) 1652{ 1653 pid_t pid; 1654 int status; 1655 /* So that we exit with the right status. */ 1656 signal(SIGTERM, init_term); 1657 /* TODO(wad): self jail with seccomp filters here. */ 1658 while ((pid = wait(&status)) > 0) { 1659 /* 1660 * This loop will only end when either there are no processes 1661 * left inside our pid namespace or we get a signal. 1662 */ 1663 if (pid == rootpid) 1664 init_exitstatus = status; 1665 } 1666 if (!WIFEXITED(init_exitstatus)) 1667 _exit(MINIJAIL_ERR_INIT); 1668 _exit(WEXITSTATUS(init_exitstatus)); 1669} 1670 1671int API minijail_from_fd(int fd, struct minijail *j) 1672{ 1673 size_t sz = 0; 1674 size_t bytes = read(fd, &sz, sizeof(sz)); 1675 char *buf; 1676 int r; 1677 if (sizeof(sz) != bytes) 1678 return -EINVAL; 1679 if (sz > USHRT_MAX) /* arbitrary sanity check */ 1680 return -E2BIG; 1681 buf = malloc(sz); 1682 if (!buf) 1683 return -ENOMEM; 1684 bytes = read(fd, buf, sz); 1685 if (bytes != sz) { 1686 free(buf); 1687 return -EINVAL; 1688 } 1689 r = minijail_unmarshal(j, buf, sz); 1690 free(buf); 1691 return r; 1692} 1693 1694int API minijail_to_fd(struct minijail *j, int fd) 1695{ 1696 char *buf; 1697 size_t sz = minijail_size(j); 1698 ssize_t written; 1699 int r; 1700 1701 if (!sz) 1702 return -EINVAL; 1703 buf = malloc(sz); 1704 r = minijail_marshal(j, buf, sz); 1705 if (r) { 1706 free(buf); 1707 return r; 1708 } 1709 /* Sends [size][minijail]. */ 1710 written = write(fd, &sz, sizeof(sz)); 1711 if (written != sizeof(sz)) { 1712 free(buf); 1713 return -EFAULT; 1714 } 1715 written = write(fd, buf, sz); 1716 if (written < 0 || (size_t) written != sz) { 1717 free(buf); 1718 return -EFAULT; 1719 } 1720 free(buf); 1721 return 0; 1722} 1723 1724int setup_preload(void) 1725{ 1726#if defined(__ANDROID__) 1727 /* Don't use LDPRELOAD on Brillo. */ 1728 return 0; 1729#else 1730 char *oldenv = getenv(kLdPreloadEnvVar) ? : ""; 1731 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH)); 1732 if (!newenv) 1733 return -ENOMEM; 1734 1735 /* Only insert a separating space if we have something to separate... */ 1736 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", 1737 PRELOADPATH); 1738 1739 /* setenv() makes a copy of the string we give it. */ 1740 setenv(kLdPreloadEnvVar, newenv, 1); 1741 free(newenv); 1742 return 0; 1743#endif 1744} 1745 1746int setup_pipe(int fds[2]) 1747{ 1748 int r = pipe(fds); 1749 char fd_buf[11]; 1750 if (r) 1751 return r; 1752 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]); 1753 if (r <= 0) 1754 return -EINVAL; 1755 setenv(kFdEnvVar, fd_buf, 1); 1756 return 0; 1757} 1758 1759int setup_pipe_end(int fds[2], size_t index) 1760{ 1761 if (index > 1) 1762 return -1; 1763 1764 close(fds[1 - index]); 1765 return fds[index]; 1766} 1767 1768int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd) 1769{ 1770 if (index > 1) 1771 return -1; 1772 1773 close(fds[1 - index]); 1774 /* dup2(2) the corresponding end of the pipe into |fd|. */ 1775 return dup2(fds[index], fd); 1776} 1777 1778int minijail_run_internal(struct minijail *j, const char *filename, 1779 char *const argv[], pid_t *pchild_pid, 1780 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1781 int use_preload); 1782 1783int API minijail_run(struct minijail *j, const char *filename, 1784 char *const argv[]) 1785{ 1786 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1787 true); 1788} 1789 1790int API minijail_run_pid(struct minijail *j, const char *filename, 1791 char *const argv[], pid_t *pchild_pid) 1792{ 1793 return minijail_run_internal(j, filename, argv, pchild_pid, 1794 NULL, NULL, NULL, true); 1795} 1796 1797int API minijail_run_pipe(struct minijail *j, const char *filename, 1798 char *const argv[], int *pstdin_fd) 1799{ 1800 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd, 1801 NULL, NULL, true); 1802} 1803 1804int API minijail_run_pid_pipes(struct minijail *j, const char *filename, 1805 char *const argv[], pid_t *pchild_pid, 1806 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) 1807{ 1808 return minijail_run_internal(j, filename, argv, pchild_pid, 1809 pstdin_fd, pstdout_fd, pstderr_fd, true); 1810} 1811 1812int API minijail_run_no_preload(struct minijail *j, const char *filename, 1813 char *const argv[]) 1814{ 1815 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1816 false); 1817} 1818 1819int API minijail_run_pid_pipes_no_preload(struct minijail *j, 1820 const char *filename, 1821 char *const argv[], 1822 pid_t *pchild_pid, 1823 int *pstdin_fd, int *pstdout_fd, 1824 int *pstderr_fd) 1825{ 1826 return minijail_run_internal(j, filename, argv, pchild_pid, 1827 pstdin_fd, pstdout_fd, pstderr_fd, false); 1828} 1829 1830int minijail_run_internal(struct minijail *j, const char *filename, 1831 char *const argv[], pid_t *pchild_pid, 1832 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1833 int use_preload) 1834{ 1835 char *oldenv, *oldenv_copy = NULL; 1836 pid_t child_pid; 1837 int pipe_fds[2]; 1838 int stdin_fds[2]; 1839 int stdout_fds[2]; 1840 int stderr_fds[2]; 1841 int child_sync_pipe_fds[2]; 1842 int sync_child = 0; 1843 int ret; 1844 /* We need to remember this across the minijail_preexec() call. */ 1845 int pid_namespace = j->flags.pids; 1846 int do_init = j->flags.do_init; 1847 1848 if (use_preload) { 1849 oldenv = getenv(kLdPreloadEnvVar); 1850 if (oldenv) { 1851 oldenv_copy = strdup(oldenv); 1852 if (!oldenv_copy) 1853 return -ENOMEM; 1854 } 1855 1856 if (setup_preload()) 1857 return -EFAULT; 1858 } 1859 1860 if (!use_preload) { 1861 if (j->flags.use_caps && j->caps != 0) 1862 die("non-empty capabilities are not supported without LD_PRELOAD"); 1863 } 1864 1865 /* 1866 * Make the process group ID of this process equal to its PID, so that 1867 * both the Minijail process and the jailed process can be killed 1868 * together. 1869 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when 1870 * the process is already a process group leader. 1871 */ 1872 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) { 1873 if (errno != EPERM) { 1874 pdie("setpgid(0, 0)"); 1875 } 1876 } 1877 1878 if (use_preload) { 1879 /* 1880 * Before we fork(2) and execve(2) the child process, we need 1881 * to open a pipe(2) to send the minijail configuration over. 1882 */ 1883 if (setup_pipe(pipe_fds)) 1884 return -EFAULT; 1885 } 1886 1887 /* 1888 * If we want to write to the child process' standard input, 1889 * create the pipe(2) now. 1890 */ 1891 if (pstdin_fd) { 1892 if (pipe(stdin_fds)) 1893 return -EFAULT; 1894 } 1895 1896 /* 1897 * If we want to read from the child process' standard output, 1898 * create the pipe(2) now. 1899 */ 1900 if (pstdout_fd) { 1901 if (pipe(stdout_fds)) 1902 return -EFAULT; 1903 } 1904 1905 /* 1906 * If we want to read from the child process' standard error, 1907 * create the pipe(2) now. 1908 */ 1909 if (pstderr_fd) { 1910 if (pipe(stderr_fds)) 1911 return -EFAULT; 1912 } 1913 1914 /* 1915 * If we want to set up a new uid/gid map in the user namespace, 1916 * or if we need to add the child process to cgroups, create the pipe(2) 1917 * to sync between parent and child. 1918 */ 1919 if (j->flags.userns || j->flags.cgroups) { 1920 sync_child = 1; 1921 if (pipe(child_sync_pipe_fds)) 1922 return -EFAULT; 1923 } 1924 1925 /* 1926 * Use sys_clone() if and only if we're creating a pid namespace. 1927 * 1928 * tl;dr: WARNING: do not mix pid namespaces and multithreading. 1929 * 1930 * In multithreaded programs, there are a bunch of locks inside libc, 1931 * some of which may be held by other threads at the time that we call 1932 * minijail_run_pid(). If we call fork(), glibc does its level best to 1933 * ensure that we hold all of these locks before it calls clone() 1934 * internally and drop them after clone() returns, but when we call 1935 * sys_clone(2) directly, all that gets bypassed and we end up with a 1936 * child address space where some of libc's important locks are held by 1937 * other threads (which did not get cloned, and hence will never release 1938 * those locks). This is okay so long as we call exec() immediately 1939 * after, but a bunch of seemingly-innocent libc functions like setenv() 1940 * take locks. 1941 * 1942 * Hence, only call sys_clone() if we need to, in order to get at pid 1943 * namespacing. If we follow this path, the child's address space might 1944 * have broken locks; you may only call functions that do not acquire 1945 * any locks. 1946 * 1947 * Unfortunately, fork() acquires every lock it can get its hands on, as 1948 * previously detailed, so this function is highly likely to deadlock 1949 * later on (see "deadlock here") if we're multithreaded. 1950 * 1951 * We might hack around this by having the clone()d child (init of the 1952 * pid namespace) return directly, rather than leaving the clone()d 1953 * process hanging around to be init for the new namespace (and having 1954 * its fork()ed child return in turn), but that process would be 1955 * crippled with its libc locks potentially broken. We might try 1956 * fork()ing in the parent before we clone() to ensure that we own all 1957 * the locks, but then we have to have the forked child hanging around 1958 * consuming resources (and possibly having file descriptors / shared 1959 * memory regions / etc attached). We'd need to keep the child around to 1960 * avoid having its children get reparented to init. 1961 * 1962 * TODO(ellyjones): figure out if the "forked child hanging around" 1963 * problem is fixable or not. It would be nice if we worked in this 1964 * case. 1965 */ 1966 if (pid_namespace) { 1967 int clone_flags = CLONE_NEWPID | SIGCHLD; 1968 if (j->flags.userns) 1969 clone_flags |= CLONE_NEWUSER; 1970 child_pid = syscall(SYS_clone, clone_flags, NULL); 1971 } else { 1972 child_pid = fork(); 1973 } 1974 1975 if (child_pid < 0) { 1976 if (use_preload) { 1977 free(oldenv_copy); 1978 } 1979 die("failed to fork child"); 1980 } 1981 1982 if (child_pid) { 1983 if (use_preload) { 1984 /* Restore parent's LD_PRELOAD. */ 1985 if (oldenv_copy) { 1986 setenv(kLdPreloadEnvVar, oldenv_copy, 1); 1987 free(oldenv_copy); 1988 } else { 1989 unsetenv(kLdPreloadEnvVar); 1990 } 1991 unsetenv(kFdEnvVar); 1992 } 1993 1994 j->initpid = child_pid; 1995 1996 if (j->flags.pid_file) 1997 write_pid_file_or_die(j); 1998 1999 if (j->flags.cgroups) 2000 add_to_cgroups_or_die(j); 2001 2002 if (j->flags.userns) 2003 write_ugid_maps_or_die(j); 2004 2005 if (sync_child) 2006 parent_setup_complete(child_sync_pipe_fds); 2007 2008 if (use_preload) { 2009 /* Send marshalled minijail. */ 2010 close(pipe_fds[0]); /* read endpoint */ 2011 ret = minijail_to_fd(j, pipe_fds[1]); 2012 close(pipe_fds[1]); /* write endpoint */ 2013 if (ret) { 2014 kill(j->initpid, SIGKILL); 2015 die("failed to send marshalled minijail"); 2016 } 2017 } 2018 2019 if (pchild_pid) 2020 *pchild_pid = child_pid; 2021 2022 /* 2023 * If we want to write to the child process' standard input, 2024 * set up the write end of the pipe. 2025 */ 2026 if (pstdin_fd) 2027 *pstdin_fd = setup_pipe_end(stdin_fds, 2028 1 /* write end */); 2029 2030 /* 2031 * If we want to read from the child process' standard output, 2032 * set up the read end of the pipe. 2033 */ 2034 if (pstdout_fd) 2035 *pstdout_fd = setup_pipe_end(stdout_fds, 2036 0 /* read end */); 2037 2038 /* 2039 * If we want to read from the child process' standard error, 2040 * set up the read end of the pipe. 2041 */ 2042 if (pstderr_fd) 2043 *pstderr_fd = setup_pipe_end(stderr_fds, 2044 0 /* read end */); 2045 2046 return 0; 2047 } 2048 /* Child process. */ 2049 free(oldenv_copy); 2050 2051 if (j->flags.reset_signal_mask) { 2052 sigset_t signal_mask; 2053 if (sigemptyset(&signal_mask) != 0) 2054 pdie("sigemptyset failed"); 2055 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0) 2056 pdie("sigprocmask failed"); 2057 } 2058 2059 if (sync_child) 2060 wait_for_parent_setup(child_sync_pipe_fds); 2061 2062 if (j->flags.userns) 2063 enter_user_namespace(j); 2064 2065 /* 2066 * If we want to write to the jailed process' standard input, 2067 * set up the read end of the pipe. 2068 */ 2069 if (pstdin_fd) { 2070 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */, 2071 STDIN_FILENO) < 0) 2072 die("failed to set up stdin pipe"); 2073 } 2074 2075 /* 2076 * If we want to read from the jailed process' standard output, 2077 * set up the write end of the pipe. 2078 */ 2079 if (pstdout_fd) { 2080 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */, 2081 STDOUT_FILENO) < 0) 2082 die("failed to set up stdout pipe"); 2083 } 2084 2085 /* 2086 * If we want to read from the jailed process' standard error, 2087 * set up the write end of the pipe. 2088 */ 2089 if (pstderr_fd) { 2090 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */, 2091 STDERR_FILENO) < 0) 2092 die("failed to set up stderr pipe"); 2093 } 2094 2095 /* If running an init program, let it decide when/how to mount /proc. */ 2096 if (pid_namespace && !do_init) 2097 j->flags.remount_proc_ro = 0; 2098 2099 if (use_preload) { 2100 /* Strip out flags that cannot be inherited across execve(2). */ 2101 minijail_preexec(j); 2102 } else { 2103 /* 2104 * If not using LD_PRELOAD, do all jailing before execve(2). 2105 * Note that PID namespaces can only be entered on fork(2), 2106 * so that flag is still cleared. 2107 */ 2108 j->flags.pids = 0; 2109 } 2110 /* Jail this process, then execve(2) the target. */ 2111 minijail_enter(j); 2112 2113 if (pid_namespace && do_init) { 2114 /* 2115 * pid namespace: this process will become init inside the new 2116 * namespace. We don't want all programs we might exec to have 2117 * to know how to be init. Normally (do_init == 1) we fork off 2118 * a child to actually run the program. If |do_init == 0|, we 2119 * let the program keep pid 1 and be init. 2120 * 2121 * If we're multithreaded, we'll probably deadlock here. See 2122 * WARNING above. 2123 */ 2124 child_pid = fork(); 2125 if (child_pid < 0) { 2126 _exit(child_pid); 2127 } else if (child_pid > 0) { 2128 /* 2129 * Best effort. Don't bother checking the return value. 2130 */ 2131 prctl(PR_SET_NAME, "minijail-init"); 2132 init(child_pid); /* Never returns. */ 2133 } 2134 } 2135 2136 /* 2137 * If we aren't pid-namespaced, or the jailed program asked to be init: 2138 * calling process 2139 * -> execve()-ing process 2140 * If we are: 2141 * calling process 2142 * -> init()-ing process 2143 * -> execve()-ing process 2144 */ 2145 ret = execve(filename, argv, environ); 2146 if (ret == -1) { 2147 pwarn("execve(%s) failed", filename); 2148 } 2149 _exit(ret); 2150} 2151 2152int API minijail_kill(struct minijail *j) 2153{ 2154 int st; 2155 if (kill(j->initpid, SIGTERM)) 2156 return -errno; 2157 if (waitpid(j->initpid, &st, 0) < 0) 2158 return -errno; 2159 return st; 2160} 2161 2162int API minijail_wait(struct minijail *j) 2163{ 2164 int st; 2165 if (waitpid(j->initpid, &st, 0) < 0) 2166 return -errno; 2167 2168 if (!WIFEXITED(st)) { 2169 int error_status = st; 2170 if (WIFSIGNALED(st)) { 2171 int signum = WTERMSIG(st); 2172 warn("child process %d received signal %d", 2173 j->initpid, signum); 2174 /* 2175 * We return MINIJAIL_ERR_JAIL if the process received 2176 * SIGSYS, which happens when a syscall is blocked by 2177 * seccomp filters. 2178 * If not, we do what bash(1) does: 2179 * $? = 128 + signum 2180 */ 2181 if (signum == SIGSYS) { 2182 error_status = MINIJAIL_ERR_JAIL; 2183 } else { 2184 error_status = 128 + signum; 2185 } 2186 } 2187 return error_status; 2188 } 2189 2190 int exit_status = WEXITSTATUS(st); 2191 if (exit_status != 0) 2192 info("child process %d exited with status %d", 2193 j->initpid, exit_status); 2194 2195 return exit_status; 2196} 2197 2198void API minijail_destroy(struct minijail *j) 2199{ 2200 size_t i; 2201 2202 if (j->flags.seccomp_filter && j->filter_prog) { 2203 free(j->filter_prog->filter); 2204 free(j->filter_prog); 2205 } 2206 while (j->mounts_head) { 2207 struct mountpoint *m = j->mounts_head; 2208 j->mounts_head = j->mounts_head->next; 2209 free(m->data); 2210 free(m->type); 2211 free(m->dest); 2212 free(m->src); 2213 free(m); 2214 } 2215 j->mounts_tail = NULL; 2216 if (j->user) 2217 free(j->user); 2218 if (j->suppl_gid_list) 2219 free(j->suppl_gid_list); 2220 if (j->chrootdir) 2221 free(j->chrootdir); 2222 if (j->pid_file_path) 2223 free(j->pid_file_path); 2224 if (j->uidmap) 2225 free(j->uidmap); 2226 if (j->gidmap) 2227 free(j->gidmap); 2228 if (j->alt_syscall_table) 2229 free(j->alt_syscall_table); 2230 for (i = 0; i < j->cgroup_count; ++i) 2231 free(j->cgroups[i]); 2232 free(j); 2233} 2234