libminijail.c revision 6537a568125667e8db44a0af38fd04fc8fd07ef7
1/* 2 * Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 3 * Use of this source code is governed by a BSD-style license that can be 4 * found in the LICENSE file. 5 */ 6 7#define _BSD_SOURCE 8#define _GNU_SOURCE 9 10#include <asm/unistd.h> 11#include <ctype.h> 12#include <errno.h> 13#include <grp.h> 14#include <inttypes.h> 15#include <limits.h> 16#include <linux/capability.h> 17#include <linux/securebits.h> 18#include <pwd.h> 19#include <sched.h> 20#include <signal.h> 21#include <stdarg.h> 22#include <stddef.h> 23#include <stdio.h> 24#include <stdlib.h> 25#include <string.h> 26#include <syscall.h> 27#include <sys/capability.h> 28#include <sys/mount.h> 29#include <sys/param.h> 30#include <sys/prctl.h> 31#include <sys/user.h> 32#include <sys/wait.h> 33#include <unistd.h> 34 35#include "libminijail.h" 36#include "libminijail-private.h" 37 38#include "signal.h" 39#include "syscall_filter.h" 40#include "util.h" 41 42/* Until these are reliably available in linux/prctl.h */ 43#ifndef PR_SET_SECCOMP 44# define PR_SET_SECCOMP 22 45#endif 46 47/* For seccomp_filter using BPF. */ 48#ifndef PR_SET_NO_NEW_PRIVS 49# define PR_SET_NO_NEW_PRIVS 38 50#endif 51#ifndef SECCOMP_MODE_FILTER 52# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ 53#endif 54 55struct binding { 56 char *src; 57 char *dest; 58 int writeable; 59 struct binding *next; 60}; 61 62struct minijail { 63 struct { 64 int uid:1; 65 int gid:1; 66 int caps:1; 67 int vfs:1; 68 int pids:1; 69 int seccomp:1; 70 int readonly:1; 71 int usergroups:1; 72 int ptrace:1; 73 int no_new_privs:1; 74 int seccomp_filter:1; 75 int log_seccomp_filter:1; 76 int chroot:1; 77 } flags; 78 uid_t uid; 79 gid_t gid; 80 gid_t usergid; 81 char *user; 82 uint64_t caps; 83 pid_t initpid; 84 int filter_len; 85 int binding_count; 86 char *chrootdir; 87 struct sock_fprog *filter_prog; 88 struct binding *bindings_head; 89 struct binding *bindings_tail; 90}; 91 92struct minijail API *minijail_new(void) 93{ 94 return calloc(1, sizeof(struct minijail)); 95} 96 97void API minijail_change_uid(struct minijail *j, uid_t uid) 98{ 99 if (uid == 0) 100 die("useless change to uid 0"); 101 j->uid = uid; 102 j->flags.uid = 1; 103} 104 105void API minijail_change_gid(struct minijail *j, gid_t gid) 106{ 107 if (gid == 0) 108 die("useless change to gid 0"); 109 j->gid = gid; 110 j->flags.gid = 1; 111} 112 113int API minijail_change_user(struct minijail *j, const char *user) 114{ 115 char *buf = NULL; 116 struct passwd pw; 117 struct passwd *ppw = NULL; 118 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX); 119 if (sz == -1) 120 sz = 65536; /* your guess is as good as mine... */ 121 122 /* 123 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return 124 * the maximum needed size of the buffer, so we don't have to search. 125 */ 126 buf = malloc(sz); 127 if (!buf) 128 return -ENOMEM; 129 getpwnam_r(user, &pw, buf, sz, &ppw); 130 /* 131 * We're safe to free the buffer here. The strings inside pw point 132 * inside buf, but we don't use any of them; this leaves the pointers 133 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded. 134 */ 135 free(buf); 136 if (!ppw) 137 return -errno; 138 minijail_change_uid(j, ppw->pw_uid); 139 j->user = strdup(user); 140 if (!j->user) 141 return -ENOMEM; 142 j->usergid = ppw->pw_gid; 143 return 0; 144} 145 146int API minijail_change_group(struct minijail *j, const char *group) 147{ 148 char *buf = NULL; 149 struct group gr; 150 struct group *pgr = NULL; 151 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX); 152 if (sz == -1) 153 sz = 65536; /* and mine is as good as yours, really */ 154 155 /* 156 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return 157 * the maximum needed size of the buffer, so we don't have to search. 158 */ 159 buf = malloc(sz); 160 if (!buf) 161 return -ENOMEM; 162 getgrnam_r(group, &gr, buf, sz, &pgr); 163 /* 164 * We're safe to free the buffer here. The strings inside gr point 165 * inside buf, but we don't use any of them; this leaves the pointers 166 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded. 167 */ 168 free(buf); 169 if (!pgr) 170 return -errno; 171 minijail_change_gid(j, pgr->gr_gid); 172 return 0; 173} 174 175void API minijail_use_seccomp(struct minijail *j) 176{ 177 j->flags.seccomp = 1; 178} 179 180void API minijail_no_new_privs(struct minijail *j) 181{ 182 j->flags.no_new_privs = 1; 183} 184 185void API minijail_use_seccomp_filter(struct minijail *j) 186{ 187 j->flags.seccomp_filter = 1; 188} 189 190void API minijail_log_seccomp_filter_failures(struct minijail *j) 191{ 192 j->flags.log_seccomp_filter = 1; 193} 194 195void API minijail_use_caps(struct minijail *j, uint64_t capmask) 196{ 197 j->caps = capmask; 198 j->flags.caps = 1; 199} 200 201void API minijail_namespace_vfs(struct minijail *j) 202{ 203 j->flags.vfs = 1; 204} 205 206void API minijail_namespace_pids(struct minijail *j) 207{ 208 j->flags.vfs = 1; 209 j->flags.readonly = 1; 210 j->flags.pids = 1; 211} 212 213void API minijail_remount_readonly(struct minijail *j) 214{ 215 j->flags.vfs = 1; 216 j->flags.readonly = 1; 217} 218 219void API minijail_inherit_usergroups(struct minijail *j) 220{ 221 j->flags.usergroups = 1; 222} 223 224void API minijail_disable_ptrace(struct minijail *j) 225{ 226 j->flags.ptrace = 1; 227} 228 229int API minijail_enter_chroot(struct minijail *j, const char *dir) { 230 if (j->chrootdir) 231 return -EINVAL; 232 j->chrootdir = strdup(dir); 233 if (!j->chrootdir) 234 return -ENOMEM; 235 j->flags.chroot = 1; 236 return 0; 237} 238 239int API minijail_bind(struct minijail *j, const char *src, const char *dest, 240 int writeable) { 241 struct binding *b; 242 243 if (*dest != '/') 244 return -EINVAL; 245 b = calloc(1, sizeof(*b)); 246 if (!b) 247 return -ENOMEM; 248 b->dest = strdup(dest); 249 if (!b->dest) 250 goto error; 251 b->src = strdup(src); 252 if (!b->src) 253 goto error; 254 b->writeable = writeable; 255 256 info("bind %s -> %s", src, dest); 257 258 /* 259 * Force vfs namespacing so the bind mounts don't leak out into the 260 * containing vfs namespace. 261 */ 262 minijail_namespace_vfs(j); 263 264 if (j->bindings_tail) 265 j->bindings_tail->next = b; 266 else 267 j->bindings_head = b; 268 j->bindings_tail = b; 269 j->binding_count++; 270 271 return 0; 272 273error: 274 free(b->src); 275 free(b->dest); 276 free(b); 277 return -ENOMEM; 278} 279 280void API minijail_parse_seccomp_filters(struct minijail *j, const char *path) 281{ 282 FILE *file = fopen(path, "r"); 283 if (!file) { 284 pdie("failed to open seccomp filter file '%s'", path); 285 } 286 287 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog)); 288 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) { 289 die("failed to compile seccomp filter BPF program in '%s'", 290 path); 291 } 292 293 j->filter_len = fprog->len; 294 j->filter_prog = fprog; 295 296 fclose(file); 297} 298 299struct marshal_state { 300 size_t available; 301 size_t total; 302 char *buf; 303}; 304 305void marshal_state_init(struct marshal_state *state, 306 char *buf, size_t available) 307{ 308 state->available = available; 309 state->buf = buf; 310 state->total = 0; 311} 312 313void marshal_append(struct marshal_state *state, 314 char *src, size_t length) 315{ 316 size_t copy_len = MIN(state->available, length); 317 318 /* Up to |available| will be written. */ 319 if (copy_len) { 320 memcpy(state->buf, src, copy_len); 321 state->buf += copy_len; 322 state->available -= copy_len; 323 } 324 /* |total| will contain the expected length. */ 325 state->total += length; 326} 327 328void minijail_marshal_helper(struct marshal_state *state, 329 const struct minijail *j) 330{ 331 struct binding *b = NULL; 332 marshal_append(state, (char *)j, sizeof(*j)); 333 if (j->user) 334 marshal_append(state, j->user, strlen(j->user) + 1); 335 if (j->chrootdir) 336 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1); 337 if (j->flags.seccomp_filter && j->filter_prog) { 338 struct sock_fprog *fp = j->filter_prog; 339 marshal_append(state, (char *)fp->filter, 340 fp->len * sizeof(struct sock_filter)); 341 } 342 for (b = j->bindings_head; b; b = b->next) { 343 marshal_append(state, b->src, strlen(b->src) + 1); 344 marshal_append(state, b->dest, strlen(b->dest) + 1); 345 marshal_append(state, (char *)&b->writeable, 346 sizeof(b->writeable)); 347 } 348} 349 350size_t API minijail_size(const struct minijail *j) 351{ 352 struct marshal_state state; 353 marshal_state_init(&state, NULL, 0); 354 minijail_marshal_helper(&state, j); 355 return state.total; 356} 357 358int minijail_marshal(const struct minijail *j, char *buf, size_t available) 359{ 360 struct marshal_state state; 361 marshal_state_init(&state, buf, available); 362 minijail_marshal_helper(&state, j); 363 return (state.total > available); 364} 365 366/* consumebytes: consumes @length bytes from a buffer @buf of length @buflength 367 * @length Number of bytes to consume 368 * @buf Buffer to consume from 369 * @buflength Size of @buf 370 * 371 * Returns a pointer to the base of the bytes, or NULL for errors. 372 */ 373void *consumebytes(size_t length, char **buf, size_t *buflength) { 374 char *p = *buf; 375 if (length > *buflength) 376 return NULL; 377 *buf += length; 378 *buflength -= length; 379 return p; 380} 381 382/* consumestr: consumes a C string from a buffer @buf of length @length 383 * @buf Buffer to consume 384 * @length Length of buffer 385 * 386 * Returns a pointer to the base of the string, or NULL for errors. 387 */ 388char *consumestr(char **buf, size_t *buflength) { 389 size_t len = strnlen(*buf, *buflength); 390 if (len == *buflength) 391 /* There's no null-terminator */ 392 return NULL; 393 return consumebytes(len + 1, buf, buflength); 394} 395 396int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) 397{ 398 int i; 399 int count; 400 int ret = -EINVAL; 401 402 if (length < sizeof(*j)) 403 goto out; 404 memcpy((void *)j, serialized, sizeof(*j)); 405 serialized += sizeof(*j); 406 length -= sizeof(*j); 407 408 /* Potentially stale pointers not used as signals. */ 409 j->bindings_head = NULL; 410 j->bindings_tail = NULL; 411 j->filter_prog = NULL; 412 413 if (j->user) { /* stale pointer */ 414 char *user = consumestr(&serialized, &length); 415 if (!user) 416 goto clear_pointers; 417 j->user = strdup(user); 418 if (!j->user) 419 goto clear_pointers; 420 } 421 422 if (j->chrootdir) { /* stale pointer */ 423 char *chrootdir = consumestr(&serialized, &length); 424 if (!chrootdir) 425 goto bad_chrootdir; 426 j->chrootdir = strdup(chrootdir); 427 if (!j->chrootdir) 428 goto bad_chrootdir; 429 } 430 431 if (j->flags.seccomp_filter && j->filter_len > 0) { 432 size_t ninstrs = j->filter_len; 433 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) || 434 ninstrs > USHRT_MAX) 435 goto bad_filters; 436 437 size_t program_len = ninstrs * sizeof(struct sock_filter); 438 void *program = consumebytes(program_len, &serialized, &length); 439 if (!program) 440 goto bad_filters; 441 442 j->filter_prog = malloc(sizeof(struct sock_fprog)); 443 j->filter_prog->len = ninstrs; 444 j->filter_prog->filter = malloc(program_len); 445 memcpy(j->filter_prog->filter, program, program_len); 446 } 447 448 count = j->binding_count; 449 j->binding_count = 0; 450 for (i = 0; i < count; ++i) { 451 int *writeable; 452 const char *dest; 453 const char *src = consumestr(&serialized, &length); 454 if (!src) 455 goto bad_bindings; 456 dest = consumestr(&serialized, &length); 457 if (!dest) 458 goto bad_bindings; 459 writeable = consumebytes(sizeof(*writeable), &serialized, &length); 460 if (!writeable) 461 goto bad_bindings; 462 if (minijail_bind(j, src, dest, *writeable)) 463 goto bad_bindings; 464 } 465 466 return 0; 467 468bad_bindings: 469 if (j->flags.seccomp_filter && j->filter_len > 0) { 470 free(j->filter_prog->filter); 471 free(j->filter_prog); 472 } 473bad_filters: 474 if (j->chrootdir) 475 free(j->chrootdir); 476bad_chrootdir: 477 if (j->user) 478 free(j->user); 479clear_pointers: 480 j->user = NULL; 481 j->chrootdir = NULL; 482out: 483 return ret; 484} 485 486void minijail_preenter(struct minijail *j) 487{ 488 /* Strip out options which are minijail_run() only. */ 489 j->flags.vfs = 0; 490 j->flags.readonly = 0; 491 j->flags.pids = 0; 492} 493 494void minijail_preexec(struct minijail *j) 495{ 496 int vfs = j->flags.vfs; 497 int readonly = j->flags.readonly; 498 if (j->user) 499 free(j->user); 500 j->user = NULL; 501 memset(&j->flags, 0, sizeof(j->flags)); 502 /* Now restore anything we meant to keep. */ 503 j->flags.vfs = vfs; 504 j->flags.readonly = readonly; 505 /* Note, pidns will already have been used before this call. */ 506} 507 508/* bind_one: Applies bindings from @b for @j, recursing as needed. 509 * @j Minijail these bindings are for 510 * @b Head of list of bindings 511 * 512 * Returns 0 for success. 513 */ 514int bind_one(const struct minijail *j, struct binding *b) { 515 int ret = 0; 516 char *dest = NULL; 517 if (ret) 518 return ret; 519 /* dest has a leading "/" */ 520 if (asprintf(&dest, "%s%s", j->chrootdir, b->dest) < 0) 521 return -ENOMEM; 522 ret = mount(b->src, dest, NULL, MS_BIND, NULL); 523 if (ret) 524 pdie("bind: %s -> %s", b->src, dest); 525 if (!b->writeable) { 526 ret = mount(b->src, dest, NULL, 527 MS_BIND | MS_REMOUNT | MS_RDONLY, NULL); 528 if (ret) 529 pdie("bind ro: %s -> %s", b->src, dest); 530 } 531 free(dest); 532 if (b->next) 533 return bind_one(j, b->next); 534 return ret; 535} 536 537int enter_chroot(const struct minijail *j) { 538 int ret; 539 if (j->bindings_head && (ret = bind_one(j, j->bindings_head))) 540 return ret; 541 542 if (chroot(j->chrootdir)) 543 return -errno; 544 545 if (chdir("/")) 546 return -errno; 547 548 return 0; 549} 550 551int remount_readonly(void) 552{ 553 const char *kProcPath = "/proc"; 554 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 555 /* 556 * Right now, we're holding a reference to our parent's old mount of 557 * /proc in our namespace, which means using MS_REMOUNT here would 558 * mutate our parent's mount as well, even though we're in a VFS 559 * namespace (!). Instead, remove their mount from our namespace 560 * and make our own. 561 */ 562 if (umount(kProcPath)) 563 return -errno; 564 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) 565 return -errno; 566 return 0; 567} 568 569void drop_ugid(const struct minijail *j) 570{ 571 if (j->flags.usergroups) { 572 if (initgroups(j->user, j->usergid)) 573 pdie("initgroups"); 574 } else { 575 /* Only attempt to clear supplemental groups if we are changing 576 * users. */ 577 if ((j->uid || j->gid) && setgroups(0, NULL)) 578 pdie("setgroups"); 579 } 580 581 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) 582 pdie("setresgid"); 583 584 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) 585 pdie("setresuid"); 586} 587 588void drop_caps(const struct minijail *j) 589{ 590 cap_t caps = cap_get_proc(); 591 cap_value_t raise_flag[1]; 592 unsigned int i; 593 if (!caps) 594 die("can't get process caps"); 595 if (cap_clear_flag(caps, CAP_INHERITABLE)) 596 die("can't clear inheritable caps"); 597 if (cap_clear_flag(caps, CAP_EFFECTIVE)) 598 die("can't clear effective caps"); 599 if (cap_clear_flag(caps, CAP_PERMITTED)) 600 die("can't clear permitted caps"); 601 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) { 602 if (i != CAP_SETPCAP && !(j->caps & (1 << i))) 603 continue; 604 raise_flag[0] = i; 605 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, raise_flag, CAP_SET)) 606 die("can't add effective cap"); 607 if (cap_set_flag(caps, CAP_PERMITTED, 1, raise_flag, CAP_SET)) 608 die("can't add permitted cap"); 609 if (cap_set_flag(caps, CAP_INHERITABLE, 1, raise_flag, CAP_SET)) 610 die("can't add inheritable cap"); 611 } 612 if (cap_set_proc(caps)) 613 die("can't apply cleaned capset"); 614 cap_free(caps); 615 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) { 616 if (j->caps & (1 << i)) 617 continue; 618 if (prctl(PR_CAPBSET_DROP, i)) 619 pdie("prctl(PR_CAPBSET_DROP)"); 620 } 621} 622 623void set_seccomp_filter(const struct minijail *j) 624{ 625 /* 626 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c> 627 * in the kernel source tree for an explanation of the parameters. 628 */ 629 if (j->flags.no_new_privs) { 630 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) 631 pdie("prctl(PR_SET_NO_NEW_PRIVS)"); 632 } 633 634 /* 635 * If we're logging seccomp filter failures, 636 * install the SIGSYS handler first. 637 */ 638 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) { 639 if (install_sigsys_handler()) 640 pdie("install SIGSYS handler"); 641 warn("logging seccomp filter failures"); 642 } 643 644 /* 645 * Install the syscall filter. 646 */ 647 if (j->flags.seccomp_filter) { 648 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, j->filter_prog)) 649 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)"); 650 } 651} 652 653void API minijail_enter(const struct minijail *j) 654{ 655 if (j->flags.pids) 656 die("tried to enter a pid-namespaced jail;" 657 "try minijail_run()?"); 658 659 if (j->flags.usergroups && !j->user) 660 die("usergroup inheritance without username"); 661 662 /* 663 * We can't recover from failures if we've dropped privileges partially, 664 * so we don't even try. If any of our operations fail, we abort() the 665 * entire process. 666 */ 667 if (j->flags.vfs && unshare(CLONE_NEWNS)) 668 pdie("unshare"); 669 670 if (j->flags.chroot && enter_chroot(j)) 671 pdie("chroot"); 672 673 if (j->flags.readonly && remount_readonly()) 674 pdie("remount"); 675 676 if (j->flags.caps) { 677 /* 678 * POSIX capabilities are a bit tricky. If we drop our 679 * capability to change uids, our attempt to use setuid() 680 * below will fail. Hang on to root caps across setuid(), then 681 * lock securebits. 682 */ 683 if (prctl(PR_SET_KEEPCAPS, 1)) 684 pdie("prctl(PR_SET_KEEPCAPS)"); 685 if (prctl 686 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS)) 687 pdie("prctl(PR_SET_SECUREBITS)"); 688 } 689 690 /* 691 * If we're setting no_new_privs, we can drop privileges 692 * before setting seccomp filter. This way filter policies 693 * don't need to allow privilege-dropping syscalls. 694 */ 695 if (j->flags.no_new_privs) { 696 drop_ugid(j); 697 if (j->flags.caps) 698 drop_caps(j); 699 700 set_seccomp_filter(j); 701 } else { 702 /* 703 * If we're not setting no_new_privs, 704 * we need to set seccomp filter *before* dropping privileges. 705 * WARNING: this means that filter policies *must* allow 706 * setgroups()/setresgid()/setresuid() for dropping root and 707 * capget()/capset()/prctl() for dropping caps. 708 */ 709 set_seccomp_filter(j); 710 711 drop_ugid(j); 712 if (j->flags.caps) 713 drop_caps(j); 714 } 715 716 /* 717 * seccomp has to come last since it cuts off all the other 718 * privilege-dropping syscalls :) 719 */ 720 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) 721 pdie("prctl(PR_SET_SECCOMP)"); 722} 723 724/* TODO(wad) will visibility affect this variable? */ 725static int init_exitstatus = 0; 726 727void init_term(int __attribute__ ((unused)) sig) 728{ 729 _exit(init_exitstatus); 730} 731 732int init(pid_t rootpid) 733{ 734 pid_t pid; 735 int status; 736 /* so that we exit with the right status */ 737 signal(SIGTERM, init_term); 738 /* TODO(wad) self jail with seccomp_filters here. */ 739 while ((pid = wait(&status)) > 0) { 740 /* 741 * This loop will only end when either there are no processes 742 * left inside our pid namespace or we get a signal. 743 */ 744 if (pid == rootpid) 745 init_exitstatus = status; 746 } 747 if (!WIFEXITED(init_exitstatus)) 748 _exit(MINIJAIL_ERR_INIT); 749 _exit(WEXITSTATUS(init_exitstatus)); 750} 751 752int API minijail_from_fd(int fd, struct minijail *j) 753{ 754 size_t sz = 0; 755 size_t bytes = read(fd, &sz, sizeof(sz)); 756 char *buf; 757 int r; 758 if (sizeof(sz) != bytes) 759 return -EINVAL; 760 if (sz > USHRT_MAX) /* Arbitrary sanity check */ 761 return -E2BIG; 762 buf = malloc(sz); 763 if (!buf) 764 return -ENOMEM; 765 bytes = read(fd, buf, sz); 766 if (bytes != sz) { 767 free(buf); 768 return -EINVAL; 769 } 770 r = minijail_unmarshal(j, buf, sz); 771 free(buf); 772 return r; 773} 774 775int API minijail_to_fd(struct minijail *j, int fd) 776{ 777 char *buf; 778 size_t sz = minijail_size(j); 779 ssize_t written; 780 int r; 781 782 if (!sz) 783 return -EINVAL; 784 buf = malloc(sz); 785 r = minijail_marshal(j, buf, sz); 786 if (r) { 787 free(buf); 788 return r; 789 } 790 /* Sends [size][minijail]. */ 791 written = write(fd, &sz, sizeof(sz)); 792 if (written != sizeof(sz)) { 793 free(buf); 794 return -EFAULT; 795 } 796 written = write(fd, buf, sz); 797 if (written < 0 || (size_t) written != sz) { 798 free(buf); 799 return -EFAULT; 800 } 801 free(buf); 802 return 0; 803} 804 805int setup_preload(void) 806{ 807 char *oldenv = getenv(kLdPreloadEnvVar) ? : ""; 808 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH)); 809 if (!newenv) 810 return -ENOMEM; 811 812 /* Only insert a separating space if we have something to separate... */ 813 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", 814 PRELOADPATH); 815 816 /* setenv() makes a copy of the string we give it */ 817 setenv(kLdPreloadEnvVar, newenv, 1); 818 free(newenv); 819 return 0; 820} 821 822int setup_pipe(int fds[2]) 823{ 824 int r = pipe(fds); 825 char fd_buf[11]; 826 if (r) 827 return r; 828 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]); 829 if (r <= 0) 830 return -EINVAL; 831 setenv(kFdEnvVar, fd_buf, 1); 832 return 0; 833} 834 835int API minijail_run(struct minijail *j, const char *filename, 836 char *const argv[]) 837{ 838 return minijail_run_pid_pipe(j, filename, argv, NULL, NULL); 839} 840 841int API minijail_run_pid(struct minijail *j, const char *filename, 842 char *const argv[], pid_t *pchild_pid) 843{ 844 return minijail_run_pid_pipe(j, filename, argv, pchild_pid, NULL); 845} 846 847int API minijail_run_pipe(struct minijail *j, const char *filename, 848 char *const argv[], int *pstdin_fd) 849{ 850 return minijail_run_pid_pipe(j, filename, argv, NULL, pstdin_fd); 851} 852 853int API minijail_run_pid_pipe(struct minijail *j, const char *filename, 854 char *const argv[], pid_t *pchild_pid, 855 int *pstdin_fd) 856{ 857 char *oldenv, *oldenv_copy = NULL; 858 pid_t child_pid; 859 int pipe_fds[2]; 860 int stdin_fds[2]; 861 int ret; 862 /* We need to remember this across the minijail_preexec() call. */ 863 int pid_namespace = j->flags.pids; 864 865 oldenv = getenv(kLdPreloadEnvVar); 866 if (oldenv) { 867 oldenv_copy = strdup(oldenv); 868 if (!oldenv_copy) 869 return -ENOMEM; 870 } 871 872 if (setup_preload()) 873 return -EFAULT; 874 875 /* 876 * Before we fork(2) and execve(2) the child process, we need to open 877 * a pipe(2) to send the minijail configuration over. 878 */ 879 if (setup_pipe(pipe_fds)) 880 return -EFAULT; 881 882 /* 883 * If we want to write to the child process' standard input, 884 * create the pipe(2) now. 885 */ 886 if (pstdin_fd) { 887 if (pipe(stdin_fds)) 888 return -EFAULT; 889 } 890 891 /* Use sys_clone() if and only if we're creating a pid namespace. 892 * 893 * tl;dr: WARNING: do not mix pid namespaces and multithreading. 894 * 895 * In multithreaded programs, there are a bunch of locks inside libc, 896 * some of which may be held by other threads at the time that we call 897 * minijail_run_pid(). If we call fork(), glibc does its level best to 898 * ensure that we hold all of these locks before it calls clone() 899 * internally and drop them after clone() returns, but when we call 900 * sys_clone(2) directly, all that gets bypassed and we end up with a 901 * child address space where some of libc's important locks are held by 902 * other threads (which did not get cloned, and hence will never release 903 * those locks). This is okay so long as we call exec() immediately 904 * after, but a bunch of seemingly-innocent libc functions like setenv() 905 * take locks. 906 * 907 * Hence, only call sys_clone() if we need to, in order to get at pid 908 * namespacing. If we follow this path, the child's address space might 909 * have broken locks; you may only call functions that do not acquire 910 * any locks. 911 * 912 * Unfortunately, fork() acquires every lock it can get its hands on, as 913 * previously detailed, so this function is highly likely to deadlock 914 * later on (see "deadlock here") if we're multithreaded. 915 * 916 * We might hack around this by having the clone()d child (init of the 917 * pid namespace) return directly, rather than leaving the clone()d 918 * process hanging around to be init for the new namespace (and having 919 * its fork()ed child return in turn), but that process would be crippled 920 * with its libc locks potentially broken. We might try fork()ing in the 921 * parent before we clone() to ensure that we own all the locks, but 922 * then we have to have the forked child hanging around consuming 923 * resources (and possibly having file descriptors / shared memory 924 * regions / etc attached). We'd need to keep the child around to avoid 925 * having its children get reparented to init. 926 * 927 * TODO(ellyjones): figure out if the "forked child hanging around" 928 * problem is fixable or not. It would be nice if we worked in this 929 * case. 930 */ 931 if (pid_namespace) 932 child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL); 933 else 934 child_pid = fork(); 935 936 if (child_pid < 0) { 937 free(oldenv_copy); 938 return child_pid; 939 } 940 941 if (child_pid) { 942 /* Restore parent's LD_PRELOAD. */ 943 if (oldenv_copy) { 944 setenv(kLdPreloadEnvVar, oldenv_copy, 1); 945 free(oldenv_copy); 946 } else { 947 unsetenv(kLdPreloadEnvVar); 948 } 949 unsetenv(kFdEnvVar); 950 951 j->initpid = child_pid; 952 953 /* Send marshalled minijail. */ 954 close(pipe_fds[0]); /* read endpoint */ 955 ret = minijail_to_fd(j, pipe_fds[1]); 956 close(pipe_fds[1]); /* write endpoint */ 957 if (ret) { 958 kill(j->initpid, SIGKILL); 959 die("failed to send marshalled minijail"); 960 } 961 962 if (pchild_pid) 963 *pchild_pid = child_pid; 964 965 /* 966 * If we want to write to the child process' standard input, 967 * set up the write end of the pipe. 968 */ 969 if (pstdin_fd) { 970 close(stdin_fds[0]); /* read endpoint */ 971 *pstdin_fd = stdin_fds[1]; 972 } 973 974 return 0; 975 } 976 free(oldenv_copy); 977 978 /* 979 * If we want to write to the jailed process' standard input, 980 * set up the read end of the pipe. 981 */ 982 if (pstdin_fd) { 983 close(stdin_fds[1]); /* write endpoint */ 984 /* dup2(2) the read end of the pipe into stdin. */ 985 if (dup2(stdin_fds[0], 0)) 986 die("failed to set up stdin pipe"); 987 } 988 989 /* Drop everything that cannot be inherited across execve. */ 990 minijail_preexec(j); 991 /* Jail this process and its descendants... */ 992 minijail_enter(j); 993 994 if (pid_namespace) { 995 /* 996 * pid namespace: this process will become init inside the new 997 * namespace, so fork off a child to actually run the program 998 * (we don't want all programs we might exec to have to know 999 * how to be init). 1000 * 1001 * If we're multithreaded, we'll probably deadlock here. See 1002 * WARNING above. 1003 */ 1004 child_pid = fork(); 1005 if (child_pid < 0) 1006 _exit(child_pid); 1007 else if (child_pid > 0) 1008 init(child_pid); /* never returns */ 1009 } 1010 1011 /* 1012 * If we aren't pid-namespaced: 1013 * calling process 1014 * -> execve()-ing process 1015 * If we are: 1016 * calling process 1017 * -> init()-ing process 1018 * -> execve()-ing process 1019 */ 1020 _exit(execve(filename, argv, environ)); 1021} 1022 1023int API minijail_kill(struct minijail *j) 1024{ 1025 int st; 1026 if (kill(j->initpid, SIGTERM)) 1027 return -errno; 1028 if (waitpid(j->initpid, &st, 0) < 0) 1029 return -errno; 1030 return st; 1031} 1032 1033int API minijail_wait(struct minijail *j) 1034{ 1035 int st; 1036 if (waitpid(j->initpid, &st, 0) < 0) 1037 return -errno; 1038 if (!WIFEXITED(st)) { 1039 if (WIFSIGNALED(st)) 1040 warn("child process received signal %d", WTERMSIG(st)); 1041 return MINIJAIL_ERR_JAIL; 1042 } 1043 return WEXITSTATUS(st); 1044} 1045 1046void API minijail_destroy(struct minijail *j) 1047{ 1048 if (j->flags.seccomp_filter && j->filter_prog) { 1049 free(j->filter_prog->filter); 1050 free(j->filter_prog); 1051 } 1052 while (j->bindings_head) { 1053 struct binding *b = j->bindings_head; 1054 j->bindings_head = j->bindings_head->next; 1055 free(b->dest); 1056 free(b->src); 1057 free(b); 1058 } 1059 j->bindings_tail = NULL; 1060 if (j->user) 1061 free(j->user); 1062 if (j->chrootdir) 1063 free(j->chrootdir); 1064 free(j); 1065} 1066