libminijail.c revision cd7a9046e61e243fca916a286e49d58e2331eaa7
1/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. */ 4 5#define _BSD_SOURCE 6#define _GNU_SOURCE 7#include <errno.h> 8#include <grp.h> 9#include <inttypes.h> 10#include <linux/capability.h> 11#include <linux/securebits.h> 12#include <pwd.h> 13#include <sched.h> 14#include <signal.h> 15#include <stdio.h> 16#include <stdlib.h> 17#include <string.h> 18#include <syscall.h> 19#include <sys/capability.h> 20#include <sys/mount.h> 21#include <sys/prctl.h> 22#include <sys/wait.h> 23#include <syslog.h> 24#include <unistd.h> 25 26#include "libminijail.h" 27#include "libminijail-private.h" 28 29struct minijail { 30 struct { 31 int uid : 1; 32 int gid : 1; 33 int caps : 1; 34 int vfs : 1; 35 int pids : 1; 36 int seccomp : 1; 37 int readonly : 1; 38 int usergroups : 1; 39 int ptrace : 1; 40 } flags; 41 uid_t uid; 42 gid_t gid; 43 gid_t usergid; 44 const char *user; 45 uint64_t caps; 46 pid_t initpid; 47}; 48 49static void pdie(const char *failed) { 50 syslog(LOG_ERR, "libminijail: %s failed: %s", failed, strerror(errno)); 51 abort(); 52} 53 54static void die(const char *failed) { 55 syslog(LOG_ERR, "libminijail: %s", failed); 56 abort(); 57} 58 59struct minijail *minijail_new(void) { 60 struct minijail *j = malloc(sizeof(*j)); 61 if (j) 62 memset(j, 0, sizeof(*j)); 63 return j; 64} 65 66void minijail_change_uid(struct minijail *j, uid_t uid) { 67 if (uid == 0) 68 die("useless change to uid 0"); 69 j->uid = uid; 70 j->flags.uid = 1; 71} 72 73void minijail_change_gid(struct minijail *j, gid_t gid) { 74 if (gid == 0) 75 die("useless change to gid 0"); 76 j->gid = gid; 77 j->flags.gid = 1; 78} 79 80int minijail_change_user(struct minijail *j, const char *user) { 81 /* In principle this should use getpwnam(), but: 82 * 1) getpwnam_r() isn't actually reentrant anyway, since it uses a 83 * statically-allocated file descriptor internally 84 * 2) fgetpwnam() (by analogy with fgetpwent) would solve (1) except that it 85 * doesn't exist 86 * 3) sysconf() (see getpwnam_r(3)) is allowed to return a size that is not 87 * large enough, which means having to loop on growing the buffer we pass 88 * in 89 */ 90 struct passwd *pw = getpwnam(user); 91 if (!pw) 92 return errno; 93 minijail_change_uid(j, pw->pw_uid); 94 j->user = user; 95 j->usergid = pw->pw_gid; 96 return 0; 97} 98 99int minijail_change_group(struct minijail *j, const char *group) { 100 /* In principle this should use getgrnam(), but: 101 * 1) getgrnam_r() isn't actually reentrant anyway, since it uses a 102 * statically-allocated file descriptor internally 103 * 2) fgetgrnam() (by analogy with fgetgrent) would solve (1) except that it 104 * doesn't exist 105 * 3) sysconf() (see getgrnam_r(3)) is allowed to return a size that is not 106 * large enough, which means having to loop on growing the buffer we pass 107 * in 108 */ 109 struct group *gr = getgrnam(group); 110 if (!gr) 111 return errno; 112 minijail_change_gid(j, gr->gr_gid); 113 return 0; 114} 115 116void minijail_use_seccomp(struct minijail *j) { 117 j->flags.seccomp = 1; 118} 119 120void minijail_use_caps(struct minijail *j, uint64_t capmask) { 121 j->caps = capmask; 122 j->flags.caps = 1; 123} 124 125void minijail_namespace_vfs(struct minijail *j) { 126 j->flags.vfs = 1; 127} 128 129void minijail_namespace_pids(struct minijail *j) { 130 j->flags.pids = 1; 131} 132 133void minijail_remount_readonly(struct minijail *j) { 134 j->flags.vfs = 1; 135 j->flags.readonly = 1; 136} 137 138void minijail_inherit_usergroups(struct minijail *j) { 139 j->flags.usergroups = 1; 140} 141 142void minijail_disable_ptrace(struct minijail *j) { 143 j->flags.ptrace = 1; 144} 145 146static int remount_readonly(void) { 147 const char *kProcPath = "/proc"; 148 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 149 /* Right now, we're holding a reference to our parent's old mount of /proc in 150 * our namespace, which means using MS_REMOUNT here would mutate our parent's 151 * mount as well, even though we're in a VFS namespace (!). Instead, remove 152 * their mount from our namespace and make our own. */ 153 if (umount(kProcPath)) 154 return errno; 155 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) 156 return errno; 157 return 0; 158} 159 160static void drop_caps(const struct minijail *j) { 161 cap_t caps = cap_get_proc(); 162 cap_value_t raise_flag[1]; 163 unsigned int i; 164 if (!caps) 165 die("can't get process caps"); 166 if (cap_clear_flag(caps, CAP_INHERITABLE)) 167 die("can't clear inheritable caps"); 168 if (cap_clear_flag(caps, CAP_EFFECTIVE)) 169 die("can't clear effective caps"); 170 if (cap_clear_flag(caps, CAP_PERMITTED)) 171 die("can't clear permitted caps"); 172 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) { 173 if (i != CAP_SETPCAP && !(j->caps & (1 << i))) 174 continue; 175 raise_flag[0] = i; 176 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, raise_flag, CAP_SET)) 177 die("can't add effective cap"); 178 if (cap_set_flag(caps, CAP_PERMITTED, 1, raise_flag, CAP_SET)) 179 die("can't add permitted cap"); 180 if (cap_set_flag(caps, CAP_INHERITABLE, 1, raise_flag, CAP_SET)) 181 die("can't add inheritable cap"); 182 } 183 if (cap_set_proc(caps)) 184 die("can't apply cleaned capset"); 185 cap_free(caps); 186 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) { 187 if (j->caps & (1 << i)) 188 continue; 189 if (prctl(PR_CAPBSET_DROP, i)) 190 pdie("prctl(PR_CAPBSET_DROP)"); 191 } 192} 193 194void minijail_enter(const struct minijail *j) { 195 if (j->flags.pids) 196 die("tried to enter a pid-namespaced jail; try minijail_run()?"); 197 198 if (j->flags.usergroups && !j->user) 199 die("usergroup inheritance without username"); 200 201 /* We can't recover from failures if we've dropped privileges partially, 202 * so we don't even try. If any of our operations fail, we abort() the 203 * entire process. */ 204 if (j->flags.vfs && unshare(CLONE_NEWNS)) 205 pdie("unshare"); 206 207 if (j->flags.readonly && remount_readonly()) 208 pdie("remount"); 209 210 if (j->flags.caps) { 211 /* POSIX capabilities are a bit tricky. If we drop our capability to change 212 * uids, our attempt to use setuid() below will fail. Hang on to root caps 213 * across setuid(), then lock securebits. */ 214 if (prctl(PR_SET_KEEPCAPS, 1)) 215 pdie("prctl(PR_SET_KEEPCAPS)"); 216 if (prctl(PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS)) 217 pdie("prctl(PR_SET_SECUREBITS)"); 218 } 219 220 if (j->flags.usergroups && initgroups(j->user, j->usergid)) 221 pdie("initgroups"); 222 else if (!j->flags.usergroups && setgroups(0, NULL)) 223 pdie("setgroups"); 224 225 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) 226 pdie("setresgid"); 227 228 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) 229 pdie("setresuid"); 230 231 if (j->flags.caps) 232 drop_caps(j); 233 234 /* seccomp has to come last since it cuts off all the other 235 * privilege-dropping syscalls :) */ 236 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) 237 pdie("prctl(PR_SET_SECCOMP)"); 238} 239 240static int init_exitstatus = 0; 241 242static void init_term(int __attribute__((unused)) sig) { 243 _exit(init_exitstatus); 244} 245 246static int init(pid_t rootpid) { 247 pid_t pid; 248 int status; 249 signal(SIGTERM, init_term); /* so that we exit with the right status */ 250 while ((pid = wait(&status)) > 0) { 251 /* This loop will only end when either there are no processes left inside 252 * our pid namespace or we get a signal. */ 253 if (pid == rootpid) 254 init_exitstatus = status; 255 } 256 if (!WIFEXITED(init_exitstatus)) 257 _exit(MINIJAIL_ERR_INIT); 258 _exit(WEXITSTATUS(init_exitstatus)); 259} 260 261/** @brief Move any commands that need to be done post-exec into an environment 262 * variable 263 * @param j Jail to move commands from. 264 * 265 * Serializes post-exec() commands into a string, removes them from the jail, 266 * and adds them to the environment; they will be deserialized later (see 267 * __minijail_preloaded) and executed inside the execve()'d process. 268 */ 269static int move_commands_to_env(struct minijail *j) { 270 const int kEnvBufSize = 256; 271 const char *ptrace = j->flags.ptrace ? "ptrace " : ""; 272 const char *seccomp = j->flags.seccomp ? "seccomp " : ""; 273 char setuid[64] = ""; 274 char caps[32] = ""; 275 char *newenv; 276 char *oldenv; 277 char *envbuf = malloc(kEnvBufSize); 278 int r; 279 280 if (!envbuf) 281 return -ENOMEM; 282 283 if (j->flags.caps) 284 snprintf(caps, sizeof(caps), "caps=%" PRIx64 " ", j->caps); 285 286 if (j->flags.uid && j->flags.caps) { 287 snprintf(setuid, sizeof(setuid), "uid=%d ", j->uid); 288 j->flags.uid = 0; 289 } 290 291 j->flags.caps = 0; 292 j->flags.ptrace = 0; 293 j->flags.seccomp = 0; 294 295 r = snprintf(envbuf, kEnvBufSize, "%s%s%s%s", setuid, ptrace, seccomp, caps); 296 if (!r) { 297 /* No commands generated, so no preload needed :) */ 298 free(envbuf); 299 return 0; 300 } 301 if (r == kEnvBufSize) { 302 free(envbuf); 303 return -E2BIG; 304 } 305 306 oldenv = getenv("LD_PRELOAD") ? : ""; 307 newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH)); 308 if (!newenv) { 309 free(envbuf); 310 return -ENOMEM; 311 } 312 313 /* Only insert a separating space if we have something to separate... */ 314 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", PRELOADPATH); 315 316 /* setenv() makes a copy of the string we give it */ 317 setenv("LD_PRELOAD", newenv, 1); 318 setenv(kCommandEnvVar, envbuf, 1); 319 free(newenv); 320 free(envbuf); 321 return 0; 322} 323 324int minijail_run(struct minijail *j, const char *filename, char *const argv[]) { 325 unsigned int pidns = j->flags.pids ? CLONE_NEWPID : 0; 326 pid_t r; 327 r = move_commands_to_env(j); 328 if (r) 329 return r; 330 331 r = syscall(SYS_clone, pidns | SIGCHLD, NULL); 332 if (r > 0) { 333 j->initpid = r; 334 return 0; 335 } 336 if (r < 0) 337 return r; 338 339 j->flags.pids = 0; 340 341 /* Jail this process and its descendants... */ 342 minijail_enter(j); 343 344 if (pidns) { 345 /* pid namespace: this process will become init inside the new namespace, so 346 * fork off a child to actually run the program (we don't want all programs 347 * we might exec to have to know how to be init). */ 348 r = fork(); 349 if (r < 0) 350 _exit(r); 351 else if (r > 0) 352 init(r); /* never returns */ 353 } 354 355 /* If we aren't pid-namespaced: 356 * calling process 357 * -> execve()-ing process 358 * If we are: 359 * calling process 360 * -> init()-ing process 361 * -> execve()-ing process 362 */ 363 _exit(execve(filename, argv, environ)); 364} 365 366int minijail_kill(struct minijail *j) { 367 int st; 368 if (kill(j->initpid, SIGTERM)) 369 return errno; 370 if (waitpid(j->initpid, &st, 0) < 0) 371 return errno; 372 return st; 373} 374 375int minijail_wait(struct minijail *j) { 376 int st; 377 if (waitpid(j->initpid, &st, 0) < 0) 378 return errno; 379 if (!WIFEXITED(st)) 380 return MINIJAIL_ERR_JAIL; 381 return WEXITSTATUS(st); 382} 383 384void minijail_destroy(struct minijail *j) { 385 free(j); 386} 387 388