libminijail.c revision cd7a9046e61e243fca916a286e49d58e2331eaa7
1/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file. */
4
5#define _BSD_SOURCE
6#define _GNU_SOURCE
7#include <errno.h>
8#include <grp.h>
9#include <inttypes.h>
10#include <linux/capability.h>
11#include <linux/securebits.h>
12#include <pwd.h>
13#include <sched.h>
14#include <signal.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <syscall.h>
19#include <sys/capability.h>
20#include <sys/mount.h>
21#include <sys/prctl.h>
22#include <sys/wait.h>
23#include <syslog.h>
24#include <unistd.h>
25
26#include "libminijail.h"
27#include "libminijail-private.h"
28
29struct minijail {
30  struct {
31    int uid : 1;
32    int gid : 1;
33    int caps : 1;
34    int vfs : 1;
35    int pids : 1;
36    int seccomp : 1;
37    int readonly : 1;
38    int usergroups : 1;
39    int ptrace : 1;
40  } flags;
41  uid_t uid;
42  gid_t gid;
43  gid_t usergid;
44  const char *user;
45  uint64_t caps;
46  pid_t initpid;
47};
48
49static void pdie(const char *failed) {
50  syslog(LOG_ERR, "libminijail: %s failed: %s", failed, strerror(errno));
51  abort();
52}
53
54static void die(const char *failed) {
55  syslog(LOG_ERR, "libminijail: %s", failed);
56  abort();
57}
58
59struct minijail *minijail_new(void) {
60  struct minijail *j = malloc(sizeof(*j));
61  if (j)
62    memset(j, 0, sizeof(*j));
63  return j;
64}
65
66void minijail_change_uid(struct minijail *j, uid_t uid) {
67  if (uid == 0)
68    die("useless change to uid 0");
69  j->uid = uid;
70  j->flags.uid = 1;
71}
72
73void minijail_change_gid(struct minijail *j, gid_t gid) {
74  if (gid == 0)
75    die("useless change to gid 0");
76  j->gid = gid;
77  j->flags.gid = 1;
78}
79
80int minijail_change_user(struct minijail *j, const char *user) {
81  /* In principle this should use getpwnam(), but:
82   * 1) getpwnam_r() isn't actually reentrant anyway, since it uses a
83   *    statically-allocated file descriptor internally
84   * 2) fgetpwnam() (by analogy with fgetpwent) would solve (1) except that it
85   *    doesn't exist
86   * 3) sysconf() (see getpwnam_r(3)) is allowed to return a size that is not
87   *    large enough, which means having to loop on growing the buffer we pass
88   *    in
89   */
90  struct passwd *pw = getpwnam(user);
91  if (!pw)
92    return errno;
93  minijail_change_uid(j, pw->pw_uid);
94  j->user = user;
95  j->usergid = pw->pw_gid;
96  return 0;
97}
98
99int minijail_change_group(struct minijail *j, const char *group) {
100  /* In principle this should use getgrnam(), but:
101   * 1) getgrnam_r() isn't actually reentrant anyway, since it uses a
102   *    statically-allocated file descriptor internally
103   * 2) fgetgrnam() (by analogy with fgetgrent) would solve (1) except that it
104   *    doesn't exist
105   * 3) sysconf() (see getgrnam_r(3)) is allowed to return a size that is not
106   *    large enough, which means having to loop on growing the buffer we pass
107   *    in
108   */
109  struct group *gr = getgrnam(group);
110  if (!gr)
111    return errno;
112  minijail_change_gid(j, gr->gr_gid);
113  return 0;
114}
115
116void minijail_use_seccomp(struct minijail *j) {
117  j->flags.seccomp = 1;
118}
119
120void minijail_use_caps(struct minijail *j, uint64_t capmask) {
121  j->caps = capmask;
122  j->flags.caps = 1;
123}
124
125void minijail_namespace_vfs(struct minijail *j) {
126  j->flags.vfs = 1;
127}
128
129void minijail_namespace_pids(struct minijail *j) {
130  j->flags.pids = 1;
131}
132
133void minijail_remount_readonly(struct minijail *j) {
134  j->flags.vfs = 1;
135  j->flags.readonly = 1;
136}
137
138void minijail_inherit_usergroups(struct minijail *j) {
139  j->flags.usergroups = 1;
140}
141
142void minijail_disable_ptrace(struct minijail *j) {
143  j->flags.ptrace = 1;
144}
145
146static int remount_readonly(void) {
147  const char *kProcPath = "/proc";
148  const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
149  /* Right now, we're holding a reference to our parent's old mount of /proc in
150   * our namespace, which means using MS_REMOUNT here would mutate our parent's
151   * mount as well, even though we're in a VFS namespace (!). Instead, remove
152   * their mount from our namespace and make our own. */
153  if (umount(kProcPath))
154    return errno;
155  if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
156    return errno;
157  return 0;
158}
159
160static void drop_caps(const struct minijail *j) {
161  cap_t caps = cap_get_proc();
162  cap_value_t raise_flag[1];
163  unsigned int i;
164  if (!caps)
165    die("can't get process caps");
166  if (cap_clear_flag(caps, CAP_INHERITABLE))
167    die("can't clear inheritable caps");
168  if (cap_clear_flag(caps, CAP_EFFECTIVE))
169    die("can't clear effective caps");
170  if (cap_clear_flag(caps, CAP_PERMITTED))
171    die("can't clear permitted caps");
172  for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
173    if (i != CAP_SETPCAP && !(j->caps & (1 << i)))
174      continue;
175    raise_flag[0] = i;
176    if (cap_set_flag(caps, CAP_EFFECTIVE, 1, raise_flag, CAP_SET))
177      die("can't add effective cap");
178    if (cap_set_flag(caps, CAP_PERMITTED, 1, raise_flag, CAP_SET))
179      die("can't add permitted cap");
180    if (cap_set_flag(caps, CAP_INHERITABLE, 1, raise_flag, CAP_SET))
181      die("can't add inheritable cap");
182  }
183  if (cap_set_proc(caps))
184    die("can't apply cleaned capset");
185  cap_free(caps);
186  for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
187    if (j->caps & (1 << i))
188      continue;
189    if (prctl(PR_CAPBSET_DROP, i))
190      pdie("prctl(PR_CAPBSET_DROP)");
191  }
192}
193
194void minijail_enter(const struct minijail *j) {
195  if (j->flags.pids)
196    die("tried to enter a pid-namespaced jail; try minijail_run()?");
197
198  if (j->flags.usergroups && !j->user)
199    die("usergroup inheritance without username");
200
201  /* We can't recover from failures if we've dropped privileges partially,
202   * so we don't even try. If any of our operations fail, we abort() the
203   * entire process. */
204  if (j->flags.vfs && unshare(CLONE_NEWNS))
205    pdie("unshare");
206
207  if (j->flags.readonly && remount_readonly())
208    pdie("remount");
209
210  if (j->flags.caps) {
211    /* POSIX capabilities are a bit tricky. If we drop our capability to change
212     * uids, our attempt to use setuid() below will fail. Hang on to root caps
213     * across setuid(), then lock securebits. */
214    if (prctl(PR_SET_KEEPCAPS, 1))
215      pdie("prctl(PR_SET_KEEPCAPS)");
216    if (prctl(PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
217      pdie("prctl(PR_SET_SECUREBITS)");
218  }
219
220  if (j->flags.usergroups && initgroups(j->user, j->usergid))
221    pdie("initgroups");
222  else if (!j->flags.usergroups && setgroups(0, NULL))
223    pdie("setgroups");
224
225  if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
226    pdie("setresgid");
227
228  if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
229    pdie("setresuid");
230
231  if (j->flags.caps)
232    drop_caps(j);
233
234  /* seccomp has to come last since it cuts off all the other
235   * privilege-dropping syscalls :) */
236  if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1))
237    pdie("prctl(PR_SET_SECCOMP)");
238}
239
240static int init_exitstatus = 0;
241
242static void init_term(int __attribute__((unused)) sig) {
243  _exit(init_exitstatus);
244}
245
246static int init(pid_t rootpid) {
247  pid_t pid;
248  int status;
249  signal(SIGTERM, init_term); /* so that we exit with the right status */
250  while ((pid = wait(&status)) > 0) {
251    /* This loop will only end when either there are no processes left inside
252     * our pid namespace or we get a signal. */
253    if (pid == rootpid)
254      init_exitstatus = status;
255  }
256  if (!WIFEXITED(init_exitstatus))
257    _exit(MINIJAIL_ERR_INIT);
258  _exit(WEXITSTATUS(init_exitstatus));
259}
260
261/** @brief Move any commands that need to be done post-exec into an environment
262 *         variable
263 *  @param j Jail to move commands from.
264 *
265 *  Serializes post-exec() commands into a string, removes them from the jail,
266 *  and adds them to the environment; they will be deserialized later (see
267 *  __minijail_preloaded) and executed inside the execve()'d process.
268 */
269static int move_commands_to_env(struct minijail *j) {
270  const int kEnvBufSize = 256;
271  const char *ptrace = j->flags.ptrace ? "ptrace " : "";
272  const char *seccomp = j->flags.seccomp ? "seccomp " : "";
273  char setuid[64] = "";
274  char caps[32] = "";
275  char *newenv;
276  char *oldenv;
277  char *envbuf = malloc(kEnvBufSize);
278  int r;
279
280  if (!envbuf)
281    return -ENOMEM;
282
283  if (j->flags.caps)
284    snprintf(caps, sizeof(caps), "caps=%" PRIx64 " ", j->caps);
285
286  if (j->flags.uid && j->flags.caps) {
287    snprintf(setuid, sizeof(setuid), "uid=%d ", j->uid);
288    j->flags.uid = 0;
289  }
290
291  j->flags.caps = 0;
292  j->flags.ptrace = 0;
293  j->flags.seccomp = 0;
294
295  r = snprintf(envbuf, kEnvBufSize, "%s%s%s%s", setuid, ptrace, seccomp, caps);
296  if (!r) {
297    /* No commands generated, so no preload needed :) */
298    free(envbuf);
299    return 0;
300  }
301  if (r == kEnvBufSize) {
302    free(envbuf);
303    return -E2BIG;
304  }
305
306  oldenv = getenv("LD_PRELOAD") ? : "";
307  newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
308  if (!newenv) {
309    free(envbuf);
310    return -ENOMEM;
311  }
312
313  /* Only insert a separating space if we have something to separate... */
314  sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", PRELOADPATH);
315
316  /* setenv() makes a copy of the string we give it */
317  setenv("LD_PRELOAD", newenv, 1);
318  setenv(kCommandEnvVar, envbuf, 1);
319  free(newenv);
320  free(envbuf);
321  return 0;
322}
323
324int minijail_run(struct minijail *j, const char *filename, char *const argv[]) {
325  unsigned int pidns = j->flags.pids ? CLONE_NEWPID : 0;
326  pid_t r;
327  r = move_commands_to_env(j);
328  if (r)
329    return r;
330
331  r = syscall(SYS_clone, pidns | SIGCHLD, NULL);
332  if (r > 0) {
333    j->initpid = r;
334    return 0;
335  }
336  if (r < 0)
337    return r;
338
339  j->flags.pids = 0;
340
341  /* Jail this process and its descendants... */
342  minijail_enter(j);
343
344  if (pidns) {
345    /* pid namespace: this process will become init inside the new namespace, so
346     * fork off a child to actually run the program (we don't want all programs
347     * we might exec to have to know how to be init). */
348    r = fork();
349    if (r < 0)
350      _exit(r);
351    else if (r > 0)
352      init(r);  /* never returns */
353  }
354
355  /* If we aren't pid-namespaced:
356   *   calling process
357   *   -> execve()-ing process
358   * If we are:
359   *   calling process
360   *   -> init()-ing process
361   *      -> execve()-ing process
362   */
363  _exit(execve(filename, argv, environ));
364}
365
366int minijail_kill(struct minijail *j) {
367  int st;
368  if (kill(j->initpid, SIGTERM))
369    return errno;
370  if (waitpid(j->initpid, &st, 0) < 0)
371    return errno;
372  return st;
373}
374
375int minijail_wait(struct minijail *j) {
376  int st;
377  if (waitpid(j->initpid, &st, 0) < 0)
378    return errno;
379  if (!WIFEXITED(st))
380    return MINIJAIL_ERR_JAIL;
381  return WEXITSTATUS(st);
382}
383
384void minijail_destroy(struct minijail *j) {
385  free(j);
386}
387
388