1// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "sandbox/linux/services/credentials.h"
6
7#include <errno.h>
8#include <limits.h>
9#include <signal.h>
10#include <stddef.h>
11#include <stdint.h>
12#include <stdio.h>
13#include <sys/syscall.h>
14#include <sys/types.h>
15#include <sys/wait.h>
16#include <unistd.h>
17
18#include "base/bind.h"
19#include "base/files/file_path.h"
20#include "base/files/file_util.h"
21#include "base/logging.h"
22#include "base/macros.h"
23#include "base/posix/eintr_wrapper.h"
24#include "base/process/launch.h"
25#include "base/template_util.h"
26#include "build/build_config.h"
27#include "sandbox/linux/services/namespace_utils.h"
28#include "sandbox/linux/services/proc_util.h"
29#include "sandbox/linux/services/syscall_wrappers.h"
30#include "sandbox/linux/services/thread_helpers.h"
31#include "sandbox/linux/system_headers/capability.h"
32#include "sandbox/linux/system_headers/linux_signal.h"
33#include "third_party/valgrind/valgrind.h"
34
35namespace sandbox {
36
37namespace {
38
39bool IsRunningOnValgrind() { return RUNNING_ON_VALGRIND; }
40
41// Checks that the set of RES-uids and the set of RES-gids have
42// one element each and return that element in |resuid| and |resgid|
43// respectively. It's ok to pass NULL as one or both of the ids.
44bool GetRESIds(uid_t* resuid, gid_t* resgid) {
45  uid_t ruid, euid, suid;
46  gid_t rgid, egid, sgid;
47  PCHECK(sys_getresuid(&ruid, &euid, &suid) == 0);
48  PCHECK(sys_getresgid(&rgid, &egid, &sgid) == 0);
49  const bool uids_are_equal = (ruid == euid) && (ruid == suid);
50  const bool gids_are_equal = (rgid == egid) && (rgid == sgid);
51  if (!uids_are_equal || !gids_are_equal) return false;
52  if (resuid) *resuid = euid;
53  if (resgid) *resgid = egid;
54  return true;
55}
56
57const int kExitSuccess = 0;
58
59#if defined(__clang__)
60// Disable sanitizers that rely on TLS and may write to non-stack memory.
61__attribute__((no_sanitize_address))
62__attribute__((no_sanitize_thread))
63__attribute__((no_sanitize_memory))
64#endif
65int ChrootToSelfFdinfo(void*) {
66  // This function can be run from a vforked child, so it should not write to
67  // any memory other than the stack or errno. Reads from TLS may be different
68  // from in the parent process.
69  RAW_CHECK(sys_chroot("/proc/self/fdinfo/") == 0);
70
71  // CWD is essentially an implicit file descriptor, so be careful to not
72  // leave it behind.
73  RAW_CHECK(chdir("/") == 0);
74  _exit(kExitSuccess);
75}
76
77// chroot() to an empty dir that is "safe". To be safe, it must not contain
78// any subdirectory (chroot-ing there would allow a chroot escape) and it must
79// be impossible to create an empty directory there.
80// We achieve this by doing the following:
81// 1. We create a new process sharing file system information.
82// 2. In the child, we chroot to /proc/self/fdinfo/
83// This is already "safe", since fdinfo/ does not contain another directory and
84// one cannot create another directory there.
85// 3. The process dies
86// After (3) happens, the directory is not available anymore in /proc.
87bool ChrootToSafeEmptyDir() {
88  // We need to chroot to a fdinfo that is unique to a process and have that
89  // process die.
90  // 1. We don't want to simply fork() because duplicating the page tables is
91  // slow with a big address space.
92  // 2. We do not use a regular thread (that would unshare CLONE_FILES) because
93  // when we are in a PID namespace, we cannot easily get a handle to the
94  // /proc/tid directory for the thread (since /proc may not be aware of the
95  // PID namespace). With a process, we can just use /proc/self.
96  pid_t pid = -1;
97  char stack_buf[PTHREAD_STACK_MIN];
98#if defined(ARCH_CPU_X86_FAMILY) || defined(ARCH_CPU_ARM_FAMILY) || \
99    defined(ARCH_CPU_MIPS64_FAMILY) || defined(ARCH_CPU_MIPS_FAMILY)
100  // The stack grows downward.
101  void* stack = stack_buf + sizeof(stack_buf);
102#else
103#error "Unsupported architecture"
104#endif
105
106  int clone_flags = CLONE_FS | LINUX_SIGCHLD;
107  void* tls = nullptr;
108#if defined(ARCH_CPU_X86_64) || defined(ARCH_CPU_ARM_FAMILY)
109  // Use CLONE_VM | CLONE_VFORK as an optimization to avoid copying page tables.
110  // Since clone writes to the new child's TLS before returning, we must set a
111  // new TLS to avoid corrupting the current process's TLS. On ARCH_CPU_X86,
112  // glibc performs syscalls by calling a function pointer in TLS, so we do not
113  // attempt this optimization.
114  clone_flags |= CLONE_VM | CLONE_VFORK | CLONE_SETTLS;
115
116  char tls_buf[PTHREAD_STACK_MIN] = {0};
117  tls = tls_buf;
118#endif
119
120  pid = clone(ChrootToSelfFdinfo, stack, clone_flags, nullptr, nullptr, tls,
121              nullptr);
122  PCHECK(pid != -1);
123
124  int status = -1;
125  PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid);
126
127  return WIFEXITED(status) && WEXITSTATUS(status) == kExitSuccess;
128}
129
130// CHECK() that an attempt to move to a new user namespace raised an expected
131// errno.
132void CheckCloneNewUserErrno(int error) {
133  // EPERM can happen if already in a chroot. EUSERS if too many nested
134  // namespaces are used. EINVAL for kernels that don't support the feature.
135  // Valgrind will ENOSYS unshare().
136  PCHECK(error == EPERM || error == EUSERS || error == EINVAL ||
137         error == ENOSYS);
138}
139
140// Converts a Capability to the corresponding Linux CAP_XXX value.
141int CapabilityToKernelValue(Credentials::Capability cap) {
142  switch (cap) {
143    case Credentials::Capability::SYS_CHROOT:
144      return CAP_SYS_CHROOT;
145    case Credentials::Capability::SYS_ADMIN:
146      return CAP_SYS_ADMIN;
147  }
148
149  LOG(FATAL) << "Invalid Capability: " << static_cast<int>(cap);
150  return 0;
151}
152
153}  // namespace.
154
155// static
156bool Credentials::DropAllCapabilities(int proc_fd) {
157  if (!SetCapabilities(proc_fd, std::vector<Capability>())) {
158    return false;
159  }
160
161  CHECK(!HasAnyCapability());
162  return true;
163}
164
165// static
166bool Credentials::DropAllCapabilities() {
167  base::ScopedFD proc_fd(ProcUtil::OpenProc());
168  return Credentials::DropAllCapabilities(proc_fd.get());
169}
170
171// static
172bool Credentials::DropAllCapabilitiesOnCurrentThread() {
173  return SetCapabilitiesOnCurrentThread(std::vector<Capability>());
174}
175
176// static
177bool Credentials::SetCapabilitiesOnCurrentThread(
178    const std::vector<Capability>& caps) {
179  struct cap_hdr hdr = {};
180  hdr.version = _LINUX_CAPABILITY_VERSION_3;
181  struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};
182
183  // Initially, cap has no capability flags set. Enable the effective and
184  // permitted flags only for the requested capabilities.
185  for (const Capability cap : caps) {
186    const int cap_num = CapabilityToKernelValue(cap);
187    const size_t index = CAP_TO_INDEX(cap_num);
188    const uint32_t mask = CAP_TO_MASK(cap_num);
189    data[index].effective |= mask;
190    data[index].permitted |= mask;
191  }
192
193  return sys_capset(&hdr, data) == 0;
194}
195
196// static
197bool Credentials::SetCapabilities(int proc_fd,
198                                  const std::vector<Capability>& caps) {
199  DCHECK_LE(0, proc_fd);
200
201#if !defined(THREAD_SANITIZER)
202  // With TSAN, accept to break the security model as it is a testing
203  // configuration.
204  CHECK(ThreadHelpers::IsSingleThreaded(proc_fd));
205#endif
206
207  return SetCapabilitiesOnCurrentThread(caps);
208}
209
210bool Credentials::HasAnyCapability() {
211  struct cap_hdr hdr = {};
212  hdr.version = _LINUX_CAPABILITY_VERSION_3;
213  struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};
214
215  PCHECK(sys_capget(&hdr, data) == 0);
216
217  for (size_t i = 0; i < arraysize(data); ++i) {
218    if (data[i].effective || data[i].permitted || data[i].inheritable) {
219      return true;
220    }
221  }
222
223  return false;
224}
225
226bool Credentials::HasCapability(Capability cap) {
227  struct cap_hdr hdr = {};
228  hdr.version = _LINUX_CAPABILITY_VERSION_3;
229  struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};
230
231  PCHECK(sys_capget(&hdr, data) == 0);
232
233  const int cap_num = CapabilityToKernelValue(cap);
234  const size_t index = CAP_TO_INDEX(cap_num);
235  const uint32_t mask = CAP_TO_MASK(cap_num);
236
237  return (data[index].effective | data[index].permitted |
238          data[index].inheritable) &
239         mask;
240}
241
242// static
243bool Credentials::CanCreateProcessInNewUserNS() {
244  // Valgrind will let clone(2) pass-through, but doesn't support unshare(),
245  // so always consider UserNS unsupported there.
246  if (IsRunningOnValgrind()) {
247    return false;
248  }
249
250#if defined(THREAD_SANITIZER)
251  // With TSAN, processes will always have threads running and can never
252  // enter a new user namespace with MoveToNewUserNS().
253  return false;
254#endif
255
256  // This is roughly a fork().
257  const pid_t pid = sys_clone(CLONE_NEWUSER | SIGCHLD, 0, 0, 0, 0);
258
259  if (pid == -1) {
260    CheckCloneNewUserErrno(errno);
261    return false;
262  }
263
264  // The parent process could have had threads. In the child, these threads
265  // have disappeared. Make sure to not do anything in the child, as this is a
266  // fragile execution environment.
267  if (pid == 0) {
268    _exit(kExitSuccess);
269  }
270
271  // Always reap the child.
272  int status = -1;
273  PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid);
274  CHECK(WIFEXITED(status));
275  CHECK_EQ(kExitSuccess, WEXITSTATUS(status));
276
277  // clone(2) succeeded, we can use CLONE_NEWUSER.
278  return true;
279}
280
281bool Credentials::MoveToNewUserNS() {
282  uid_t uid;
283  gid_t gid;
284  if (!GetRESIds(&uid, &gid)) {
285    // If all the uids (or gids) are not equal to each other, the security
286    // model will most likely confuse the caller, abort.
287    DVLOG(1) << "uids or gids differ!";
288    return false;
289  }
290  int ret = sys_unshare(CLONE_NEWUSER);
291  if (ret) {
292    const int unshare_errno = errno;
293    VLOG(1) << "Looks like unprivileged CLONE_NEWUSER may not be available "
294            << "on this kernel.";
295    CheckCloneNewUserErrno(unshare_errno);
296    return false;
297  }
298
299  if (NamespaceUtils::KernelSupportsDenySetgroups()) {
300    PCHECK(NamespaceUtils::DenySetgroups());
301  }
302
303  // The current {r,e,s}{u,g}id is now an overflow id (c.f.
304  // /proc/sys/kernel/overflowuid). Setup the uid and gid maps.
305  DCHECK(GetRESIds(NULL, NULL));
306  const char kGidMapFile[] = "/proc/self/gid_map";
307  const char kUidMapFile[] = "/proc/self/uid_map";
308  PCHECK(NamespaceUtils::WriteToIdMapFile(kGidMapFile, gid));
309  PCHECK(NamespaceUtils::WriteToIdMapFile(kUidMapFile, uid));
310  DCHECK(GetRESIds(NULL, NULL));
311  return true;
312}
313
314bool Credentials::DropFileSystemAccess(int proc_fd) {
315  CHECK_LE(0, proc_fd);
316
317  CHECK(ChrootToSafeEmptyDir());
318  CHECK(!base::DirectoryExists(base::FilePath("/proc")));
319  CHECK(!ProcUtil::HasOpenDirectory(proc_fd));
320  // We never let this function fail.
321  return true;
322}
323
324pid_t Credentials::ForkAndDropCapabilitiesInChild() {
325  pid_t pid = fork();
326  if (pid != 0) {
327    return pid;
328  }
329
330  // Since we just forked, we are single threaded.
331  PCHECK(DropAllCapabilitiesOnCurrentThread());
332  return 0;
333}
334
335}  // namespace sandbox.
336