1// Copyright (c) 2013 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "sandbox/linux/services/credentials.h" 6 7#include <errno.h> 8#include <limits.h> 9#include <signal.h> 10#include <stddef.h> 11#include <stdint.h> 12#include <stdio.h> 13#include <sys/syscall.h> 14#include <sys/types.h> 15#include <sys/wait.h> 16#include <unistd.h> 17 18#include "base/bind.h" 19#include "base/files/file_path.h" 20#include "base/files/file_util.h" 21#include "base/logging.h" 22#include "base/macros.h" 23#include "base/posix/eintr_wrapper.h" 24#include "base/process/launch.h" 25#include "base/template_util.h" 26#include "build/build_config.h" 27#include "sandbox/linux/services/namespace_utils.h" 28#include "sandbox/linux/services/proc_util.h" 29#include "sandbox/linux/services/syscall_wrappers.h" 30#include "sandbox/linux/services/thread_helpers.h" 31#include "sandbox/linux/system_headers/capability.h" 32#include "sandbox/linux/system_headers/linux_signal.h" 33#include "third_party/valgrind/valgrind.h" 34 35namespace sandbox { 36 37namespace { 38 39bool IsRunningOnValgrind() { return RUNNING_ON_VALGRIND; } 40 41// Checks that the set of RES-uids and the set of RES-gids have 42// one element each and return that element in |resuid| and |resgid| 43// respectively. It's ok to pass NULL as one or both of the ids. 44bool GetRESIds(uid_t* resuid, gid_t* resgid) { 45 uid_t ruid, euid, suid; 46 gid_t rgid, egid, sgid; 47 PCHECK(sys_getresuid(&ruid, &euid, &suid) == 0); 48 PCHECK(sys_getresgid(&rgid, &egid, &sgid) == 0); 49 const bool uids_are_equal = (ruid == euid) && (ruid == suid); 50 const bool gids_are_equal = (rgid == egid) && (rgid == sgid); 51 if (!uids_are_equal || !gids_are_equal) return false; 52 if (resuid) *resuid = euid; 53 if (resgid) *resgid = egid; 54 return true; 55} 56 57const int kExitSuccess = 0; 58 59#if defined(__clang__) 60// Disable sanitizers that rely on TLS and may write to non-stack memory. 61__attribute__((no_sanitize_address)) 62__attribute__((no_sanitize_thread)) 63__attribute__((no_sanitize_memory)) 64#endif 65int ChrootToSelfFdinfo(void*) { 66 // This function can be run from a vforked child, so it should not write to 67 // any memory other than the stack or errno. Reads from TLS may be different 68 // from in the parent process. 69 RAW_CHECK(sys_chroot("/proc/self/fdinfo/") == 0); 70 71 // CWD is essentially an implicit file descriptor, so be careful to not 72 // leave it behind. 73 RAW_CHECK(chdir("/") == 0); 74 _exit(kExitSuccess); 75} 76 77// chroot() to an empty dir that is "safe". To be safe, it must not contain 78// any subdirectory (chroot-ing there would allow a chroot escape) and it must 79// be impossible to create an empty directory there. 80// We achieve this by doing the following: 81// 1. We create a new process sharing file system information. 82// 2. In the child, we chroot to /proc/self/fdinfo/ 83// This is already "safe", since fdinfo/ does not contain another directory and 84// one cannot create another directory there. 85// 3. The process dies 86// After (3) happens, the directory is not available anymore in /proc. 87bool ChrootToSafeEmptyDir() { 88 // We need to chroot to a fdinfo that is unique to a process and have that 89 // process die. 90 // 1. We don't want to simply fork() because duplicating the page tables is 91 // slow with a big address space. 92 // 2. We do not use a regular thread (that would unshare CLONE_FILES) because 93 // when we are in a PID namespace, we cannot easily get a handle to the 94 // /proc/tid directory for the thread (since /proc may not be aware of the 95 // PID namespace). With a process, we can just use /proc/self. 96 pid_t pid = -1; 97 char stack_buf[PTHREAD_STACK_MIN]; 98#if defined(ARCH_CPU_X86_FAMILY) || defined(ARCH_CPU_ARM_FAMILY) || \ 99 defined(ARCH_CPU_MIPS64_FAMILY) || defined(ARCH_CPU_MIPS_FAMILY) 100 // The stack grows downward. 101 void* stack = stack_buf + sizeof(stack_buf); 102#else 103#error "Unsupported architecture" 104#endif 105 106 int clone_flags = CLONE_FS | LINUX_SIGCHLD; 107 void* tls = nullptr; 108#if defined(ARCH_CPU_X86_64) || defined(ARCH_CPU_ARM_FAMILY) 109 // Use CLONE_VM | CLONE_VFORK as an optimization to avoid copying page tables. 110 // Since clone writes to the new child's TLS before returning, we must set a 111 // new TLS to avoid corrupting the current process's TLS. On ARCH_CPU_X86, 112 // glibc performs syscalls by calling a function pointer in TLS, so we do not 113 // attempt this optimization. 114 clone_flags |= CLONE_VM | CLONE_VFORK | CLONE_SETTLS; 115 116 char tls_buf[PTHREAD_STACK_MIN] = {0}; 117 tls = tls_buf; 118#endif 119 120 pid = clone(ChrootToSelfFdinfo, stack, clone_flags, nullptr, nullptr, tls, 121 nullptr); 122 PCHECK(pid != -1); 123 124 int status = -1; 125 PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid); 126 127 return WIFEXITED(status) && WEXITSTATUS(status) == kExitSuccess; 128} 129 130// CHECK() that an attempt to move to a new user namespace raised an expected 131// errno. 132void CheckCloneNewUserErrno(int error) { 133 // EPERM can happen if already in a chroot. EUSERS if too many nested 134 // namespaces are used. EINVAL for kernels that don't support the feature. 135 // Valgrind will ENOSYS unshare(). 136 PCHECK(error == EPERM || error == EUSERS || error == EINVAL || 137 error == ENOSYS); 138} 139 140// Converts a Capability to the corresponding Linux CAP_XXX value. 141int CapabilityToKernelValue(Credentials::Capability cap) { 142 switch (cap) { 143 case Credentials::Capability::SYS_CHROOT: 144 return CAP_SYS_CHROOT; 145 case Credentials::Capability::SYS_ADMIN: 146 return CAP_SYS_ADMIN; 147 } 148 149 LOG(FATAL) << "Invalid Capability: " << static_cast<int>(cap); 150 return 0; 151} 152 153} // namespace. 154 155// static 156bool Credentials::DropAllCapabilities(int proc_fd) { 157 if (!SetCapabilities(proc_fd, std::vector<Capability>())) { 158 return false; 159 } 160 161 CHECK(!HasAnyCapability()); 162 return true; 163} 164 165// static 166bool Credentials::DropAllCapabilities() { 167 base::ScopedFD proc_fd(ProcUtil::OpenProc()); 168 return Credentials::DropAllCapabilities(proc_fd.get()); 169} 170 171// static 172bool Credentials::DropAllCapabilitiesOnCurrentThread() { 173 return SetCapabilitiesOnCurrentThread(std::vector<Capability>()); 174} 175 176// static 177bool Credentials::SetCapabilitiesOnCurrentThread( 178 const std::vector<Capability>& caps) { 179 struct cap_hdr hdr = {}; 180 hdr.version = _LINUX_CAPABILITY_VERSION_3; 181 struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}}; 182 183 // Initially, cap has no capability flags set. Enable the effective and 184 // permitted flags only for the requested capabilities. 185 for (const Capability cap : caps) { 186 const int cap_num = CapabilityToKernelValue(cap); 187 const size_t index = CAP_TO_INDEX(cap_num); 188 const uint32_t mask = CAP_TO_MASK(cap_num); 189 data[index].effective |= mask; 190 data[index].permitted |= mask; 191 } 192 193 return sys_capset(&hdr, data) == 0; 194} 195 196// static 197bool Credentials::SetCapabilities(int proc_fd, 198 const std::vector<Capability>& caps) { 199 DCHECK_LE(0, proc_fd); 200 201#if !defined(THREAD_SANITIZER) 202 // With TSAN, accept to break the security model as it is a testing 203 // configuration. 204 CHECK(ThreadHelpers::IsSingleThreaded(proc_fd)); 205#endif 206 207 return SetCapabilitiesOnCurrentThread(caps); 208} 209 210bool Credentials::HasAnyCapability() { 211 struct cap_hdr hdr = {}; 212 hdr.version = _LINUX_CAPABILITY_VERSION_3; 213 struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}}; 214 215 PCHECK(sys_capget(&hdr, data) == 0); 216 217 for (size_t i = 0; i < arraysize(data); ++i) { 218 if (data[i].effective || data[i].permitted || data[i].inheritable) { 219 return true; 220 } 221 } 222 223 return false; 224} 225 226bool Credentials::HasCapability(Capability cap) { 227 struct cap_hdr hdr = {}; 228 hdr.version = _LINUX_CAPABILITY_VERSION_3; 229 struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}}; 230 231 PCHECK(sys_capget(&hdr, data) == 0); 232 233 const int cap_num = CapabilityToKernelValue(cap); 234 const size_t index = CAP_TO_INDEX(cap_num); 235 const uint32_t mask = CAP_TO_MASK(cap_num); 236 237 return (data[index].effective | data[index].permitted | 238 data[index].inheritable) & 239 mask; 240} 241 242// static 243bool Credentials::CanCreateProcessInNewUserNS() { 244 // Valgrind will let clone(2) pass-through, but doesn't support unshare(), 245 // so always consider UserNS unsupported there. 246 if (IsRunningOnValgrind()) { 247 return false; 248 } 249 250#if defined(THREAD_SANITIZER) 251 // With TSAN, processes will always have threads running and can never 252 // enter a new user namespace with MoveToNewUserNS(). 253 return false; 254#endif 255 256 // This is roughly a fork(). 257 const pid_t pid = sys_clone(CLONE_NEWUSER | SIGCHLD, 0, 0, 0, 0); 258 259 if (pid == -1) { 260 CheckCloneNewUserErrno(errno); 261 return false; 262 } 263 264 // The parent process could have had threads. In the child, these threads 265 // have disappeared. Make sure to not do anything in the child, as this is a 266 // fragile execution environment. 267 if (pid == 0) { 268 _exit(kExitSuccess); 269 } 270 271 // Always reap the child. 272 int status = -1; 273 PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid); 274 CHECK(WIFEXITED(status)); 275 CHECK_EQ(kExitSuccess, WEXITSTATUS(status)); 276 277 // clone(2) succeeded, we can use CLONE_NEWUSER. 278 return true; 279} 280 281bool Credentials::MoveToNewUserNS() { 282 uid_t uid; 283 gid_t gid; 284 if (!GetRESIds(&uid, &gid)) { 285 // If all the uids (or gids) are not equal to each other, the security 286 // model will most likely confuse the caller, abort. 287 DVLOG(1) << "uids or gids differ!"; 288 return false; 289 } 290 int ret = sys_unshare(CLONE_NEWUSER); 291 if (ret) { 292 const int unshare_errno = errno; 293 VLOG(1) << "Looks like unprivileged CLONE_NEWUSER may not be available " 294 << "on this kernel."; 295 CheckCloneNewUserErrno(unshare_errno); 296 return false; 297 } 298 299 if (NamespaceUtils::KernelSupportsDenySetgroups()) { 300 PCHECK(NamespaceUtils::DenySetgroups()); 301 } 302 303 // The current {r,e,s}{u,g}id is now an overflow id (c.f. 304 // /proc/sys/kernel/overflowuid). Setup the uid and gid maps. 305 DCHECK(GetRESIds(NULL, NULL)); 306 const char kGidMapFile[] = "/proc/self/gid_map"; 307 const char kUidMapFile[] = "/proc/self/uid_map"; 308 PCHECK(NamespaceUtils::WriteToIdMapFile(kGidMapFile, gid)); 309 PCHECK(NamespaceUtils::WriteToIdMapFile(kUidMapFile, uid)); 310 DCHECK(GetRESIds(NULL, NULL)); 311 return true; 312} 313 314bool Credentials::DropFileSystemAccess(int proc_fd) { 315 CHECK_LE(0, proc_fd); 316 317 CHECK(ChrootToSafeEmptyDir()); 318 CHECK(!base::DirectoryExists(base::FilePath("/proc"))); 319 CHECK(!ProcUtil::HasOpenDirectory(proc_fd)); 320 // We never let this function fail. 321 return true; 322} 323 324pid_t Credentials::ForkAndDropCapabilitiesInChild() { 325 pid_t pid = fork(); 326 if (pid != 0) { 327 return pid; 328 } 329 330 // Since we just forked, we are single threaded. 331 PCHECK(DropAllCapabilitiesOnCurrentThread()); 332 return 0; 333} 334 335} // namespace sandbox. 336