1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "sandbox/linux/services/credentials.h"
6 
7 #include <errno.h>
8 #include <limits.h>
9 #include <signal.h>
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <stdio.h>
13 #include <sys/syscall.h>
14 #include <sys/types.h>
15 #include <sys/wait.h>
16 #include <unistd.h>
17 
18 #include "base/bind.h"
19 #include "base/compiler_specific.h"
20 #include "base/files/file_path.h"
21 #include "base/files/file_util.h"
22 #include "base/logging.h"
23 #include "base/macros.h"
24 #include "base/posix/eintr_wrapper.h"
25 #include "base/process/launch.h"
26 #include "base/third_party/valgrind/valgrind.h"
27 #include "build/build_config.h"
28 #include "sandbox/linux/services/namespace_utils.h"
29 #include "sandbox/linux/services/proc_util.h"
30 #include "sandbox/linux/services/syscall_wrappers.h"
31 #include "sandbox/linux/services/thread_helpers.h"
32 #include "sandbox/linux/system_headers/capability.h"
33 #include "sandbox/linux/system_headers/linux_signal.h"
34 
35 namespace sandbox {
36 
37 namespace {
38 
IsRunningOnValgrind()39 bool IsRunningOnValgrind() { return RUNNING_ON_VALGRIND; }
40 
41 // Checks that the set of RES-uids and the set of RES-gids have
42 // one element each and return that element in |resuid| and |resgid|
43 // respectively. It's ok to pass NULL as one or both of the ids.
GetRESIds(uid_t * resuid,gid_t * resgid)44 bool GetRESIds(uid_t* resuid, gid_t* resgid) {
45   uid_t ruid, euid, suid;
46   gid_t rgid, egid, sgid;
47   PCHECK(sys_getresuid(&ruid, &euid, &suid) == 0);
48   PCHECK(sys_getresgid(&rgid, &egid, &sgid) == 0);
49   const bool uids_are_equal = (ruid == euid) && (ruid == suid);
50   const bool gids_are_equal = (rgid == egid) && (rgid == sgid);
51   if (!uids_are_equal || !gids_are_equal) return false;
52   if (resuid) *resuid = euid;
53   if (resgid) *resgid = egid;
54   return true;
55 }
56 
57 const int kExitSuccess = 0;
58 
59 #if defined(__clang__)
60 // Disable sanitizers that rely on TLS and may write to non-stack memory.
61 __attribute__((no_sanitize_address))
62 __attribute__((no_sanitize_thread))
63 __attribute__((no_sanitize_memory))
64 #endif
ChrootToSelfFdinfo(void *)65 int ChrootToSelfFdinfo(void*) {
66   // This function can be run from a vforked child, so it should not write to
67   // any memory other than the stack or errno. Reads from TLS may be different
68   // from in the parent process.
69   RAW_CHECK(sys_chroot("/proc/self/fdinfo/") == 0);
70 
71   // CWD is essentially an implicit file descriptor, so be careful to not
72   // leave it behind.
73   RAW_CHECK(chdir("/") == 0);
74   _exit(kExitSuccess);
75 }
76 
77 // chroot() to an empty dir that is "safe". To be safe, it must not contain
78 // any subdirectory (chroot-ing there would allow a chroot escape) and it must
79 // be impossible to create an empty directory there.
80 // We achieve this by doing the following:
81 // 1. We create a new process sharing file system information.
82 // 2. In the child, we chroot to /proc/self/fdinfo/
83 // This is already "safe", since fdinfo/ does not contain another directory and
84 // one cannot create another directory there.
85 // 3. The process dies
86 // After (3) happens, the directory is not available anymore in /proc.
ChrootToSafeEmptyDir()87 bool ChrootToSafeEmptyDir() {
88   // We need to chroot to a fdinfo that is unique to a process and have that
89   // process die.
90   // 1. We don't want to simply fork() because duplicating the page tables is
91   // slow with a big address space.
92   // 2. We do not use a regular thread (that would unshare CLONE_FILES) because
93   // when we are in a PID namespace, we cannot easily get a handle to the
94   // /proc/tid directory for the thread (since /proc may not be aware of the
95   // PID namespace). With a process, we can just use /proc/self.
96   pid_t pid = -1;
97   char stack_buf[PTHREAD_STACK_MIN] ALIGNAS(16);
98 #if defined(ARCH_CPU_X86_FAMILY) || defined(ARCH_CPU_ARM_FAMILY) || \
99     defined(ARCH_CPU_MIPS_FAMILY)
100   // The stack grows downward.
101   void* stack = stack_buf + sizeof(stack_buf);
102 #else
103 #error "Unsupported architecture"
104 #endif
105 
106   int clone_flags = CLONE_FS | LINUX_SIGCHLD;
107   void* tls = nullptr;
108 #if defined(ARCH_CPU_X86_64) || defined(ARCH_CPU_ARM_FAMILY)
109   // Use CLONE_VM | CLONE_VFORK as an optimization to avoid copying page tables.
110   // Since clone writes to the new child's TLS before returning, we must set a
111   // new TLS to avoid corrupting the current process's TLS. On ARCH_CPU_X86,
112   // glibc performs syscalls by calling a function pointer in TLS, so we do not
113   // attempt this optimization.
114   clone_flags |= CLONE_VM | CLONE_VFORK | CLONE_SETTLS;
115 
116   char tls_buf[PTHREAD_STACK_MIN] = {0};
117   tls = tls_buf;
118 #endif
119 
120   pid = clone(ChrootToSelfFdinfo, stack, clone_flags, nullptr, nullptr, tls,
121               nullptr);
122   PCHECK(pid != -1);
123 
124   int status = -1;
125   PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid);
126 
127   return WIFEXITED(status) && WEXITSTATUS(status) == kExitSuccess;
128 }
129 
130 // CHECK() that an attempt to move to a new user namespace raised an expected
131 // errno.
CheckCloneNewUserErrno(int error)132 void CheckCloneNewUserErrno(int error) {
133   // EPERM can happen if already in a chroot. EUSERS if too many nested
134   // namespaces are used. EINVAL for kernels that don't support the feature.
135   // Valgrind will ENOSYS unshare().
136   PCHECK(error == EPERM || error == EUSERS || error == EINVAL ||
137          error == ENOSYS);
138 }
139 
140 // Converts a Capability to the corresponding Linux CAP_XXX value.
CapabilityToKernelValue(Credentials::Capability cap)141 int CapabilityToKernelValue(Credentials::Capability cap) {
142   switch (cap) {
143     case Credentials::Capability::SYS_CHROOT:
144       return CAP_SYS_CHROOT;
145     case Credentials::Capability::SYS_ADMIN:
146       return CAP_SYS_ADMIN;
147   }
148 
149   LOG(FATAL) << "Invalid Capability: " << static_cast<int>(cap);
150   return 0;
151 }
152 
153 }  // namespace.
154 
155 // static
DropAllCapabilities(int proc_fd)156 bool Credentials::DropAllCapabilities(int proc_fd) {
157   if (!SetCapabilities(proc_fd, std::vector<Capability>())) {
158     return false;
159   }
160 
161   CHECK(!HasAnyCapability());
162   return true;
163 }
164 
165 // static
DropAllCapabilities()166 bool Credentials::DropAllCapabilities() {
167   base::ScopedFD proc_fd(ProcUtil::OpenProc());
168   return Credentials::DropAllCapabilities(proc_fd.get());
169 }
170 
171 // static
DropAllCapabilitiesOnCurrentThread()172 bool Credentials::DropAllCapabilitiesOnCurrentThread() {
173   return SetCapabilitiesOnCurrentThread(std::vector<Capability>());
174 }
175 
176 // static
SetCapabilitiesOnCurrentThread(const std::vector<Capability> & caps)177 bool Credentials::SetCapabilitiesOnCurrentThread(
178     const std::vector<Capability>& caps) {
179   struct cap_hdr hdr = {};
180   hdr.version = _LINUX_CAPABILITY_VERSION_3;
181   struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};
182 
183   // Initially, cap has no capability flags set. Enable the effective and
184   // permitted flags only for the requested capabilities.
185   for (const Capability cap : caps) {
186     const int cap_num = CapabilityToKernelValue(cap);
187     const size_t index = CAP_TO_INDEX(cap_num);
188     const uint32_t mask = CAP_TO_MASK(cap_num);
189     data[index].effective |= mask;
190     data[index].permitted |= mask;
191   }
192 
193   return sys_capset(&hdr, data) == 0;
194 }
195 
196 // static
SetCapabilities(int proc_fd,const std::vector<Capability> & caps)197 bool Credentials::SetCapabilities(int proc_fd,
198                                   const std::vector<Capability>& caps) {
199   DCHECK_LE(0, proc_fd);
200 
201 #if !defined(THREAD_SANITIZER)
202   // With TSAN, accept to break the security model as it is a testing
203   // configuration.
204   CHECK(ThreadHelpers::IsSingleThreaded(proc_fd));
205 #endif
206 
207   return SetCapabilitiesOnCurrentThread(caps);
208 }
209 
HasAnyCapability()210 bool Credentials::HasAnyCapability() {
211   struct cap_hdr hdr = {};
212   hdr.version = _LINUX_CAPABILITY_VERSION_3;
213   struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};
214 
215   PCHECK(sys_capget(&hdr, data) == 0);
216 
217   for (size_t i = 0; i < arraysize(data); ++i) {
218     if (data[i].effective || data[i].permitted || data[i].inheritable) {
219       return true;
220     }
221   }
222 
223   return false;
224 }
225 
HasCapability(Capability cap)226 bool Credentials::HasCapability(Capability cap) {
227   struct cap_hdr hdr = {};
228   hdr.version = _LINUX_CAPABILITY_VERSION_3;
229   struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};
230 
231   PCHECK(sys_capget(&hdr, data) == 0);
232 
233   const int cap_num = CapabilityToKernelValue(cap);
234   const size_t index = CAP_TO_INDEX(cap_num);
235   const uint32_t mask = CAP_TO_MASK(cap_num);
236 
237   return (data[index].effective | data[index].permitted |
238           data[index].inheritable) &
239          mask;
240 }
241 
242 // static
CanCreateProcessInNewUserNS()243 bool Credentials::CanCreateProcessInNewUserNS() {
244   // Valgrind will let clone(2) pass-through, but doesn't support unshare(),
245   // so always consider UserNS unsupported there.
246   if (IsRunningOnValgrind()) {
247     return false;
248   }
249 
250 #if defined(THREAD_SANITIZER)
251   // With TSAN, processes will always have threads running and can never
252   // enter a new user namespace with MoveToNewUserNS().
253   return false;
254 #endif
255 
256   // This is roughly a fork().
257   const pid_t pid = sys_clone(CLONE_NEWUSER | SIGCHLD, 0, 0, 0, 0);
258 
259   if (pid == -1) {
260     CheckCloneNewUserErrno(errno);
261     return false;
262   }
263 
264   // The parent process could have had threads. In the child, these threads
265   // have disappeared. Make sure to not do anything in the child, as this is a
266   // fragile execution environment.
267   if (pid == 0) {
268     _exit(kExitSuccess);
269   }
270 
271   // Always reap the child.
272   int status = -1;
273   PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid);
274   CHECK(WIFEXITED(status));
275   CHECK_EQ(kExitSuccess, WEXITSTATUS(status));
276 
277   // clone(2) succeeded, we can use CLONE_NEWUSER.
278   return true;
279 }
280 
MoveToNewUserNS()281 bool Credentials::MoveToNewUserNS() {
282   uid_t uid;
283   gid_t gid;
284   if (!GetRESIds(&uid, &gid)) {
285     // If all the uids (or gids) are not equal to each other, the security
286     // model will most likely confuse the caller, abort.
287     DVLOG(1) << "uids or gids differ!";
288     return false;
289   }
290   int ret = sys_unshare(CLONE_NEWUSER);
291   if (ret) {
292     const int unshare_errno = errno;
293     VLOG(1) << "Looks like unprivileged CLONE_NEWUSER may not be available "
294             << "on this kernel.";
295     CheckCloneNewUserErrno(unshare_errno);
296     return false;
297   }
298 
299   if (NamespaceUtils::KernelSupportsDenySetgroups()) {
300     PCHECK(NamespaceUtils::DenySetgroups());
301   }
302 
303   // The current {r,e,s}{u,g}id is now an overflow id (c.f.
304   // /proc/sys/kernel/overflowuid). Setup the uid and gid maps.
305   DCHECK(GetRESIds(NULL, NULL));
306   const char kGidMapFile[] = "/proc/self/gid_map";
307   const char kUidMapFile[] = "/proc/self/uid_map";
308   PCHECK(NamespaceUtils::WriteToIdMapFile(kGidMapFile, gid));
309   PCHECK(NamespaceUtils::WriteToIdMapFile(kUidMapFile, uid));
310   DCHECK(GetRESIds(NULL, NULL));
311   return true;
312 }
313 
DropFileSystemAccess(int proc_fd)314 bool Credentials::DropFileSystemAccess(int proc_fd) {
315   CHECK_LE(0, proc_fd);
316 
317   CHECK(ChrootToSafeEmptyDir());
318   CHECK(!base::DirectoryExists(base::FilePath("/proc")));
319   CHECK(!ProcUtil::HasOpenDirectory(proc_fd));
320   // We never let this function fail.
321   return true;
322 }
323 
ForkAndDropCapabilitiesInChild()324 pid_t Credentials::ForkAndDropCapabilitiesInChild() {
325   pid_t pid = fork();
326   if (pid != 0) {
327     return pid;
328   }
329 
330   // Since we just forked, we are single threaded.
331   PCHECK(DropAllCapabilitiesOnCurrentThread());
332   return 0;
333 }
334 
335 }  // namespace sandbox.
336