1 /*
2 * Copyright (C) 2023 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "./execute.h"
18
19 #include <linux/securebits.h>
20 #include <linux/uio.h>
21 #include <seccomp_policy.h>
22 #include <sys/capability.h>
23 #include <sys/personality.h>
24 #include <sys/prctl.h>
25 #include <sys/ptrace.h>
26 #include <sys/wait.h>
27 #include <unistd.h>
28
29 #include <iostream>
30 #include <memory>
31
32 #include "./elf-utils.h"
33 #include "./registers.h"
34 #include "./shell-code.h"
35
36 namespace shell_as {
37
38 namespace {
39
40 // Capabilities are implemented as a 64-bit bit-vector. Therefore the maximum
41 // number of capabilities supported by a kernel is 64.
42 constexpr cap_value_t kMaxCapabilities = 64;
43
DropPreExecPrivileges(const shell_as::SecurityContext * context)44 bool DropPreExecPrivileges(const shell_as::SecurityContext* context) {
45 // The ordering here is important:
46 // (1) The platform's seccomp filters disallow setresgiud, so it must come
47 // before the seccomp drop.
48 // (2) Adding seccomp filters must happen before setresuid because setresuid
49 // drops some capabilities which are required for seccomp.
50 if (context->group_id.has_value() &&
51 setresgid(context->group_id.value(), context->group_id.value(),
52 context->group_id.value()) != 0) {
53 std::cerr << "Unable to set group id: " << context->group_id.value()
54 << std::endl;
55 return false;
56 }
57 if (context->supplementary_group_ids.has_value() &&
58 setgroups(context->supplementary_group_ids.value().size(),
59 context->supplementary_group_ids.value().data()) != 0) {
60 std::cerr << "Unable to set supplementary groups." << std::endl;
61 return false;
62 }
63
64 if (context->seccomp_filter.has_value()) {
65 switch (context->seccomp_filter.value()) {
66 case shell_as::kAppFilter:
67 set_app_seccomp_filter();
68 break;
69 case shell_as::kAppZygoteFilter:
70 set_app_zygote_seccomp_filter();
71 break;
72 case shell_as::kSystemFilter:
73 set_system_seccomp_filter();
74 break;
75 }
76 }
77
78 // This must be set prior to setresuid, otherwise that call will drop the
79 // permitted set of capabilities.
80 if (prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) != 0) {
81 std::cerr << "Unable to set keep capabilities." << std::endl;
82 return false;
83 }
84
85 if (context->user_id.has_value() &&
86 setresuid(context->user_id.value(), context->user_id.value(),
87 context->user_id.value()) != 0) {
88 std::cerr << "Unable to set user id: " << context->user_id.value()
89 << std::endl;
90 return false;
91 }
92
93 // Capabilities must be reacquired after setresuid since it still modifies
94 // capabilities, but it leaves the permitted set intact.
95 if (context->capabilities.has_value()) {
96 // The first step is to raise all the capabilities possible in all sets
97 // including the inheritable set. This defines the superset of possible
98 // capabilities that can be passed on after calling execve.
99 //
100 // The reason that all capabilities are raised in the inheritable set is due
101 // to a limitation of libcap. libcap may not contain a capability definition
102 // for all capabilities supported by the kernel. If this occurs, it will
103 // silently ignore requests to raise unknown capabilities via cap_set_flag.
104 //
105 // However, when parsing a cap_t from a text value, libcap will treat "all"
106 // as all possible 64 capability bits as set.
107 cap_t all_capabilities = cap_from_text("all+pie");
108 if (cap_set_proc(all_capabilities) != 0) {
109 std::cerr << "Unable to raise inheritable capability set." << std::endl;
110 cap_free(all_capabilities);
111 return false;
112 }
113 cap_free(all_capabilities);
114
115 // The second step is to raise the /desired/ capability subset in the
116 // ambient capability set. These are the capabilities that will actually be
117 // passed to the process after execve.
118 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) != 0) {
119 std::cerr << "Unable to clear ambient capabilities." << std::endl;
120 return false;
121 }
122 cap_t desired_capabilities = context->capabilities.value();
123 for (cap_value_t cap = 0; cap < kMaxCapabilities; cap++) {
124 // Skip capability values not supported by the kernel.
125 if (!CAP_IS_SUPPORTED(cap)) {
126 continue;
127 }
128 cap_flag_value_t value = CAP_CLEAR;
129 if (cap_get_flag(desired_capabilities, cap, CAP_PERMITTED, &value) == 0 &&
130 value == CAP_SET) {
131 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) != 0) {
132 std::cerr << "Unable to raise capability " << cap
133 << " in the ambient set." << std::endl;
134 return false;
135 }
136 }
137 }
138
139 // The final step is to raise the SECBIT_NOROOT flag. The kernel has special
140 // case logic that treats root calling execve differently than other users.
141 //
142 // By default all bits in the permitted set prior to calling execve will be
143 // raised after calling execve. This would ignore the work above and result
144 // in the process to have all capabilities.
145 //
146 // Setting the SECBIT_NOROOT disables this special casing for root and
147 // causes the kernel to treat it as any other UID.
148 int64_t secure_bits = prctl(PR_GET_SECUREBITS, 0, 0, 0, 0);
149 if (secure_bits < 0 ||
150 prctl(PR_SET_SECUREBITS, secure_bits | SECBIT_NOROOT, 0, 0, 0) != 0) {
151 std::cerr << "Unable to raise SECBIT_NOROOT." << std::endl;
152 return false;
153 }
154 }
155 return true;
156 }
157
ReadChildByte(const pid_t process,const uintptr_t address)158 uint8_t ReadChildByte(const pid_t process, const uintptr_t address) {
159 uintptr_t data = ptrace(PTRACE_PEEKDATA, process, address, nullptr);
160 return ((uint8_t*)&data)[0];
161 }
162
WriteChildByte(const pid_t process,const uintptr_t address,const uint8_t value)163 void WriteChildByte(const pid_t process, const uintptr_t address,
164 const uint8_t value) {
165 // This is not the most efficient way to write data to a process. However, it
166 // reduces code complexity of handling different word sizes and reading and
167 // writing memory that is not a multiple of the native word size.
168 uintptr_t data = ptrace(PTRACE_PEEKDATA, process, address, nullptr);
169 ((uint8_t*)&data)[0] = value;
170 ptrace(PTRACE_POKEDATA, process, address, data);
171 }
172
ReadChildMemory(const pid_t process,uintptr_t process_address,uint8_t * bytes,size_t byte_count)173 void ReadChildMemory(const pid_t process, uintptr_t process_address,
174 uint8_t* bytes, size_t byte_count) {
175 for (; byte_count != 0; byte_count--, bytes++, process_address++) {
176 *bytes = ReadChildByte(process, process_address);
177 }
178 }
179
WriteChildMemory(const pid_t process,uintptr_t process_address,uint8_t const * bytes,size_t byte_count)180 void WriteChildMemory(const pid_t process, uintptr_t process_address,
181 uint8_t const* bytes, size_t byte_count) {
182 for (; byte_count != 0; byte_count--, bytes++, process_address++) {
183 WriteChildByte(process, process_address, *bytes);
184 }
185 }
186
187 // Executes shell code in a target process.
188 //
189 // The following assumptions are made:
190 // * The process is currently being ptraced and that the process has already
191 // stopped.
192 // * The shell code will raise SIGSTOP when it has finished as signal that
193 // control flow should be handed back to the original code.
194 // * The shell code only alters registers and pushes values onto the stack.
195 //
196 // Execution is performed by overwriting the memory under the current
197 // instruction pointer with the shell code. After the shell code signals
198 // completion the original register state and memory are restored.
199 //
200 // If the above assumptions are met, then this function will leave the process
201 // in a stopped state that is equivalent to the original state.
ExecuteShellCode(const pid_t process,const uint8_t * shell_code,const size_t shell_code_size)202 bool ExecuteShellCode(const pid_t process, const uint8_t* shell_code,
203 const size_t shell_code_size) {
204 REGISTER_STRUCT registers;
205 struct iovec registers_iovec;
206 registers_iovec.iov_base = ®isters;
207 registers_iovec.iov_len = sizeof(REGISTER_STRUCT);
208 ptrace(PTRACE_GETREGSET, process, 1, ®isters_iovec);
209
210 std::unique_ptr<uint8_t[]> memory_backup(new uint8_t[shell_code_size]);
211 ReadChildMemory(process, PROGRAM_COUNTER(registers), memory_backup.get(),
212 shell_code_size);
213 WriteChildMemory(process, PROGRAM_COUNTER(registers), shell_code,
214 shell_code_size);
215
216 // Execute the shell code and wait for the signal that it has finished.
217 ptrace(PTRACE_CONT, process, NULL, NULL);
218 int status;
219 waitpid(process, &status, 0);
220 if (status >> 8 != SIGSTOP) {
221 std::cerr << "Failed to execute SELinux shellcode." << std::endl;
222 return false;
223 }
224
225 ptrace(PTRACE_SETREGSET, process, 1, ®isters_iovec);
226 WriteChildMemory(process, PROGRAM_COUNTER(registers), memory_backup.get(),
227 shell_code_size);
228 return true;
229 }
230
SetProgramCounter(const pid_t process_id,uint64_t program_counter)231 bool SetProgramCounter(const pid_t process_id, uint64_t program_counter) {
232 REGISTER_STRUCT registers;
233 struct iovec registers_iovec;
234 registers_iovec.iov_base = ®isters;
235 registers_iovec.iov_len = sizeof(REGISTER_STRUCT);
236 if (ptrace(PTRACE_GETREGSET, process_id, 1, ®isters_iovec) != 0) {
237 return false;
238 }
239 PROGRAM_COUNTER(registers) = program_counter;
240 if ((ptrace(PTRACE_SETREGSET, process_id, 1, ®isters_iovec)) != 0) {
241 return false;
242 }
243 return true;
244 }
245
StepToEntryPoint(const pid_t process_id)246 bool StepToEntryPoint(const pid_t process_id) {
247 bool is_arm_mode;
248 uint64_t entry_address;
249 if (!GetElfEntryPoint(process_id, &entry_address, &is_arm_mode)) {
250 std::cerr << "Not able to determine Elf entry point." << std::endl;
251 return false;
252 }
253 if (is_arm_mode) {
254 // TODO(willcoster): If there is a need to handle ARM mode instructions in
255 // addition to thumb instructions update this with ARM mode shell code.
256 std::cerr << "Attempting to run an ARM-mode binary. "
257 << "shell-as currently only supports thumb-mode. "
258 << "Bug willcoster@ if you run into this error." << std::endl;
259 return false;
260 }
261
262 int expected_signal = 0;
263 size_t trap_code_size = 0;
264 std::unique_ptr<uint8_t[]> trap_code =
265 GetTrapShellCode(&expected_signal, &trap_code_size);
266 std::unique_ptr<uint8_t[]> backup(new uint8_t[trap_code_size]);
267
268 // Set a break point at the entry point declared by the Elf file. When a
269 // statically linked binary is executed this will be the first instruction
270 // executed.
271 //
272 // When a dynamically linked binary is executed, the dynamic linker is
273 // executed first. This brings .so files into memory and resolves shared
274 // symbols. Once this process is finished, it jumps to the entry point
275 // declared in the Elf file.
276 ReadChildMemory(process_id, entry_address, backup.get(), trap_code_size);
277 WriteChildMemory(process_id, entry_address, trap_code.get(), trap_code_size);
278 ptrace(PTRACE_CONT, process_id, NULL, NULL);
279 int status;
280 waitpid(process_id, &status, 0);
281 if (status >> 8 != expected_signal) {
282 std::cerr << "Program exited unexpectedly while stepping to entry point."
283 << std::endl;
284 std::cerr << "Expected status " << expected_signal << " but encountered "
285 << (status >> 8) << std::endl;
286 return false;
287 }
288
289 if (!SetProgramCounter(process_id, entry_address)) {
290 return false;
291 }
292 WriteChildMemory(process_id, entry_address, backup.get(), trap_code_size);
293 return true;
294 }
295
296 } // namespace
297
ExecuteInContext(char * const executable_and_args[],const shell_as::SecurityContext * context)298 bool ExecuteInContext(char* const executable_and_args[],
299 const shell_as::SecurityContext* context) {
300 // Getting an executable running in a lower privileged context is tricky with
301 // SELinux. The recommended approach in the documentation is to use setexeccon
302 // which sets the context on the next execve call.
303 //
304 // However, this doesn't work for unprivileged processes like untrusted apps
305 // in Android because they are not allowed to execute most binaries.
306 //
307 // To work around this, ptrace is used to inject shell code into the new
308 // process just after it has executed an execve syscall. This shell code then
309 // sets the desired SELinux context.
310 pid_t child = fork();
311 if (child == 0) {
312 // Disabling ASLR makes it easier to determine the entry point of the target
313 // executable.
314 personality(ADDR_NO_RANDOMIZE);
315
316 // Drop the privileges that can be dropped before executing the new binary
317 // and exit early if there is an issue.
318 if (!DropPreExecPrivileges(context)) {
319 exit(1);
320 }
321
322 ptrace(PTRACE_TRACEME, 0, NULL, NULL);
323 raise(SIGSTOP); // Wait for the parent process to attach.
324 execv(executable_and_args[0], executable_and_args);
325 } else {
326 // Wait for the child to reach the SIGSTOP line above.
327 int status;
328 waitpid(child, &status, 0);
329 if ((status >> 8) != SIGSTOP) {
330 // If the first status is not SIGSTOP, then the child aborted early
331 // because it was not able to set the user and group IDs.
332 return false;
333 }
334
335 // Break inside the child's execv call.
336 ptrace(PTRACE_SETOPTIONS, child, NULL,
337 PTRACE_O_TRACEEXEC | PTRACE_O_EXITKILL);
338 ptrace(PTRACE_CONT, child, NULL, NULL);
339 waitpid(child, &status, 0);
340 if (status >> 8 != (SIGTRAP | PTRACE_EVENT_EXEC << 8)) {
341 std::cerr << "Failed to execute " << executable_and_args[0] << std::endl;
342 return false;
343 }
344
345 // Allow the dynamic linker to run before dropping to a lower SELinux
346 // context. This is required for executing in some very constrained domains
347 // like mediacodec.
348 //
349 // If the context was dropped before the dynamic linker runs, then when the
350 // linker attempts to read /proc/self/exe to determine dynamic symbol
351 // information, SELinux will kill the binary if the domain is not allowed to
352 // read the binary's executable file.
353 //
354 // This happens for example, when attempting to run any toybox binary (id,
355 // sh, etc) as mediacodec.
356 if (!StepToEntryPoint(child)) {
357 std::cerr << "Something bad happened stepping to the entry point."
358 << std::endl;
359 return false;
360 }
361
362 // Run the SELinux shellcode in the child process before the child can
363 // execute any instructions in the newly loaded executable.
364 if (context->selinux_context.has_value()) {
365 size_t shell_code_size;
366 std::unique_ptr<uint8_t[]> shell_code = GetSELinuxShellCode(
367 context->selinux_context.value(), &shell_code_size);
368 bool success = ExecuteShellCode(child, shell_code.get(), shell_code_size);
369 if (!success) {
370 return false;
371 }
372 }
373
374 // Resume and detach from the child now that the SELinux context has been
375 // updated.
376 ptrace(PTRACE_DETACH, child, NULL, NULL);
377 waitpid(child, nullptr, 0);
378 }
379 return true;
380 }
381
382 } // namespace shell_as
383