1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "host/libs/process_monitor/process_monitor.h"
18
19 #ifdef __linux__
20 #include <sys/prctl.h>
21 #endif
22
23 #include <sys/types.h>
24 #include <sys/wait.h>
25
26 #include <assert.h>
27 #include <errno.h>
28 #include <signal.h>
29 #include <stdio.h>
30
31 #include <algorithm>
32 #include <atomic>
33 #include <cstdint>
34 #include <future>
35 #include <memory>
36 #include <string>
37 #include <thread>
38 #include <vector>
39
40 #include <android-base/file.h>
41 #include <android-base/logging.h>
42
43 #include "common/libs/fs/shared_buf.h"
44 #include "common/libs/fs/shared_select.h"
45 #include "common/libs/utils/contains.h"
46 #include "common/libs/utils/result.h"
47 #include "common/libs/utils/subprocess.h"
48 #include "host/libs/command_util/runner/defs.h"
49 #include "host/libs/command_util/util.h"
50 #include "host/libs/config/cuttlefish_config.h"
51 #include "host/libs/config/known_paths.h"
52 #include "host/libs/process_monitor/process_monitor_channel.h"
53
54 namespace cuttlefish {
55
56 namespace {
57
LogSubprocessExit(const std::string & name,pid_t pid,int wstatus)58 void LogSubprocessExit(const std::string& name, pid_t pid, int wstatus) {
59 LOG(INFO) << "Detected unexpected exit of monitored subprocess " << name;
60 if (WIFEXITED(wstatus)) {
61 LOG(INFO) << "Subprocess " << name << " (" << pid
62 << ") has exited with exit code " << WEXITSTATUS(wstatus);
63 } else if (WIFSIGNALED(wstatus)) {
64 int sig_num = WTERMSIG(wstatus);
65 LOG(ERROR) << "Subprocess " << name << " (" << pid
66 << ") was interrupted by a signal '" << strsignal(sig_num)
67 << "' (" << sig_num << ")";
68 } else {
69 LOG(INFO) << "subprocess " << name << " (" << pid
70 << ") has exited for unknown reasons";
71 }
72 }
73
LogSubprocessExit(const std::string & name,const siginfo_t & infop)74 void LogSubprocessExit(const std::string& name, const siginfo_t& infop) {
75 LOG(INFO) << "Detected unexpected exit of monitored subprocess " << name;
76 if (infop.si_code == CLD_EXITED) {
77 LOG(INFO) << "Subprocess " << name << " (" << infop.si_pid
78 << ") has exited with exit code " << infop.si_status;
79 } else if (infop.si_code == CLD_KILLED) {
80 LOG(ERROR) << "Subprocess " << name << " (" << infop.si_pid
81 << ") was interrupted by a signal '"
82 << strsignal(infop.si_status) << "' (" << infop.si_status << ")";
83 } else {
84 LOG(INFO) << "subprocess " << name << " (" << infop.si_pid
85 << ") has exited for unknown reasons (code = " << infop.si_code
86 << ", status = " << infop.si_status << ")";
87 }
88 }
89
MonitorLoop(const std::atomic_bool & running,std::mutex & properties_mutex,const bool restart_subprocesses,std::vector<MonitorEntry> & monitored)90 Result<void> MonitorLoop(const std::atomic_bool& running,
91 std::mutex& properties_mutex,
92 const bool restart_subprocesses,
93 std::vector<MonitorEntry>& monitored) {
94 while (running.load()) {
95 int wstatus;
96 pid_t pid = wait(&wstatus);
97 int error_num = errno;
98 CF_EXPECT(pid != -1, "Wait failed: " << strerror(error_num));
99 if (!WIFSIGNALED(wstatus) && !WIFEXITED(wstatus)) {
100 LOG(DEBUG) << "Unexpected status from wait: " << wstatus << " for pid "
101 << pid;
102 continue;
103 }
104 if (!running.load()) { // Avoid extra restarts near the end
105 break;
106 }
107 auto matches = [pid](const auto& it) { return it.proc->pid() == pid; };
108 std::unique_lock lock(properties_mutex);
109 auto it = std::find_if(monitored.begin(), monitored.end(), matches);
110 if (it == monitored.end()) {
111 LogSubprocessExit("(unknown)", pid, wstatus);
112 } else {
113 LogSubprocessExit(it->cmd->GetShortName(), it->proc->pid(), wstatus);
114 if (restart_subprocesses) {
115 auto options = SubprocessOptions().InGroup(true);
116 // in the future, cmd->Start might not run exec()
117 it->proc.reset(new Subprocess(it->cmd->Start(std::move(options))));
118 } else {
119 bool is_critical = it->is_critical;
120 monitored.erase(it);
121 if (running.load() && is_critical) {
122 LOG(ERROR) << "Stopping all monitored processes due to unexpected "
123 "exit of critical process";
124 Command stop_cmd(StopCvdBinary());
125 stop_cmd.Start();
126 }
127 }
128 }
129 }
130 return {};
131 }
132
StopSubprocesses(std::vector<MonitorEntry> & monitored)133 Result<void> StopSubprocesses(std::vector<MonitorEntry>& monitored) {
134 LOG(DEBUG) << "Stopping monitored subprocesses";
135 auto stop = [](const auto& it) {
136 auto stop_result = it.proc->Stop();
137 if (stop_result == StopperResult::kStopFailure) {
138 LOG(WARNING) << "Error in stopping \"" << it.cmd->GetShortName() << "\"";
139 return false;
140 }
141 siginfo_t infop;
142 auto success = it.proc->Wait(&infop, WEXITED);
143 if (success < 0) {
144 LOG(WARNING) << "Failed to wait for process " << it.cmd->GetShortName();
145 return false;
146 }
147 if (stop_result == StopperResult::kStopCrash) {
148 LogSubprocessExit(it.cmd->GetShortName(), infop);
149 }
150 return true;
151 };
152 // Processes were started in the order they appear in the vector, stop them in
153 // reverse order for symmetry.
154 size_t stopped = std::count_if(monitored.rbegin(), monitored.rend(), stop);
155 CF_EXPECT(stopped == monitored.size(), "Didn't stop all subprocesses");
156 return {};
157 }
158
SuspendResumeImpl(std::vector<MonitorEntry> & monitor_entries,std::mutex & properties_mutex,const SharedFD & channel_to_secure_env,const bool is_suspend,SharedFD child_monitor_socket)159 Result<void> SuspendResumeImpl(std::vector<MonitorEntry>& monitor_entries,
160 std::mutex& properties_mutex,
161 const SharedFD& channel_to_secure_env,
162 const bool is_suspend,
163 SharedFD child_monitor_socket) {
164 std::lock_guard lock(properties_mutex);
165 auto secure_env_itr = std::find_if(
166 monitor_entries.begin(), monitor_entries.end(), [](MonitorEntry& entry) {
167 auto prog_name = android::base::Basename(entry.cmd->Executable());
168 return (prog_name == "secure_env");
169 });
170 if (secure_env_itr != monitor_entries.end()) {
171 CF_EXPECT(channel_to_secure_env->IsOpen(),
172 "channel to secure_env is not open.");
173 run_cvd::ExtendedLauncherAction extended_action;
174 if (is_suspend) {
175 extended_action.mutable_suspend();
176 } else {
177 extended_action.mutable_resume();
178 }
179 CF_EXPECT(RunLauncherAction(channel_to_secure_env, extended_action,
180 std::nullopt));
181 }
182
183 for (const auto& entry : monitor_entries) {
184 if (!entry.cmd) {
185 LOG(ERROR) << "Monitor Entry has a nullptr for cmd.";
186 continue;
187 }
188 if (!entry.proc) {
189 LOG(ERROR) << "Monitor Entry has a nullptr for proc.";
190 continue;
191 }
192 auto prog_name = android::base::Basename(entry.cmd->Executable());
193 auto process_restart_bin =
194 android::base::Basename(ProcessRestarterBinary());
195 if (prog_name == "log_tee") {
196 // Don't stop log_tee, we want to continue processing logs while
197 // suspended.
198 continue;
199 }
200 if (prog_name == "wmediumd") {
201 // wmediumd should be running while openWRT is saved using the
202 // guest snapshot logic
203 continue;
204 }
205 if (prog_name == "secure_env") {
206 // secure_env was handled above in a customized way
207 continue;
208 }
209
210 if (process_restart_bin == prog_name) {
211 if (is_suspend) {
212 CF_EXPECT(entry.proc->SendSignal(SIGTSTP));
213 } else {
214 CF_EXPECT(entry.proc->SendSignal(SIGCONT));
215 }
216 continue;
217 }
218 if (is_suspend) {
219 CF_EXPECT(entry.proc->SendSignalToGroup(SIGTSTP));
220 } else {
221 CF_EXPECT(entry.proc->SendSignalToGroup(SIGCONT));
222 }
223 }
224 using process_monitor_impl::ChildToParentResponse;
225 using process_monitor_impl::ChildToParentResponseType;
226 ChildToParentResponse response(ChildToParentResponseType::kSuccess);
227 CF_EXPECT(response.Write(child_monitor_socket));
228 return {};
229 }
230
231 } // namespace
232
StartSubprocesses(ProcessMonitor::Properties & properties)233 Result<void> ProcessMonitor::StartSubprocesses(
234 ProcessMonitor::Properties& properties) {
235 LOG(DEBUG) << "Starting monitored subprocesses";
236 for (auto& monitored : properties.entries_) {
237 LOG(INFO) << monitored.cmd->GetShortName();
238 auto options = SubprocessOptions().InGroup(true);
239 std::string short_name = monitored.cmd->GetShortName();
240 auto last_slash = short_name.find_last_of('/');
241 if (last_slash != std::string::npos) {
242 short_name = short_name.substr(last_slash + 1);
243 }
244 if (Contains(properties_.strace_commands_, short_name)) {
245 options.Strace(properties.strace_log_dir_ + "/strace-" + short_name);
246 }
247 if (properties.sandbox_processes_ && monitored.can_sandbox) {
248 options.SandboxArguments({
249 HostBinaryPath("process_sandboxer"),
250 "--log_dir=" + properties.strace_log_dir_,
251 "--host_artifacts_path=" + DefaultHostArtifactsPath(""),
252 });
253 }
254 monitored.proc.reset(
255 new Subprocess(monitored.cmd->Start(std::move(options))));
256 CF_EXPECT(monitored.proc->Started(), "Failed to start subprocess");
257 }
258 return {};
259 }
260
ReadMonitorSocketLoop(std::atomic_bool & running)261 Result<void> ProcessMonitor::ReadMonitorSocketLoop(std::atomic_bool& running) {
262 LOG(DEBUG) << "Waiting for a `stop` message from the parent";
263 while (running.load()) {
264 using process_monitor_impl::ParentToChildMessage;
265 auto message = CF_EXPECT(ParentToChildMessage::Read(child_monitor_socket_));
266 if (message.Stop()) {
267 running.store(false);
268 // Wake up the wait() loop by giving it an exited child process
269 if (fork() == 0) {
270 std::exit(0);
271 }
272 // will break the for-loop as running is now false
273 continue;
274 }
275 using process_monitor_impl::ParentToChildMessageType;
276 if (message.Type() == ParentToChildMessageType::kHostSuspend) {
277 CF_EXPECT(SuspendHostProcessesImpl());
278 continue;
279 }
280 if (message.Type() == ParentToChildMessageType::kHostResume) {
281 CF_EXPECT(ResumeHostProcessesImpl());
282 continue;
283 }
284 }
285 return {};
286 }
287
SuspendHostProcessesImpl()288 Result<void> ProcessMonitor::SuspendHostProcessesImpl() {
289 CF_EXPECT(SuspendResumeImpl(properties_.entries_, properties_mutex_,
290 channel_to_secure_env_, /* is_suspend */ true,
291 child_monitor_socket_),
292 "Failed suspend");
293 return {};
294 }
295
ResumeHostProcessesImpl()296 Result<void> ProcessMonitor::ResumeHostProcessesImpl() {
297 CF_EXPECT(SuspendResumeImpl(properties_.entries_, properties_mutex_,
298 channel_to_secure_env_, /* is_suspend */ false,
299 child_monitor_socket_),
300 "Failed resume");
301 return {};
302 }
303
RestartSubprocesses(bool r)304 ProcessMonitor::Properties& ProcessMonitor::Properties::RestartSubprocesses(
305 bool r) & {
306 restart_subprocesses_ = r;
307 return *this;
308 }
309
RestartSubprocesses(bool r)310 ProcessMonitor::Properties ProcessMonitor::Properties::RestartSubprocesses(
311 bool r) && {
312 return std::move(RestartSubprocesses(r));
313 }
314
AddCommand(MonitorCommand cmd)315 ProcessMonitor::Properties& ProcessMonitor::Properties::AddCommand(
316 MonitorCommand cmd) & {
317 auto& entry = entries_.emplace_back(std::move(cmd.command), cmd.is_critical);
318 entry.can_sandbox = cmd.can_sandbox;
319 return *this;
320 }
321
AddCommand(MonitorCommand cmd)322 ProcessMonitor::Properties ProcessMonitor::Properties::AddCommand(
323 MonitorCommand cmd) && {
324 return std::move(AddCommand(std::move(cmd)));
325 }
326
StraceCommands(std::set<std::string> strace)327 ProcessMonitor::Properties& ProcessMonitor::Properties::StraceCommands(
328 std::set<std::string> strace) & {
329 strace_commands_ = std::move(strace);
330 return *this;
331 }
StraceCommands(std::set<std::string> strace)332 ProcessMonitor::Properties ProcessMonitor::Properties::StraceCommands(
333 std::set<std::string> strace) && {
334 return std::move(StraceCommands(std::move(strace)));
335 }
336
StraceLogDir(std::string log_dir)337 ProcessMonitor::Properties& ProcessMonitor::Properties::StraceLogDir(
338 std::string log_dir) & {
339 strace_log_dir_ = std::move(log_dir);
340 return *this;
341 }
StraceLogDir(std::string log_dir)342 ProcessMonitor::Properties ProcessMonitor::Properties::StraceLogDir(
343 std::string log_dir) && {
344 return std::move(StraceLogDir(std::move(log_dir)));
345 }
346
SandboxProcesses(bool r)347 ProcessMonitor::Properties& ProcessMonitor::Properties::SandboxProcesses(
348 bool r) & {
349 sandbox_processes_ = r;
350 return *this;
351 }
SandboxProcesses(bool r)352 ProcessMonitor::Properties ProcessMonitor::Properties::SandboxProcesses(
353 bool r) && {
354 return std::move(SandboxProcesses(r));
355 }
356
ProcessMonitor(ProcessMonitor::Properties && properties,const SharedFD & secure_env_fd)357 ProcessMonitor::ProcessMonitor(ProcessMonitor::Properties&& properties,
358 const SharedFD& secure_env_fd)
359 : properties_(std::move(properties)),
360 channel_to_secure_env_(secure_env_fd),
361 monitor_(-1) {}
362
StopMonitoredProcesses()363 Result<void> ProcessMonitor::StopMonitoredProcesses() {
364 CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
365 CF_EXPECT(parent_monitor_socket_->IsOpen(),
366 "The monitor socket is already closed");
367 using process_monitor_impl::ParentToChildMessage;
368 using process_monitor_impl::ParentToChildMessageType;
369 ParentToChildMessage message(ParentToChildMessageType::kStop);
370 CF_EXPECT(message.Write(parent_monitor_socket_));
371
372 pid_t last_monitor = monitor_;
373 monitor_ = -1;
374 parent_monitor_socket_->Close();
375 int wstatus;
376 CF_EXPECT(waitpid(last_monitor, &wstatus, 0) == last_monitor,
377 "Failed to wait for monitor process");
378 CF_EXPECT(!WIFSIGNALED(wstatus), "Monitor process exited due to a signal");
379 CF_EXPECT(WIFEXITED(wstatus), "Monitor process exited for unknown reasons");
380 CF_EXPECT(WEXITSTATUS(wstatus) == 0,
381 "Monitor process exited with code " << WEXITSTATUS(wstatus));
382 return {};
383 }
384
SuspendMonitoredProcesses()385 Result<void> ProcessMonitor::SuspendMonitoredProcesses() {
386 CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
387 CF_EXPECT(parent_monitor_socket_->IsOpen(),
388 "The monitor socket is already closed");
389 using process_monitor_impl::ParentToChildMessage;
390 using process_monitor_impl::ParentToChildMessageType;
391 ParentToChildMessage message(ParentToChildMessageType::kHostSuspend);
392 CF_EXPECT(message.Write(parent_monitor_socket_));
393 using process_monitor_impl::ChildToParentResponse;
394 auto response =
395 CF_EXPECT(ChildToParentResponse::Read(parent_monitor_socket_));
396 CF_EXPECT(response.Success(),
397 "On kHostSuspend, the child run_cvd returned kFailure.");
398 return {};
399 }
400
ResumeMonitoredProcesses()401 Result<void> ProcessMonitor::ResumeMonitoredProcesses() {
402 CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
403 CF_EXPECT(parent_monitor_socket_->IsOpen(),
404 "The monitor socket is already closed");
405 using process_monitor_impl::ParentToChildMessage;
406 using process_monitor_impl::ParentToChildMessageType;
407 ParentToChildMessage message(ParentToChildMessageType::kHostResume);
408 CF_EXPECT(message.Write(parent_monitor_socket_));
409 using process_monitor_impl::ChildToParentResponse;
410 auto response =
411 CF_EXPECT(ChildToParentResponse::Read(parent_monitor_socket_));
412 CF_EXPECT(response.Success(),
413 "On kHostResume, the child run_cvd returned kFailure.");
414 return {};
415 }
416
StartAndMonitorProcesses()417 Result<void> ProcessMonitor::StartAndMonitorProcesses() {
418 CF_EXPECT(monitor_ == -1, "The monitor process was already started");
419 CF_EXPECT(!parent_monitor_socket_->IsOpen(),
420 "Parent monitor socket was already opened");
421 SharedFD parent_sock;
422 SharedFD child_sock;
423 SharedFD::SocketPair(AF_UNIX, SOCK_STREAM, 0, &parent_sock, &child_sock);
424 monitor_ = fork();
425 if (monitor_ == 0) {
426 child_monitor_socket_ = std::move(child_sock);
427 parent_sock->Close();
428 auto monitor_result = MonitorRoutine();
429 if (!monitor_result.ok()) {
430 LOG(ERROR) << "Monitoring processes failed:\n"
431 << monitor_result.error().FormatForEnv();
432 }
433 std::exit(monitor_result.ok() ? 0 : 1);
434 } else {
435 parent_monitor_socket_ = std::move(parent_sock);
436 child_sock->Close();
437 return {};
438 }
439 }
440
MonitorRoutine()441 Result<void> ProcessMonitor::MonitorRoutine() {
442 #ifdef __linux__
443 // Make this process a subreaper to reliably catch subprocess exits.
444 // See https://man7.org/linux/man-pages/man2/prctl.2.html
445 prctl(PR_SET_CHILD_SUBREAPER, 1);
446 prctl(PR_SET_PDEATHSIG, SIGHUP); // Die when parent dies
447 #endif
448
449 LOG(DEBUG) << "Monitoring subprocesses";
450 CF_EXPECT(StartSubprocesses(properties_));
451
452 std::atomic_bool running(true);
453
454 auto read_monitor_socket_loop =
455 [this](std::atomic_bool& running) -> Result<void> {
456 CF_EXPECT(this->ReadMonitorSocketLoop(running));
457 return {};
458 };
459 auto parent_comms = std::async(std::launch::async, read_monitor_socket_loop,
460 std::ref(running));
461
462 CF_EXPECT(MonitorLoop(running, properties_mutex_,
463 properties_.restart_subprocesses_,
464 properties_.entries_));
465 CF_EXPECT(parent_comms.get(), "Should have exited if monitoring stopped");
466
467 CF_EXPECT(StopSubprocesses(properties_.entries_));
468 LOG(DEBUG) << "Done monitoring subprocesses";
469 return {};
470 }
471
472 } // namespace cuttlefish
473