1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "host/libs/process_monitor/process_monitor.h"
18 
19 #ifdef __linux__
20 #include <sys/prctl.h>
21 #endif
22 
23 #include <sys/types.h>
24 #include <sys/wait.h>
25 
26 #include <assert.h>
27 #include <errno.h>
28 #include <signal.h>
29 #include <stdio.h>
30 
31 #include <algorithm>
32 #include <atomic>
33 #include <cstdint>
34 #include <future>
35 #include <memory>
36 #include <string>
37 #include <thread>
38 #include <vector>
39 
40 #include <android-base/file.h>
41 #include <android-base/logging.h>
42 
43 #include "common/libs/fs/shared_buf.h"
44 #include "common/libs/fs/shared_select.h"
45 #include "common/libs/utils/contains.h"
46 #include "common/libs/utils/result.h"
47 #include "common/libs/utils/subprocess.h"
48 #include "host/libs/command_util/runner/defs.h"
49 #include "host/libs/command_util/util.h"
50 #include "host/libs/config/cuttlefish_config.h"
51 #include "host/libs/config/known_paths.h"
52 #include "host/libs/process_monitor/process_monitor_channel.h"
53 
54 namespace cuttlefish {
55 
56 namespace {
57 
LogSubprocessExit(const std::string & name,pid_t pid,int wstatus)58 void LogSubprocessExit(const std::string& name, pid_t pid, int wstatus) {
59   LOG(INFO) << "Detected unexpected exit of monitored subprocess " << name;
60   if (WIFEXITED(wstatus)) {
61     LOG(INFO) << "Subprocess " << name << " (" << pid
62               << ") has exited with exit code " << WEXITSTATUS(wstatus);
63   } else if (WIFSIGNALED(wstatus)) {
64     int sig_num = WTERMSIG(wstatus);
65     LOG(ERROR) << "Subprocess " << name << " (" << pid
66                << ") was interrupted by a signal '" << strsignal(sig_num)
67                << "' (" << sig_num << ")";
68   } else {
69     LOG(INFO) << "subprocess " << name << " (" << pid
70               << ") has exited for unknown reasons";
71   }
72 }
73 
LogSubprocessExit(const std::string & name,const siginfo_t & infop)74 void LogSubprocessExit(const std::string& name, const siginfo_t& infop) {
75   LOG(INFO) << "Detected unexpected exit of monitored subprocess " << name;
76   if (infop.si_code == CLD_EXITED) {
77     LOG(INFO) << "Subprocess " << name << " (" << infop.si_pid
78               << ") has exited with exit code " << infop.si_status;
79   } else if (infop.si_code == CLD_KILLED) {
80     LOG(ERROR) << "Subprocess " << name << " (" << infop.si_pid
81                << ") was interrupted by a signal '"
82                << strsignal(infop.si_status) << "' (" << infop.si_status << ")";
83   } else {
84     LOG(INFO) << "subprocess " << name << " (" << infop.si_pid
85               << ") has exited for unknown reasons (code = " << infop.si_code
86               << ", status = " << infop.si_status << ")";
87   }
88 }
89 
MonitorLoop(const std::atomic_bool & running,std::mutex & properties_mutex,const bool restart_subprocesses,std::vector<MonitorEntry> & monitored)90 Result<void> MonitorLoop(const std::atomic_bool& running,
91                          std::mutex& properties_mutex,
92                          const bool restart_subprocesses,
93                          std::vector<MonitorEntry>& monitored) {
94   while (running.load()) {
95     int wstatus;
96     pid_t pid = wait(&wstatus);
97     int error_num = errno;
98     CF_EXPECT(pid != -1, "Wait failed: " << strerror(error_num));
99     if (!WIFSIGNALED(wstatus) && !WIFEXITED(wstatus)) {
100       LOG(DEBUG) << "Unexpected status from wait: " << wstatus << " for pid "
101                  << pid;
102       continue;
103     }
104     if (!running.load()) {  // Avoid extra restarts near the end
105       break;
106     }
107     auto matches = [pid](const auto& it) { return it.proc->pid() == pid; };
108     std::unique_lock lock(properties_mutex);
109     auto it = std::find_if(monitored.begin(), monitored.end(), matches);
110     if (it == monitored.end()) {
111       LogSubprocessExit("(unknown)", pid, wstatus);
112     } else {
113       LogSubprocessExit(it->cmd->GetShortName(), it->proc->pid(), wstatus);
114       if (restart_subprocesses) {
115         auto options = SubprocessOptions().InGroup(true);
116         // in the future, cmd->Start might not run exec()
117         it->proc.reset(new Subprocess(it->cmd->Start(std::move(options))));
118       } else {
119         bool is_critical = it->is_critical;
120         monitored.erase(it);
121         if (running.load() && is_critical) {
122           LOG(ERROR) << "Stopping all monitored processes due to unexpected "
123                         "exit of critical process";
124           Command stop_cmd(StopCvdBinary());
125           stop_cmd.Start();
126         }
127       }
128     }
129   }
130   return {};
131 }
132 
StopSubprocesses(std::vector<MonitorEntry> & monitored)133 Result<void> StopSubprocesses(std::vector<MonitorEntry>& monitored) {
134   LOG(DEBUG) << "Stopping monitored subprocesses";
135   auto stop = [](const auto& it) {
136     auto stop_result = it.proc->Stop();
137     if (stop_result == StopperResult::kStopFailure) {
138       LOG(WARNING) << "Error in stopping \"" << it.cmd->GetShortName() << "\"";
139       return false;
140     }
141     siginfo_t infop;
142     auto success = it.proc->Wait(&infop, WEXITED);
143     if (success < 0) {
144       LOG(WARNING) << "Failed to wait for process " << it.cmd->GetShortName();
145       return false;
146     }
147     if (stop_result == StopperResult::kStopCrash) {
148       LogSubprocessExit(it.cmd->GetShortName(), infop);
149     }
150     return true;
151   };
152   // Processes were started in the order they appear in the vector, stop them in
153   // reverse order for symmetry.
154   size_t stopped = std::count_if(monitored.rbegin(), monitored.rend(), stop);
155   CF_EXPECT(stopped == monitored.size(), "Didn't stop all subprocesses");
156   return {};
157 }
158 
SuspendResumeImpl(std::vector<MonitorEntry> & monitor_entries,std::mutex & properties_mutex,const SharedFD & channel_to_secure_env,const bool is_suspend,SharedFD child_monitor_socket)159 Result<void> SuspendResumeImpl(std::vector<MonitorEntry>& monitor_entries,
160                                std::mutex& properties_mutex,
161                                const SharedFD& channel_to_secure_env,
162                                const bool is_suspend,
163                                SharedFD child_monitor_socket) {
164   std::lock_guard lock(properties_mutex);
165   auto secure_env_itr = std::find_if(
166       monitor_entries.begin(), monitor_entries.end(), [](MonitorEntry& entry) {
167         auto prog_name = android::base::Basename(entry.cmd->Executable());
168         return (prog_name == "secure_env");
169       });
170   if (secure_env_itr != monitor_entries.end()) {
171     CF_EXPECT(channel_to_secure_env->IsOpen(),
172               "channel to secure_env is not open.");
173     run_cvd::ExtendedLauncherAction extended_action;
174     if (is_suspend) {
175       extended_action.mutable_suspend();
176     } else {
177       extended_action.mutable_resume();
178     }
179     CF_EXPECT(RunLauncherAction(channel_to_secure_env, extended_action,
180                                 std::nullopt));
181   }
182 
183   for (const auto& entry : monitor_entries) {
184     if (!entry.cmd) {
185       LOG(ERROR) << "Monitor Entry has a nullptr for cmd.";
186       continue;
187     }
188     if (!entry.proc) {
189       LOG(ERROR) << "Monitor Entry has a nullptr for proc.";
190       continue;
191     }
192     auto prog_name = android::base::Basename(entry.cmd->Executable());
193     auto process_restart_bin =
194         android::base::Basename(ProcessRestarterBinary());
195     if (prog_name == "log_tee") {
196       // Don't stop log_tee, we want to continue processing logs while
197       // suspended.
198       continue;
199     }
200     if (prog_name == "wmediumd") {
201       // wmediumd should be running while openWRT is saved using the
202       // guest snapshot logic
203       continue;
204     }
205     if (prog_name == "secure_env") {
206       // secure_env was handled above in a customized way
207       continue;
208     }
209 
210     if (process_restart_bin == prog_name) {
211       if (is_suspend) {
212         CF_EXPECT(entry.proc->SendSignal(SIGTSTP));
213       } else {
214         CF_EXPECT(entry.proc->SendSignal(SIGCONT));
215       }
216       continue;
217     }
218     if (is_suspend) {
219       CF_EXPECT(entry.proc->SendSignalToGroup(SIGTSTP));
220     } else {
221       CF_EXPECT(entry.proc->SendSignalToGroup(SIGCONT));
222     }
223   }
224   using process_monitor_impl::ChildToParentResponse;
225   using process_monitor_impl::ChildToParentResponseType;
226   ChildToParentResponse response(ChildToParentResponseType::kSuccess);
227   CF_EXPECT(response.Write(child_monitor_socket));
228   return {};
229 }
230 
231 }  // namespace
232 
StartSubprocesses(ProcessMonitor::Properties & properties)233 Result<void> ProcessMonitor::StartSubprocesses(
234     ProcessMonitor::Properties& properties) {
235   LOG(DEBUG) << "Starting monitored subprocesses";
236   for (auto& monitored : properties.entries_) {
237     LOG(INFO) << monitored.cmd->GetShortName();
238     auto options = SubprocessOptions().InGroup(true);
239     std::string short_name = monitored.cmd->GetShortName();
240     auto last_slash = short_name.find_last_of('/');
241     if (last_slash != std::string::npos) {
242       short_name = short_name.substr(last_slash + 1);
243     }
244     if (Contains(properties_.strace_commands_, short_name)) {
245       options.Strace(properties.strace_log_dir_ + "/strace-" + short_name);
246     }
247     if (properties.sandbox_processes_ && monitored.can_sandbox) {
248       options.SandboxArguments({
249           HostBinaryPath("process_sandboxer"),
250           "--log_dir=" + properties.strace_log_dir_,
251           "--host_artifacts_path=" + DefaultHostArtifactsPath(""),
252       });
253     }
254     monitored.proc.reset(
255         new Subprocess(monitored.cmd->Start(std::move(options))));
256     CF_EXPECT(monitored.proc->Started(), "Failed to start subprocess");
257   }
258   return {};
259 }
260 
ReadMonitorSocketLoop(std::atomic_bool & running)261 Result<void> ProcessMonitor::ReadMonitorSocketLoop(std::atomic_bool& running) {
262   LOG(DEBUG) << "Waiting for a `stop` message from the parent";
263   while (running.load()) {
264     using process_monitor_impl::ParentToChildMessage;
265     auto message = CF_EXPECT(ParentToChildMessage::Read(child_monitor_socket_));
266     if (message.Stop()) {
267       running.store(false);
268       // Wake up the wait() loop by giving it an exited child process
269       if (fork() == 0) {
270         std::exit(0);
271       }
272       // will break the for-loop as running is now false
273       continue;
274     }
275     using process_monitor_impl::ParentToChildMessageType;
276     if (message.Type() == ParentToChildMessageType::kHostSuspend) {
277       CF_EXPECT(SuspendHostProcessesImpl());
278       continue;
279     }
280     if (message.Type() == ParentToChildMessageType::kHostResume) {
281       CF_EXPECT(ResumeHostProcessesImpl());
282       continue;
283     }
284   }
285   return {};
286 }
287 
SuspendHostProcessesImpl()288 Result<void> ProcessMonitor::SuspendHostProcessesImpl() {
289   CF_EXPECT(SuspendResumeImpl(properties_.entries_, properties_mutex_,
290                               channel_to_secure_env_, /* is_suspend */ true,
291                               child_monitor_socket_),
292             "Failed suspend");
293   return {};
294 }
295 
ResumeHostProcessesImpl()296 Result<void> ProcessMonitor::ResumeHostProcessesImpl() {
297   CF_EXPECT(SuspendResumeImpl(properties_.entries_, properties_mutex_,
298                               channel_to_secure_env_, /* is_suspend */ false,
299                               child_monitor_socket_),
300             "Failed resume");
301   return {};
302 }
303 
RestartSubprocesses(bool r)304 ProcessMonitor::Properties& ProcessMonitor::Properties::RestartSubprocesses(
305     bool r) & {
306   restart_subprocesses_ = r;
307   return *this;
308 }
309 
RestartSubprocesses(bool r)310 ProcessMonitor::Properties ProcessMonitor::Properties::RestartSubprocesses(
311     bool r) && {
312   return std::move(RestartSubprocesses(r));
313 }
314 
AddCommand(MonitorCommand cmd)315 ProcessMonitor::Properties& ProcessMonitor::Properties::AddCommand(
316     MonitorCommand cmd) & {
317   auto& entry = entries_.emplace_back(std::move(cmd.command), cmd.is_critical);
318   entry.can_sandbox = cmd.can_sandbox;
319   return *this;
320 }
321 
AddCommand(MonitorCommand cmd)322 ProcessMonitor::Properties ProcessMonitor::Properties::AddCommand(
323     MonitorCommand cmd) && {
324   return std::move(AddCommand(std::move(cmd)));
325 }
326 
StraceCommands(std::set<std::string> strace)327 ProcessMonitor::Properties& ProcessMonitor::Properties::StraceCommands(
328     std::set<std::string> strace) & {
329   strace_commands_ = std::move(strace);
330   return *this;
331 }
StraceCommands(std::set<std::string> strace)332 ProcessMonitor::Properties ProcessMonitor::Properties::StraceCommands(
333     std::set<std::string> strace) && {
334   return std::move(StraceCommands(std::move(strace)));
335 }
336 
StraceLogDir(std::string log_dir)337 ProcessMonitor::Properties& ProcessMonitor::Properties::StraceLogDir(
338     std::string log_dir) & {
339   strace_log_dir_ = std::move(log_dir);
340   return *this;
341 }
StraceLogDir(std::string log_dir)342 ProcessMonitor::Properties ProcessMonitor::Properties::StraceLogDir(
343     std::string log_dir) && {
344   return std::move(StraceLogDir(std::move(log_dir)));
345 }
346 
SandboxProcesses(bool r)347 ProcessMonitor::Properties& ProcessMonitor::Properties::SandboxProcesses(
348     bool r) & {
349   sandbox_processes_ = r;
350   return *this;
351 }
SandboxProcesses(bool r)352 ProcessMonitor::Properties ProcessMonitor::Properties::SandboxProcesses(
353     bool r) && {
354   return std::move(SandboxProcesses(r));
355 }
356 
ProcessMonitor(ProcessMonitor::Properties && properties,const SharedFD & secure_env_fd)357 ProcessMonitor::ProcessMonitor(ProcessMonitor::Properties&& properties,
358                                const SharedFD& secure_env_fd)
359     : properties_(std::move(properties)),
360       channel_to_secure_env_(secure_env_fd),
361       monitor_(-1) {}
362 
StopMonitoredProcesses()363 Result<void> ProcessMonitor::StopMonitoredProcesses() {
364   CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
365   CF_EXPECT(parent_monitor_socket_->IsOpen(),
366             "The monitor socket is already closed");
367   using process_monitor_impl::ParentToChildMessage;
368   using process_monitor_impl::ParentToChildMessageType;
369   ParentToChildMessage message(ParentToChildMessageType::kStop);
370   CF_EXPECT(message.Write(parent_monitor_socket_));
371 
372   pid_t last_monitor = monitor_;
373   monitor_ = -1;
374   parent_monitor_socket_->Close();
375   int wstatus;
376   CF_EXPECT(waitpid(last_monitor, &wstatus, 0) == last_monitor,
377             "Failed to wait for monitor process");
378   CF_EXPECT(!WIFSIGNALED(wstatus), "Monitor process exited due to a signal");
379   CF_EXPECT(WIFEXITED(wstatus), "Monitor process exited for unknown reasons");
380   CF_EXPECT(WEXITSTATUS(wstatus) == 0,
381             "Monitor process exited with code " << WEXITSTATUS(wstatus));
382   return {};
383 }
384 
SuspendMonitoredProcesses()385 Result<void> ProcessMonitor::SuspendMonitoredProcesses() {
386   CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
387   CF_EXPECT(parent_monitor_socket_->IsOpen(),
388             "The monitor socket is already closed");
389   using process_monitor_impl::ParentToChildMessage;
390   using process_monitor_impl::ParentToChildMessageType;
391   ParentToChildMessage message(ParentToChildMessageType::kHostSuspend);
392   CF_EXPECT(message.Write(parent_monitor_socket_));
393   using process_monitor_impl::ChildToParentResponse;
394   auto response =
395       CF_EXPECT(ChildToParentResponse::Read(parent_monitor_socket_));
396   CF_EXPECT(response.Success(),
397             "On kHostSuspend, the child run_cvd returned kFailure.");
398   return {};
399 }
400 
ResumeMonitoredProcesses()401 Result<void> ProcessMonitor::ResumeMonitoredProcesses() {
402   CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
403   CF_EXPECT(parent_monitor_socket_->IsOpen(),
404             "The monitor socket is already closed");
405   using process_monitor_impl::ParentToChildMessage;
406   using process_monitor_impl::ParentToChildMessageType;
407   ParentToChildMessage message(ParentToChildMessageType::kHostResume);
408   CF_EXPECT(message.Write(parent_monitor_socket_));
409   using process_monitor_impl::ChildToParentResponse;
410   auto response =
411       CF_EXPECT(ChildToParentResponse::Read(parent_monitor_socket_));
412   CF_EXPECT(response.Success(),
413             "On kHostResume, the child run_cvd returned kFailure.");
414   return {};
415 }
416 
StartAndMonitorProcesses()417 Result<void> ProcessMonitor::StartAndMonitorProcesses() {
418   CF_EXPECT(monitor_ == -1, "The monitor process was already started");
419   CF_EXPECT(!parent_monitor_socket_->IsOpen(),
420             "Parent monitor socket was already opened");
421   SharedFD parent_sock;
422   SharedFD child_sock;
423   SharedFD::SocketPair(AF_UNIX, SOCK_STREAM, 0, &parent_sock, &child_sock);
424   monitor_ = fork();
425   if (monitor_ == 0) {
426     child_monitor_socket_ = std::move(child_sock);
427     parent_sock->Close();
428     auto monitor_result = MonitorRoutine();
429     if (!monitor_result.ok()) {
430       LOG(ERROR) << "Monitoring processes failed:\n"
431                  << monitor_result.error().FormatForEnv();
432     }
433     std::exit(monitor_result.ok() ? 0 : 1);
434   } else {
435     parent_monitor_socket_ = std::move(parent_sock);
436     child_sock->Close();
437     return {};
438   }
439 }
440 
MonitorRoutine()441 Result<void> ProcessMonitor::MonitorRoutine() {
442 #ifdef __linux__
443   // Make this process a subreaper to reliably catch subprocess exits.
444   // See https://man7.org/linux/man-pages/man2/prctl.2.html
445   prctl(PR_SET_CHILD_SUBREAPER, 1);
446   prctl(PR_SET_PDEATHSIG, SIGHUP);  // Die when parent dies
447 #endif
448 
449   LOG(DEBUG) << "Monitoring subprocesses";
450   CF_EXPECT(StartSubprocesses(properties_));
451 
452   std::atomic_bool running(true);
453 
454   auto read_monitor_socket_loop =
455       [this](std::atomic_bool& running) -> Result<void> {
456     CF_EXPECT(this->ReadMonitorSocketLoop(running));
457     return {};
458   };
459   auto parent_comms = std::async(std::launch::async, read_monitor_socket_loop,
460                                  std::ref(running));
461 
462   CF_EXPECT(MonitorLoop(running, properties_mutex_,
463                         properties_.restart_subprocesses_,
464                         properties_.entries_));
465   CF_EXPECT(parent_comms.get(), "Should have exited if monitoring stopped");
466 
467   CF_EXPECT(StopSubprocesses(properties_.entries_));
468   LOG(DEBUG) << "Done monitoring subprocesses";
469   return {};
470 }
471 
472 }  // namespace cuttlefish
473