1 /*
2  * Copyright (C) 2022 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "aemu/base/HealthMonitor.h"
17 
18 #include <map>
19 
20 #include "aemu/base/system/System.h"
21 #include "aemu/base/testing/TestClock.h"
22 #include "host-common/logging.h"
23 #include "host-common/GfxstreamFatalError.h"
24 
25 namespace emugl {
26 
27 using android::base::AutoLock;
28 using android::base::MetricEventHang;
29 using android::base::MetricEventUnHang;
30 using android::base::TestClock;
31 using std::chrono::duration_cast;
32 using emugl::ABORT_REASON_OTHER;
33 using emugl::FatalError;
34 
35 template <class... Ts>
36 struct MonitoredEventVisitor : Ts... {
37     using Ts::operator()...;
38 };
39 template <class... Ts>
40 MonitoredEventVisitor(Ts...) -> MonitoredEventVisitor<Ts...>;
41 
42 template <class Clock>
HealthMonitor(MetricsLogger & metricsLogger,uint64_t heartbeatInterval)43 HealthMonitor<Clock>::HealthMonitor(MetricsLogger& metricsLogger, uint64_t heartbeatInterval)
44     : mInterval(Duration(std::chrono::milliseconds(heartbeatInterval))), mLogger(metricsLogger) {
45     start();
46 }
47 
48 template <class Clock>
~HealthMonitor()49 HealthMonitor<Clock>::~HealthMonitor() {
50     auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::EndMonitoring{});
51     {
52         AutoLock lock(mLock);
53         mEventQueue.push(std::move(event));
54     }
55     poll();
56     wait();
57 }
58 
59 template <class Clock>
startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata,std::optional<std::function<std::unique_ptr<HangAnnotations> ()>> onHangAnnotationsCallback,uint64_t timeout,std::optional<Id> parentId)60 typename HealthMonitor<Clock>::Id HealthMonitor<Clock>::startMonitoringTask(
61     std::unique_ptr<EventHangMetadata> metadata,
62     std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback,
63     uint64_t timeout, std::optional<Id> parentId) {
64     auto intervalMs = duration_cast<std::chrono::milliseconds>(mInterval).count();
65     if (timeout < intervalMs) {
66         WARN("Timeout value %d is too low (heartbeat is every %d). Increasing to %d", timeout,
67              intervalMs, intervalMs * 2);
68         timeout = intervalMs * 2;
69     }
70 
71     AutoLock lock(mLock);
72     auto id = mNextId++;
73     auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Start{
74         .id = id,
75         .metadata = std::move(metadata),
76         .timeOccurred = Clock::now(),
77         .onHangAnnotationsCallback = std::move(onHangAnnotationsCallback),
78         .timeoutThreshold = Duration(std::chrono::milliseconds(timeout)),
79         .parentId = parentId});
80     mEventQueue.push(std::move(event));
81     return id;
82 }
83 
84 template <class Clock>
touchMonitoredTask(Id id)85 void HealthMonitor<Clock>::touchMonitoredTask(Id id) {
86     auto event = std::make_unique<MonitoredEvent>(
87         typename MonitoredEventType::Touch{.id = id, .timeOccurred = Clock::now()});
88     AutoLock lock(mLock);
89     mEventQueue.push(std::move(event));
90 }
91 
92 template <class Clock>
stopMonitoringTask(Id id)93 void HealthMonitor<Clock>::stopMonitoringTask(Id id) {
94     auto event = std::make_unique<MonitoredEvent>(
95         typename MonitoredEventType::Stop{.id = id, .timeOccurred = Clock::now()});
96     AutoLock lock(mLock);
97     mEventQueue.push(std::move(event));
98 }
99 
100 template <class Clock>
poll()101 std::future<void> HealthMonitor<Clock>::poll() {
102     auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Poll{});
103     std::future<void> ret =
104         std::get<typename MonitoredEventType::Poll>(*event).complete.get_future();
105 
106     AutoLock lock(mLock);
107     mEventQueue.push(std::move(event));
108     mCv.signalAndUnlock(&lock);
109     return ret;
110 }
111 
112 // Thread's main loop
113 template <class Clock>
main()114 intptr_t HealthMonitor<Clock>::main() {
115     bool keepMonitoring = true;
116     std::queue<std::unique_ptr<MonitoredEvent>> events;
117 
118     while (keepMonitoring) {
119         std::vector<std::promise<void>> pollPromises;
120         std::unordered_set<Id> tasksToRemove;
121         int newHungTasks = mHungTasks;
122         {
123             AutoLock lock(mLock);
124             if (mEventQueue.empty()) {
125                 mCv.timedWait(
126                     &mLock,
127                     android::base::getUnixTimeUs() +
128                         std::chrono::duration_cast<std::chrono::microseconds>(mInterval).count());
129             }
130             mEventQueue.swap(events);
131         }
132 
133         Timestamp now = Clock::now();
134         while (!events.empty()) {
135             auto event(std::move(events.front()));
136             events.pop();
137 
138             std::visit(MonitoredEventVisitor{
139                            [](std::monostate& event) {
140                                ERR("MonitoredEvent type not found");
141                                GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER)) <<
142                                    "MonitoredEvent type not found";
143                            },
144                            [this, &events](typename MonitoredEventType::Start& event) {
145                                auto it = mMonitoredTasks.find(event.id);
146                                if (it != mMonitoredTasks.end()) {
147                                    ERR("Registered multiple start events for task %d", event.id);
148                                    return;
149                                }
150                                if (event.parentId && mMonitoredTasks.find(event.parentId.value()) ==
151                                                          mMonitoredTasks.end()) {
152                                    WARN("Requested parent task %d does not exist.",
153                                         event.parentId.value());
154                                    event.parentId = std::nullopt;
155                                }
156                                it = mMonitoredTasks
157                                         .emplace(event.id,
158                                                  std::move(MonitoredTask{
159                                                      .id = event.id,
160                                                      .timeoutTimestamp = event.timeOccurred +
161                                                                          event.timeoutThreshold,
162                                                      .timeoutThreshold = event.timeoutThreshold,
163                                                      .hungTimestamp = std::nullopt,
164                                                      .metadata = std::move(event.metadata),
165                                                      .onHangAnnotationsCallback =
166                                                          std::move(event.onHangAnnotationsCallback),
167                                                      .parentId = event.parentId}))
168                                         .first;
169                                updateTaskParent(events, it->second, event.timeOccurred);
170                            },
171                            [this, &events](typename MonitoredEventType::Touch& event) {
172                                auto it = mMonitoredTasks.find(event.id);
173                                if (it == mMonitoredTasks.end()) {
174                                    ERR("HealthMonitor has no task in progress for id %d", event.id);
175                                    return;
176                                }
177 
178                                auto& task = it->second;
179                                task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;
180                                updateTaskParent(events, task, event.timeOccurred);
181                            },
182                            [this, &tasksToRemove,
183                             &events](typename MonitoredEventType::Stop& event) {
184                                auto it = mMonitoredTasks.find(event.id);
185                                if (it == mMonitoredTasks.end()) {
186                                    ERR("HealthMonitor has no task in progress for id %d", event.id);
187                                    return;
188                                }
189 
190                                auto& task = it->second;
191                                task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;
192                                updateTaskParent(events, task, event.timeOccurred);
193 
194                                // Mark it for deletion, but retain it until the end of
195                                // the health check concurrent tasks hung
196                                tasksToRemove.insert(event.id);
197                            },
198                            [&keepMonitoring](typename MonitoredEventType::EndMonitoring& event) {
199                                keepMonitoring = false;
200                            },
201                            [&pollPromises](typename MonitoredEventType::Poll& event) {
202                                pollPromises.push_back(std::move(event.complete));
203                            }},
204                        *event);
205         }
206 
207         // Sort by what times out first. Identical timestamps are possible
208         std::multimap<Timestamp, uint64_t> sortedTasks;
209         for (auto& [_, task] : mMonitoredTasks) {
210             sortedTasks.insert(std::pair<Timestamp, uint64_t>(task.timeoutTimestamp, task.id));
211         }
212 
213         for (auto& [_, task_id] : sortedTasks) {
214             auto& task = mMonitoredTasks[task_id];
215             if (task.timeoutTimestamp < now) {
216                 // Newly hung task
217                 if (!task.hungTimestamp.has_value()) {
218                     // Copy over additional annotations captured at hangTime
219                     if (task.onHangAnnotationsCallback) {
220                         auto newAnnotations = (*task.onHangAnnotationsCallback)();
221                         task.metadata->mergeAnnotations(std::move(newAnnotations));
222                     }
223                     mLogger.logMetricEvent(MetricEventHang{.taskId = task.id,
224                                                            .metadata = task.metadata.get(),
225                                                            .otherHungTasks = newHungTasks});
226                     task.hungTimestamp = task.timeoutTimestamp;
227                     newHungTasks++;
228                 }
229             } else {
230                 // Task resumes
231                 if (task.hungTimestamp.has_value()) {
232                     newHungTasks--;
233                     auto hangTime = duration_cast<std::chrono::milliseconds>(
234                                         task.timeoutTimestamp -
235                                         (task.hungTimestamp.value() + task.timeoutThreshold))
236                                         .count();
237                     mLogger.logMetricEvent(MetricEventUnHang{.taskId = task.id,
238                                                              .metadata = task.metadata.get(),
239                                                              .hung_ms = hangTime,
240                                                              .otherHungTasks = newHungTasks});
241                     task.hungTimestamp = std::nullopt;
242                 }
243             }
244             if (tasksToRemove.find(task_id) != tasksToRemove.end()) {
245                 mMonitoredTasks.erase(task_id);
246             }
247         }
248 
249         if (mHungTasks != newHungTasks) {
250             ERR("HealthMonitor: Number of unresponsive tasks %s: %d -> %d",
251                 mHungTasks < newHungTasks ? "increased" : "decreaased", mHungTasks, newHungTasks);
252             mHungTasks = newHungTasks;
253         }
254 
255         for (auto& complete : pollPromises) {
256             complete.set_value();
257         }
258     }
259 
260     return 0;
261 }
262 
263 template <class Clock>
updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>> & events,const MonitoredTask & task,Timestamp eventTime)264 void HealthMonitor<Clock>::updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events,
265                                             const MonitoredTask& task, Timestamp eventTime) {
266     std::optional<Id> parentId = task.parentId;
267     if (parentId) {
268         auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Touch{
269             .id = parentId.value(), .timeOccurred = eventTime + Duration(kTimeEpsilon)});
270         events.push(std::move(event));
271     }
272 }
273 
CreateHealthMonitor(MetricsLogger & metricsLogger,uint64_t heartbeatInterval)274 std::unique_ptr<HealthMonitor<>> CreateHealthMonitor(MetricsLogger& metricsLogger,
275                                                      uint64_t heartbeatInterval) {
276 #if ENABLE_HEALTH_MONITOR
277     INFO("HealthMonitor enabled.");
278     return std::make_unique<HealthMonitor<>>(metricsLogger, heartbeatInterval);
279 #else
280     INFO("HealthMonitor disabled.");
281     return nullptr;
282 #endif
283 }
284 
285 template class HealthMonitor<steady_clock>;
286 template class HealthMonitor<TestClock>;
287 
288 }  // namespace emugl
289