1 /*
2 * Copyright (C) 2022 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "aemu/base/HealthMonitor.h"
17
18 #include <map>
19
20 #include "aemu/base/system/System.h"
21 #include "aemu/base/testing/TestClock.h"
22 #include "host-common/logging.h"
23 #include "host-common/GfxstreamFatalError.h"
24
25 namespace emugl {
26
27 using android::base::AutoLock;
28 using android::base::MetricEventHang;
29 using android::base::MetricEventUnHang;
30 using android::base::TestClock;
31 using std::chrono::duration_cast;
32 using emugl::ABORT_REASON_OTHER;
33 using emugl::FatalError;
34
35 template <class... Ts>
36 struct MonitoredEventVisitor : Ts... {
37 using Ts::operator()...;
38 };
39 template <class... Ts>
40 MonitoredEventVisitor(Ts...) -> MonitoredEventVisitor<Ts...>;
41
42 template <class Clock>
HealthMonitor(MetricsLogger & metricsLogger,uint64_t heartbeatInterval)43 HealthMonitor<Clock>::HealthMonitor(MetricsLogger& metricsLogger, uint64_t heartbeatInterval)
44 : mInterval(Duration(std::chrono::milliseconds(heartbeatInterval))), mLogger(metricsLogger) {
45 start();
46 }
47
48 template <class Clock>
~HealthMonitor()49 HealthMonitor<Clock>::~HealthMonitor() {
50 auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::EndMonitoring{});
51 {
52 AutoLock lock(mLock);
53 mEventQueue.push(std::move(event));
54 }
55 poll();
56 wait();
57 }
58
59 template <class Clock>
startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata,std::optional<std::function<std::unique_ptr<HangAnnotations> ()>> onHangAnnotationsCallback,uint64_t timeout,std::optional<Id> parentId)60 typename HealthMonitor<Clock>::Id HealthMonitor<Clock>::startMonitoringTask(
61 std::unique_ptr<EventHangMetadata> metadata,
62 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback,
63 uint64_t timeout, std::optional<Id> parentId) {
64 auto intervalMs = duration_cast<std::chrono::milliseconds>(mInterval).count();
65 if (timeout < intervalMs) {
66 WARN("Timeout value %d is too low (heartbeat is every %d). Increasing to %d", timeout,
67 intervalMs, intervalMs * 2);
68 timeout = intervalMs * 2;
69 }
70
71 AutoLock lock(mLock);
72 auto id = mNextId++;
73 auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Start{
74 .id = id,
75 .metadata = std::move(metadata),
76 .timeOccurred = Clock::now(),
77 .onHangAnnotationsCallback = std::move(onHangAnnotationsCallback),
78 .timeoutThreshold = Duration(std::chrono::milliseconds(timeout)),
79 .parentId = parentId});
80 mEventQueue.push(std::move(event));
81 return id;
82 }
83
84 template <class Clock>
touchMonitoredTask(Id id)85 void HealthMonitor<Clock>::touchMonitoredTask(Id id) {
86 auto event = std::make_unique<MonitoredEvent>(
87 typename MonitoredEventType::Touch{.id = id, .timeOccurred = Clock::now()});
88 AutoLock lock(mLock);
89 mEventQueue.push(std::move(event));
90 }
91
92 template <class Clock>
stopMonitoringTask(Id id)93 void HealthMonitor<Clock>::stopMonitoringTask(Id id) {
94 auto event = std::make_unique<MonitoredEvent>(
95 typename MonitoredEventType::Stop{.id = id, .timeOccurred = Clock::now()});
96 AutoLock lock(mLock);
97 mEventQueue.push(std::move(event));
98 }
99
100 template <class Clock>
poll()101 std::future<void> HealthMonitor<Clock>::poll() {
102 auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Poll{});
103 std::future<void> ret =
104 std::get<typename MonitoredEventType::Poll>(*event).complete.get_future();
105
106 AutoLock lock(mLock);
107 mEventQueue.push(std::move(event));
108 mCv.signalAndUnlock(&lock);
109 return ret;
110 }
111
112 // Thread's main loop
113 template <class Clock>
main()114 intptr_t HealthMonitor<Clock>::main() {
115 bool keepMonitoring = true;
116 std::queue<std::unique_ptr<MonitoredEvent>> events;
117
118 while (keepMonitoring) {
119 std::vector<std::promise<void>> pollPromises;
120 std::unordered_set<Id> tasksToRemove;
121 int newHungTasks = mHungTasks;
122 {
123 AutoLock lock(mLock);
124 if (mEventQueue.empty()) {
125 mCv.timedWait(
126 &mLock,
127 android::base::getUnixTimeUs() +
128 std::chrono::duration_cast<std::chrono::microseconds>(mInterval).count());
129 }
130 mEventQueue.swap(events);
131 }
132
133 Timestamp now = Clock::now();
134 while (!events.empty()) {
135 auto event(std::move(events.front()));
136 events.pop();
137
138 std::visit(MonitoredEventVisitor{
139 [](std::monostate& event) {
140 ERR("MonitoredEvent type not found");
141 GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER)) <<
142 "MonitoredEvent type not found";
143 },
144 [this, &events](typename MonitoredEventType::Start& event) {
145 auto it = mMonitoredTasks.find(event.id);
146 if (it != mMonitoredTasks.end()) {
147 ERR("Registered multiple start events for task %d", event.id);
148 return;
149 }
150 if (event.parentId && mMonitoredTasks.find(event.parentId.value()) ==
151 mMonitoredTasks.end()) {
152 WARN("Requested parent task %d does not exist.",
153 event.parentId.value());
154 event.parentId = std::nullopt;
155 }
156 it = mMonitoredTasks
157 .emplace(event.id,
158 std::move(MonitoredTask{
159 .id = event.id,
160 .timeoutTimestamp = event.timeOccurred +
161 event.timeoutThreshold,
162 .timeoutThreshold = event.timeoutThreshold,
163 .hungTimestamp = std::nullopt,
164 .metadata = std::move(event.metadata),
165 .onHangAnnotationsCallback =
166 std::move(event.onHangAnnotationsCallback),
167 .parentId = event.parentId}))
168 .first;
169 updateTaskParent(events, it->second, event.timeOccurred);
170 },
171 [this, &events](typename MonitoredEventType::Touch& event) {
172 auto it = mMonitoredTasks.find(event.id);
173 if (it == mMonitoredTasks.end()) {
174 ERR("HealthMonitor has no task in progress for id %d", event.id);
175 return;
176 }
177
178 auto& task = it->second;
179 task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;
180 updateTaskParent(events, task, event.timeOccurred);
181 },
182 [this, &tasksToRemove,
183 &events](typename MonitoredEventType::Stop& event) {
184 auto it = mMonitoredTasks.find(event.id);
185 if (it == mMonitoredTasks.end()) {
186 ERR("HealthMonitor has no task in progress for id %d", event.id);
187 return;
188 }
189
190 auto& task = it->second;
191 task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;
192 updateTaskParent(events, task, event.timeOccurred);
193
194 // Mark it for deletion, but retain it until the end of
195 // the health check concurrent tasks hung
196 tasksToRemove.insert(event.id);
197 },
198 [&keepMonitoring](typename MonitoredEventType::EndMonitoring& event) {
199 keepMonitoring = false;
200 },
201 [&pollPromises](typename MonitoredEventType::Poll& event) {
202 pollPromises.push_back(std::move(event.complete));
203 }},
204 *event);
205 }
206
207 // Sort by what times out first. Identical timestamps are possible
208 std::multimap<Timestamp, uint64_t> sortedTasks;
209 for (auto& [_, task] : mMonitoredTasks) {
210 sortedTasks.insert(std::pair<Timestamp, uint64_t>(task.timeoutTimestamp, task.id));
211 }
212
213 for (auto& [_, task_id] : sortedTasks) {
214 auto& task = mMonitoredTasks[task_id];
215 if (task.timeoutTimestamp < now) {
216 // Newly hung task
217 if (!task.hungTimestamp.has_value()) {
218 // Copy over additional annotations captured at hangTime
219 if (task.onHangAnnotationsCallback) {
220 auto newAnnotations = (*task.onHangAnnotationsCallback)();
221 task.metadata->mergeAnnotations(std::move(newAnnotations));
222 }
223 mLogger.logMetricEvent(MetricEventHang{.taskId = task.id,
224 .metadata = task.metadata.get(),
225 .otherHungTasks = newHungTasks});
226 task.hungTimestamp = task.timeoutTimestamp;
227 newHungTasks++;
228 }
229 } else {
230 // Task resumes
231 if (task.hungTimestamp.has_value()) {
232 newHungTasks--;
233 auto hangTime = duration_cast<std::chrono::milliseconds>(
234 task.timeoutTimestamp -
235 (task.hungTimestamp.value() + task.timeoutThreshold))
236 .count();
237 mLogger.logMetricEvent(MetricEventUnHang{.taskId = task.id,
238 .metadata = task.metadata.get(),
239 .hung_ms = hangTime,
240 .otherHungTasks = newHungTasks});
241 task.hungTimestamp = std::nullopt;
242 }
243 }
244 if (tasksToRemove.find(task_id) != tasksToRemove.end()) {
245 mMonitoredTasks.erase(task_id);
246 }
247 }
248
249 if (mHungTasks != newHungTasks) {
250 ERR("HealthMonitor: Number of unresponsive tasks %s: %d -> %d",
251 mHungTasks < newHungTasks ? "increased" : "decreaased", mHungTasks, newHungTasks);
252 mHungTasks = newHungTasks;
253 }
254
255 for (auto& complete : pollPromises) {
256 complete.set_value();
257 }
258 }
259
260 return 0;
261 }
262
263 template <class Clock>
updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>> & events,const MonitoredTask & task,Timestamp eventTime)264 void HealthMonitor<Clock>::updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events,
265 const MonitoredTask& task, Timestamp eventTime) {
266 std::optional<Id> parentId = task.parentId;
267 if (parentId) {
268 auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Touch{
269 .id = parentId.value(), .timeOccurred = eventTime + Duration(kTimeEpsilon)});
270 events.push(std::move(event));
271 }
272 }
273
CreateHealthMonitor(MetricsLogger & metricsLogger,uint64_t heartbeatInterval)274 std::unique_ptr<HealthMonitor<>> CreateHealthMonitor(MetricsLogger& metricsLogger,
275 uint64_t heartbeatInterval) {
276 #if ENABLE_HEALTH_MONITOR
277 INFO("HealthMonitor enabled.");
278 return std::make_unique<HealthMonitor<>>(metricsLogger, heartbeatInterval);
279 #else
280 INFO("HealthMonitor disabled.");
281 return nullptr;
282 #endif
283 }
284
285 template class HealthMonitor<steady_clock>;
286 template class HealthMonitor<TestClock>;
287
288 } // namespace emugl
289