1 /*
2 * Copyright (C) 2022 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "aemu/base/AndroidHealthMonitor.h"
17
18 #include <map>
19 #include <sys/time.h>
20
21 namespace gfxstream {
22 namespace guest {
23
24 using gfxstream::guest::AutoLock;
25 using std::chrono::duration_cast;
26
27 template <class... Ts>
28 struct MonitoredEventVisitor : Ts... {
29 using Ts::operator()...;
30 };
31 template <class... Ts>
32 MonitoredEventVisitor(Ts...) -> MonitoredEventVisitor<Ts...>;
33
34 template <class Clock>
HealthMonitor(HealthMonitorConsumer & consumer,uint64_t heartbeatInterval)35 HealthMonitor<Clock>::HealthMonitor(HealthMonitorConsumer& consumer, uint64_t heartbeatInterval)
36 : mInterval(Duration(std::chrono::milliseconds(heartbeatInterval))), mConsumer(consumer) {
37 start();
38 }
39
40 template <class Clock>
~HealthMonitor()41 HealthMonitor<Clock>::~HealthMonitor() {
42 auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::EndMonitoring{});
43 {
44 AutoLock lock(mLock);
45 mEventQueue.push(std::move(event));
46 }
47 poll();
48 wait();
49 }
50
51 template <class Clock>
startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata,std::optional<std::function<std::unique_ptr<HangAnnotations> ()>> onHangAnnotationsCallback,uint64_t timeout,std::optional<Id> parentId)52 typename HealthMonitor<Clock>::Id HealthMonitor<Clock>::startMonitoringTask(
53 std::unique_ptr<EventHangMetadata> metadata,
54 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback,
55 uint64_t timeout, std::optional<Id> parentId) {
56 auto intervalMs = duration_cast<std::chrono::milliseconds>(mInterval).count();
57 if (timeout < intervalMs) {
58 ALOGW("Timeout value %llu is too low (heartbeat is every %llu). Increasing to %llu",
59 (unsigned long long)timeout, (unsigned long long) intervalMs,
60 (unsigned long long)intervalMs * 2);
61 timeout = intervalMs * 2;
62 }
63
64 AutoLock lock(mLock);
65 auto id = mNextId++;
66 auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Start{
67 .id = id,
68 .metadata = std::move(metadata),
69 .timeOccurred = Clock::now(),
70 .onHangAnnotationsCallback = std::move(onHangAnnotationsCallback),
71 .timeoutThreshold = Duration(std::chrono::milliseconds(timeout)),
72 .parentId = parentId});
73 mEventQueue.push(std::move(event));
74 return id;
75 }
76
77 template <class Clock>
touchMonitoredTask(Id id)78 void HealthMonitor<Clock>::touchMonitoredTask(Id id) {
79 auto event = std::make_unique<MonitoredEvent>(
80 typename MonitoredEventType::Touch{.id = id, .timeOccurred = Clock::now()});
81 AutoLock lock(mLock);
82 mEventQueue.push(std::move(event));
83 }
84
85 template <class Clock>
stopMonitoringTask(Id id)86 void HealthMonitor<Clock>::stopMonitoringTask(Id id) {
87 auto event = std::make_unique<MonitoredEvent>(
88 typename MonitoredEventType::Stop{.id = id, .timeOccurred = Clock::now()});
89 AutoLock lock(mLock);
90 mEventQueue.push(std::move(event));
91 }
92
93 template <class Clock>
poll()94 std::future<void> HealthMonitor<Clock>::poll() {
95 auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Poll{});
96 std::future<void> ret =
97 std::get<typename MonitoredEventType::Poll>(*event).complete.get_future();
98
99 AutoLock lock(mLock);
100 mEventQueue.push(std::move(event));
101 mCv.signalAndUnlock(&lock);
102 return ret;
103 }
104
105 // Thread's main loop
106 template <class Clock>
main()107 intptr_t HealthMonitor<Clock>::main() {
108 bool keepMonitoring = true;
109 std::queue<std::unique_ptr<MonitoredEvent>> events;
110
111 while (keepMonitoring) {
112 std::vector<std::promise<void>> pollPromises;
113 std::unordered_set<Id> tasksToRemove;
114 int newHungTasks = mHungTasks;
115 {
116 AutoLock lock(mLock);
117 struct timeval currentTime;
118 gettimeofday(¤tTime, 0);
119 if (mEventQueue.empty()) {
120 mCv.timedWait(
121 &mLock,
122 currentTime.tv_sec * 1000000LL + currentTime.tv_usec +
123 std::chrono::duration_cast<std::chrono::microseconds>(mInterval).count());
124 }
125 mEventQueue.swap(events);
126 }
127
128 Timestamp now = Clock::now();
129 while (!events.empty()) {
130 auto event(std::move(events.front()));
131 events.pop();
132
133 std::visit(MonitoredEventVisitor{
134 [](std::monostate&) {
135 ALOGE("MonitoredEvent type not found");
136 abort();
137 },
138 [this, &events](typename MonitoredEventType::Start& event) {
139 auto it = mMonitoredTasks.find(event.id);
140 if (it != mMonitoredTasks.end()) {
141 ALOGE("Registered multiple start events for task %llu",
142 (unsigned long long)event.id);
143 return;
144 }
145 if (event.parentId && mMonitoredTasks.find(event.parentId.value()) ==
146 mMonitoredTasks.end()) {
147 ALOGW("Requested parent task %llu does not exist.",
148 (unsigned long long)event.parentId.value());
149 event.parentId = std::nullopt;
150 }
151 it = mMonitoredTasks
152 .emplace(event.id,
153 std::move(MonitoredTask{
154 .id = event.id,
155 .timeoutTimestamp = event.timeOccurred +
156 event.timeoutThreshold,
157 .timeoutThreshold = event.timeoutThreshold,
158 .hungTimestamp = std::nullopt,
159 .metadata = std::move(event.metadata),
160 .onHangAnnotationsCallback =
161 std::move(event.onHangAnnotationsCallback),
162 .parentId = event.parentId}))
163 .first;
164 updateTaskParent(events, it->second, event.timeOccurred);
165 },
166 [this, &events](typename MonitoredEventType::Touch& event) {
167 auto it = mMonitoredTasks.find(event.id);
168 if (it == mMonitoredTasks.end()) {
169 ALOGE("HealthMonitor has no task in progress for id %llu",
170 (unsigned long long)event.id);
171 return;
172 }
173
174 auto& task = it->second;
175 task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;
176 updateTaskParent(events, task, event.timeOccurred);
177 },
178 [this, &tasksToRemove,
179 &events](typename MonitoredEventType::Stop& event) {
180 auto it = mMonitoredTasks.find(event.id);
181 if (it == mMonitoredTasks.end()) {
182 ALOGE("HealthMonitor has no task in progress for id %llu",
183 (unsigned long long)event.id);
184 return;
185 }
186
187 auto& task = it->second;
188 task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;
189 updateTaskParent(events, task, event.timeOccurred);
190
191 // Mark it for deletion, but retain it until the end of
192 // the health check concurrent tasks hung
193 tasksToRemove.insert(event.id);
194 },
195 [&keepMonitoring](typename MonitoredEventType::EndMonitoring&) {
196 keepMonitoring = false;
197 },
198 [&pollPromises](typename MonitoredEventType::Poll& event) {
199 pollPromises.push_back(std::move(event.complete));
200 }},
201 *event);
202 }
203
204 // Sort by what times out first. Identical timestamps are possible
205 std::multimap<Timestamp, uint64_t> sortedTasks;
206 for (auto& [_, task] : mMonitoredTasks) {
207 sortedTasks.insert(std::pair<Timestamp, uint64_t>(task.timeoutTimestamp, task.id));
208 }
209
210 for (auto& [_, task_id] : sortedTasks) {
211 auto& task = mMonitoredTasks[task_id];
212 if (task.timeoutTimestamp < now) {
213 // Newly hung task
214 if (!task.hungTimestamp.has_value()) {
215 // Copy over additional annotations captured at hangTime
216 if (task.onHangAnnotationsCallback) {
217 auto newAnnotations = (*task.onHangAnnotationsCallback)();
218 task.metadata->mergeAnnotations(std::move(newAnnotations));
219 }
220 mConsumer.consumeHangEvent(task.id, task.metadata.get(), newHungTasks);
221 task.hungTimestamp = task.timeoutTimestamp;
222 newHungTasks++;
223 }
224 } else {
225 // Task resumes
226 if (task.hungTimestamp.has_value()) {
227 auto hangTime = duration_cast<std::chrono::milliseconds>(
228 task.timeoutTimestamp -
229 (task.hungTimestamp.value() + task.timeoutThreshold))
230 .count();
231 mConsumer.consumeUnHangEvent(task.id, task.metadata.get(), hangTime);
232 task.hungTimestamp = std::nullopt;
233 newHungTasks--;
234 }
235 }
236 if (tasksToRemove.find(task_id) != tasksToRemove.end()) {
237 mMonitoredTasks.erase(task_id);
238 }
239 }
240
241 if (mHungTasks != newHungTasks) {
242 ALOGE("HealthMonitor: Number of unresponsive tasks %s: %d -> %d",
243 mHungTasks < newHungTasks ? "increased" : "decreaased", mHungTasks, newHungTasks);
244 mHungTasks = newHungTasks;
245 }
246
247 for (auto& complete : pollPromises) {
248 complete.set_value();
249 }
250 }
251
252 return 0;
253 }
254
255 template <class Clock>
updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>> & events,const MonitoredTask & task,Timestamp eventTime)256 void HealthMonitor<Clock>::updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events,
257 const MonitoredTask& task, Timestamp eventTime) {
258 std::optional<Id> parentId = task.parentId;
259 if (parentId) {
260 auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Touch{
261 .id = parentId.value(), .timeOccurred = eventTime + Duration(kTimeEpsilon)});
262 events.push(std::move(event));
263 }
264 }
265
CreateHealthMonitor(HealthMonitorConsumer & consumer,uint64_t heartbeatInterval)266 std::unique_ptr<HealthMonitor<>> CreateHealthMonitor(HealthMonitorConsumer& consumer,
267 uint64_t heartbeatInterval) {
268 #ifdef ENABLE_ANDROID_HEALTH_MONITOR
269 ALOGI("HealthMonitor enabled. Returning monitor.");
270 return std::make_unique<HealthMonitor<>>(consumer, heartbeatInterval);
271 #else
272 (void)consumer;
273 (void)heartbeatInterval;
274 ALOGV("HealthMonitor disabled. Returning nullptr");
275 return nullptr;
276 #endif
277 }
278
279 template class HealthMonitor<steady_clock>;
280
281 } // namespace guest
282 } // namespace gfxstream
283