1 /*
2  * Copyright (C) 2022 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "aemu/base/AndroidHealthMonitor.h"
17 
18 #include <map>
19 #include <sys/time.h>
20 
21 namespace gfxstream {
22 namespace guest {
23 
24 using gfxstream::guest::AutoLock;
25 using std::chrono::duration_cast;
26 
27 template <class... Ts>
28 struct MonitoredEventVisitor : Ts... {
29     using Ts::operator()...;
30 };
31 template <class... Ts>
32 MonitoredEventVisitor(Ts...) -> MonitoredEventVisitor<Ts...>;
33 
34 template <class Clock>
HealthMonitor(HealthMonitorConsumer & consumer,uint64_t heartbeatInterval)35 HealthMonitor<Clock>::HealthMonitor(HealthMonitorConsumer& consumer, uint64_t heartbeatInterval)
36     : mInterval(Duration(std::chrono::milliseconds(heartbeatInterval))), mConsumer(consumer) {
37     start();
38 }
39 
40 template <class Clock>
~HealthMonitor()41 HealthMonitor<Clock>::~HealthMonitor() {
42     auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::EndMonitoring{});
43     {
44         AutoLock lock(mLock);
45         mEventQueue.push(std::move(event));
46     }
47     poll();
48     wait();
49 }
50 
51 template <class Clock>
startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata,std::optional<std::function<std::unique_ptr<HangAnnotations> ()>> onHangAnnotationsCallback,uint64_t timeout,std::optional<Id> parentId)52 typename HealthMonitor<Clock>::Id HealthMonitor<Clock>::startMonitoringTask(
53     std::unique_ptr<EventHangMetadata> metadata,
54     std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback,
55     uint64_t timeout, std::optional<Id> parentId) {
56     auto intervalMs = duration_cast<std::chrono::milliseconds>(mInterval).count();
57     if (timeout < intervalMs) {
58         ALOGW("Timeout value %llu is too low (heartbeat is every %llu). Increasing to %llu",
59               (unsigned long long)timeout, (unsigned long long) intervalMs,
60               (unsigned long long)intervalMs * 2);
61         timeout = intervalMs * 2;
62     }
63 
64     AutoLock lock(mLock);
65     auto id = mNextId++;
66     auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Start{
67         .id = id,
68         .metadata = std::move(metadata),
69         .timeOccurred = Clock::now(),
70         .onHangAnnotationsCallback = std::move(onHangAnnotationsCallback),
71         .timeoutThreshold = Duration(std::chrono::milliseconds(timeout)),
72         .parentId = parentId});
73     mEventQueue.push(std::move(event));
74     return id;
75 }
76 
77 template <class Clock>
touchMonitoredTask(Id id)78 void HealthMonitor<Clock>::touchMonitoredTask(Id id) {
79     auto event = std::make_unique<MonitoredEvent>(
80         typename MonitoredEventType::Touch{.id = id, .timeOccurred = Clock::now()});
81     AutoLock lock(mLock);
82     mEventQueue.push(std::move(event));
83 }
84 
85 template <class Clock>
stopMonitoringTask(Id id)86 void HealthMonitor<Clock>::stopMonitoringTask(Id id) {
87     auto event = std::make_unique<MonitoredEvent>(
88         typename MonitoredEventType::Stop{.id = id, .timeOccurred = Clock::now()});
89     AutoLock lock(mLock);
90     mEventQueue.push(std::move(event));
91 }
92 
93 template <class Clock>
poll()94 std::future<void> HealthMonitor<Clock>::poll() {
95     auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Poll{});
96     std::future<void> ret =
97         std::get<typename MonitoredEventType::Poll>(*event).complete.get_future();
98 
99     AutoLock lock(mLock);
100     mEventQueue.push(std::move(event));
101     mCv.signalAndUnlock(&lock);
102     return ret;
103 }
104 
105 // Thread's main loop
106 template <class Clock>
main()107 intptr_t HealthMonitor<Clock>::main() {
108     bool keepMonitoring = true;
109     std::queue<std::unique_ptr<MonitoredEvent>> events;
110 
111     while (keepMonitoring) {
112         std::vector<std::promise<void>> pollPromises;
113         std::unordered_set<Id> tasksToRemove;
114         int newHungTasks = mHungTasks;
115         {
116             AutoLock lock(mLock);
117             struct timeval currentTime;
118             gettimeofday(&currentTime, 0);
119             if (mEventQueue.empty()) {
120                 mCv.timedWait(
121                     &mLock,
122                     currentTime.tv_sec * 1000000LL + currentTime.tv_usec +
123                         std::chrono::duration_cast<std::chrono::microseconds>(mInterval).count());
124             }
125             mEventQueue.swap(events);
126         }
127 
128         Timestamp now = Clock::now();
129         while (!events.empty()) {
130             auto event(std::move(events.front()));
131             events.pop();
132 
133             std::visit(MonitoredEventVisitor{
134                            [](std::monostate&) {
135                                ALOGE("MonitoredEvent type not found");
136                                abort();
137                            },
138                            [this, &events](typename MonitoredEventType::Start& event) {
139                                auto it = mMonitoredTasks.find(event.id);
140                                if (it != mMonitoredTasks.end()) {
141                                    ALOGE("Registered multiple start events for task %llu",
142                                          (unsigned long long)event.id);
143                                    return;
144                                }
145                                if (event.parentId && mMonitoredTasks.find(event.parentId.value()) ==
146                                                          mMonitoredTasks.end()) {
147                                    ALOGW("Requested parent task %llu does not exist.",
148                                          (unsigned long long)event.parentId.value());
149                                    event.parentId = std::nullopt;
150                                }
151                                it = mMonitoredTasks
152                                         .emplace(event.id,
153                                                  std::move(MonitoredTask{
154                                                      .id = event.id,
155                                                      .timeoutTimestamp = event.timeOccurred +
156                                                                          event.timeoutThreshold,
157                                                      .timeoutThreshold = event.timeoutThreshold,
158                                                      .hungTimestamp = std::nullopt,
159                                                      .metadata = std::move(event.metadata),
160                                                      .onHangAnnotationsCallback =
161                                                          std::move(event.onHangAnnotationsCallback),
162                                                      .parentId = event.parentId}))
163                                         .first;
164                                updateTaskParent(events, it->second, event.timeOccurred);
165                            },
166                            [this, &events](typename MonitoredEventType::Touch& event) {
167                                auto it = mMonitoredTasks.find(event.id);
168                                if (it == mMonitoredTasks.end()) {
169                                    ALOGE("HealthMonitor has no task in progress for id %llu",
170                                          (unsigned long long)event.id);
171                                    return;
172                                }
173 
174                                auto& task = it->second;
175                                task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;
176                                updateTaskParent(events, task, event.timeOccurred);
177                            },
178                            [this, &tasksToRemove,
179                             &events](typename MonitoredEventType::Stop& event) {
180                                auto it = mMonitoredTasks.find(event.id);
181                                if (it == mMonitoredTasks.end()) {
182                                    ALOGE("HealthMonitor has no task in progress for id %llu",
183                                          (unsigned long long)event.id);
184                                    return;
185                                }
186 
187                                auto& task = it->second;
188                                task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;
189                                updateTaskParent(events, task, event.timeOccurred);
190 
191                                // Mark it for deletion, but retain it until the end of
192                                // the health check concurrent tasks hung
193                                tasksToRemove.insert(event.id);
194                            },
195                            [&keepMonitoring](typename MonitoredEventType::EndMonitoring&) {
196                                keepMonitoring = false;
197                            },
198                            [&pollPromises](typename MonitoredEventType::Poll& event) {
199                                pollPromises.push_back(std::move(event.complete));
200                            }},
201                        *event);
202         }
203 
204         // Sort by what times out first. Identical timestamps are possible
205         std::multimap<Timestamp, uint64_t> sortedTasks;
206         for (auto& [_, task] : mMonitoredTasks) {
207             sortedTasks.insert(std::pair<Timestamp, uint64_t>(task.timeoutTimestamp, task.id));
208         }
209 
210         for (auto& [_, task_id] : sortedTasks) {
211             auto& task = mMonitoredTasks[task_id];
212             if (task.timeoutTimestamp < now) {
213                 // Newly hung task
214                 if (!task.hungTimestamp.has_value()) {
215                     // Copy over additional annotations captured at hangTime
216                     if (task.onHangAnnotationsCallback) {
217                         auto newAnnotations = (*task.onHangAnnotationsCallback)();
218                         task.metadata->mergeAnnotations(std::move(newAnnotations));
219                     }
220                     mConsumer.consumeHangEvent(task.id, task.metadata.get(), newHungTasks);
221                     task.hungTimestamp = task.timeoutTimestamp;
222                     newHungTasks++;
223                 }
224             } else {
225                 // Task resumes
226                 if (task.hungTimestamp.has_value()) {
227                     auto hangTime = duration_cast<std::chrono::milliseconds>(
228                                         task.timeoutTimestamp -
229                                         (task.hungTimestamp.value() + task.timeoutThreshold))
230                                         .count();
231                     mConsumer.consumeUnHangEvent(task.id, task.metadata.get(), hangTime);
232                     task.hungTimestamp = std::nullopt;
233                     newHungTasks--;
234                 }
235             }
236             if (tasksToRemove.find(task_id) != tasksToRemove.end()) {
237                 mMonitoredTasks.erase(task_id);
238             }
239         }
240 
241         if (mHungTasks != newHungTasks) {
242             ALOGE("HealthMonitor: Number of unresponsive tasks %s: %d -> %d",
243                 mHungTasks < newHungTasks ? "increased" : "decreaased", mHungTasks, newHungTasks);
244             mHungTasks = newHungTasks;
245         }
246 
247         for (auto& complete : pollPromises) {
248             complete.set_value();
249         }
250     }
251 
252     return 0;
253 }
254 
255 template <class Clock>
updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>> & events,const MonitoredTask & task,Timestamp eventTime)256 void HealthMonitor<Clock>::updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events,
257                                             const MonitoredTask& task, Timestamp eventTime) {
258     std::optional<Id> parentId = task.parentId;
259     if (parentId) {
260         auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Touch{
261             .id = parentId.value(), .timeOccurred = eventTime + Duration(kTimeEpsilon)});
262         events.push(std::move(event));
263     }
264 }
265 
CreateHealthMonitor(HealthMonitorConsumer & consumer,uint64_t heartbeatInterval)266 std::unique_ptr<HealthMonitor<>> CreateHealthMonitor(HealthMonitorConsumer& consumer,
267                                                      uint64_t heartbeatInterval) {
268 #ifdef ENABLE_ANDROID_HEALTH_MONITOR
269     ALOGI("HealthMonitor enabled. Returning monitor.");
270     return std::make_unique<HealthMonitor<>>(consumer, heartbeatInterval);
271 #else
272     (void)consumer;
273     (void)heartbeatInterval;
274     ALOGV("HealthMonitor disabled. Returning nullptr");
275     return nullptr;
276 #endif
277 }
278 
279 template class HealthMonitor<steady_clock>;
280 
281 } // namespace guest
282 } // namespace gfxstream
283