/* * Copyright (C) 2022 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "aemu/base/HealthMonitor.h" #include #include "aemu/base/system/System.h" #include "aemu/base/testing/TestClock.h" #include "host-common/logging.h" #include "host-common/GfxstreamFatalError.h" namespace emugl { using android::base::AutoLock; using android::base::MetricEventHang; using android::base::MetricEventUnHang; using android::base::TestClock; using std::chrono::duration_cast; using emugl::ABORT_REASON_OTHER; using emugl::FatalError; template struct MonitoredEventVisitor : Ts... { using Ts::operator()...; }; template MonitoredEventVisitor(Ts...) -> MonitoredEventVisitor; template HealthMonitor::HealthMonitor(MetricsLogger& metricsLogger, uint64_t heartbeatInterval) : mInterval(Duration(std::chrono::milliseconds(heartbeatInterval))), mLogger(metricsLogger) { start(); } template HealthMonitor::~HealthMonitor() { auto event = std::make_unique(typename MonitoredEventType::EndMonitoring{}); { AutoLock lock(mLock); mEventQueue.push(std::move(event)); } poll(); wait(); } template typename HealthMonitor::Id HealthMonitor::startMonitoringTask( std::unique_ptr metadata, std::optional()>> onHangAnnotationsCallback, uint64_t timeout, std::optional parentId) { auto intervalMs = duration_cast(mInterval).count(); if (timeout < intervalMs) { WARN("Timeout value %d is too low (heartbeat is every %d). Increasing to %d", timeout, intervalMs, intervalMs * 2); timeout = intervalMs * 2; } AutoLock lock(mLock); auto id = mNextId++; auto event = std::make_unique(typename MonitoredEventType::Start{ .id = id, .metadata = std::move(metadata), .timeOccurred = Clock::now(), .onHangAnnotationsCallback = std::move(onHangAnnotationsCallback), .timeoutThreshold = Duration(std::chrono::milliseconds(timeout)), .parentId = parentId}); mEventQueue.push(std::move(event)); return id; } template void HealthMonitor::touchMonitoredTask(Id id) { auto event = std::make_unique( typename MonitoredEventType::Touch{.id = id, .timeOccurred = Clock::now()}); AutoLock lock(mLock); mEventQueue.push(std::move(event)); } template void HealthMonitor::stopMonitoringTask(Id id) { auto event = std::make_unique( typename MonitoredEventType::Stop{.id = id, .timeOccurred = Clock::now()}); AutoLock lock(mLock); mEventQueue.push(std::move(event)); } template std::future HealthMonitor::poll() { auto event = std::make_unique(typename MonitoredEventType::Poll{}); std::future ret = std::get(*event).complete.get_future(); AutoLock lock(mLock); mEventQueue.push(std::move(event)); mCv.signalAndUnlock(&lock); return ret; } // Thread's main loop template intptr_t HealthMonitor::main() { bool keepMonitoring = true; std::queue> events; while (keepMonitoring) { std::vector> pollPromises; std::unordered_set tasksToRemove; int newHungTasks = mHungTasks; { AutoLock lock(mLock); if (mEventQueue.empty()) { mCv.timedWait( &mLock, android::base::getUnixTimeUs() + std::chrono::duration_cast(mInterval).count()); } mEventQueue.swap(events); } Timestamp now = Clock::now(); while (!events.empty()) { auto event(std::move(events.front())); events.pop(); std::visit(MonitoredEventVisitor{ [](std::monostate& event) { ERR("MonitoredEvent type not found"); GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER)) << "MonitoredEvent type not found"; }, [this, &events](typename MonitoredEventType::Start& event) { auto it = mMonitoredTasks.find(event.id); if (it != mMonitoredTasks.end()) { ERR("Registered multiple start events for task %d", event.id); return; } if (event.parentId && mMonitoredTasks.find(event.parentId.value()) == mMonitoredTasks.end()) { WARN("Requested parent task %d does not exist.", event.parentId.value()); event.parentId = std::nullopt; } it = mMonitoredTasks .emplace(event.id, std::move(MonitoredTask{ .id = event.id, .timeoutTimestamp = event.timeOccurred + event.timeoutThreshold, .timeoutThreshold = event.timeoutThreshold, .hungTimestamp = std::nullopt, .metadata = std::move(event.metadata), .onHangAnnotationsCallback = std::move(event.onHangAnnotationsCallback), .parentId = event.parentId})) .first; updateTaskParent(events, it->second, event.timeOccurred); }, [this, &events](typename MonitoredEventType::Touch& event) { auto it = mMonitoredTasks.find(event.id); if (it == mMonitoredTasks.end()) { ERR("HealthMonitor has no task in progress for id %d", event.id); return; } auto& task = it->second; task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold; updateTaskParent(events, task, event.timeOccurred); }, [this, &tasksToRemove, &events](typename MonitoredEventType::Stop& event) { auto it = mMonitoredTasks.find(event.id); if (it == mMonitoredTasks.end()) { ERR("HealthMonitor has no task in progress for id %d", event.id); return; } auto& task = it->second; task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold; updateTaskParent(events, task, event.timeOccurred); // Mark it for deletion, but retain it until the end of // the health check concurrent tasks hung tasksToRemove.insert(event.id); }, [&keepMonitoring](typename MonitoredEventType::EndMonitoring& event) { keepMonitoring = false; }, [&pollPromises](typename MonitoredEventType::Poll& event) { pollPromises.push_back(std::move(event.complete)); }}, *event); } // Sort by what times out first. Identical timestamps are possible std::multimap sortedTasks; for (auto& [_, task] : mMonitoredTasks) { sortedTasks.insert(std::pair(task.timeoutTimestamp, task.id)); } for (auto& [_, task_id] : sortedTasks) { auto& task = mMonitoredTasks[task_id]; if (task.timeoutTimestamp < now) { // Newly hung task if (!task.hungTimestamp.has_value()) { // Copy over additional annotations captured at hangTime if (task.onHangAnnotationsCallback) { auto newAnnotations = (*task.onHangAnnotationsCallback)(); task.metadata->mergeAnnotations(std::move(newAnnotations)); } mLogger.logMetricEvent(MetricEventHang{.taskId = task.id, .metadata = task.metadata.get(), .otherHungTasks = newHungTasks}); task.hungTimestamp = task.timeoutTimestamp; newHungTasks++; } } else { // Task resumes if (task.hungTimestamp.has_value()) { newHungTasks--; auto hangTime = duration_cast( task.timeoutTimestamp - (task.hungTimestamp.value() + task.timeoutThreshold)) .count(); mLogger.logMetricEvent(MetricEventUnHang{.taskId = task.id, .metadata = task.metadata.get(), .hung_ms = hangTime, .otherHungTasks = newHungTasks}); task.hungTimestamp = std::nullopt; } } if (tasksToRemove.find(task_id) != tasksToRemove.end()) { mMonitoredTasks.erase(task_id); } } if (mHungTasks != newHungTasks) { ERR("HealthMonitor: Number of unresponsive tasks %s: %d -> %d", mHungTasks < newHungTasks ? "increased" : "decreaased", mHungTasks, newHungTasks); mHungTasks = newHungTasks; } for (auto& complete : pollPromises) { complete.set_value(); } } return 0; } template void HealthMonitor::updateTaskParent(std::queue>& events, const MonitoredTask& task, Timestamp eventTime) { std::optional parentId = task.parentId; if (parentId) { auto event = std::make_unique(typename MonitoredEventType::Touch{ .id = parentId.value(), .timeOccurred = eventTime + Duration(kTimeEpsilon)}); events.push(std::move(event)); } } std::unique_ptr> CreateHealthMonitor(MetricsLogger& metricsLogger, uint64_t heartbeatInterval) { #if ENABLE_HEALTH_MONITOR INFO("HealthMonitor enabled."); return std::make_unique>(metricsLogger, heartbeatInterval); #else INFO("HealthMonitor disabled."); return nullptr; #endif } template class HealthMonitor; template class HealthMonitor; } // namespace emugl