1 /**
2 * Copyright (c) 2020, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "carwatchdogd"
18 #define DEBUG false // STOPSHIP if true.
19 #define ATRACE_TAG ATRACE_TAG_SYSTEM_SERVER
20
21 #include "WatchdogProcessService.h"
22
23 #include "PackageInfoResolver.h"
24 #include "ServiceManager.h"
25 #include "UidProcStatsCollector.h"
26 #include "WatchdogServiceHelper.h"
27
28 #include <aidl/android/hardware/automotive/vehicle/BnVehicle.h>
29 #include <aidl/android/hardware/automotive/vehicle/ProcessTerminationReason.h>
30 #include <android-base/file.h>
31 #include <android-base/macros.h>
32 #include <android-base/properties.h>
33 #include <android-base/stringprintf.h>
34 #include <android-base/strings.h>
35 #include <android/util/ProtoOutputStream.h>
36 #include <binder/IPCThreadState.h>
37 #include <hidl/HidlTransportSupport.h>
38 #include <utils/SystemClock.h>
39 #include <utils/Trace.h>
40
41 #include <IVhalClient.h>
42 #include <VehicleHalTypes.h>
43 #include <inttypes.h>
44
45 #include <utility>
46
47 #include <carwatchdog_daemon_dump.proto.h>
48 #include <health_check_client_info.proto.h>
49 #include <performance_stats.proto.h>
50
51 namespace android {
52 namespace automotive {
53 namespace watchdog {
54
55 using ::aidl::android::automotive::watchdog::ICarWatchdogClient;
56 using ::aidl::android::automotive::watchdog::TimeoutLength;
57 using ::aidl::android::automotive::watchdog::internal::ICarWatchdogMonitor;
58 using ::aidl::android::automotive::watchdog::internal::ICarWatchdogServiceForSystem;
59 using ::aidl::android::automotive::watchdog::internal::ProcessIdentifier;
60 using ::aidl::android::hardware::automotive::vehicle::BnVehicle;
61 using ::aidl::android::hardware::automotive::vehicle::ProcessTerminationReason;
62 using ::aidl::android::hardware::automotive::vehicle::StatusCode;
63 using ::aidl::android::hardware::automotive::vehicle::SubscribeOptions;
64 using ::aidl::android::hardware::automotive::vehicle::VehiclePropConfig;
65 using ::aidl::android::hardware::automotive::vehicle::VehicleProperty;
66 using ::aidl::android::hardware::automotive::vehicle::VehiclePropertyStatus;
67 using ::aidl::android::hardware::automotive::vehicle::VehiclePropValue;
68 using ::android::sp;
69 using ::android::String16;
70 using ::android::base::Error;
71 using ::android::base::GetIntProperty;
72 using ::android::base::GetProperty;
73 using ::android::base::ReadFileToString;
74 using ::android::base::Result;
75 using ::android::base::StringAppendF;
76 using ::android::base::StringPrintf;
77 using ::android::base::Trim;
78 using ::android::base::WriteStringToFd;
79 using ::android::binder::Status;
80 using ::android::frameworks::automotive::vhal::HalPropError;
81 using ::android::frameworks::automotive::vhal::IHalPropValue;
82 using ::android::frameworks::automotive::vhal::ISubscriptionClient;
83 using ::android::frameworks::automotive::vhal::IVhalClient;
84 using ::android::hardware::hidl_vec;
85 using ::android::hardware::interfacesEqual;
86 using ::android::hardware::Return;
87 using ::android::hidl::base::V1_0::IBase;
88 using ::android::hidl::manager::V1_0::IServiceManager;
89 using ::android::util::ProtoOutputStream;
90 using ::ndk::ScopedAIBinder_DeathRecipient;
91 using ::ndk::ScopedAStatus;
92 using ::ndk::SpAIBinder;
93
94 namespace {
95
96 const std::vector<TimeoutLength> kTimeouts = {TimeoutLength::TIMEOUT_CRITICAL,
97 TimeoutLength::TIMEOUT_MODERATE,
98 TimeoutLength::TIMEOUT_NORMAL};
99
100 // TimeoutLength is also used as a message ID. Other message IDs should start next to
101 // TimeoutLength::TIMEOUT_NORMAL.
102 const int32_t MSG_VHAL_WATCHDOG_ALIVE = static_cast<int>(TimeoutLength::TIMEOUT_NORMAL) + 1;
103 const int32_t MSG_VHAL_HEALTH_CHECK = MSG_VHAL_WATCHDOG_ALIVE + 1;
104 const int32_t MSG_CACHE_VHAL_PROCESS_IDENTIFIER = MSG_VHAL_HEALTH_CHECK + 1;
105
106 // VHAL is supposed to send heart beat every 3s. Car watchdog checks if there is the latest heart
107 // beat from VHAL within 3s, allowing 1s marginal time.
108 // If {@code ro.carwatchdog.vhal_healthcheck.interval} is set, car watchdog checks VHAL health at
109 // the given interval. The lower bound of the interval is 3s.
110 constexpr int32_t kDefaultVhalCheckIntervalSec = 3;
111 constexpr std::chrono::milliseconds kHealthCheckDelayMillis = 1s;
112 constexpr int32_t kMaxVhalPidCachingAttempts = 2;
113 constexpr std::chrono::nanoseconds kDefaultVhalPidCachingRetryDelayNs = 30s;
114 constexpr TimeoutLength kCarWatchdogServiceTimeoutDelay = TimeoutLength::TIMEOUT_CRITICAL;
115 constexpr int32_t kMissingIntPropertyValue = -1;
116
117 constexpr const char kPropertyVhalCheckInterval[] = "ro.carwatchdog.vhal_healthcheck.interval";
118 constexpr const char kPropertyClientCheckInterval[] = "ro.carwatchdog.client_healthcheck.interval";
119 constexpr const char kServiceName[] = "WatchdogProcessService";
120 constexpr const char kHidlVhalInterfaceName[] = "android.hardware.automotive.vehicle@2.0::IVehicle";
121
122 const std::function<sp<IServiceManager>()> kDefaultTryGetHidlServiceManager =
__anonf2704ec60202() 123 []() -> sp<IServiceManager> { return IServiceManager::tryGetService(/*getStub=*/false); };
124
125 enum RegistrationError {
126 ERR_ILLEGAL_STATE = 0,
127 ERR_DUPLICATE_REGISTRATION,
128 };
129
toScopedAStatus(Result<void> resultWithRegistrationError)130 ScopedAStatus toScopedAStatus(Result<void> resultWithRegistrationError) {
131 if (resultWithRegistrationError.ok()) {
132 return ScopedAStatus::ok();
133 }
134 if (resultWithRegistrationError.error().code() ==
135 RegistrationError::ERR_DUPLICATE_REGISTRATION) {
136 return ScopedAStatus::ok();
137 }
138 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_STATE,
139 resultWithRegistrationError.error()
140 .message()
141 .c_str());
142 }
143
toPidString(const std::vector<ProcessIdentifier> & processIdentifiers)144 std::string toPidString(const std::vector<ProcessIdentifier>& processIdentifiers) {
145 size_t size = processIdentifiers.size();
146 if (size == 0) {
147 return "";
148 }
149 std::string buffer;
150 StringAppendF(&buffer, "%d", processIdentifiers[0].pid);
151 for (size_t i = 1; i < size; i++) {
152 StringAppendF(&buffer, ", %d", processIdentifiers[i].pid);
153 }
154 return buffer;
155 }
156
isSystemShuttingDown()157 bool isSystemShuttingDown() {
158 std::string sysPowerCtl;
159 std::istringstream tokenStream(GetProperty("sys.powerctl", ""));
160 std::getline(tokenStream, sysPowerCtl, ',');
161 return sysPowerCtl == "reboot" || sysPowerCtl == "shutdown";
162 }
163
getStartTimeForPid(pid_t pid)164 int64_t getStartTimeForPid(pid_t pid) {
165 auto pidStat = UidProcStatsCollector::readStatFileForPid(pid);
166 if (!pidStat.ok()) {
167 return elapsedRealtime();
168 }
169 return pidStat->startTimeMillis;
170 }
171
onBinderDied(void * cookie)172 void onBinderDied(void* cookie) {
173 const auto& thiz = ServiceManager::getInstance()->getWatchdogProcessService();
174 if (thiz == nullptr) {
175 return;
176 }
177 thiz->handleBinderDeath(cookie);
178 }
queryHidlServiceManagerForVhalPid(const sp<IServiceManager> & hidlServiceManager)179 Result<pid_t> queryHidlServiceManagerForVhalPid(const sp<IServiceManager>& hidlServiceManager) {
180 pid_t pid = -1;
181 Return<void> ret = hidlServiceManager->debugDump([&](auto& hals) {
182 for (const auto& info : hals) {
183 if (info.pid == static_cast<int>(IServiceManager::PidConstant::NO_PID)) {
184 continue;
185 }
186 if (info.interfaceName == kHidlVhalInterfaceName) {
187 pid = info.pid;
188 return;
189 }
190 }
191 });
192
193 if (!ret.isOk()) {
194 return Error() << "Failed to get VHAL process id from HIDL service manager";
195 }
196 if (pid == -1) {
197 return Error() << "No VHAL service registered to HIDL service manager";
198 }
199 return pid;
200 }
201
toProtoHealthCheckTimeout(TimeoutLength timeoutLength)202 int toProtoHealthCheckTimeout(TimeoutLength timeoutLength) {
203 switch (timeoutLength) {
204 case TimeoutLength::TIMEOUT_CRITICAL:
205 return HealthCheckClientInfo::CRITICAL;
206 case TimeoutLength::TIMEOUT_MODERATE:
207 return HealthCheckClientInfo::MODERATE;
208 case TimeoutLength::TIMEOUT_NORMAL:
209 return HealthCheckClientInfo::NORMAL;
210 default:
211 return HealthCheckClientInfo::HEALTH_CHECK_TIMEOUT_UNSPECIFIED;
212 }
213 }
214
215 } // namespace
216
timeoutToString(TimeoutLength timeout)217 std::string timeoutToString(TimeoutLength timeout) {
218 switch (timeout) {
219 case TimeoutLength::TIMEOUT_CRITICAL:
220 return "TIMEOUT_CRITICAL";
221 case TimeoutLength::TIMEOUT_MODERATE:
222 return "TIMEOUT_MODERATE";
223 case TimeoutLength::TIMEOUT_NORMAL:
224 return "TIMEOUT_NORMAL";
225 default:
226 return "UNKNOWN TIMEOUT";
227 }
228 }
229
WatchdogProcessService(const sp<Looper> & handlerLooper)230 WatchdogProcessService::WatchdogProcessService(const sp<Looper>& handlerLooper) :
231 WatchdogProcessService(IVhalClient::tryCreate, kDefaultTryGetHidlServiceManager,
232 getStartTimeForPid, kDefaultVhalPidCachingRetryDelayNs, handlerLooper,
233 sp<AIBinderDeathRegistrationWrapper>::make()) {}
234
WatchdogProcessService(const std::function<std::shared_ptr<IVhalClient> ()> & tryCreateVhalClientFunc,const std::function<sp<IServiceManager> ()> & tryGetHidlServiceManagerFunc,const std::function<int64_t (pid_t)> & getStartTimeForPidFunc,const std::chrono::nanoseconds & vhalPidCachingRetryDelayNs,const sp<Looper> & handlerLooper,const sp<AIBinderDeathRegistrationWrapperInterface> & deathRegistrationWrapper)235 WatchdogProcessService::WatchdogProcessService(
236 const std::function<std::shared_ptr<IVhalClient>()>& tryCreateVhalClientFunc,
237 const std::function<sp<IServiceManager>()>& tryGetHidlServiceManagerFunc,
238 const std::function<int64_t(pid_t)>& getStartTimeForPidFunc,
239 const std::chrono::nanoseconds& vhalPidCachingRetryDelayNs, const sp<Looper>& handlerLooper,
240 const sp<AIBinderDeathRegistrationWrapperInterface>& deathRegistrationWrapper) :
241 kTryCreateVhalClientFunc(tryCreateVhalClientFunc),
242 kTryGetHidlServiceManagerFunc(tryGetHidlServiceManagerFunc),
243 kGetStartTimeForPidFunc(getStartTimeForPidFunc),
244 kVhalPidCachingRetryDelayNs(vhalPidCachingRetryDelayNs),
245 mHandlerLooper(handlerLooper),
246 mClientBinderDeathRecipient(
247 ScopedAIBinder_DeathRecipient(AIBinder_DeathRecipient_new(onBinderDied))),
248 mLastSessionId(0),
249 mServiceStarted(false),
250 mDeathRegistrationWrapper(deathRegistrationWrapper),
251 mIsEnabled(true),
252 mVhalService(nullptr),
253 mTotalVhalPidCachingAttempts(0) {
254 mVhalBinderDiedCallback =
255 std::make_shared<IVhalClient::OnBinderDiedCallbackFunc>([this] { handleVhalDeath(); });
256 for (const auto& timeout : kTimeouts) {
257 mClientsByTimeout.insert(std::make_pair(timeout, ClientInfoMap()));
258 mPingedClients.insert(std::make_pair(timeout, PingedClientMap()));
259 }
260
261 int32_t vhalHealthCheckIntervalSec =
262 GetIntProperty(kPropertyVhalCheckInterval, kDefaultVhalCheckIntervalSec);
263 vhalHealthCheckIntervalSec = std::max(vhalHealthCheckIntervalSec, kDefaultVhalCheckIntervalSec);
264 mVhalHealthCheckWindowMillis = std::chrono::seconds(vhalHealthCheckIntervalSec);
265
266 int32_t clientHealthCheckIntervalSec =
267 GetIntProperty(kPropertyClientCheckInterval, kMissingIntPropertyValue);
268 // Overridden timeout value must be greater than or equal to the maximum possible timeout value.
269 // Otherwise, clients will be pinged more frequently than the guaranteed timeout duration.
270 if (clientHealthCheckIntervalSec != kMissingIntPropertyValue) {
271 int32_t normalSec = std::chrono::duration_cast<std::chrono::seconds>(
272 getTimeoutDurationNs(TimeoutLength::TIMEOUT_NORMAL))
273 .count();
274 mOverriddenClientHealthCheckWindowNs = std::optional<std::chrono::seconds>{
275 std::max(clientHealthCheckIntervalSec, normalSec)};
276 }
277 }
278
~WatchdogProcessService()279 WatchdogProcessService::~WatchdogProcessService() {
280 terminate();
281 }
282
registerClient(const std::shared_ptr<ICarWatchdogClient> & client,TimeoutLength timeout)283 ScopedAStatus WatchdogProcessService::registerClient(
284 const std::shared_ptr<ICarWatchdogClient>& client, TimeoutLength timeout) {
285 ATRACE_CALL();
286 if (client == nullptr) {
287 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
288 "Must provide non-null client");
289 }
290 pid_t callingPid = IPCThreadState::self()->getCallingPid();
291 uid_t callingUid = IPCThreadState::self()->getCallingUid();
292 userid_t callingUserId = multiuser_get_user_id(callingUid);
293
294 ClientInfo clientInfo(client, callingPid, callingUserId, kGetStartTimeForPidFunc(callingPid),
295 *this);
296 return toScopedAStatus(registerClient(clientInfo, timeout));
297 }
298
unregisterClient(const std::shared_ptr<ICarWatchdogClient> & client)299 ScopedAStatus WatchdogProcessService::unregisterClient(
300 const std::shared_ptr<ICarWatchdogClient>& client) {
301 if (client == nullptr) {
302 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
303 "Must provide non-null client");
304 }
305 Mutex::Autolock lock(mMutex);
306 return unregisterClientLocked(kTimeouts, client->asBinder(), ClientType::Regular);
307 }
308
registerCarWatchdogService(const SpAIBinder & binder,const sp<WatchdogServiceHelperInterface> & helper)309 ScopedAStatus WatchdogProcessService::registerCarWatchdogService(
310 const SpAIBinder& binder, const sp<WatchdogServiceHelperInterface>& helper) {
311 ATRACE_CALL();
312 pid_t callingPid = IPCThreadState::self()->getCallingPid();
313 uid_t callingUid = IPCThreadState::self()->getCallingUid();
314 userid_t callingUserId = multiuser_get_user_id(callingUid);
315
316 if (helper == nullptr) {
317 return ScopedAStatus::
318 fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
319 "Watchdog service helper instance is null");
320 }
321 ClientInfo clientInfo(helper, binder, callingPid, callingUserId,
322 kGetStartTimeForPidFunc(callingPid), *this);
323 if (auto result = registerClient(clientInfo, kCarWatchdogServiceTimeoutDelay); !result.ok()) {
324 return toScopedAStatus(result);
325 }
326 Mutex::Autolock lock(mMutex);
327 if (mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) == 0 &&
328 mVhalService != nullptr && mVhalService->isAidlVhal() &&
329 !mVhalProcessIdentifier.has_value()) {
330 // When CarService is restarted in the middle handling the AIDL VHAL pid fetch request,
331 // the request will fail. Restart the caching process only when the AIDL VHAL pid is
332 // missing.
333 mTotalVhalPidCachingAttempts = 0;
334 mHandlerLooper->sendMessage(mMessageHandler, Message(MSG_CACHE_VHAL_PROCESS_IDENTIFIER));
335 }
336 return ScopedAStatus::ok();
337 }
338
unregisterCarWatchdogService(const SpAIBinder & binder)339 void WatchdogProcessService::unregisterCarWatchdogService(const SpAIBinder& binder) {
340 Mutex::Autolock lock(mMutex);
341
342 std::vector<TimeoutLength> timeouts = {TimeoutLength::TIMEOUT_CRITICAL};
343 unregisterClientLocked(timeouts, binder, ClientType::Service);
344 }
345
registerMonitor(const std::shared_ptr<ICarWatchdogMonitor> & monitor)346 ScopedAStatus WatchdogProcessService::registerMonitor(
347 const std::shared_ptr<ICarWatchdogMonitor>& monitor) {
348 ATRACE_CALL();
349 if (monitor == nullptr) {
350 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
351 "Must provide non-null monitor");
352 }
353 const auto binder = monitor->asBinder();
354 {
355 Mutex::Autolock lock(mMutex);
356 if (mMonitor != nullptr) {
357 if (mMonitor->asBinder() == binder) {
358 return ScopedAStatus::ok();
359 }
360 AIBinder* aiBinder = mMonitor->asBinder().get();
361 mDeathRegistrationWrapper->unlinkToDeath(aiBinder, mClientBinderDeathRecipient.get(),
362 static_cast<void*>(aiBinder));
363 }
364 mMonitor = monitor;
365 }
366
367 AIBinder* aiBinder = binder.get();
368 auto status =
369 mDeathRegistrationWrapper->linkToDeath(aiBinder, mClientBinderDeathRecipient.get(),
370 static_cast<void*>(aiBinder));
371 if (!status.isOk()) {
372 {
373 Mutex::Autolock lock(mMutex);
374 if (mMonitor != nullptr && mMonitor->asBinder() == binder) {
375 mMonitor.reset();
376 }
377 }
378 ALOGW("Failed to register the monitor as it is dead.");
379 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_STATE,
380 "The monitor is dead.");
381 }
382 if (DEBUG) {
383 ALOGD("Car watchdog monitor is registered");
384 }
385 return ScopedAStatus::ok();
386 }
387
unregisterMonitor(const std::shared_ptr<ICarWatchdogMonitor> & monitor)388 ScopedAStatus WatchdogProcessService::unregisterMonitor(
389 const std::shared_ptr<ICarWatchdogMonitor>& monitor) {
390 if (monitor == nullptr) {
391 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
392 "Must provide non-null monitor");
393 }
394 const auto binder = monitor->asBinder();
395 Mutex::Autolock lock(mMutex);
396 if (mMonitor == nullptr || mMonitor->asBinder() != binder) {
397 ALOGW("Failed to unregister the monitor as it has not been registered.");
398 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
399 "The monitor has not been registered.");
400 }
401 AIBinder* aiBinder = binder.get();
402 mDeathRegistrationWrapper->unlinkToDeath(aiBinder, mClientBinderDeathRecipient.get(),
403 static_cast<void*>(aiBinder));
404 mMonitor.reset();
405 if (DEBUG) {
406 ALOGD("Car watchdog monitor is unregistered");
407 }
408 return ScopedAStatus::ok();
409 }
410
tellClientAlive(const std::shared_ptr<ICarWatchdogClient> & client,int32_t sessionId)411 ScopedAStatus WatchdogProcessService::tellClientAlive(
412 const std::shared_ptr<ICarWatchdogClient>& client, int32_t sessionId) {
413 if (client == nullptr) {
414 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
415 "Must provide non-null client");
416 }
417 Mutex::Autolock lock(mMutex);
418 return tellClientAliveLocked(client->asBinder(), sessionId);
419 }
420
tellCarWatchdogServiceAlive(const std::shared_ptr<ICarWatchdogServiceForSystem> & service,const std::vector<ProcessIdentifier> & clientsNotResponding,int32_t sessionId)421 ScopedAStatus WatchdogProcessService::tellCarWatchdogServiceAlive(
422 const std::shared_ptr<ICarWatchdogServiceForSystem>& service,
423 const std::vector<ProcessIdentifier>& clientsNotResponding, int32_t sessionId) {
424 if (service == nullptr) {
425 return ScopedAStatus::
426 fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
427 "Must provide non-null car watchdog service");
428 }
429 ScopedAStatus status;
430 {
431 Mutex::Autolock lock(mMutex);
432 if (DEBUG) {
433 if (clientsNotResponding.size() > 0) {
434 ALOGD("CarWatchdogService(session: %d) responded with non-responding clients: %s",
435 sessionId, toPidString(clientsNotResponding).c_str());
436 }
437 }
438 status = tellClientAliveLocked(service->asBinder(), sessionId);
439 }
440 if (status.isOk()) {
441 dumpAndKillAllProcesses(clientsNotResponding, /*reportToVhal=*/true);
442 }
443 return status;
444 }
445
tellDumpFinished(const std::shared_ptr<ICarWatchdogMonitor> & monitor,const ProcessIdentifier & processIdentifier)446 ScopedAStatus WatchdogProcessService::tellDumpFinished(
447 const std::shared_ptr<ICarWatchdogMonitor>& monitor,
448 const ProcessIdentifier& processIdentifier) {
449 Mutex::Autolock lock(mMutex);
450 if (mMonitor == nullptr || monitor == nullptr || mMonitor->asBinder() != monitor->asBinder()) {
451 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
452 "The monitor is not registered or an "
453 "invalid monitor is given");
454 }
455 ALOGI("Process(pid: %d) has been dumped and killed", processIdentifier.pid);
456 return ScopedAStatus::ok();
457 }
458
setEnabled(bool isEnabled)459 void WatchdogProcessService::setEnabled(bool isEnabled) {
460 Mutex::Autolock lock(mMutex);
461 if (mIsEnabled == isEnabled) {
462 return;
463 }
464 ALOGI("%s is %s", kServiceName, isEnabled ? "enabled" : "disabled");
465 mIsEnabled = isEnabled;
466 mHandlerLooper->removeMessages(mMessageHandler, MSG_VHAL_HEALTH_CHECK);
467 if (!mIsEnabled) {
468 return;
469 }
470 if (mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) == 0) {
471 mVhalHeartBeat.eventTime = uptimeMillis();
472 std::chrono::nanoseconds intervalNs =
473 mVhalHealthCheckWindowMillis + kHealthCheckDelayMillis;
474 mHandlerLooper->sendMessageDelayed(intervalNs.count(), mMessageHandler,
475 Message(MSG_VHAL_HEALTH_CHECK));
476 }
477 for (const auto& timeout : kTimeouts) {
478 mHandlerLooper->removeMessages(mMessageHandler, static_cast<int>(timeout));
479 startHealthCheckingLocked(timeout);
480 }
481 }
482
onUserStateChange(userid_t userId,bool isStarted)483 void WatchdogProcessService::onUserStateChange(userid_t userId, bool isStarted) {
484 std::string buffer;
485 Mutex::Autolock lock(mMutex);
486 if (isStarted) {
487 mStoppedUserIds.erase(userId);
488 } else {
489 mStoppedUserIds.insert(userId);
490 }
491 }
492
onDump(int fd)493 void WatchdogProcessService::onDump(int fd) {
494 Mutex::Autolock lock(mMutex);
495 const char* indent = " ";
496 const char* doubleIndent = " ";
497 std::string buffer;
498 WriteStringToFd("CAR WATCHDOG PROCESS SERVICE\n", fd);
499 WriteStringToFd(StringPrintf("%s%s enabled: %s\n", indent, kServiceName,
500 mIsEnabled ? "true" : "false"),
501 fd);
502 WriteStringToFd(StringPrintf("%sRegistered clients\n", indent), fd);
503 int count = 1;
504 for (const auto& timeout : kTimeouts) {
505 ClientInfoMap& clients = mClientsByTimeout[timeout];
506 for (auto it = clients.begin(); it != clients.end(); it++, count++) {
507 WriteStringToFd(StringPrintf("%sClient #%d: %s\n", doubleIndent, count,
508 it->second.toString().c_str()),
509 fd);
510 }
511 }
512 WriteStringToFd(StringPrintf("%sMonitor registered: %s\n", indent,
513 mMonitor == nullptr ? "false" : "true"),
514 fd);
515 WriteStringToFd(StringPrintf("%sisSystemShuttingDown: %s\n", indent,
516 isSystemShuttingDown() ? "true" : "false"),
517 fd);
518 buffer = "none";
519 bool first = true;
520 for (const auto& userId : mStoppedUserIds) {
521 if (first) {
522 buffer = StringPrintf("%d", userId);
523 first = false;
524 } else {
525 StringAppendF(&buffer, ", %d", userId);
526 }
527 }
528 WriteStringToFd(StringPrintf("%sStopped users: %s\n", indent, buffer.c_str()), fd);
529 if (mVhalService != nullptr &&
530 mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) == 0) {
531 int64_t systemUptime = uptimeMillis();
532 WriteStringToFd(StringPrintf("%sVHAL health check is supported:\n%s\tVHAL health check "
533 "interval: %lld millis\n%s\tVHAL heartbeat was updated "
534 "%" PRIi64 " millis ago",
535 indent, indent, mVhalHealthCheckWindowMillis.count(), indent,
536 systemUptime - mVhalHeartBeat.eventTime),
537 fd);
538 std::string vhalType = mVhalService->isAidlVhal() ? "AIDL" : "HIDL";
539 if (mVhalProcessIdentifier.has_value()) {
540 WriteStringToFd(StringPrintf("%s%s VHAL process identifier (PID = %d, Start time "
541 "millis = "
542 "%" PRIi64 ")",
543 indent, vhalType.c_str(), mVhalProcessIdentifier->pid,
544 mVhalProcessIdentifier->startTimeMillis),
545 fd);
546 } else if (mTotalVhalPidCachingAttempts < kMaxVhalPidCachingAttempts) {
547 WriteStringToFd(StringPrintf("%sStill fetching %s VHAL process identifier. "
548 "Total attempts made = %d, Remaining attempts = %d",
549 indent, vhalType.c_str(), mTotalVhalPidCachingAttempts,
550 kMaxVhalPidCachingAttempts - mTotalVhalPidCachingAttempts),
551 fd);
552 } else {
553 WriteStringToFd(StringPrintf("%sFailed to fetch %s VHAL process identifier. "
554 "Cannot terminate VHAL when VHAL becomes unresponsive",
555 indent, vhalType.c_str()),
556 fd);
557 }
558 } else if (mVhalService != nullptr) {
559 WriteStringToFd(StringPrintf("%sVHAL client is connected but the heartbeat property is not "
560 "supported",
561 indent),
562 fd);
563 } else {
564 WriteStringToFd(StringPrintf("%sVHAL client is not connected", indent), fd);
565 }
566 }
567
onDumpProto(ProtoOutputStream & outProto)568 void WatchdogProcessService::onDumpProto(ProtoOutputStream& outProto) {
569 Mutex::Autolock lock(mMutex);
570
571 uint64_t healthCheckServiceDumpToken =
572 outProto.start(CarWatchdogDaemonDump::HEALTH_CHECK_SERVICE_DUMP);
573
574 outProto.write(HealthCheckServiceDump::IS_ENABLED, mIsEnabled);
575 outProto.write(HealthCheckServiceDump::IS_MONITOR_REGISTERED, mMonitor != nullptr);
576 outProto.write(HealthCheckServiceDump::IS_SYSTEM_SHUT_DOWN_IN_PROGRESS, isSystemShuttingDown());
577
578 for (const auto& userId : mStoppedUserIds) {
579 outProto.write(HealthCheckServiceDump::STOPPED_USERS, static_cast<int>(userId));
580 }
581 auto criticalHealthCheckWindowMillis =
582 std::chrono::duration_cast<std::chrono::milliseconds>(
583 getTimeoutDurationNs(TimeoutLength::TIMEOUT_CRITICAL))
584 .count();
585 auto moderateHealthCheckWindowMillis =
586 std::chrono::duration_cast<std::chrono::milliseconds>(
587 getTimeoutDurationNs(TimeoutLength::TIMEOUT_MODERATE))
588 .count();
589 auto normalHealthCheckWindowMillis =
590 std::chrono::duration_cast<std::chrono::milliseconds>(
591 getTimeoutDurationNs(TimeoutLength::TIMEOUT_NORMAL))
592 .count();
593 outProto.write(HealthCheckServiceDump::CRITICAL_HEALTH_CHECK_WINDOW_MILLIS,
594 criticalHealthCheckWindowMillis);
595 outProto.write(HealthCheckServiceDump::MODERATE_HEALTH_CHECK_WINDOW_MILLIS,
596 moderateHealthCheckWindowMillis);
597 outProto.write(HealthCheckServiceDump::NORMAL_HEALTH_CHECK_WINDOW_MILLIS,
598 normalHealthCheckWindowMillis);
599
600 // Vhal Health Check Info
601 uint64_t vHalHealthCheckInfoToken =
602 outProto.start(HealthCheckServiceDump::VHAL_HEALTH_CHECK_INFO);
603 outProto.write(VhalHealthCheckInfo::IS_ENABLED, mVhalService != nullptr);
604 outProto.write(VhalHealthCheckInfo::HEALTH_CHECK_WINDOW_MILLIS,
605 mVhalHealthCheckWindowMillis.count());
606 outProto.write(VhalHealthCheckInfo::LAST_HEARTBEAT_UPDATE_AGO_MILLIS,
607 uptimeMillis() - mVhalHeartBeat.eventTime);
608 int pidCachingProgressState = VhalHealthCheckInfo::FAILURE;
609 if (mVhalProcessIdentifier.has_value()) {
610 pidCachingProgressState = VhalHealthCheckInfo::SUCCESS;
611 } else if (mTotalVhalPidCachingAttempts < kMaxVhalPidCachingAttempts) {
612 pidCachingProgressState = VhalHealthCheckInfo::IN_PROGRESS;
613 }
614 outProto.write(VhalHealthCheckInfo::PID_CACHING_PROGRESS_STATE, pidCachingProgressState);
615 outProto.write(VhalHealthCheckInfo::PID,
616 mVhalProcessIdentifier.has_value() ? mVhalProcessIdentifier->pid : -1);
617 outProto.write(VhalHealthCheckInfo::START_TIME_MILLIS,
618 mVhalProcessIdentifier.has_value() ? mVhalProcessIdentifier->startTimeMillis
619 : -1);
620
621 outProto.end(vHalHealthCheckInfoToken);
622
623 // Health Check Client Info
624 for (const auto& timeout : kTimeouts) {
625 const ClientInfoMap& clients = mClientsByTimeout[timeout];
626 for (auto it = clients.begin(); it != clients.end(); it++) {
627 uint64_t healthCheckClientInfoToken =
628 outProto.start(HealthCheckServiceDump::REGISTERED_CLIENT_INFOS);
629 const ClientInfo clientInfo = it->second;
630 outProto.write(HealthCheckClientInfo::PID, clientInfo.kPid);
631
632 uint64_t userPackageInfoToken =
633 outProto.start(HealthCheckClientInfo::USER_PACKAGE_INFO);
634 outProto.write(UserPackageInfo::USER_ID, static_cast<int>(clientInfo.kUserId));
635 outProto.write(UserPackageInfo::PACKAGE_NAME, clientInfo.packageName);
636 outProto.end(userPackageInfoToken);
637
638 outProto.write(HealthCheckClientInfo::CLIENT_TYPE, toProtoClientType(clientInfo.kType));
639 outProto.write(HealthCheckClientInfo::START_TIME_MILLIS, clientInfo.kStartTimeMillis);
640 outProto.write(HealthCheckClientInfo::HEALTH_CHECK_TIMEOUT,
641 toProtoHealthCheckTimeout(timeout));
642 outProto.end(healthCheckClientInfoToken);
643 }
644 }
645
646 outProto.end(healthCheckServiceDumpToken);
647 }
648
doHealthCheck(int what)649 void WatchdogProcessService::doHealthCheck(int what) {
650 mHandlerLooper->removeMessages(mMessageHandler, what);
651 if (Mutex::Autolock lock(mMutex); !mIsEnabled) {
652 return;
653 }
654 const TimeoutLength timeout = static_cast<TimeoutLength>(what);
655 dumpAndKillClientsIfNotResponding(timeout);
656
657 /* Generates a temporary/local vector containing clients.
658 * Using a local copy may send unnecessary ping messages to clients after they are unregistered.
659 * Clients should be able to handle them.
660 */
661 std::vector<ClientInfo> clientsToCheck;
662 PingedClientMap* pingedClients = nullptr;
663 {
664 Mutex::Autolock lock(mMutex);
665 pingedClients = &mPingedClients[timeout];
666 pingedClients->clear();
667 for (auto& [_, clientInfo] : mClientsByTimeout[timeout]) {
668 if (mStoppedUserIds.count(clientInfo.kUserId) > 0) {
669 continue;
670 }
671 int sessionId = getNewSessionId();
672 clientInfo.sessionId = sessionId;
673 clientsToCheck.push_back(clientInfo);
674 pingedClients->insert(std::make_pair(sessionId, clientInfo));
675 }
676 }
677
678 for (const auto& clientInfo : clientsToCheck) {
679 if (auto status = clientInfo.checkIfAlive(timeout); !status.isOk()) {
680 if (DEBUG) {
681 ALOGW("Failed to send a ping message to client(pid: %d): %s", clientInfo.kPid,
682 status.getMessage());
683 }
684 {
685 Mutex::Autolock lock(mMutex);
686 pingedClients->erase(clientInfo.sessionId);
687 }
688 }
689 }
690 // Though the size of pingedClients is a more specific measure, clientsToCheck is used as a
691 // conservative approach.
692 if (clientsToCheck.size() > 0) {
693 auto durationNs = getTimeoutDurationNs(timeout);
694 mHandlerLooper->sendMessageDelayed(durationNs.count(), mMessageHandler, Message(what));
695 }
696 }
697
start()698 Result<void> WatchdogProcessService::start() {
699 if (mServiceStarted) {
700 return Error(INVALID_OPERATION) << "Cannot start process monitoring more than once";
701 }
702 auto thiz = sp<WatchdogProcessService>::fromExisting(this);
703 mMessageHandler = sp<MessageHandlerImpl>::make(thiz);
704 mPropertyChangeListener = std::make_shared<PropertyChangeListener>(thiz);
705 mServiceStarted = true;
706 reportWatchdogAliveToVhal();
707 return {};
708 }
709
terminate()710 void WatchdogProcessService::terminate() {
711 std::unique_ptr<ISubscriptionClient> propertySubscriptionClient;
712 {
713 Mutex::Autolock lock(mMutex);
714 if (!mServiceStarted) {
715 return;
716 }
717 for (auto& [_, clients] : mClientsByTimeout) {
718 for (auto& [_, client] : clients) {
719 client.unlinkToDeath(mClientBinderDeathRecipient.get());
720 }
721 clients.clear();
722 }
723 mClientsByTimeout.clear();
724 if (mMonitor != nullptr) {
725 AIBinder* aiBinder = mMonitor->asBinder().get();
726 mDeathRegistrationWrapper->unlinkToDeath(aiBinder, mClientBinderDeathRecipient.get(),
727 static_cast<void*>(aiBinder));
728 mMonitor.reset();
729 }
730 mHandlerLooper->removeMessages(mMessageHandler, MSG_VHAL_HEALTH_CHECK);
731 mServiceStarted = false;
732 if (mVhalService == nullptr) {
733 return;
734 }
735 if (mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) == 0) {
736 propertySubscriptionClient =
737 mVhalService->getSubscriptionClient(mPropertyChangeListener);
738 }
739 mVhalService->removeOnBinderDiedCallback(mVhalBinderDiedCallback);
740 resetVhalInfoLocked();
741 }
742 if (propertySubscriptionClient != nullptr) {
743 std::vector<int32_t> propIds = {static_cast<int32_t>(VehicleProperty::VHAL_HEARTBEAT)};
744 auto result = propertySubscriptionClient->unsubscribe(propIds);
745 if (!result.ok()) {
746 ALOGW("Failed to unsubscribe from VHAL_HEARTBEAT.");
747 }
748 }
749 }
750
registerClient(const ClientInfo & clientInfo,TimeoutLength timeout)751 Result<void> WatchdogProcessService::registerClient(const ClientInfo& clientInfo,
752 TimeoutLength timeout) {
753 ATRACE_CALL();
754 uintptr_t cookieId = reinterpret_cast<uintptr_t>(clientInfo.getAIBinder());
755 {
756 Mutex::Autolock lock(mMutex);
757 if (findClientAndProcessLocked(kTimeouts, clientInfo.getAIBinder(), nullptr)) {
758 return Error(RegistrationError::ERR_DUPLICATE_REGISTRATION)
759 << "Failed to register (" << clientInfo.toString()
760 << ") as it is already registered";
761 }
762
763 ClientInfoMap& clients = mClientsByTimeout[timeout];
764 clients.insert(std::make_pair(cookieId, clientInfo));
765 }
766 if (auto status = clientInfo.linkToDeath(mClientBinderDeathRecipient.get()); !status.isOk()) {
767 Mutex::Autolock lock(mMutex);
768 if (auto it = mClientsByTimeout.find(timeout); it != mClientsByTimeout.end()) {
769 if (const auto& clientIt = it->second.find(cookieId); clientIt != it->second.end()) {
770 it->second.erase(clientIt);
771 }
772 }
773 return Error(RegistrationError::ERR_ILLEGAL_STATE)
774 << "Failed to register (" << clientInfo.toString() << ") as it is dead";
775 }
776 if (DEBUG) {
777 ALOGD("Car watchdog client (%s, timeout = %d) is registered", clientInfo.toString().c_str(),
778 timeout);
779 }
780 Mutex::Autolock lock(mMutex);
781 // If the client array becomes non-empty, start health checking.
782 if (mClientsByTimeout[timeout].size() == 1) {
783 startHealthCheckingLocked(timeout);
784 ALOGI("Starting health checking for timeout = %d", timeout);
785 }
786 uid_t callingUid = IPCThreadState::self()->getCallingUid();
787
788 // Lazy initialization of PackageInfoResolver.
789 if (mPackageInfoResolver == nullptr) {
790 mPackageInfoResolver = PackageInfoResolver::getInstance();
791 }
792 mPackageInfoResolver
793 ->asyncFetchPackageNamesForUids({callingUid},
794 [&](std::unordered_map<uid_t, std::string>
795 packageNames) {
796 ClientInfoMap& clients =
797 this->mClientsByTimeout[timeout];
798 auto client = clients.find(cookieId);
799 // The client could have been unregistered by
800 // the time that the packageName is updated.
801 if (client != clients.end()) {
802 if (packageNames.find(callingUid) !=
803 packageNames.end()) {
804 client->second.packageName =
805 packageNames[callingUid];
806 } else {
807 ALOGW("Failed to resolve packageName "
808 "for calling uid: %i.",
809 callingUid);
810 }
811 }
812 });
813 return {};
814 }
815
unregisterClientLocked(const std::vector<TimeoutLength> & timeouts,const SpAIBinder & binder,ClientType clientType)816 ScopedAStatus WatchdogProcessService::unregisterClientLocked(
817 const std::vector<TimeoutLength>& timeouts, const SpAIBinder& binder,
818 ClientType clientType) {
819 const char* clientName = clientType == ClientType::Regular ? "client" : "service";
820 bool result = findClientAndProcessLocked(timeouts, binder.get(),
821 [&](ClientInfoMap& clients,
822 ClientInfoMap::const_iterator it) {
823 it->second.unlinkToDeath(
824 mClientBinderDeathRecipient.get());
825 clients.erase(it);
826 });
827 if (!result) {
828 std::string errorStr =
829 StringPrintf("The car watchdog %s has not been registered", clientName);
830 const char* errorCause = errorStr.c_str();
831 ALOGW("Failed to unregister the car watchdog %s: %s", clientName, errorCause);
832 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT, errorCause);
833 }
834 if (DEBUG) {
835 ALOGD("Car watchdog %s is unregistered", clientName);
836 }
837 return ScopedAStatus::ok();
838 }
839
tellClientAliveLocked(const SpAIBinder & binder,int32_t sessionId)840 ScopedAStatus WatchdogProcessService::tellClientAliveLocked(const SpAIBinder& binder,
841 int32_t sessionId) {
842 for (const auto& timeout : kTimeouts) {
843 PingedClientMap& clients = mPingedClients[timeout];
844 PingedClientMap::const_iterator it = clients.find(sessionId);
845 if (it == clients.cend() || it->second.getAIBinder() != binder.get()) {
846 continue;
847 }
848 clients.erase(it);
849 return ScopedAStatus::ok();
850 }
851 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
852 "The client is not registered or the "
853 "session ID is not found");
854 }
855
findClientAndProcessLocked(const std::vector<TimeoutLength> & timeouts,AIBinder * aiBinder,const Processor & processor)856 bool WatchdogProcessService::findClientAndProcessLocked(const std::vector<TimeoutLength>& timeouts,
857 AIBinder* aiBinder,
858 const Processor& processor) {
859 return findClientAndProcessLocked(timeouts, reinterpret_cast<uintptr_t>(aiBinder), processor);
860 }
861
findClientAndProcessLocked(const std::vector<TimeoutLength> & timeouts,uintptr_t binderPtrId,const Processor & processor)862 bool WatchdogProcessService::findClientAndProcessLocked(const std::vector<TimeoutLength>& timeouts,
863 uintptr_t binderPtrId,
864 const Processor& processor) {
865 for (const auto& timeout : timeouts) {
866 auto clientsByIdIt = mClientsByTimeout.find(timeout);
867 if (clientsByIdIt == mClientsByTimeout.end()) {
868 continue;
869 }
870 auto it = clientsByIdIt->second.find(binderPtrId);
871 if (it == clientsByIdIt->second.end()) {
872 continue;
873 }
874 if (processor != nullptr) {
875 processor(clientsByIdIt->second, it);
876 }
877 return true;
878 }
879
880 return false;
881 }
882
startHealthCheckingLocked(TimeoutLength timeout)883 Result<void> WatchdogProcessService::startHealthCheckingLocked(TimeoutLength timeout) {
884 PingedClientMap& clients = mPingedClients[timeout];
885 clients.clear();
886 int what = static_cast<int>(timeout);
887 auto durationNs = getTimeoutDurationNs(timeout);
888 mHandlerLooper->sendMessageDelayed(durationNs.count(), mMessageHandler, Message(what));
889 return {};
890 }
891
dumpAndKillClientsIfNotResponding(TimeoutLength timeout)892 Result<void> WatchdogProcessService::dumpAndKillClientsIfNotResponding(TimeoutLength timeout) {
893 std::vector<ProcessIdentifier> processIdentifiers;
894 std::vector<const ClientInfo*> clientsToNotify;
895 {
896 Mutex::Autolock lock(mMutex);
897 PingedClientMap& clients = mPingedClients[timeout];
898 for (PingedClientMap::const_iterator it = clients.cbegin(); it != clients.cend(); it++) {
899 pid_t pid = -1;
900 userid_t userId = -1;
901 uint64_t startTimeMillis = 0;
902 std::vector<TimeoutLength> timeouts = {timeout};
903 findClientAndProcessLocked(timeouts, it->second.getAIBinder(),
904 [&](ClientInfoMap& cachedClients,
905 ClientInfoMap::const_iterator cachedClientsIt) {
906 auto clientInfo = cachedClientsIt->second;
907 pid = clientInfo.kPid;
908 startTimeMillis = clientInfo.kStartTimeMillis;
909 userId = clientInfo.kUserId;
910 clientInfo.unlinkToDeath(
911 mClientBinderDeathRecipient.get());
912 cachedClients.erase(cachedClientsIt);
913 });
914 if (pid != -1 && mStoppedUserIds.count(userId) == 0) {
915 clientsToNotify.emplace_back(&it->second);
916 ProcessIdentifier processIdentifier;
917 processIdentifier.pid = pid;
918 processIdentifier.startTimeMillis = startTimeMillis;
919 processIdentifiers.push_back(processIdentifier);
920 }
921 }
922 }
923 for (const ClientInfo*& clientInfo : clientsToNotify) {
924 clientInfo->prepareProcessTermination();
925 }
926 return dumpAndKillAllProcesses(processIdentifiers, /*reportToVhal=*/true);
927 }
928
dumpAndKillAllProcesses(const std::vector<ProcessIdentifier> & processesNotResponding,bool reportToVhal)929 Result<void> WatchdogProcessService::dumpAndKillAllProcesses(
930 const std::vector<ProcessIdentifier>& processesNotResponding, bool reportToVhal) {
931 size_t size = processesNotResponding.size();
932 if (size == 0) {
933 return {};
934 }
935 std::string pidString = toPidString(processesNotResponding);
936 std::shared_ptr<ICarWatchdogMonitor> monitor;
937 {
938 Mutex::Autolock lock(mMutex);
939 if (mMonitor == nullptr) {
940 std::string errorMsg =
941 StringPrintf("Failed to dump and kill processes(pid = %s): Monitor is not set",
942 pidString.c_str());
943 ALOGW("%s", errorMsg.c_str());
944 return Error() << errorMsg;
945 }
946 monitor = mMonitor;
947 }
948 if (isSystemShuttingDown()) {
949 ALOGI("Skip dumping and killing processes(%s): The system is shutting down",
950 pidString.c_str());
951 return {};
952 }
953 if (reportToVhal) {
954 reportTerminatedProcessToVhal(processesNotResponding);
955 }
956 monitor->onClientsNotResponding(processesNotResponding);
957 if (DEBUG) {
958 ALOGD("Dumping and killing processes is requested: %s", pidString.c_str());
959 }
960 return {};
961 }
962
963 // Handle when car watchdog clients die.
handleBinderDeath(void * cookie)964 void WatchdogProcessService::handleBinderDeath(void* cookie) {
965 uintptr_t cookieId = reinterpret_cast<uintptr_t>(cookie);
966
967 // The same binder death recipient is used for both monitor and client deaths. So, check both
968 // the monitor and all the clients until a match is found.
969 Mutex::Autolock lock(mMutex);
970 if (mMonitor != nullptr) {
971 if (AIBinder* aiBinder = mMonitor->asBinder().get();
972 reinterpret_cast<uintptr_t>(aiBinder) == cookieId) {
973 mMonitor.reset();
974 ALOGW("The monitor has died.");
975 return;
976 }
977 }
978
979 findClientAndProcessLocked(kTimeouts, cookieId,
980 [&](ClientInfoMap& clients, ClientInfoMap::const_iterator it) {
981 ALOGW("Client(pid: %d) died", it->second.kPid);
982 clients.erase(it);
983 });
984 }
985
986 // Handle when VHAL dies.
handleVhalDeath()987 void WatchdogProcessService::handleVhalDeath() {
988 Mutex::Autolock lock(mMutex);
989 ALOGW("VHAL has died.");
990 mHandlerLooper->removeMessages(mMessageHandler, MSG_VHAL_HEALTH_CHECK);
991 // Destroying mVHalService would remove all onBinderDied callbacks.
992 resetVhalInfoLocked();
993 }
994
reportWatchdogAliveToVhal()995 void WatchdogProcessService::reportWatchdogAliveToVhal() {
996 ATRACE_CALL();
997 if (mNotSupportedVhalProperties.count(VehicleProperty::WATCHDOG_ALIVE) > 0) {
998 ALOGW("VHAL doesn't support WATCHDOG_ALIVE. Car watchdog will not update WATCHDOG_ALIVE.");
999 return;
1000 }
1001 int64_t systemUptime = uptimeMillis();
1002 VehiclePropValue propValue{
1003 .prop = static_cast<int32_t>(VehicleProperty::WATCHDOG_ALIVE),
1004 .value.int64Values = {systemUptime},
1005 };
1006 const auto& ret = updateVhal(propValue);
1007 if (!ret.ok()) {
1008 ALOGW("Failed to update WATCHDOG_ALIVE VHAL property. Will try again in 3s, error: %s",
1009 ret.error().message().c_str());
1010 }
1011 // Update VHAL with the interval of TIMEOUT_CRITICAL(3s).
1012 auto durationNs = getTimeoutDurationNs(TimeoutLength::TIMEOUT_CRITICAL);
1013 mHandlerLooper->removeMessages(mMessageHandler, MSG_VHAL_WATCHDOG_ALIVE);
1014 mHandlerLooper->sendMessageDelayed(durationNs.count(), mMessageHandler,
1015 Message(MSG_VHAL_WATCHDOG_ALIVE));
1016 }
1017
reportTerminatedProcessToVhal(const std::vector<ProcessIdentifier> & processesNotResponding)1018 void WatchdogProcessService::reportTerminatedProcessToVhal(
1019 const std::vector<ProcessIdentifier>& processesNotResponding) {
1020 ATRACE_CALL();
1021 if (mNotSupportedVhalProperties.count(VehicleProperty::WATCHDOG_TERMINATED_PROCESS) > 0) {
1022 ALOGW("VHAL doesn't support WATCHDOG_TERMINATED_PROCESS. Terminated process is not "
1023 "reported to VHAL.");
1024 return;
1025 }
1026 for (auto&& processIdentifier : processesNotResponding) {
1027 const auto& retCmdLine = readProcCmdLine(processIdentifier.pid);
1028 if (!retCmdLine.ok()) {
1029 ALOGW("Failed to get process command line for pid(%d): %s", processIdentifier.pid,
1030 retCmdLine.error().message().c_str());
1031 continue;
1032 }
1033 std::string procCmdLine = retCmdLine.value();
1034 VehiclePropValue propValue{
1035 .prop = static_cast<int32_t>(VehicleProperty::WATCHDOG_TERMINATED_PROCESS),
1036 .value.int32Values = {static_cast<int32_t>(
1037 ProcessTerminationReason::NOT_RESPONDING)},
1038 .value.stringValue = procCmdLine,
1039 };
1040 const auto& retUpdate = updateVhal(propValue);
1041 if (!retUpdate.ok()) {
1042 ALOGW("Failed to update WATCHDOG_TERMINATED_PROCESS VHAL property(command line: %s)",
1043 procCmdLine.c_str());
1044 }
1045 }
1046 }
1047
updateVhal(const VehiclePropValue & value)1048 Result<void> WatchdogProcessService::updateVhal(const VehiclePropValue& value) {
1049 ATRACE_CALL();
1050 const auto& connectRet = connectToVhal();
1051 if (!connectRet.ok()) {
1052 std::string errorMsg = "VHAL is not connected: " + connectRet.error().message();
1053 ALOGW("%s", errorMsg.c_str());
1054 return Error() << errorMsg;
1055 }
1056 int32_t propId = value.prop;
1057 std::shared_ptr<IVhalClient> vhalService;
1058 {
1059 Mutex::Autolock lock(mMutex);
1060 if (mNotSupportedVhalProperties.count(static_cast<VehicleProperty>(propId)) > 0) {
1061 std::string errorMsg = StringPrintf("VHAL doesn't support property(id: %d)", propId);
1062 ALOGW("%s", errorMsg.c_str());
1063 return Error() << errorMsg;
1064 }
1065 vhalService = mVhalService;
1066 }
1067
1068 auto halPropValue = vhalService->createHalPropValue(propId);
1069 halPropValue->setInt32Values(value.value.int32Values);
1070 halPropValue->setInt64Values(value.value.int64Values);
1071 halPropValue->setStringValue(value.value.stringValue);
1072 if (auto result = vhalService->setValueSync(*halPropValue); !result.ok()) {
1073 return Error() << "Failed to set propValue(" << propId
1074 << ") to VHAL, error: " << result.error().message();
1075 }
1076
1077 return {};
1078 }
1079
readProcCmdLine(int32_t pid)1080 Result<std::string> WatchdogProcessService::readProcCmdLine(int32_t pid) {
1081 std::string cmdLinePath = StringPrintf("/proc/%d/cmdline", pid);
1082 std::string procCmdLine;
1083 if (ReadFileToString(cmdLinePath, &procCmdLine)) {
1084 std::replace(procCmdLine.begin(), procCmdLine.end(), '\0', ' ');
1085 procCmdLine = Trim(procCmdLine);
1086 return procCmdLine;
1087 }
1088 return Error() << "Failed to read " << cmdLinePath;
1089 }
1090
connectToVhal()1091 Result<void> WatchdogProcessService::connectToVhal() {
1092 {
1093 Mutex::Autolock lock(mMutex);
1094 if (mVhalService != nullptr) {
1095 return {};
1096 }
1097 mVhalService = kTryCreateVhalClientFunc();
1098 if (mVhalService == nullptr) {
1099 return Error() << "Failed to connect to VHAL.";
1100 }
1101 mVhalService->addOnBinderDiedCallback(mVhalBinderDiedCallback);
1102 }
1103 queryVhalProperties();
1104 subscribeToVhalHeartBeat();
1105 ALOGI("Successfully connected to VHAL.");
1106 return {};
1107 }
1108
queryVhalProperties()1109 void WatchdogProcessService::queryVhalProperties() {
1110 std::shared_ptr<IVhalClient> vhalService;
1111 {
1112 Mutex::Autolock lock(mMutex);
1113 vhalService = mVhalService;
1114 }
1115 std::unordered_set<VehicleProperty> notSupportedProperties;
1116 std::vector<VehicleProperty> propIds = {VehicleProperty::WATCHDOG_ALIVE,
1117 VehicleProperty::WATCHDOG_TERMINATED_PROCESS,
1118 VehicleProperty::VHAL_HEARTBEAT};
1119 for (const auto& propId : propIds) {
1120 if (auto result = vhalService->getPropConfigs({static_cast<int32_t>(propId)});
1121 !result.ok()) {
1122 notSupportedProperties.insert(propId);
1123 }
1124 }
1125 {
1126 Mutex::Autolock lock(mMutex);
1127 mNotSupportedVhalProperties = std::move(notSupportedProperties);
1128 }
1129 }
1130
subscribeToVhalHeartBeat()1131 void WatchdogProcessService::subscribeToVhalHeartBeat() {
1132 std::unique_ptr<ISubscriptionClient> propertySubscriptionClient;
1133 {
1134 Mutex::Autolock lock(mMutex);
1135 if (mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) > 0) {
1136 ALOGW("VHAL doesn't support VHAL_HEARTBEAT. Checking VHAL health is disabled.");
1137 return;
1138 }
1139
1140 mVhalHeartBeat = {
1141 .eventTime = 0,
1142 .value = 0,
1143 };
1144 propertySubscriptionClient = mVhalService->getSubscriptionClient(mPropertyChangeListener);
1145 }
1146 std::vector<SubscribeOptions> options = {
1147 {.propId = static_cast<int32_t>(VehicleProperty::VHAL_HEARTBEAT), .areaIds = {}},
1148 };
1149 if (auto result = propertySubscriptionClient->subscribe(options); !result.ok()) {
1150 ALOGW("Failed to subscribe to VHAL_HEARTBEAT. Checking VHAL health is disabled. '%s'",
1151 result.error().message().c_str());
1152 return;
1153 }
1154 std::chrono::nanoseconds intervalNs = mVhalHealthCheckWindowMillis + kHealthCheckDelayMillis;
1155 mHandlerLooper->sendMessageDelayed(intervalNs.count(), mMessageHandler,
1156 Message(MSG_VHAL_HEALTH_CHECK));
1157 // VHAL process identifier is required only when terminating the VHAL process. VHAL process is
1158 // terminated only when the VHAL is unhealthy. However, caching the process identifier as soon
1159 // as connecting to VHAL guarantees the correct PID is cached. Because the VHAL pid is queried
1160 // from the service manager, the caching should be performed outside the class level lock. So,
1161 // handle the caching in the handler thread after successfully subscribing to the VHAL_HEARTBEAT
1162 // property.
1163 mHandlerLooper->sendMessage(mMessageHandler, Message(MSG_CACHE_VHAL_PROCESS_IDENTIFIER));
1164 return;
1165 }
1166
getWatchdogServiceHelperLocked()1167 const sp<WatchdogServiceHelperInterface> WatchdogProcessService::getWatchdogServiceHelperLocked() {
1168 ClientInfoMap& clients = mClientsByTimeout[kCarWatchdogServiceTimeoutDelay];
1169 for (const auto& [_, clientInfo] : clients) {
1170 if (clientInfo.kType == ClientType::Service) {
1171 return clientInfo.kWatchdogServiceHelper;
1172 }
1173 }
1174 return nullptr;
1175 }
1176
cacheVhalProcessIdentifier()1177 void WatchdogProcessService::cacheVhalProcessIdentifier() {
1178 // Ensure only one MSG_CACHE_VHAL_PROCESS_IDENTIFIER is present on the looper at any given time.
1179 // Duplicate messages could be posted when the CarService restarts during the caching attempts.
1180 // When duplicate messages are present, the following retry delay won't have any effect.
1181 mHandlerLooper->removeMessages(mMessageHandler, MSG_CACHE_VHAL_PROCESS_IDENTIFIER);
1182 bool isAidlVhal;
1183 sp<WatchdogServiceHelperInterface> serviceHelper;
1184 {
1185 Mutex::Autolock lock(mMutex);
1186 if (mVhalService == nullptr || mVhalProcessIdentifier.has_value()) {
1187 return;
1188 }
1189 isAidlVhal = mVhalService->isAidlVhal();
1190 serviceHelper = getWatchdogServiceHelperLocked();
1191 // WatchdogServiceHelper is available only when the CarWatchdogService
1192 // is connected. So, if the WatchdogServiceHelper is not available,
1193 // postpone requesting the AIDL VHAL process identifier from
1194 // CarWatchdogService until the daemon is connected with the service.
1195 if (isAidlVhal && serviceHelper == nullptr) {
1196 if (DEBUG) {
1197 ALOGE("Skipping requesting AIDL VHAL pid from CarWatchdogService until the service "
1198 "is connected");
1199 }
1200 return;
1201 }
1202 if (mTotalVhalPidCachingAttempts >= kMaxVhalPidCachingAttempts) {
1203 ALOGE("Failed to cache VHAL process identifier. Total attempts made to cache: %d",
1204 mTotalVhalPidCachingAttempts);
1205 return;
1206 }
1207 mTotalVhalPidCachingAttempts++;
1208 }
1209 const auto retryCaching = [&](const std::string& logMessage) {
1210 ALOGW("%s. Retrying caching VHAL pid in %lld ms", logMessage.c_str(),
1211 kVhalPidCachingRetryDelayNs.count() / (1'000'000));
1212 mHandlerLooper->sendMessageDelayed(kVhalPidCachingRetryDelayNs.count(), mMessageHandler,
1213 Message(MSG_CACHE_VHAL_PROCESS_IDENTIFIER));
1214 };
1215 if (isAidlVhal) {
1216 if (const auto status = serviceHelper->requestAidlVhalPid(); !status.isOk()) {
1217 retryCaching(StringPrintf("Failed to request AIDL VHAL pid from CarWatchdogService: %s",
1218 status.getMessage()));
1219 return;
1220 }
1221 // CarWatchdogService responds with the PID via an asynchronous callback. When
1222 // CarWatchdogService cannot respond with the PID, the daemon must retry caching the PID but
1223 // this needs to happen asynchronously. So, post a retry message to ensure that the AIDL
1224 // VHAL PID is returned by the CarWatchdogService within the retry timeout.
1225 retryCaching("Requested AIDL VHAL pid from CarWatchdogService");
1226 return;
1227 }
1228 Result<pid_t> result;
1229 sp<IServiceManager> hidlServiceManager = kTryGetHidlServiceManagerFunc();
1230 if (hidlServiceManager == nullptr) {
1231 retryCaching("Failed to get HIDL service manager");
1232 return;
1233 }
1234 if (result = queryHidlServiceManagerForVhalPid(hidlServiceManager); !result.ok()) {
1235 retryCaching(result.error().message());
1236 return;
1237 }
1238 cacheVhalProcessIdentifierForPid(*result);
1239 }
1240
onAidlVhalPidFetched(pid_t pid)1241 void WatchdogProcessService::onAidlVhalPidFetched(pid_t pid) {
1242 {
1243 Mutex::Autolock lock(mMutex);
1244 if (mVhalService == nullptr || !mVhalService->isAidlVhal()) {
1245 return;
1246 }
1247 }
1248 cacheVhalProcessIdentifierForPid(pid);
1249 }
1250
cacheVhalProcessIdentifierForPid(int32_t pid)1251 void WatchdogProcessService::cacheVhalProcessIdentifierForPid(int32_t pid) {
1252 if (pid < 0) {
1253 ALOGE("Ignoring request to cache invalid VHAL pid (%d)", pid);
1254 return;
1255 }
1256 ProcessIdentifier processIdentifier;
1257 processIdentifier.pid = pid;
1258 processIdentifier.startTimeMillis = kGetStartTimeForPidFunc(pid);
1259
1260 Mutex::Autolock lock(mMutex);
1261 mVhalProcessIdentifier = processIdentifier;
1262 mHandlerLooper->removeMessages(mMessageHandler, MSG_CACHE_VHAL_PROCESS_IDENTIFIER);
1263 }
1264
getNewSessionId()1265 int32_t WatchdogProcessService::getNewSessionId() {
1266 // Make sure that session id is always positive number.
1267 if (++mLastSessionId <= 0) {
1268 mLastSessionId = 1;
1269 }
1270 return mLastSessionId;
1271 }
1272
updateVhalHeartBeat(int64_t value)1273 void WatchdogProcessService::updateVhalHeartBeat(int64_t value) {
1274 bool wrongHeartBeat;
1275 {
1276 Mutex::Autolock lock(mMutex);
1277 if (!mIsEnabled) {
1278 return;
1279 }
1280 wrongHeartBeat = value <= mVhalHeartBeat.value;
1281 mVhalHeartBeat.eventTime = uptimeMillis();
1282 mVhalHeartBeat.value = value;
1283 }
1284 if (wrongHeartBeat) {
1285 ALOGW("VHAL updated heart beat with a wrong value. Terminating VHAL...");
1286 terminateVhal();
1287 return;
1288 }
1289 std::chrono::nanoseconds intervalNs = mVhalHealthCheckWindowMillis + kHealthCheckDelayMillis;
1290 mHandlerLooper->sendMessageDelayed(intervalNs.count(), mMessageHandler,
1291 Message(MSG_VHAL_HEALTH_CHECK));
1292 }
1293
checkVhalHealth()1294 void WatchdogProcessService::checkVhalHealth() {
1295 int64_t lastEventTime;
1296 int64_t currentUptime = uptimeMillis();
1297 {
1298 Mutex::Autolock lock(mMutex);
1299 if (mVhalService == nullptr || !mIsEnabled) {
1300 return;
1301 }
1302 lastEventTime = mVhalHeartBeat.eventTime;
1303 }
1304 if (currentUptime > lastEventTime + mVhalHealthCheckWindowMillis.count()) {
1305 ALOGW("VHAL failed to update heart beat within timeout. Terminating VHAL...");
1306 terminateVhal();
1307 }
1308 }
1309
resetVhalInfoLocked()1310 void WatchdogProcessService::resetVhalInfoLocked() {
1311 mVhalService.reset();
1312 mVhalProcessIdentifier.reset();
1313 mTotalVhalPidCachingAttempts = 0;
1314 // Stop any pending caching attempts when the VHAL info is reset.
1315 mHandlerLooper->removeMessages(mMessageHandler, MSG_CACHE_VHAL_PROCESS_IDENTIFIER);
1316 }
1317
terminateVhal()1318 void WatchdogProcessService::terminateVhal() {
1319 std::optional<ProcessIdentifier> processIdentifier;
1320 {
1321 Mutex::Autolock lock(mMutex);
1322 processIdentifier = mVhalProcessIdentifier;
1323 resetVhalInfoLocked();
1324 if (!processIdentifier.has_value()) {
1325 ALOGE("Failed to terminate VHAL: failed to fetch VHAL PID");
1326 return;
1327 }
1328 }
1329 dumpAndKillAllProcesses(std::vector<ProcessIdentifier>(1, *processIdentifier),
1330 /*reportToVhal=*/false);
1331 }
1332
getTimeoutDurationNs(const TimeoutLength & timeout)1333 std::chrono::nanoseconds WatchdogProcessService::getTimeoutDurationNs(
1334 const TimeoutLength& timeout) {
1335 // When a default timeout has been overridden by the |kPropertyClientCheckInterval| read-only
1336 // property override the timeout value for all timeout lengths.
1337 if (mOverriddenClientHealthCheckWindowNs.has_value()) {
1338 return mOverriddenClientHealthCheckWindowNs.value();
1339 }
1340 switch (timeout) {
1341 case TimeoutLength::TIMEOUT_CRITICAL:
1342 return 3s; // 3s and no buffer time.
1343 case TimeoutLength::TIMEOUT_MODERATE:
1344 return 6s; // 5s + 1s as buffer time.
1345 case TimeoutLength::TIMEOUT_NORMAL:
1346 return 12s; // 10s + 2s as buffer time.
1347 }
1348 }
1349
toString() const1350 std::string WatchdogProcessService::ClientInfo::toString() const {
1351 std::string buffer;
1352 StringAppendF(&buffer, "pid = %d, userId = %d, type = %s", kPid, kUserId,
1353 kType == ClientType::Regular ? "regular" : "watchdog service");
1354 return buffer;
1355 }
1356
getAIBinder() const1357 AIBinder* WatchdogProcessService::ClientInfo::getAIBinder() const {
1358 if (kType == ClientType::Regular) {
1359 return kClient->asBinder().get();
1360 }
1361 return kWatchdogServiceBinder.get();
1362 }
1363
linkToDeath(AIBinder_DeathRecipient * recipient) const1364 ScopedAStatus WatchdogProcessService::ClientInfo::linkToDeath(
1365 AIBinder_DeathRecipient* recipient) const {
1366 if (kType == ClientType::Regular) {
1367 AIBinder* aiBinder = getAIBinder();
1368 return kService.mDeathRegistrationWrapper->linkToDeath(aiBinder, recipient,
1369 static_cast<void*>(aiBinder));
1370 }
1371 // WatchdogServiceHelper is the binder death recipient for watchdog service, ergo
1372 // skip this step.
1373 return ScopedAStatus::ok();
1374 }
1375
unlinkToDeath(AIBinder_DeathRecipient * recipient) const1376 ScopedAStatus WatchdogProcessService::ClientInfo::unlinkToDeath(
1377 AIBinder_DeathRecipient* recipient) const {
1378 if (kType == ClientType::Regular) {
1379 AIBinder* aiBinder = getAIBinder();
1380 return kService.mDeathRegistrationWrapper->unlinkToDeath(aiBinder, recipient,
1381 static_cast<void*>(aiBinder));
1382 }
1383 // WatchdogServiceHelper is the binder death recipient for watchdog service, ergo
1384 // skip this step.
1385 return ScopedAStatus::ok();
1386 }
1387
checkIfAlive(TimeoutLength timeout) const1388 ScopedAStatus WatchdogProcessService::ClientInfo::checkIfAlive(TimeoutLength timeout) const {
1389 ATRACE_NAME(StringPrintf("checkIfAlive - %s", timeoutToString(timeout).c_str()).c_str());
1390 if (kType == ClientType::Regular) {
1391 return kClient->checkIfAlive(sessionId, timeout);
1392 }
1393 return kWatchdogServiceHelper->checkIfAlive(kWatchdogServiceBinder, sessionId, timeout);
1394 }
1395
prepareProcessTermination() const1396 ScopedAStatus WatchdogProcessService::ClientInfo::prepareProcessTermination() const {
1397 ATRACE_CALL();
1398 if (kType == ClientType::Regular) {
1399 return kClient->prepareProcessTermination();
1400 }
1401 return kWatchdogServiceHelper->prepareProcessTermination(kWatchdogServiceBinder);
1402 }
1403
onPropertyEvent(const std::vector<std::unique_ptr<IHalPropValue>> & propValues)1404 void WatchdogProcessService::PropertyChangeListener::onPropertyEvent(
1405 const std::vector<std::unique_ptr<IHalPropValue>>& propValues) {
1406 for (const auto& value : propValues) {
1407 if (value->getPropId() == static_cast<int32_t>(VehicleProperty::VHAL_HEARTBEAT)) {
1408 if (value->getInt64Values().size() < 1) {
1409 ALOGE("Invalid VHAL_HEARTBEAT value, empty value");
1410 } else {
1411 kService->updateVhalHeartBeat(value->getInt64Values()[0]);
1412 }
1413 break;
1414 }
1415 }
1416 }
1417
onPropertySetError(const std::vector<HalPropError> & errors)1418 void WatchdogProcessService::PropertyChangeListener::onPropertySetError(
1419 const std::vector<HalPropError>& errors) {
1420 for (const auto& error : errors) {
1421 if (error.propId != static_cast<int32_t>(VehicleProperty::WATCHDOG_ALIVE) &&
1422 error.propId != static_cast<int32_t>(VehicleProperty::WATCHDOG_TERMINATED_PROCESS)) {
1423 continue;
1424 }
1425 ALOGE("failed to set VHAL property, prop ID: %d, status: %d", error.propId,
1426 static_cast<int32_t>(error.status));
1427 }
1428 }
1429
toProtoClientType(ClientType clientType)1430 int WatchdogProcessService::toProtoClientType(ClientType clientType) {
1431 switch (clientType) {
1432 case ClientType::Regular:
1433 return HealthCheckClientInfo::REGULAR;
1434 case ClientType::Service:
1435 return HealthCheckClientInfo::CAR_WATCHDOG_SERVICE;
1436 default:
1437 return HealthCheckClientInfo::CLIENT_TYPE_UNSPECIFIED;
1438 }
1439 }
1440
handleMessage(const Message & message)1441 void WatchdogProcessService::MessageHandlerImpl::handleMessage(const Message& message) {
1442 switch (message.what) {
1443 case static_cast<int>(TimeoutLength::TIMEOUT_CRITICAL):
1444 case static_cast<int>(TimeoutLength::TIMEOUT_MODERATE):
1445 case static_cast<int>(TimeoutLength::TIMEOUT_NORMAL):
1446 kService->doHealthCheck(message.what);
1447 break;
1448 case MSG_VHAL_WATCHDOG_ALIVE:
1449 kService->reportWatchdogAliveToVhal();
1450 break;
1451 case MSG_VHAL_HEALTH_CHECK:
1452 kService->checkVhalHealth();
1453 break;
1454 case MSG_CACHE_VHAL_PROCESS_IDENTIFIER:
1455 kService->cacheVhalProcessIdentifier();
1456 break;
1457 default:
1458 ALOGW("Unknown message: %d", message.what);
1459 }
1460 }
1461
1462 } // namespace watchdog
1463 } // namespace automotive
1464 } // namespace android
1465