1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.server; 18 19 import static com.android.server.Watchdog.HandlerCheckerAndTimeout.withCustomTimeout; 20 import static com.android.server.Watchdog.HandlerCheckerAndTimeout.withDefaultTimeout; 21 22 import android.annotation.NonNull; 23 import android.annotation.Nullable; 24 import android.app.IActivityController; 25 import android.content.BroadcastReceiver; 26 import android.content.Context; 27 import android.content.Intent; 28 import android.content.IntentFilter; 29 import android.database.ContentObserver; 30 import android.hidl.manager.V1_0.IServiceManager; 31 import android.net.Uri; 32 import android.os.Binder; 33 import android.os.Build; 34 import android.os.Debug; 35 import android.os.FileUtils; 36 import android.os.Handler; 37 import android.os.IPowerManager; 38 import android.os.Looper; 39 import android.os.Process; 40 import android.os.RemoteException; 41 import android.os.ServiceDebugInfo; 42 import android.os.ServiceManager; 43 import android.os.SystemClock; 44 import android.os.SystemProperties; 45 import android.os.UserHandle; 46 import android.provider.Settings; 47 import android.sysprop.WatchdogProperties; 48 import android.util.Dumpable; 49 import android.util.EventLog; 50 import android.util.Log; 51 import android.util.Slog; 52 import android.util.SparseBooleanArray; 53 54 import com.android.internal.os.BackgroundThread; 55 import com.android.internal.os.ProcessCpuTracker; 56 import com.android.internal.os.ZygoteConnectionConstants; 57 import com.android.internal.util.FrameworkStatsLog; 58 import com.android.server.am.ActivityManagerService; 59 import com.android.server.am.StackTracesDumpHelper; 60 import com.android.server.am.TraceErrorLogger; 61 import com.android.server.criticalevents.CriticalEventLog; 62 import com.android.server.wm.SurfaceAnimationThread; 63 64 import java.io.BufferedReader; 65 import java.io.File; 66 import java.io.FileNotFoundException; 67 import java.io.FileReader; 68 import java.io.FileWriter; 69 import java.io.IOException; 70 import java.io.PrintWriter; 71 import java.io.StringWriter; 72 import java.time.Clock; 73 import java.util.ArrayList; 74 import java.util.Arrays; 75 import java.util.Collections; 76 import java.util.HashSet; 77 import java.util.List; 78 import java.util.Optional; 79 import java.util.UUID; 80 import java.util.concurrent.CompletableFuture; 81 import java.util.concurrent.TimeUnit; 82 83 /** 84 * This class calls its monitor every minute. Killing this process if they don't return 85 **/ 86 public class Watchdog implements Dumpable { 87 static final String TAG = "Watchdog"; 88 89 /** Debug flag. */ 90 public static final boolean DEBUG = false; 91 92 // Set this to true to use debug default values. 93 private static final boolean DB = false; 94 95 // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with 96 // timeout in com.android.internal.os.ZygoteConnection, or wrapped applications 97 // can trigger the watchdog. 98 // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped 99 // applications may not work with a debug build. CTS will fail. 100 private static final long DEFAULT_TIMEOUT = DB ? 10 * 1000 : 60 * 1000; 101 102 // This ratio is used to compute the pre-watchdog timeout (2 means that the pre-watchdog timeout 103 // will be half the full timeout). 104 // 105 // The pre-watchdog event is similar to a full watchdog except it does not crash system server. 106 private static final int PRE_WATCHDOG_TIMEOUT_RATIO = 4; 107 108 // These are temporally ordered: larger values as lateness increases 109 static final int COMPLETED = 0; 110 static final int WAITING = 1; 111 static final int WAITED_UNTIL_PRE_WATCHDOG = 2; 112 static final int OVERDUE = 3; 113 114 // Track watchdog timeout history and break the crash loop if there is. 115 private static final String TIMEOUT_HISTORY_FILE = "/data/system/watchdog-timeout-history.txt"; 116 private static final String PROP_FATAL_LOOP_COUNT = "framework_watchdog.fatal_count"; 117 private static final String PROP_FATAL_LOOP_WINDOWS_SECS = 118 "framework_watchdog.fatal_window.second"; 119 120 // Which native processes to dump into dropbox's stack traces 121 public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { 122 "/system/bin/audioserver", 123 "/system/bin/cameraserver", 124 "/system/bin/drmserver", 125 "/system/bin/keystore2", 126 "/system/bin/mediadrmserver", 127 "/system/bin/mediaserver", 128 "/system/bin/netd", 129 "/system/bin/sdcard", 130 "/system/bin/servicemanager", 131 "/system/bin/surfaceflinger", 132 "/system/bin/vold", 133 "media.extractor", // system/bin/mediaextractor 134 "media.metrics", // system/bin/mediametrics 135 "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service 136 "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec 137 "media.transcoding", // Media transcoding service 138 "com.android.bluetooth", // Bluetooth service 139 "/apex/com.android.art/bin/artd", // ART daemon 140 "/apex/com.android.os.statsd/bin/statsd", // Stats daemon 141 }; 142 143 public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList( 144 "android.hardware.audio@4.0::IDevicesFactory", 145 "android.hardware.audio@5.0::IDevicesFactory", 146 "android.hardware.audio@6.0::IDevicesFactory", 147 "android.hardware.audio@7.0::IDevicesFactory", 148 "android.hardware.biometrics.face@1.0::IBiometricsFace", 149 "android.hardware.biometrics.fingerprint@2.1::IBiometricsFingerprint", 150 "android.hardware.bluetooth@1.0::IBluetoothHci", 151 "android.hardware.camera.provider@2.4::ICameraProvider", 152 "android.hardware.gnss@1.0::IGnss", 153 "android.hardware.graphics.allocator@2.0::IAllocator", 154 "android.hardware.graphics.allocator@4.0::IAllocator", 155 "android.hardware.graphics.composer@2.1::IComposer", 156 "android.hardware.health@2.0::IHealth", 157 "android.hardware.light@2.0::ILight", 158 "android.hardware.media.c2@1.0::IComponentStore", 159 "android.hardware.media.omx@1.0::IOmx", 160 "android.hardware.media.omx@1.0::IOmxStore", 161 "android.hardware.neuralnetworks@1.0::IDevice", 162 "android.hardware.power@1.0::IPower", 163 "android.hardware.power.stats@1.0::IPowerStats", 164 "android.hardware.sensors@1.0::ISensors", 165 "android.hardware.sensors@2.0::ISensors", 166 "android.hardware.sensors@2.1::ISensors", 167 "android.hardware.vibrator@1.0::IVibrator", 168 "android.hardware.vr@1.0::IVr", 169 "android.system.suspend@1.0::ISystemSuspend" 170 ); 171 172 public static final String[] AIDL_INTERFACE_PREFIXES_OF_INTEREST = new String[] { 173 "android.hardware.audio.core.IModule/", 174 "android.hardware.audio.core.IConfig/", 175 "android.hardware.audio.effect.IFactory/", 176 "android.hardware.biometrics.face.IFace/", 177 "android.hardware.biometrics.fingerprint.IFingerprint/", 178 "android.hardware.bluetooth.IBluetoothHci/", 179 "android.hardware.camera.provider.ICameraProvider/", 180 "android.hardware.drm.IDrmFactory/", 181 "android.hardware.gnss.IGnss/", 182 "android.hardware.graphics.allocator.IAllocator/", 183 "android.hardware.graphics.composer3.IComposer/", 184 "android.hardware.health.IHealth/", 185 "android.hardware.input.processor.IInputProcessor/", 186 "android.hardware.light.ILights/", 187 "android.hardware.neuralnetworks.IDevice/", 188 "android.hardware.power.IPower/", 189 "android.hardware.power.stats.IPowerStats/", 190 "android.hardware.sensors.ISensors/", 191 "android.hardware.vibrator.IVibrator/", 192 "android.hardware.vibrator.IVibratorManager/", 193 "android.system.suspend.ISystemSuspend/", 194 }; 195 196 private static Watchdog sWatchdog; 197 198 private final Thread mThread; 199 200 private final Object mLock = new Object(); 201 202 /* This handler will be used to post message back onto the main thread */ 203 private final ArrayList<HandlerCheckerAndTimeout> mHandlerCheckers = new ArrayList<>(); 204 private final HandlerChecker mMonitorChecker; 205 private ActivityManagerService mActivity; 206 private IActivityController mController; 207 private boolean mAllowRestart = true; 208 // We start with DEFAULT_TIMEOUT. This will then be update with the timeout values from Settings 209 // once the settings provider is initialized. 210 private volatile long mWatchdogTimeoutMillis = DEFAULT_TIMEOUT; 211 private final List<Integer> mInterestingJavaPids = new ArrayList<>(); 212 private final TraceErrorLogger mTraceErrorLogger; 213 214 /** Holds a checker and its timeout. */ 215 static final class HandlerCheckerAndTimeout { 216 private final HandlerChecker mHandler; 217 private final Optional<Long> mCustomTimeoutMillis; 218 HandlerCheckerAndTimeout(HandlerChecker checker, Optional<Long> timeoutMillis)219 private HandlerCheckerAndTimeout(HandlerChecker checker, Optional<Long> timeoutMillis) { 220 this.mHandler = checker; 221 this.mCustomTimeoutMillis = timeoutMillis; 222 } 223 checker()224 HandlerChecker checker() { 225 return mHandler; 226 } 227 228 /** Returns the timeout. */ customTimeoutMillis()229 Optional<Long> customTimeoutMillis() { 230 return mCustomTimeoutMillis; 231 } 232 233 /** 234 * Creates a checker with the default timeout. The timeout will use the default value which 235 * is configurable server-side. 236 */ withDefaultTimeout(HandlerChecker checker)237 static HandlerCheckerAndTimeout withDefaultTimeout(HandlerChecker checker) { 238 return new HandlerCheckerAndTimeout(checker, Optional.empty()); 239 } 240 241 /** 242 * Creates a checker with a custom timeout. The timeout overrides the default value and will 243 * always be used. 244 */ withCustomTimeout( HandlerChecker checker, long timeoutMillis)245 static HandlerCheckerAndTimeout withCustomTimeout( 246 HandlerChecker checker, long timeoutMillis) { 247 return new HandlerCheckerAndTimeout(checker, Optional.of(timeoutMillis)); 248 } 249 } 250 251 /** Used for checking status of handle threads and scheduling monitor callbacks. */ 252 public static class HandlerChecker implements Runnable { 253 private final Handler mHandler; 254 private final String mName; 255 private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); 256 private final ArrayList<Monitor> mMonitorQueue = new ArrayList<Monitor>(); 257 private long mWaitMaxMillis; 258 private boolean mCompleted; 259 private Monitor mCurrentMonitor; 260 private long mStartTimeMillis; 261 private int mPauseCount; 262 private long mPauseEndTimeMillis; 263 private Clock mClock; 264 private Object mLock; 265 HandlerChecker(Handler handler, String name, Object lock, Clock clock)266 HandlerChecker(Handler handler, String name, Object lock, Clock clock) { 267 mHandler = handler; 268 mName = name; 269 mLock = lock; 270 mCompleted = true; 271 mClock = clock; 272 } 273 HandlerChecker(Handler handler, String name, Object lock)274 HandlerChecker(Handler handler, String name, Object lock) { 275 this(handler, name, lock, SystemClock.uptimeClock()); 276 } 277 addMonitorLocked(Monitor monitor)278 void addMonitorLocked(Monitor monitor) { 279 // We don't want to update mMonitors when the Handler is in the middle of checking 280 // all monitors. We will update mMonitors on the next schedule if it is safe 281 mMonitorQueue.add(monitor); 282 } 283 284 /** 285 * Schedules a run on the handler thread. 286 * 287 * @param handlerCheckerTimeoutMillis the timeout to use for this run 288 */ scheduleCheckLocked(long handlerCheckerTimeoutMillis)289 public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) { 290 mWaitMaxMillis = handlerCheckerTimeoutMillis; 291 292 if (mCompleted) { 293 // Safe to update monitors in queue, Handler is not in the middle of work 294 mMonitors.addAll(mMonitorQueue); 295 mMonitorQueue.clear(); 296 } 297 298 long nowMillis = mClock.millis(); 299 boolean isPaused = mPauseCount > 0 || mPauseEndTimeMillis > nowMillis; 300 if ((mMonitors.size() == 0 && isHandlerPolling()) || isPaused) { 301 // Don't schedule until after resume OR 302 // If the target looper has recently been polling, then 303 // there is no reason to enqueue our checker on it since that 304 // is as good as it not being deadlocked. This avoid having 305 // to do a context switch to check the thread. Note that we 306 // only do this if we have no monitors since those would need to 307 // be executed at this point. 308 mCompleted = true; 309 return; 310 } 311 if (!mCompleted) { 312 // we already have a check in flight, so no need 313 return; 314 } 315 316 mCompleted = false; 317 mCurrentMonitor = null; 318 mStartTimeMillis = nowMillis; 319 mPauseEndTimeMillis = 0; 320 mHandler.postAtFrontOfQueue(this); 321 } 322 isHandlerPolling()323 boolean isHandlerPolling() { 324 return mHandler.getLooper().getQueue().isPolling(); 325 } 326 getCompletionStateLocked()327 public int getCompletionStateLocked() { 328 if (mCompleted) { 329 return COMPLETED; 330 } else { 331 long latency = mClock.millis() - mStartTimeMillis; 332 if (latency < mWaitMaxMillis / PRE_WATCHDOG_TIMEOUT_RATIO) { 333 return WAITING; 334 } else if (latency < mWaitMaxMillis) { 335 return WAITED_UNTIL_PRE_WATCHDOG; 336 } 337 } 338 return OVERDUE; 339 } 340 getThread()341 public Thread getThread() { 342 return mHandler.getLooper().getThread(); 343 } 344 getName()345 public String getName() { 346 return mName; 347 } 348 describeBlockedStateLocked()349 String describeBlockedStateLocked() { 350 final String prefix; 351 if (mCurrentMonitor == null) { 352 prefix = "Blocked in handler"; 353 } else { 354 prefix = "Blocked in monitor " + mCurrentMonitor.getClass().getName(); 355 } 356 long latencySeconds = (mClock.millis() - mStartTimeMillis) / 1000; 357 return prefix + " on " + mName + " (" + getThread().getName() + ")" 358 + " for " + latencySeconds + "s"; 359 } 360 361 @Override run()362 public void run() { 363 // Once we get here, we ensure that mMonitors does not change even if we call 364 // #addMonitorLocked because we first add the new monitors to mMonitorQueue and 365 // move them to mMonitors on the next schedule when mCompleted is true, at which 366 // point we have completed execution of this method. 367 final int size = mMonitors.size(); 368 for (int i = 0 ; i < size ; i++) { 369 synchronized (mLock) { 370 mCurrentMonitor = mMonitors.get(i); 371 } 372 mCurrentMonitor.monitor(); 373 } 374 375 synchronized (mLock) { 376 mCompleted = true; 377 mCurrentMonitor = null; 378 } 379 } 380 381 /** 382 * Pauses the checks for the given time. 383 * 384 * <p>The current run will be ignored and another run will be scheduled after 385 * the given time. 386 */ pauseForLocked(int pauseMillis, String reason)387 public void pauseForLocked(int pauseMillis, String reason) { 388 mPauseEndTimeMillis = mClock.millis() + pauseMillis; 389 // Mark as completed, because there's a chance we called this after the watchog 390 // thread loop called Object#wait after 'WAITED_UNTIL_PRE_WATCHDOG'. In that case we 391 // want to ensure the next call to #getCompletionStateLocked for this checker returns 392 // 'COMPLETED' 393 mCompleted = true; 394 Slog.i(TAG, "Pausing of HandlerChecker: " + mName + " for reason: " 395 + reason + ". Pause end time: " + mPauseEndTimeMillis); 396 } 397 398 /** Pause the HandlerChecker. */ pauseLocked(String reason)399 public void pauseLocked(String reason) { 400 mPauseCount++; 401 // Mark as completed, because there's a chance we called this after the watchog 402 // thread loop called Object#wait after 'WAITED_UNTIL_PRE_WATCHDOG'. In that case we 403 // want to ensure the next call to #getCompletionStateLocked for this checker returns 404 // 'COMPLETED' 405 mCompleted = true; 406 Slog.i(TAG, "Pausing HandlerChecker: " + mName + " for reason: " 407 + reason + ". Pause count: " + mPauseCount); 408 } 409 410 /** Resume the HandlerChecker from the last {@link #pauseLocked}. */ resumeLocked(String reason)411 public void resumeLocked(String reason) { 412 if (mPauseCount > 0) { 413 mPauseCount--; 414 Slog.i(TAG, "Resuming HandlerChecker: " + mName + " for reason: " 415 + reason + ". Pause count: " + mPauseCount); 416 } else { 417 Slog.wtf(TAG, "Already resumed HandlerChecker: " + mName); 418 } 419 } 420 421 @Override toString()422 public String toString() { 423 return "CheckerHandler for " + mName; 424 } 425 } 426 427 final class RebootRequestReceiver extends BroadcastReceiver { 428 @Override onReceive(Context c, Intent intent)429 public void onReceive(Context c, Intent intent) { 430 if (intent.getIntExtra("nowait", 0) != 0) { 431 rebootSystem("Received ACTION_REBOOT broadcast"); 432 return; 433 } 434 Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); 435 } 436 } 437 438 /** Monitor for checking the availability of binder threads. The monitor will block until 439 * there is a binder thread available to process in coming IPCs to make sure other processes 440 * can still communicate with the service. 441 */ 442 private static final class BinderThreadMonitor implements Watchdog.Monitor { 443 @Override monitor()444 public void monitor() { 445 Binder.blockUntilThreadAvailable(); 446 } 447 } 448 449 public interface Monitor { monitor()450 void monitor(); 451 } 452 getInstance()453 public static Watchdog getInstance() { 454 if (sWatchdog == null) { 455 sWatchdog = new Watchdog(); 456 } 457 458 return sWatchdog; 459 } 460 Watchdog()461 private Watchdog() { 462 mThread = new Thread(this::run, "watchdog"); 463 464 // Initialize handler checkers for each common thread we want to check. Note 465 // that we are not currently checking the background thread, since it can 466 // potentially hold longer running operations with no guarantees about the timeliness 467 // of operations there. 468 // 469 // Use a custom thread to check monitors to avoid lock contention from impacted other 470 // threads. 471 ServiceThread t = new ServiceThread("watchdog.monitor", 472 android.os.Process.THREAD_PRIORITY_DEFAULT, true /*allowIo*/); 473 t.start(); 474 mMonitorChecker = new HandlerChecker(new Handler(t.getLooper()), "monitor thread", mLock); 475 mHandlerCheckers.add(withDefaultTimeout(mMonitorChecker)); 476 477 mHandlerCheckers.add( 478 withDefaultTimeout( 479 new HandlerChecker(FgThread.getHandler(), "foreground thread", mLock))); 480 // Add checker for main thread. We only do a quick check since there 481 // can be UI running on the thread. 482 mHandlerCheckers.add( 483 withDefaultTimeout( 484 new HandlerChecker( 485 new Handler(Looper.getMainLooper()), "main thread", mLock))); 486 // Add checker for shared UI thread. 487 mHandlerCheckers.add( 488 withDefaultTimeout(new HandlerChecker(UiThread.getHandler(), "ui thread", mLock))); 489 // And also check IO thread. 490 mHandlerCheckers.add( 491 withDefaultTimeout(new HandlerChecker(IoThread.getHandler(), "i/o thread", mLock))); 492 // And the display thread. 493 mHandlerCheckers.add( 494 withDefaultTimeout( 495 new HandlerChecker(DisplayThread.getHandler(), "display thread", mLock))); 496 // And the animation thread. 497 mHandlerCheckers.add( 498 withDefaultTimeout( 499 new HandlerChecker( 500 AnimationThread.getHandler(), "animation thread", mLock))); 501 // And the surface animation thread. 502 mHandlerCheckers.add( 503 withDefaultTimeout( 504 new HandlerChecker( 505 SurfaceAnimationThread.getHandler(), 506 "surface animation thread", 507 mLock))); 508 // Initialize monitor for Binder threads. 509 addMonitor(new BinderThreadMonitor()); 510 511 mInterestingJavaPids.add(Process.myPid()); 512 513 // See the notes on DEFAULT_TIMEOUT. 514 assert DB || Build.IS_USERDEBUG || 515 DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS; 516 517 mTraceErrorLogger = new TraceErrorLogger(); 518 } 519 520 /** 521 * Called by SystemServer to cause the internal thread to begin execution. 522 */ start()523 public void start() { 524 mThread.start(); 525 } 526 527 /** 528 * Registers a {@link BroadcastReceiver} to listen to reboot broadcasts and trigger reboot. 529 * Should be called during boot after the ActivityManagerService is up and registered 530 * as a system service so it can handle registration of a {@link BroadcastReceiver}. 531 */ init(Context context, ActivityManagerService activity)532 public void init(Context context, ActivityManagerService activity) { 533 mActivity = activity; 534 context.registerReceiver(new RebootRequestReceiver(), 535 new IntentFilter(Intent.ACTION_REBOOT), 536 android.Manifest.permission.REBOOT, null); 537 } 538 539 private static class SettingsObserver extends ContentObserver { 540 private final Uri mUri = Settings.Global.getUriFor(Settings.Global.WATCHDOG_TIMEOUT_MILLIS); 541 private final Context mContext; 542 private final Watchdog mWatchdog; 543 SettingsObserver(Context context, Watchdog watchdog)544 SettingsObserver(Context context, Watchdog watchdog) { 545 super(BackgroundThread.getHandler()); 546 mContext = context; 547 mWatchdog = watchdog; 548 // Always kick once to ensure that we match current state 549 onChange(); 550 } 551 552 @Override onChange(boolean selfChange, Uri uri, int userId)553 public void onChange(boolean selfChange, Uri uri, int userId) { 554 if (mUri.equals(uri)) { 555 onChange(); 556 } 557 } 558 onChange()559 public void onChange() { 560 try { 561 mWatchdog.updateWatchdogTimeout(Settings.Global.getLong( 562 mContext.getContentResolver(), 563 Settings.Global.WATCHDOG_TIMEOUT_MILLIS, DEFAULT_TIMEOUT)); 564 } catch (RuntimeException e) { 565 Slog.e(TAG, "Exception while reading settings " + e.getMessage(), e); 566 } 567 } 568 } 569 570 /** 571 * Register an observer to listen to settings. 572 * 573 * It needs to be called after the settings service is initialized. 574 */ registerSettingsObserver(Context context)575 public void registerSettingsObserver(Context context) { 576 context.getContentResolver().registerContentObserver( 577 Settings.Global.getUriFor(Settings.Global.WATCHDOG_TIMEOUT_MILLIS), 578 false, 579 new SettingsObserver(context, this), 580 UserHandle.USER_SYSTEM); 581 } 582 583 /** 584 * Updates watchdog timeout values. 585 */ updateWatchdogTimeout(long timeoutMillis)586 void updateWatchdogTimeout(long timeoutMillis) { 587 // See the notes on DEFAULT_TIMEOUT. 588 if (!DB && !Build.IS_USERDEBUG 589 && timeoutMillis <= ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS) { 590 timeoutMillis = ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS + 1; 591 } 592 mWatchdogTimeoutMillis = timeoutMillis; 593 Slog.i(TAG, "Watchdog timeout updated to " + mWatchdogTimeoutMillis + " millis"); 594 } 595 isInterestingJavaProcess(String processName)596 private static boolean isInterestingJavaProcess(String processName) { 597 return processName.equals(StorageManagerService.sMediaStoreAuthorityProcessName) 598 || processName.equals("com.android.phone"); 599 } 600 601 /** 602 * Notifies the watchdog when a Java process with {@code pid} is started. 603 * This process may have its stack trace dumped during an ANR. 604 */ processStarted(String processName, int pid)605 public void processStarted(String processName, int pid) { 606 if (isInterestingJavaProcess(processName)) { 607 Slog.i(TAG, "Interesting Java process " + processName + " started. Pid " + pid); 608 synchronized (mLock) { 609 mInterestingJavaPids.add(pid); 610 } 611 } 612 } 613 614 /** 615 * Notifies the watchdog when a Java process with {@code pid} dies. 616 */ processDied(String processName, int pid)617 public void processDied(String processName, int pid) { 618 if (isInterestingJavaProcess(processName)) { 619 Slog.i(TAG, "Interesting Java process " + processName + " died. Pid " + pid); 620 synchronized (mLock) { 621 mInterestingJavaPids.remove(Integer.valueOf(pid)); 622 } 623 } 624 } 625 setActivityController(IActivityController controller)626 public void setActivityController(IActivityController controller) { 627 synchronized (mLock) { 628 mController = controller; 629 } 630 } 631 setAllowRestart(boolean allowRestart)632 public void setAllowRestart(boolean allowRestart) { 633 synchronized (mLock) { 634 mAllowRestart = allowRestart; 635 } 636 } 637 addMonitor(Monitor monitor)638 public void addMonitor(Monitor monitor) { 639 synchronized (mLock) { 640 mMonitorChecker.addMonitorLocked(monitor); 641 } 642 } 643 addThread(Handler thread)644 public void addThread(Handler thread) { 645 synchronized (mLock) { 646 final String name = thread.getLooper().getThread().getName(); 647 mHandlerCheckers.add(withDefaultTimeout(new HandlerChecker(thread, name, mLock))); 648 } 649 } 650 addThread(Handler thread, long timeoutMillis)651 public void addThread(Handler thread, long timeoutMillis) { 652 synchronized (mLock) { 653 final String name = thread.getLooper().getThread().getName(); 654 mHandlerCheckers.add( 655 withCustomTimeout(new HandlerChecker(thread, name, mLock), timeoutMillis)); 656 } 657 } 658 659 /** 660 * Pauses the checks of the watchdog for this thread. This is useful 661 * to run a slow operation on one of the monitored thread. 662 * 663 * <p>After the given time, the timeout will go back to the default value. 664 * <p>This method does not require resume to be called. 665 */ pauseWatchingCurrentThreadFor(int pauseMillis, String reason)666 public void pauseWatchingCurrentThreadFor(int pauseMillis, String reason) { 667 synchronized (mLock) { 668 for (HandlerCheckerAndTimeout hc : mHandlerCheckers) { 669 HandlerChecker checker = hc.checker(); 670 if (Thread.currentThread().equals(checker.getThread())) { 671 checker.pauseForLocked(pauseMillis, reason); 672 } 673 } 674 } 675 } 676 677 /** 678 * Pauses the checks of the watchdog for the monitor thread for the given time 679 * 680 * <p>Similar to {@link pauseWatchingCurrentThreadFor} but used for monitors added 681 * through {@link #addMonitor} 682 * <p>This method does not require resume to be called. 683 */ pauseWatchingMonitorsFor(int pauseMillis, String reason)684 public void pauseWatchingMonitorsFor(int pauseMillis, String reason) { 685 mMonitorChecker.pauseForLocked(pauseMillis, reason); 686 } 687 688 /** 689 * Pauses Watchdog action for the currently running thread. Useful before executing long running 690 * operations that could falsely trigger the watchdog. Each call to this will require a matching 691 * call to {@link #resumeWatchingCurrentThread}. 692 * 693 * <p>If the current thread has not been added to the Watchdog, this call is a no-op. 694 * 695 * <p>If the Watchdog is already paused for the current thread, this call adds 696 * adds another pause and will require an additional {@link #resumeCurrentThread} to resume. 697 * 698 * <p>Note: Use with care, as any deadlocks on the current thread will be undetected until all 699 * pauses have been resumed. Prefer to use #pauseWatchingCurrentThreadFor. 700 */ pauseWatchingCurrentThread(String reason)701 public void pauseWatchingCurrentThread(String reason) { 702 synchronized (mLock) { 703 for (HandlerCheckerAndTimeout hc : mHandlerCheckers) { 704 HandlerChecker checker = hc.checker(); 705 if (Thread.currentThread().equals(checker.getThread())) { 706 checker.pauseLocked(reason); 707 } 708 } 709 } 710 } 711 712 /** 713 * Resumes the last pause from {@link #pauseWatchingCurrentThread} for the currently running 714 * thread. 715 * 716 * <p>If the current thread has not been added to the Watchdog, this call is a no-op. 717 * 718 * <p>If the Watchdog action for the current thread is already resumed, this call logs a wtf. 719 * 720 * <p>If all pauses have been resumed, the Watchdog action is finally resumed, otherwise, 721 * the Watchdog action for the current thread remains paused until resume is called at least 722 * as many times as the calls to pause. 723 */ resumeWatchingCurrentThread(String reason)724 public void resumeWatchingCurrentThread(String reason) { 725 synchronized (mLock) { 726 for (HandlerCheckerAndTimeout hc : mHandlerCheckers) { 727 HandlerChecker checker = hc.checker(); 728 if (Thread.currentThread().equals(checker.getThread())) { 729 checker.resumeLocked(reason); 730 } 731 } 732 } 733 } 734 735 /** 736 * Perform a full reboot of the system. 737 */ rebootSystem(String reason)738 void rebootSystem(String reason) { 739 Slog.i(TAG, "Rebooting system because: " + reason); 740 IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE); 741 try { 742 pms.reboot(false, reason, false); 743 } catch (RemoteException ex) { 744 } 745 } 746 evaluateCheckerCompletionLocked()747 private int evaluateCheckerCompletionLocked() { 748 int state = COMPLETED; 749 for (int i=0; i<mHandlerCheckers.size(); i++) { 750 HandlerChecker hc = mHandlerCheckers.get(i).checker(); 751 state = Math.max(state, hc.getCompletionStateLocked()); 752 } 753 return state; 754 } 755 getCheckersWithStateLocked(int completionState)756 private ArrayList<HandlerChecker> getCheckersWithStateLocked(int completionState) { 757 ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>(); 758 for (int i=0; i<mHandlerCheckers.size(); i++) { 759 HandlerChecker hc = mHandlerCheckers.get(i).checker(); 760 if (hc.getCompletionStateLocked() == completionState) { 761 checkers.add(hc); 762 } 763 } 764 return checkers; 765 } 766 describeCheckersLocked(List<HandlerChecker> checkers)767 private String describeCheckersLocked(List<HandlerChecker> checkers) { 768 StringBuilder builder = new StringBuilder(128); 769 for (int i=0; i<checkers.size(); i++) { 770 if (builder.length() > 0) { 771 builder.append(", "); 772 } 773 builder.append(checkers.get(i).describeBlockedStateLocked()); 774 } 775 return builder.toString(); 776 } 777 addInterestingHidlPids(HashSet<Integer> pids)778 private static void addInterestingHidlPids(HashSet<Integer> pids) { 779 try { 780 IServiceManager serviceManager = IServiceManager.getService(); 781 ArrayList<IServiceManager.InstanceDebugInfo> dump = 782 serviceManager.debugDump(); 783 for (IServiceManager.InstanceDebugInfo info : dump) { 784 if (info.pid == IServiceManager.PidConstant.NO_PID) { 785 continue; 786 } 787 788 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) { 789 continue; 790 } 791 792 pids.add(info.pid); 793 } 794 } catch (RemoteException e) { 795 Log.w(TAG, e); 796 } 797 } 798 addInterestingAidlPids(HashSet<Integer> pids)799 private static void addInterestingAidlPids(HashSet<Integer> pids) { 800 ServiceDebugInfo[] infos = ServiceManager.getServiceDebugInfo(); 801 if (infos == null) return; 802 803 for (ServiceDebugInfo info : infos) { 804 for (String prefix : AIDL_INTERFACE_PREFIXES_OF_INTEREST) { 805 if (info.name.startsWith(prefix)) { 806 pids.add(info.debugPid); 807 } 808 } 809 } 810 } 811 getInterestingNativePids()812 static ArrayList<Integer> getInterestingNativePids() { 813 HashSet<Integer> pids = new HashSet<>(); 814 addInterestingAidlPids(pids); 815 addInterestingHidlPids(pids); 816 817 int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST); 818 if (nativePids != null) { 819 for (int i : nativePids) { 820 pids.add(i); 821 } 822 } 823 824 return new ArrayList<Integer>(pids); 825 } 826 run()827 private void run() { 828 boolean waitedHalf = false; 829 830 while (true) { 831 List<HandlerChecker> blockedCheckers = Collections.emptyList(); 832 String subject = ""; 833 boolean allowRestart = true; 834 int debuggerWasConnected = 0; 835 boolean doWaitedPreDump = false; 836 // The value of mWatchdogTimeoutMillis might change while we are executing the loop. 837 // We store the current value to use a consistent value for all handlers. 838 final long watchdogTimeoutMillis = mWatchdogTimeoutMillis; 839 final long checkIntervalMillis = watchdogTimeoutMillis / PRE_WATCHDOG_TIMEOUT_RATIO; 840 final ArrayList<Integer> pids; 841 synchronized (mLock) { 842 long timeout = checkIntervalMillis; 843 // Make sure we (re)spin the checkers that have become idle within 844 // this wait-and-check interval 845 for (int i=0; i<mHandlerCheckers.size(); i++) { 846 HandlerCheckerAndTimeout hc = mHandlerCheckers.get(i); 847 // We pick the watchdog to apply every time we reschedule the checkers. The 848 // default timeout might have changed since the last run. 849 hc.checker().scheduleCheckLocked(hc.customTimeoutMillis() 850 .orElse(watchdogTimeoutMillis * Build.HW_TIMEOUT_MULTIPLIER)); 851 } 852 853 if (debuggerWasConnected > 0) { 854 debuggerWasConnected--; 855 } 856 857 // NOTE: We use uptimeMillis() here because we do not want to increment the time we 858 // wait while asleep. If the device is asleep then the thing that we are waiting 859 // to timeout on is asleep as well and won't have a chance to run, causing a false 860 // positive on when to kill things. 861 long start = SystemClock.uptimeMillis(); 862 while (timeout > 0) { 863 if (Debug.isDebuggerConnected()) { 864 debuggerWasConnected = 2; 865 } 866 try { 867 mLock.wait(timeout); 868 // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting 869 } catch (InterruptedException e) { 870 Log.wtf(TAG, e); 871 } 872 if (Debug.isDebuggerConnected()) { 873 debuggerWasConnected = 2; 874 } 875 timeout = checkIntervalMillis - (SystemClock.uptimeMillis() - start); 876 } 877 878 final int waitState = evaluateCheckerCompletionLocked(); 879 if (waitState == COMPLETED) { 880 // The monitors have returned; reset 881 waitedHalf = false; 882 continue; 883 } else if (waitState == WAITING) { 884 // still waiting but within their configured intervals; back off and recheck 885 continue; 886 } else if (waitState == WAITED_UNTIL_PRE_WATCHDOG) { 887 if (!waitedHalf) { 888 Slog.i(TAG, "WAITED_UNTIL_PRE_WATCHDOG"); 889 waitedHalf = true; 890 // We've waited until the pre-watchdog, but we'd need to do the stack trace 891 // dump w/o the lock. 892 blockedCheckers = getCheckersWithStateLocked(WAITED_UNTIL_PRE_WATCHDOG); 893 subject = describeCheckersLocked(blockedCheckers); 894 pids = new ArrayList<>(mInterestingJavaPids); 895 doWaitedPreDump = true; 896 } else { 897 continue; 898 } 899 } else { 900 // something is overdue! 901 blockedCheckers = getCheckersWithStateLocked(OVERDUE); 902 subject = describeCheckersLocked(blockedCheckers); 903 allowRestart = mAllowRestart; 904 pids = new ArrayList<>(mInterestingJavaPids); 905 } 906 } // END synchronized (mLock) 907 908 // If we got here, that means that the system is most likely hung. 909 // 910 // First collect stack traces from all threads of the system process. 911 // 912 // Then, if we reached the full timeout, kill this process so that the system will 913 // restart. If we reached pre-watchdog timeout, just log some information and continue. 914 logWatchog(doWaitedPreDump, subject, pids); 915 916 if (doWaitedPreDump) { 917 // We have waited for only pre-watchdog timeout, we continue to wait for the 918 // duration of the full timeout before killing the process. 919 continue; 920 } 921 922 IActivityController controller; 923 synchronized (mLock) { 924 controller = mController; 925 } 926 if (controller != null) { 927 Slog.i(TAG, "Reporting stuck state to activity controller"); 928 try { 929 Binder.setDumpDisabled("Service dumps disabled due to hung system process."); 930 // 1 = keep waiting, -1 = kill system 931 int res = controller.systemNotResponding(subject); 932 if (res >= 0) { 933 Slog.i(TAG, "Activity controller requested to coninue to wait"); 934 waitedHalf = false; 935 continue; 936 } 937 } catch (RemoteException e) { 938 } 939 } 940 941 // Only kill the process if the debugger is not attached. 942 if (Debug.isDebuggerConnected()) { 943 debuggerWasConnected = 2; 944 } 945 if (debuggerWasConnected >= 2) { 946 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); 947 } else if (debuggerWasConnected > 0) { 948 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process"); 949 } else if (!allowRestart) { 950 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); 951 } else { 952 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); 953 WatchdogDiagnostics.diagnoseCheckers(blockedCheckers); 954 Slog.w(TAG, "*** GOODBYE!"); 955 if (!Build.IS_USER && isCrashLoopFound() 956 && !WatchdogProperties.should_ignore_fatal_count().orElse(false)) { 957 breakCrashLoop(); 958 } 959 Process.killProcess(Process.myPid()); 960 System.exit(10); 961 } 962 963 waitedHalf = false; 964 } 965 } 966 logWatchog(boolean preWatchdog, String subject, ArrayList<Integer> pids)967 private void logWatchog(boolean preWatchdog, String subject, ArrayList<Integer> pids) { 968 // Get critical event log before logging the pre-watchdog so that it doesn't 969 // occur in the log. 970 String criticalEvents = 971 CriticalEventLog.getInstance().logLinesForSystemServerTraceFile(); 972 final UUID errorId = mTraceErrorLogger.generateErrorId(); 973 if (mTraceErrorLogger.isAddErrorIdEnabled()) { 974 mTraceErrorLogger.addProcessInfoAndErrorIdToTrace("system_server", Process.myPid(), 975 errorId); 976 mTraceErrorLogger.addSubjectToTrace(subject, errorId); 977 } 978 979 final String dropboxTag; 980 if (preWatchdog) { 981 dropboxTag = "pre_watchdog"; 982 CriticalEventLog.getInstance().logHalfWatchdog(subject); 983 FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_PRE_WATCHDOG_OCCURRED); 984 } else { 985 dropboxTag = "watchdog"; 986 CriticalEventLog.getInstance().logWatchdog(subject, errorId); 987 EventLog.writeEvent(EventLogTags.WATCHDOG, subject); 988 // Log the atom as early as possible since it is used as a mechanism to trigger 989 // Perfetto. Ideally, the Perfetto trace capture should happen as close to the 990 // point in time when the Watchdog happens as possible. 991 FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, subject); 992 } 993 994 long anrTime = SystemClock.uptimeMillis(); 995 StringBuilder report = new StringBuilder(); 996 report.append(ResourcePressureUtil.currentPsiState()); 997 ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false); 998 StringWriter tracesFileException = new StringWriter(); 999 final File stack = StackTracesDumpHelper.dumpStackTraces( 1000 pids, processCpuTracker, new SparseBooleanArray(), 1001 CompletableFuture.completedFuture(getInterestingNativePids()), tracesFileException, 1002 subject, criticalEvents, Runnable::run, /* latencyTracker= */null); 1003 // Give some extra time to make sure the stack traces get written. 1004 // The system's been hanging for a whlie, another second or two won't hurt much. 1005 SystemClock.sleep(5000); 1006 processCpuTracker.update(); 1007 report.append(processCpuTracker.printCurrentState(anrTime, 10)); 1008 report.append(tracesFileException.getBuffer()); 1009 1010 if (!preWatchdog) { 1011 // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the 1012 // kernel log 1013 doSysRq('w'); 1014 doSysRq('l'); 1015 } 1016 1017 // Try to add the error to the dropbox, but assuming that the ActivityManager 1018 // itself may be deadlocked. (which has happened, causing this statement to 1019 // deadlock and the watchdog as a whole to be ineffective) 1020 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { 1021 public void run() { 1022 // If a watched thread hangs before init() is called, we don't have a 1023 // valid mActivity. So we can't log the error to dropbox. 1024 if (mActivity != null) { 1025 mActivity.addErrorToDropBox( 1026 dropboxTag, null, "system_server", null, null, null, 1027 null, report.toString(), stack, null, null, null, 1028 errorId, null); 1029 } 1030 } 1031 }; 1032 dropboxThread.start(); 1033 try { 1034 dropboxThread.join(2000); // wait up to 2 seconds for it to return. 1035 } catch (InterruptedException ignored) { } 1036 } 1037 doSysRq(char c)1038 private void doSysRq(char c) { 1039 try { 1040 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); 1041 sysrq_trigger.write(c); 1042 sysrq_trigger.close(); 1043 } catch (IOException e) { 1044 Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e); 1045 } 1046 } 1047 resetTimeoutHistory()1048 private void resetTimeoutHistory() { 1049 writeTimeoutHistory(new ArrayList<String>()); 1050 } 1051 writeTimeoutHistory(Iterable<String> crashHistory)1052 private void writeTimeoutHistory(Iterable<String> crashHistory) { 1053 String data = String.join(",", crashHistory); 1054 1055 try (FileWriter writer = new FileWriter(TIMEOUT_HISTORY_FILE)) { 1056 writer.write(SystemProperties.get("ro.boottime.zygote")); 1057 writer.write(":"); 1058 writer.write(data); 1059 } catch (IOException e) { 1060 Slog.e(TAG, "Failed to write file " + TIMEOUT_HISTORY_FILE, e); 1061 } 1062 } 1063 readTimeoutHistory()1064 private String[] readTimeoutHistory() { 1065 final String[] emptyStringArray = {}; 1066 1067 try (BufferedReader reader = new BufferedReader(new FileReader(TIMEOUT_HISTORY_FILE))) { 1068 String line = reader.readLine(); 1069 if (line == null) { 1070 return emptyStringArray; 1071 } 1072 1073 String[] data = line.trim().split(":"); 1074 String boottime = data.length >= 1 ? data[0] : ""; 1075 String history = data.length >= 2 ? data[1] : ""; 1076 if (SystemProperties.get("ro.boottime.zygote").equals(boottime) && !history.isEmpty()) { 1077 return history.split(","); 1078 } else { 1079 return emptyStringArray; 1080 } 1081 } catch (FileNotFoundException e) { 1082 return emptyStringArray; 1083 } catch (IOException e) { 1084 Slog.e(TAG, "Failed to read file " + TIMEOUT_HISTORY_FILE, e); 1085 return emptyStringArray; 1086 } 1087 } 1088 hasActiveUsbConnection()1089 private boolean hasActiveUsbConnection() { 1090 try { 1091 final String state = FileUtils.readTextFile( 1092 new File("/sys/class/android_usb/android0/state"), 1093 128 /*max*/, null /*ellipsis*/).trim(); 1094 if ("CONFIGURED".equals(state)) { 1095 return true; 1096 } 1097 } catch (IOException e) { 1098 Slog.w(TAG, "Failed to determine if device was on USB", e); 1099 } 1100 return false; 1101 } 1102 isCrashLoopFound()1103 private boolean isCrashLoopFound() { 1104 int fatalCount = WatchdogProperties.fatal_count().orElse(0); 1105 long fatalWindowMs = TimeUnit.SECONDS.toMillis( 1106 WatchdogProperties.fatal_window_seconds().orElse(0)); 1107 if (fatalCount == 0 || fatalWindowMs == 0) { 1108 if (fatalCount != fatalWindowMs) { 1109 Slog.w(TAG, String.format("sysprops '%s' and '%s' should be set or unset together", 1110 PROP_FATAL_LOOP_COUNT, PROP_FATAL_LOOP_WINDOWS_SECS)); 1111 } 1112 return false; 1113 } 1114 1115 // new-history = [last (fatalCount - 1) items in old-history] + [nowMs]. 1116 long nowMs = SystemClock.elapsedRealtime(); // Time since boot including deep sleep. 1117 String[] rawCrashHistory = readTimeoutHistory(); 1118 ArrayList<String> crashHistory = new ArrayList<String>(Arrays.asList(Arrays.copyOfRange( 1119 rawCrashHistory, 1120 Math.max(0, rawCrashHistory.length - fatalCount - 1), 1121 rawCrashHistory.length))); 1122 // Something wrong here. 1123 crashHistory.add(String.valueOf(nowMs)); 1124 writeTimeoutHistory(crashHistory); 1125 1126 // Returns false if the device has an active USB connection. 1127 if (hasActiveUsbConnection()) { 1128 return false; 1129 } 1130 1131 long firstCrashMs; 1132 try { 1133 firstCrashMs = Long.parseLong(crashHistory.get(0)); 1134 } catch (NumberFormatException t) { 1135 Slog.w(TAG, "Failed to parseLong " + crashHistory.get(0), t); 1136 resetTimeoutHistory(); 1137 return false; 1138 } 1139 return crashHistory.size() >= fatalCount && nowMs - firstCrashMs < fatalWindowMs; 1140 } 1141 breakCrashLoop()1142 private void breakCrashLoop() { 1143 try (FileWriter kmsg = new FileWriter("/dev/kmsg_debug", /* append= */ true)) { 1144 kmsg.append("Fatal reset to escape the system_server crashing loop\n"); 1145 } catch (IOException e) { 1146 Slog.w(TAG, "Failed to append to kmsg", e); 1147 } 1148 doSysRq('c'); 1149 } 1150 1151 @Override dump(@onNull PrintWriter pw, @Nullable String[] args)1152 public void dump(@NonNull PrintWriter pw, @Nullable String[] args) { 1153 pw.print("WatchdogTimeoutMillis="); 1154 pw.println(mWatchdogTimeoutMillis); 1155 } 1156 } 1157