1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.server; 18 19 import android.app.IActivityController; 20 import android.content.BroadcastReceiver; 21 import android.content.Context; 22 import android.content.Intent; 23 import android.content.IntentFilter; 24 import android.hidl.manager.V1_0.IServiceManager; 25 import android.os.Binder; 26 import android.os.Build; 27 import android.os.Debug; 28 import android.os.Handler; 29 import android.os.IPowerManager; 30 import android.os.Looper; 31 import android.os.Process; 32 import android.os.RemoteException; 33 import android.os.ServiceManager; 34 import android.os.SystemClock; 35 import android.system.ErrnoException; 36 import android.system.Os; 37 import android.system.OsConstants; 38 import android.system.StructRlimit; 39 import android.util.EventLog; 40 import android.util.Log; 41 import android.util.Slog; 42 import android.util.SparseArray; 43 44 import com.android.internal.os.ProcessCpuTracker; 45 import com.android.internal.os.ZygoteConnectionConstants; 46 import com.android.internal.util.FrameworkStatsLog; 47 import com.android.server.am.ActivityManagerService; 48 import com.android.server.wm.SurfaceAnimationThread; 49 50 import java.io.File; 51 import java.io.FileWriter; 52 import java.io.IOException; 53 import java.io.StringWriter; 54 import java.nio.charset.StandardCharsets; 55 import java.nio.file.Files; 56 import java.nio.file.Path; 57 import java.nio.file.Paths; 58 import java.util.ArrayList; 59 import java.util.Arrays; 60 import java.util.Collections; 61 import java.util.HashSet; 62 import java.util.List; 63 64 /** This class calls its monitor every minute. Killing this process if they don't return **/ 65 public class Watchdog extends Thread { 66 static final String TAG = "Watchdog"; 67 68 /** Debug flag. */ 69 public static final boolean DEBUG = false; 70 71 // Set this to true to use debug default values. 72 private static final boolean DB = false; 73 74 // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with 75 // timeout in com.android.internal.os.ZygoteConnection, or wrapped applications 76 // can trigger the watchdog. 77 // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped 78 // applications may not work with a debug build. CTS will fail. 79 private static final long DEFAULT_TIMEOUT = DB ? 10 * 1000 : 60 * 1000; 80 private static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2; 81 82 // These are temporally ordered: larger values as lateness increases 83 private static final int COMPLETED = 0; 84 private static final int WAITING = 1; 85 private static final int WAITED_HALF = 2; 86 private static final int OVERDUE = 3; 87 88 // Which native processes to dump into dropbox's stack traces 89 public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { 90 "/system/bin/audioserver", 91 "/system/bin/cameraserver", 92 "/system/bin/drmserver", 93 "/system/bin/mediadrmserver", 94 "/system/bin/mediaserver", 95 "/system/bin/netd", 96 "/system/bin/sdcard", 97 "/system/bin/surfaceflinger", 98 "/system/bin/vold", 99 "media.extractor", // system/bin/mediaextractor 100 "media.metrics", // system/bin/mediametrics 101 "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service 102 "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec 103 "com.android.bluetooth", // Bluetooth service 104 "/apex/com.android.os.statsd/bin/statsd", // Stats daemon 105 }; 106 107 public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList( 108 "android.hardware.audio@2.0::IDevicesFactory", 109 "android.hardware.audio@4.0::IDevicesFactory", 110 "android.hardware.audio@5.0::IDevicesFactory", 111 "android.hardware.audio@6.0::IDevicesFactory", 112 "android.hardware.biometrics.face@1.0::IBiometricsFace", 113 "android.hardware.biometrics.fingerprint@2.1::IBiometricsFingerprint", 114 "android.hardware.bluetooth@1.0::IBluetoothHci", 115 "android.hardware.camera.provider@2.4::ICameraProvider", 116 "android.hardware.gnss@1.0::IGnss", 117 "android.hardware.graphics.allocator@2.0::IAllocator", 118 "android.hardware.graphics.composer@2.1::IComposer", 119 "android.hardware.health@2.0::IHealth", 120 "android.hardware.light@2.0::ILight", 121 "android.hardware.media.c2@1.0::IComponentStore", 122 "android.hardware.media.omx@1.0::IOmx", 123 "android.hardware.media.omx@1.0::IOmxStore", 124 "android.hardware.neuralnetworks@1.0::IDevice", 125 "android.hardware.power.stats@1.0::IPowerStats", 126 "android.hardware.sensors@1.0::ISensors", 127 "android.hardware.sensors@2.0::ISensors", 128 "android.hardware.sensors@2.1::ISensors", 129 "android.hardware.vr@1.0::IVr", 130 "android.system.suspend@1.0::ISystemSuspend" 131 ); 132 133 private static Watchdog sWatchdog; 134 135 /* This handler will be used to post message back onto the main thread */ 136 private final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>(); 137 private final HandlerChecker mMonitorChecker; 138 private ActivityManagerService mActivity; 139 140 private IActivityController mController; 141 private boolean mAllowRestart = true; 142 private final OpenFdMonitor mOpenFdMonitor; 143 private final List<Integer> mInterestingJavaPids = new ArrayList<>(); 144 145 /** 146 * Used for checking status of handle threads and scheduling monitor callbacks. 147 */ 148 public final class HandlerChecker implements Runnable { 149 private final Handler mHandler; 150 private final String mName; 151 private final long mWaitMax; 152 private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); 153 private final ArrayList<Monitor> mMonitorQueue = new ArrayList<Monitor>(); 154 private boolean mCompleted; 155 private Monitor mCurrentMonitor; 156 private long mStartTime; 157 private int mPauseCount; 158 HandlerChecker(Handler handler, String name, long waitMaxMillis)159 HandlerChecker(Handler handler, String name, long waitMaxMillis) { 160 mHandler = handler; 161 mName = name; 162 mWaitMax = waitMaxMillis; 163 mCompleted = true; 164 } 165 addMonitorLocked(Monitor monitor)166 void addMonitorLocked(Monitor monitor) { 167 // We don't want to update mMonitors when the Handler is in the middle of checking 168 // all monitors. We will update mMonitors on the next schedule if it is safe 169 mMonitorQueue.add(monitor); 170 } 171 scheduleCheckLocked()172 public void scheduleCheckLocked() { 173 if (mCompleted) { 174 // Safe to update monitors in queue, Handler is not in the middle of work 175 mMonitors.addAll(mMonitorQueue); 176 mMonitorQueue.clear(); 177 } 178 if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) 179 || (mPauseCount > 0)) { 180 // Don't schedule until after resume OR 181 // If the target looper has recently been polling, then 182 // there is no reason to enqueue our checker on it since that 183 // is as good as it not being deadlocked. This avoid having 184 // to do a context switch to check the thread. Note that we 185 // only do this if we have no monitors since those would need to 186 // be executed at this point. 187 mCompleted = true; 188 return; 189 } 190 if (!mCompleted) { 191 // we already have a check in flight, so no need 192 return; 193 } 194 195 mCompleted = false; 196 mCurrentMonitor = null; 197 mStartTime = SystemClock.uptimeMillis(); 198 mHandler.postAtFrontOfQueue(this); 199 } 200 isOverdueLocked()201 boolean isOverdueLocked() { 202 return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax); 203 } 204 getCompletionStateLocked()205 public int getCompletionStateLocked() { 206 if (mCompleted) { 207 return COMPLETED; 208 } else { 209 long latency = SystemClock.uptimeMillis() - mStartTime; 210 if (latency < mWaitMax/2) { 211 return WAITING; 212 } else if (latency < mWaitMax) { 213 return WAITED_HALF; 214 } 215 } 216 return OVERDUE; 217 } 218 getThread()219 public Thread getThread() { 220 return mHandler.getLooper().getThread(); 221 } 222 getName()223 public String getName() { 224 return mName; 225 } 226 describeBlockedStateLocked()227 String describeBlockedStateLocked() { 228 if (mCurrentMonitor == null) { 229 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")"; 230 } else { 231 return "Blocked in monitor " + mCurrentMonitor.getClass().getName() 232 + " on " + mName + " (" + getThread().getName() + ")"; 233 } 234 } 235 236 @Override run()237 public void run() { 238 // Once we get here, we ensure that mMonitors does not change even if we call 239 // #addMonitorLocked because we first add the new monitors to mMonitorQueue and 240 // move them to mMonitors on the next schedule when mCompleted is true, at which 241 // point we have completed execution of this method. 242 final int size = mMonitors.size(); 243 for (int i = 0 ; i < size ; i++) { 244 synchronized (Watchdog.this) { 245 mCurrentMonitor = mMonitors.get(i); 246 } 247 mCurrentMonitor.monitor(); 248 } 249 250 synchronized (Watchdog.this) { 251 mCompleted = true; 252 mCurrentMonitor = null; 253 } 254 } 255 256 /** Pause the HandlerChecker. */ pauseLocked(String reason)257 public void pauseLocked(String reason) { 258 mPauseCount++; 259 // Mark as completed, because there's a chance we called this after the watchog 260 // thread loop called Object#wait after 'WAITED_HALF'. In that case we want to ensure 261 // the next call to #getCompletionStateLocked for this checker returns 'COMPLETED' 262 mCompleted = true; 263 Slog.i(TAG, "Pausing HandlerChecker: " + mName + " for reason: " 264 + reason + ". Pause count: " + mPauseCount); 265 } 266 267 /** Resume the HandlerChecker from the last {@link #pauseLocked}. */ resumeLocked(String reason)268 public void resumeLocked(String reason) { 269 if (mPauseCount > 0) { 270 mPauseCount--; 271 Slog.i(TAG, "Resuming HandlerChecker: " + mName + " for reason: " 272 + reason + ". Pause count: " + mPauseCount); 273 } else { 274 Slog.wtf(TAG, "Already resumed HandlerChecker: " + mName); 275 } 276 } 277 } 278 279 final class RebootRequestReceiver extends BroadcastReceiver { 280 @Override onReceive(Context c, Intent intent)281 public void onReceive(Context c, Intent intent) { 282 if (intent.getIntExtra("nowait", 0) != 0) { 283 rebootSystem("Received ACTION_REBOOT broadcast"); 284 return; 285 } 286 Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); 287 } 288 } 289 290 /** Monitor for checking the availability of binder threads. The monitor will block until 291 * there is a binder thread available to process in coming IPCs to make sure other processes 292 * can still communicate with the service. 293 */ 294 private static final class BinderThreadMonitor implements Watchdog.Monitor { 295 @Override monitor()296 public void monitor() { 297 Binder.blockUntilThreadAvailable(); 298 } 299 } 300 301 public interface Monitor { monitor()302 void monitor(); 303 } 304 getInstance()305 public static Watchdog getInstance() { 306 if (sWatchdog == null) { 307 sWatchdog = new Watchdog(); 308 } 309 310 return sWatchdog; 311 } 312 Watchdog()313 private Watchdog() { 314 super("watchdog"); 315 // Initialize handler checkers for each common thread we want to check. Note 316 // that we are not currently checking the background thread, since it can 317 // potentially hold longer running operations with no guarantees about the timeliness 318 // of operations there. 319 320 // The shared foreground thread is the main checker. It is where we 321 // will also dispatch monitor checks and do other work. 322 mMonitorChecker = new HandlerChecker(FgThread.getHandler(), 323 "foreground thread", DEFAULT_TIMEOUT); 324 mHandlerCheckers.add(mMonitorChecker); 325 // Add checker for main thread. We only do a quick check since there 326 // can be UI running on the thread. 327 mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()), 328 "main thread", DEFAULT_TIMEOUT)); 329 // Add checker for shared UI thread. 330 mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), 331 "ui thread", DEFAULT_TIMEOUT)); 332 // And also check IO thread. 333 mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), 334 "i/o thread", DEFAULT_TIMEOUT)); 335 // And the display thread. 336 mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(), 337 "display thread", DEFAULT_TIMEOUT)); 338 // And the animation thread. 339 mHandlerCheckers.add(new HandlerChecker(AnimationThread.getHandler(), 340 "animation thread", DEFAULT_TIMEOUT)); 341 // And the surface animation thread. 342 mHandlerCheckers.add(new HandlerChecker(SurfaceAnimationThread.getHandler(), 343 "surface animation thread", DEFAULT_TIMEOUT)); 344 345 // Initialize monitor for Binder threads. 346 addMonitor(new BinderThreadMonitor()); 347 348 mOpenFdMonitor = OpenFdMonitor.create(); 349 350 mInterestingJavaPids.add(Process.myPid()); 351 352 // See the notes on DEFAULT_TIMEOUT. 353 assert DB || 354 DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS; 355 } 356 357 /** 358 * Registers a {@link BroadcastReceiver} to listen to reboot broadcasts and trigger reboot. 359 * Should be called during boot after the ActivityManagerService is up and registered 360 * as a system service so it can handle registration of a {@link BroadcastReceiver}. 361 */ init(Context context, ActivityManagerService activity)362 public void init(Context context, ActivityManagerService activity) { 363 mActivity = activity; 364 context.registerReceiver(new RebootRequestReceiver(), 365 new IntentFilter(Intent.ACTION_REBOOT), 366 android.Manifest.permission.REBOOT, null); 367 } 368 isInterestingJavaProcess(String processName)369 private static boolean isInterestingJavaProcess(String processName) { 370 return processName.equals(StorageManagerService.sMediaStoreAuthorityProcessName) 371 || processName.equals("com.android.phone"); 372 } 373 374 /** 375 * Notifies the watchdog when a Java process with {@code pid} is started. 376 * This process may have its stack trace dumped during an ANR. 377 */ processStarted(String processName, int pid)378 public void processStarted(String processName, int pid) { 379 if (isInterestingJavaProcess(processName)) { 380 Slog.i(TAG, "Interesting Java process " + processName + " started. Pid " + pid); 381 synchronized (this) { 382 mInterestingJavaPids.add(pid); 383 } 384 } 385 } 386 387 /** 388 * Notifies the watchdog when a Java process with {@code pid} dies. 389 */ processDied(String processName, int pid)390 public void processDied(String processName, int pid) { 391 if (isInterestingJavaProcess(processName)) { 392 Slog.i(TAG, "Interesting Java process " + processName + " died. Pid " + pid); 393 synchronized (this) { 394 mInterestingJavaPids.remove(Integer.valueOf(pid)); 395 } 396 } 397 } 398 setActivityController(IActivityController controller)399 public void setActivityController(IActivityController controller) { 400 synchronized (this) { 401 mController = controller; 402 } 403 } 404 setAllowRestart(boolean allowRestart)405 public void setAllowRestart(boolean allowRestart) { 406 synchronized (this) { 407 mAllowRestart = allowRestart; 408 } 409 } 410 addMonitor(Monitor monitor)411 public void addMonitor(Monitor monitor) { 412 synchronized (this) { 413 mMonitorChecker.addMonitorLocked(monitor); 414 } 415 } 416 addThread(Handler thread)417 public void addThread(Handler thread) { 418 addThread(thread, DEFAULT_TIMEOUT); 419 } 420 addThread(Handler thread, long timeoutMillis)421 public void addThread(Handler thread, long timeoutMillis) { 422 synchronized (this) { 423 final String name = thread.getLooper().getThread().getName(); 424 mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis)); 425 } 426 } 427 428 /** 429 * Pauses Watchdog action for the currently running thread. Useful before executing long running 430 * operations that could falsely trigger the watchdog. Each call to this will require a matching 431 * call to {@link #resumeWatchingCurrentThread}. 432 * 433 * <p>If the current thread has not been added to the Watchdog, this call is a no-op. 434 * 435 * <p>If the Watchdog is already paused for the current thread, this call adds 436 * adds another pause and will require an additional {@link #resumeCurrentThread} to resume. 437 * 438 * <p>Note: Use with care, as any deadlocks on the current thread will be undetected until all 439 * pauses have been resumed. 440 */ pauseWatchingCurrentThread(String reason)441 public void pauseWatchingCurrentThread(String reason) { 442 synchronized (this) { 443 for (HandlerChecker hc : mHandlerCheckers) { 444 if (Thread.currentThread().equals(hc.getThread())) { 445 hc.pauseLocked(reason); 446 } 447 } 448 } 449 } 450 451 /** 452 * Resumes the last pause from {@link #pauseWatchingCurrentThread} for the currently running 453 * thread. 454 * 455 * <p>If the current thread has not been added to the Watchdog, this call is a no-op. 456 * 457 * <p>If the Watchdog action for the current thread is already resumed, this call logs a wtf. 458 * 459 * <p>If all pauses have been resumed, the Watchdog action is finally resumed, otherwise, 460 * the Watchdog action for the current thread remains paused until resume is called at least 461 * as many times as the calls to pause. 462 */ resumeWatchingCurrentThread(String reason)463 public void resumeWatchingCurrentThread(String reason) { 464 synchronized (this) { 465 for (HandlerChecker hc : mHandlerCheckers) { 466 if (Thread.currentThread().equals(hc.getThread())) { 467 hc.resumeLocked(reason); 468 } 469 } 470 } 471 } 472 473 /** 474 * Perform a full reboot of the system. 475 */ rebootSystem(String reason)476 void rebootSystem(String reason) { 477 Slog.i(TAG, "Rebooting system because: " + reason); 478 IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE); 479 try { 480 pms.reboot(false, reason, false); 481 } catch (RemoteException ex) { 482 } 483 } 484 evaluateCheckerCompletionLocked()485 private int evaluateCheckerCompletionLocked() { 486 int state = COMPLETED; 487 for (int i=0; i<mHandlerCheckers.size(); i++) { 488 HandlerChecker hc = mHandlerCheckers.get(i); 489 state = Math.max(state, hc.getCompletionStateLocked()); 490 } 491 return state; 492 } 493 getBlockedCheckersLocked()494 private ArrayList<HandlerChecker> getBlockedCheckersLocked() { 495 ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>(); 496 for (int i=0; i<mHandlerCheckers.size(); i++) { 497 HandlerChecker hc = mHandlerCheckers.get(i); 498 if (hc.isOverdueLocked()) { 499 checkers.add(hc); 500 } 501 } 502 return checkers; 503 } 504 describeCheckersLocked(List<HandlerChecker> checkers)505 private String describeCheckersLocked(List<HandlerChecker> checkers) { 506 StringBuilder builder = new StringBuilder(128); 507 for (int i=0; i<checkers.size(); i++) { 508 if (builder.length() > 0) { 509 builder.append(", "); 510 } 511 builder.append(checkers.get(i).describeBlockedStateLocked()); 512 } 513 return builder.toString(); 514 } 515 getInterestingHalPids()516 private static ArrayList<Integer> getInterestingHalPids() { 517 try { 518 IServiceManager serviceManager = IServiceManager.getService(); 519 ArrayList<IServiceManager.InstanceDebugInfo> dump = 520 serviceManager.debugDump(); 521 HashSet<Integer> pids = new HashSet<>(); 522 for (IServiceManager.InstanceDebugInfo info : dump) { 523 if (info.pid == IServiceManager.PidConstant.NO_PID) { 524 continue; 525 } 526 527 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) { 528 continue; 529 } 530 531 pids.add(info.pid); 532 } 533 return new ArrayList<Integer>(pids); 534 } catch (RemoteException e) { 535 return new ArrayList<Integer>(); 536 } 537 } 538 getInterestingNativePids()539 static ArrayList<Integer> getInterestingNativePids() { 540 ArrayList<Integer> pids = getInterestingHalPids(); 541 542 int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST); 543 if (nativePids != null) { 544 pids.ensureCapacity(pids.size() + nativePids.length); 545 for (int i : nativePids) { 546 pids.add(i); 547 } 548 } 549 550 return pids; 551 } 552 553 @Override run()554 public void run() { 555 boolean waitedHalf = false; 556 while (true) { 557 final List<HandlerChecker> blockedCheckers; 558 final String subject; 559 final boolean allowRestart; 560 int debuggerWasConnected = 0; 561 synchronized (this) { 562 long timeout = CHECK_INTERVAL; 563 // Make sure we (re)spin the checkers that have become idle within 564 // this wait-and-check interval 565 for (int i=0; i<mHandlerCheckers.size(); i++) { 566 HandlerChecker hc = mHandlerCheckers.get(i); 567 hc.scheduleCheckLocked(); 568 } 569 570 if (debuggerWasConnected > 0) { 571 debuggerWasConnected--; 572 } 573 574 // NOTE: We use uptimeMillis() here because we do not want to increment the time we 575 // wait while asleep. If the device is asleep then the thing that we are waiting 576 // to timeout on is asleep as well and won't have a chance to run, causing a false 577 // positive on when to kill things. 578 long start = SystemClock.uptimeMillis(); 579 while (timeout > 0) { 580 if (Debug.isDebuggerConnected()) { 581 debuggerWasConnected = 2; 582 } 583 try { 584 wait(timeout); 585 // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting 586 } catch (InterruptedException e) { 587 Log.wtf(TAG, e); 588 } 589 if (Debug.isDebuggerConnected()) { 590 debuggerWasConnected = 2; 591 } 592 timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start); 593 } 594 595 boolean fdLimitTriggered = false; 596 if (mOpenFdMonitor != null) { 597 fdLimitTriggered = mOpenFdMonitor.monitor(); 598 } 599 600 if (!fdLimitTriggered) { 601 final int waitState = evaluateCheckerCompletionLocked(); 602 if (waitState == COMPLETED) { 603 // The monitors have returned; reset 604 waitedHalf = false; 605 continue; 606 } else if (waitState == WAITING) { 607 // still waiting but within their configured intervals; back off and recheck 608 continue; 609 } else if (waitState == WAITED_HALF) { 610 if (!waitedHalf) { 611 Slog.i(TAG, "WAITED_HALF"); 612 // We've waited half the deadlock-detection interval. Pull a stack 613 // trace and wait another half. 614 ArrayList<Integer> pids = new ArrayList<>(mInterestingJavaPids); 615 ActivityManagerService.dumpStackTraces(pids, null, null, 616 getInterestingNativePids(), null); 617 waitedHalf = true; 618 } 619 continue; 620 } 621 622 // something is overdue! 623 blockedCheckers = getBlockedCheckersLocked(); 624 subject = describeCheckersLocked(blockedCheckers); 625 } else { 626 blockedCheckers = Collections.emptyList(); 627 subject = "Open FD high water mark reached"; 628 } 629 allowRestart = mAllowRestart; 630 } 631 632 // If we got here, that means that the system is most likely hung. 633 // First collect stack traces from all threads of the system process. 634 // Then kill this process so that the system will restart. 635 EventLog.writeEvent(EventLogTags.WATCHDOG, subject); 636 637 ArrayList<Integer> pids = new ArrayList<>(mInterestingJavaPids); 638 639 long anrTime = SystemClock.uptimeMillis(); 640 StringBuilder report = new StringBuilder(); 641 report.append(MemoryPressureUtil.currentPsiState()); 642 ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false); 643 StringWriter tracesFileException = new StringWriter(); 644 final File stack = ActivityManagerService.dumpStackTraces( 645 pids, processCpuTracker, new SparseArray<>(), getInterestingNativePids(), 646 tracesFileException); 647 648 // Give some extra time to make sure the stack traces get written. 649 // The system's been hanging for a minute, another second or two won't hurt much. 650 SystemClock.sleep(5000); 651 652 processCpuTracker.update(); 653 report.append(processCpuTracker.printCurrentState(anrTime)); 654 report.append(tracesFileException.getBuffer()); 655 656 // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log 657 doSysRq('w'); 658 doSysRq('l'); 659 660 // Try to add the error to the dropbox, but assuming that the ActivityManager 661 // itself may be deadlocked. (which has happened, causing this statement to 662 // deadlock and the watchdog as a whole to be ineffective) 663 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { 664 public void run() { 665 // If a watched thread hangs before init() is called, we don't have a 666 // valid mActivity. So we can't log the error to dropbox. 667 if (mActivity != null) { 668 mActivity.addErrorToDropBox( 669 "watchdog", null, "system_server", null, null, null, 670 subject, report.toString(), stack, null); 671 } 672 FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, 673 subject); 674 } 675 }; 676 dropboxThread.start(); 677 try { 678 dropboxThread.join(2000); // wait up to 2 seconds for it to return. 679 } catch (InterruptedException ignored) {} 680 681 IActivityController controller; 682 synchronized (this) { 683 controller = mController; 684 } 685 if (controller != null) { 686 Slog.i(TAG, "Reporting stuck state to activity controller"); 687 try { 688 Binder.setDumpDisabled("Service dumps disabled due to hung system process."); 689 // 1 = keep waiting, -1 = kill system 690 int res = controller.systemNotResponding(subject); 691 if (res >= 0) { 692 Slog.i(TAG, "Activity controller requested to coninue to wait"); 693 waitedHalf = false; 694 continue; 695 } 696 } catch (RemoteException e) { 697 } 698 } 699 700 // Only kill the process if the debugger is not attached. 701 if (Debug.isDebuggerConnected()) { 702 debuggerWasConnected = 2; 703 } 704 if (debuggerWasConnected >= 2) { 705 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); 706 } else if (debuggerWasConnected > 0) { 707 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process"); 708 } else if (!allowRestart) { 709 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); 710 } else { 711 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); 712 WatchdogDiagnostics.diagnoseCheckers(blockedCheckers); 713 Slog.w(TAG, "*** GOODBYE!"); 714 Process.killProcess(Process.myPid()); 715 System.exit(10); 716 } 717 718 waitedHalf = false; 719 } 720 } 721 doSysRq(char c)722 private void doSysRq(char c) { 723 try { 724 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); 725 sysrq_trigger.write(c); 726 sysrq_trigger.close(); 727 } catch (IOException e) { 728 Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e); 729 } 730 } 731 732 public static final class OpenFdMonitor { 733 /** 734 * Number of FDs below the soft limit that we trigger a runtime restart at. This was 735 * chosen arbitrarily, but will need to be at least 6 in order to have a sufficient number 736 * of FDs in reserve to complete a dump. 737 */ 738 private static final int FD_HIGH_WATER_MARK = 12; 739 740 private final File mDumpDir; 741 private final File mFdHighWaterMark; 742 create()743 public static OpenFdMonitor create() { 744 // Only run the FD monitor on debuggable builds (such as userdebug and eng builds). 745 if (!Build.IS_DEBUGGABLE) { 746 return null; 747 } 748 749 final StructRlimit rlimit; 750 try { 751 rlimit = android.system.Os.getrlimit(OsConstants.RLIMIT_NOFILE); 752 } catch (ErrnoException errno) { 753 Slog.w(TAG, "Error thrown from getrlimit(RLIMIT_NOFILE)", errno); 754 return null; 755 } 756 757 // The assumption we're making here is that FD numbers are allocated (more or less) 758 // sequentially, which is currently (and historically) true since open is currently 759 // specified to always return the lowest-numbered non-open file descriptor for the 760 // current process. 761 // 762 // We do this to avoid having to enumerate the contents of /proc/self/fd in order to 763 // count the number of descriptors open in the process. 764 final File fdThreshold = new File("/proc/self/fd/" + (rlimit.rlim_cur - FD_HIGH_WATER_MARK)); 765 return new OpenFdMonitor(new File("/data/anr"), fdThreshold); 766 } 767 OpenFdMonitor(File dumpDir, File fdThreshold)768 OpenFdMonitor(File dumpDir, File fdThreshold) { 769 mDumpDir = dumpDir; 770 mFdHighWaterMark = fdThreshold; 771 } 772 773 /** 774 * Dumps open file descriptors and their full paths to a temporary file in {@code mDumpDir}. 775 */ dumpOpenDescriptors()776 private void dumpOpenDescriptors() { 777 // We cannot exec lsof to get more info about open file descriptors because a newly 778 // forked process will not have the permissions to readlink. Instead list all open 779 // descriptors from /proc/pid/fd and resolve them. 780 List<String> dumpInfo = new ArrayList<>(); 781 String fdDirPath = String.format("/proc/%d/fd/", Process.myPid()); 782 File[] fds = new File(fdDirPath).listFiles(); 783 if (fds == null) { 784 dumpInfo.add("Unable to list " + fdDirPath); 785 } else { 786 for (File f : fds) { 787 String fdSymLink = f.getAbsolutePath(); 788 String resolvedPath = ""; 789 try { 790 resolvedPath = Os.readlink(fdSymLink); 791 } catch (ErrnoException ex) { 792 resolvedPath = ex.getMessage(); 793 } 794 dumpInfo.add(fdSymLink + "\t" + resolvedPath); 795 } 796 } 797 798 // Dump the fds & paths to a temp file. 799 try { 800 File dumpFile = File.createTempFile("anr_fd_", "", mDumpDir); 801 Path out = Paths.get(dumpFile.getAbsolutePath()); 802 Files.write(out, dumpInfo, StandardCharsets.UTF_8); 803 } catch (IOException ex) { 804 Slog.w(TAG, "Unable to write open descriptors to file: " + ex); 805 } 806 } 807 808 /** 809 * @return {@code true} if the high water mark was breached and a dump was written, 810 * {@code false} otherwise. 811 */ monitor()812 public boolean monitor() { 813 if (mFdHighWaterMark.exists()) { 814 dumpOpenDescriptors(); 815 return true; 816 } 817 818 return false; 819 } 820 } 821 } 822