1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.server; 18 19 import android.app.IActivityController; 20 import android.os.Binder; 21 import android.os.RemoteException; 22 import com.android.server.am.ActivityManagerService; 23 24 import android.content.BroadcastReceiver; 25 import android.content.ContentResolver; 26 import android.content.Context; 27 import android.content.Intent; 28 import android.content.IntentFilter; 29 import android.hidl.manager.V1_0.IServiceManager; 30 import android.os.Debug; 31 import android.os.Handler; 32 import android.os.IPowerManager; 33 import android.os.Looper; 34 import android.os.Process; 35 import android.os.ServiceManager; 36 import android.os.SystemClock; 37 import android.os.SystemProperties; 38 import android.util.EventLog; 39 import android.util.Log; 40 import android.util.Slog; 41 42 import java.io.File; 43 import java.io.FileWriter; 44 import java.io.IOException; 45 import java.util.ArrayList; 46 import java.util.Arrays; 47 import java.util.HashSet; 48 import java.util.List; 49 50 /** This class calls its monitor every minute. Killing this process if they don't return **/ 51 public class Watchdog extends Thread { 52 static final String TAG = "Watchdog"; 53 54 // Set this to true to use debug default values. 55 static final boolean DB = false; 56 57 // Set this to true to have the watchdog record kernel thread stacks when it fires 58 static final boolean RECORD_KERNEL_THREADS = true; 59 60 static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000; 61 static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2; 62 63 // These are temporally ordered: larger values as lateness increases 64 static final int COMPLETED = 0; 65 static final int WAITING = 1; 66 static final int WAITED_HALF = 2; 67 static final int OVERDUE = 3; 68 69 // Which native processes to dump into dropbox's stack traces 70 public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { 71 "/system/bin/audioserver", 72 "/system/bin/cameraserver", 73 "/system/bin/drmserver", 74 "/system/bin/mediadrmserver", 75 "/system/bin/mediaserver", 76 "/system/bin/sdcard", 77 "/system/bin/surfaceflinger", 78 "media.extractor", // system/bin/mediaextractor 79 "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service 80 "com.android.bluetooth", // Bluetooth service 81 }; 82 83 public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList( 84 "android.hardware.audio@2.0::IDevicesFactory", 85 "android.hardware.bluetooth@1.0::IBluetoothHci", 86 "android.hardware.camera.provider@2.4::ICameraProvider", 87 "android.hardware.graphics.composer@2.1::IComposer", 88 "android.hardware.vr@1.0::IVr", 89 "android.hardware.media.omx@1.0::IOmx" 90 ); 91 92 static Watchdog sWatchdog; 93 94 /* This handler will be used to post message back onto the main thread */ 95 final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>(); 96 final HandlerChecker mMonitorChecker; 97 ContentResolver mResolver; 98 ActivityManagerService mActivity; 99 100 int mPhonePid; 101 IActivityController mController; 102 boolean mAllowRestart = true; 103 104 /** 105 * Used for checking status of handle threads and scheduling monitor callbacks. 106 */ 107 public final class HandlerChecker implements Runnable { 108 private final Handler mHandler; 109 private final String mName; 110 private final long mWaitMax; 111 private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); 112 private boolean mCompleted; 113 private Monitor mCurrentMonitor; 114 private long mStartTime; 115 HandlerChecker(Handler handler, String name, long waitMaxMillis)116 HandlerChecker(Handler handler, String name, long waitMaxMillis) { 117 mHandler = handler; 118 mName = name; 119 mWaitMax = waitMaxMillis; 120 mCompleted = true; 121 } 122 addMonitor(Monitor monitor)123 public void addMonitor(Monitor monitor) { 124 mMonitors.add(monitor); 125 } 126 scheduleCheckLocked()127 public void scheduleCheckLocked() { 128 if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) { 129 // If the target looper has recently been polling, then 130 // there is no reason to enqueue our checker on it since that 131 // is as good as it not being deadlocked. This avoid having 132 // to do a context switch to check the thread. Note that we 133 // only do this if mCheckReboot is false and we have no 134 // monitors, since those would need to be executed at this point. 135 mCompleted = true; 136 return; 137 } 138 139 if (!mCompleted) { 140 // we already have a check in flight, so no need 141 return; 142 } 143 144 mCompleted = false; 145 mCurrentMonitor = null; 146 mStartTime = SystemClock.uptimeMillis(); 147 mHandler.postAtFrontOfQueue(this); 148 } 149 isOverdueLocked()150 public boolean isOverdueLocked() { 151 return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax); 152 } 153 getCompletionStateLocked()154 public int getCompletionStateLocked() { 155 if (mCompleted) { 156 return COMPLETED; 157 } else { 158 long latency = SystemClock.uptimeMillis() - mStartTime; 159 if (latency < mWaitMax/2) { 160 return WAITING; 161 } else if (latency < mWaitMax) { 162 return WAITED_HALF; 163 } 164 } 165 return OVERDUE; 166 } 167 getThread()168 public Thread getThread() { 169 return mHandler.getLooper().getThread(); 170 } 171 getName()172 public String getName() { 173 return mName; 174 } 175 describeBlockedStateLocked()176 public String describeBlockedStateLocked() { 177 if (mCurrentMonitor == null) { 178 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")"; 179 } else { 180 return "Blocked in monitor " + mCurrentMonitor.getClass().getName() 181 + " on " + mName + " (" + getThread().getName() + ")"; 182 } 183 } 184 185 @Override run()186 public void run() { 187 final int size = mMonitors.size(); 188 for (int i = 0 ; i < size ; i++) { 189 synchronized (Watchdog.this) { 190 mCurrentMonitor = mMonitors.get(i); 191 } 192 mCurrentMonitor.monitor(); 193 } 194 195 synchronized (Watchdog.this) { 196 mCompleted = true; 197 mCurrentMonitor = null; 198 } 199 } 200 } 201 202 final class RebootRequestReceiver extends BroadcastReceiver { 203 @Override onReceive(Context c, Intent intent)204 public void onReceive(Context c, Intent intent) { 205 if (intent.getIntExtra("nowait", 0) != 0) { 206 rebootSystem("Received ACTION_REBOOT broadcast"); 207 return; 208 } 209 Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); 210 } 211 } 212 213 /** Monitor for checking the availability of binder threads. The monitor will block until 214 * there is a binder thread available to process in coming IPCs to make sure other processes 215 * can still communicate with the service. 216 */ 217 private static final class BinderThreadMonitor implements Watchdog.Monitor { 218 @Override monitor()219 public void monitor() { 220 Binder.blockUntilThreadAvailable(); 221 } 222 } 223 224 public interface Monitor { monitor()225 void monitor(); 226 } 227 getInstance()228 public static Watchdog getInstance() { 229 if (sWatchdog == null) { 230 sWatchdog = new Watchdog(); 231 } 232 233 return sWatchdog; 234 } 235 Watchdog()236 private Watchdog() { 237 super("watchdog"); 238 // Initialize handler checkers for each common thread we want to check. Note 239 // that we are not currently checking the background thread, since it can 240 // potentially hold longer running operations with no guarantees about the timeliness 241 // of operations there. 242 243 // The shared foreground thread is the main checker. It is where we 244 // will also dispatch monitor checks and do other work. 245 mMonitorChecker = new HandlerChecker(FgThread.getHandler(), 246 "foreground thread", DEFAULT_TIMEOUT); 247 mHandlerCheckers.add(mMonitorChecker); 248 // Add checker for main thread. We only do a quick check since there 249 // can be UI running on the thread. 250 mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()), 251 "main thread", DEFAULT_TIMEOUT)); 252 // Add checker for shared UI thread. 253 mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), 254 "ui thread", DEFAULT_TIMEOUT)); 255 // And also check IO thread. 256 mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), 257 "i/o thread", DEFAULT_TIMEOUT)); 258 // And the display thread. 259 mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(), 260 "display thread", DEFAULT_TIMEOUT)); 261 262 // Initialize monitor for Binder threads. 263 addMonitor(new BinderThreadMonitor()); 264 } 265 init(Context context, ActivityManagerService activity)266 public void init(Context context, ActivityManagerService activity) { 267 mResolver = context.getContentResolver(); 268 mActivity = activity; 269 270 context.registerReceiver(new RebootRequestReceiver(), 271 new IntentFilter(Intent.ACTION_REBOOT), 272 android.Manifest.permission.REBOOT, null); 273 } 274 processStarted(String name, int pid)275 public void processStarted(String name, int pid) { 276 synchronized (this) { 277 if ("com.android.phone".equals(name)) { 278 mPhonePid = pid; 279 } 280 } 281 } 282 setActivityController(IActivityController controller)283 public void setActivityController(IActivityController controller) { 284 synchronized (this) { 285 mController = controller; 286 } 287 } 288 setAllowRestart(boolean allowRestart)289 public void setAllowRestart(boolean allowRestart) { 290 synchronized (this) { 291 mAllowRestart = allowRestart; 292 } 293 } 294 addMonitor(Monitor monitor)295 public void addMonitor(Monitor monitor) { 296 synchronized (this) { 297 if (isAlive()) { 298 throw new RuntimeException("Monitors can't be added once the Watchdog is running"); 299 } 300 mMonitorChecker.addMonitor(monitor); 301 } 302 } 303 addThread(Handler thread)304 public void addThread(Handler thread) { 305 addThread(thread, DEFAULT_TIMEOUT); 306 } 307 addThread(Handler thread, long timeoutMillis)308 public void addThread(Handler thread, long timeoutMillis) { 309 synchronized (this) { 310 if (isAlive()) { 311 throw new RuntimeException("Threads can't be added once the Watchdog is running"); 312 } 313 final String name = thread.getLooper().getThread().getName(); 314 mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis)); 315 } 316 } 317 318 /** 319 * Perform a full reboot of the system. 320 */ rebootSystem(String reason)321 void rebootSystem(String reason) { 322 Slog.i(TAG, "Rebooting system because: " + reason); 323 IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE); 324 try { 325 pms.reboot(false, reason, false); 326 } catch (RemoteException ex) { 327 } 328 } 329 evaluateCheckerCompletionLocked()330 private int evaluateCheckerCompletionLocked() { 331 int state = COMPLETED; 332 for (int i=0; i<mHandlerCheckers.size(); i++) { 333 HandlerChecker hc = mHandlerCheckers.get(i); 334 state = Math.max(state, hc.getCompletionStateLocked()); 335 } 336 return state; 337 } 338 getBlockedCheckersLocked()339 private ArrayList<HandlerChecker> getBlockedCheckersLocked() { 340 ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>(); 341 for (int i=0; i<mHandlerCheckers.size(); i++) { 342 HandlerChecker hc = mHandlerCheckers.get(i); 343 if (hc.isOverdueLocked()) { 344 checkers.add(hc); 345 } 346 } 347 return checkers; 348 } 349 describeCheckersLocked(ArrayList<HandlerChecker> checkers)350 private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) { 351 StringBuilder builder = new StringBuilder(128); 352 for (int i=0; i<checkers.size(); i++) { 353 if (builder.length() > 0) { 354 builder.append(", "); 355 } 356 builder.append(checkers.get(i).describeBlockedStateLocked()); 357 } 358 return builder.toString(); 359 } 360 getInterestingHalPids()361 private ArrayList<Integer> getInterestingHalPids() { 362 try { 363 IServiceManager serviceManager = IServiceManager.getService(); 364 ArrayList<IServiceManager.InstanceDebugInfo> dump = 365 serviceManager.debugDump(); 366 HashSet<Integer> pids = new HashSet<>(); 367 for (IServiceManager.InstanceDebugInfo info : dump) { 368 if (info.pid == IServiceManager.PidConstant.NO_PID) { 369 continue; 370 } 371 372 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) { 373 continue; 374 } 375 376 pids.add(info.pid); 377 } 378 return new ArrayList<Integer>(pids); 379 } catch (RemoteException e) { 380 return new ArrayList<Integer>(); 381 } 382 } 383 getInterestingNativePids()384 private ArrayList<Integer> getInterestingNativePids() { 385 ArrayList<Integer> pids = getInterestingHalPids(); 386 387 int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST); 388 if (nativePids != null) { 389 pids.ensureCapacity(pids.size() + nativePids.length); 390 for (int i : nativePids) { 391 pids.add(i); 392 } 393 } 394 395 return pids; 396 } 397 398 @Override run()399 public void run() { 400 boolean waitedHalf = false; 401 while (true) { 402 final ArrayList<HandlerChecker> blockedCheckers; 403 final String subject; 404 final boolean allowRestart; 405 int debuggerWasConnected = 0; 406 synchronized (this) { 407 long timeout = CHECK_INTERVAL; 408 // Make sure we (re)spin the checkers that have become idle within 409 // this wait-and-check interval 410 for (int i=0; i<mHandlerCheckers.size(); i++) { 411 HandlerChecker hc = mHandlerCheckers.get(i); 412 hc.scheduleCheckLocked(); 413 } 414 415 if (debuggerWasConnected > 0) { 416 debuggerWasConnected--; 417 } 418 419 // NOTE: We use uptimeMillis() here because we do not want to increment the time we 420 // wait while asleep. If the device is asleep then the thing that we are waiting 421 // to timeout on is asleep as well and won't have a chance to run, causing a false 422 // positive on when to kill things. 423 long start = SystemClock.uptimeMillis(); 424 while (timeout > 0) { 425 if (Debug.isDebuggerConnected()) { 426 debuggerWasConnected = 2; 427 } 428 try { 429 wait(timeout); 430 } catch (InterruptedException e) { 431 Log.wtf(TAG, e); 432 } 433 if (Debug.isDebuggerConnected()) { 434 debuggerWasConnected = 2; 435 } 436 timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start); 437 } 438 439 final int waitState = evaluateCheckerCompletionLocked(); 440 if (waitState == COMPLETED) { 441 // The monitors have returned; reset 442 waitedHalf = false; 443 continue; 444 } else if (waitState == WAITING) { 445 // still waiting but within their configured intervals; back off and recheck 446 continue; 447 } else if (waitState == WAITED_HALF) { 448 if (!waitedHalf) { 449 // We've waited half the deadlock-detection interval. Pull a stack 450 // trace and wait another half. 451 ArrayList<Integer> pids = new ArrayList<Integer>(); 452 pids.add(Process.myPid()); 453 ActivityManagerService.dumpStackTraces(true, pids, null, null, 454 getInterestingNativePids()); 455 waitedHalf = true; 456 } 457 continue; 458 } 459 460 // something is overdue! 461 blockedCheckers = getBlockedCheckersLocked(); 462 subject = describeCheckersLocked(blockedCheckers); 463 allowRestart = mAllowRestart; 464 } 465 466 // If we got here, that means that the system is most likely hung. 467 // First collect stack traces from all threads of the system process. 468 // Then kill this process so that the system will restart. 469 EventLog.writeEvent(EventLogTags.WATCHDOG, subject); 470 471 ArrayList<Integer> pids = new ArrayList<>(); 472 pids.add(Process.myPid()); 473 if (mPhonePid > 0) pids.add(mPhonePid); 474 // Pass !waitedHalf so that just in case we somehow wind up here without having 475 // dumped the halfway stacks, we properly re-initialize the trace file. 476 final File stack = ActivityManagerService.dumpStackTraces( 477 !waitedHalf, pids, null, null, getInterestingNativePids()); 478 479 // Give some extra time to make sure the stack traces get written. 480 // The system's been hanging for a minute, another second or two won't hurt much. 481 SystemClock.sleep(2000); 482 483 // Pull our own kernel thread stacks as well if we're configured for that 484 if (RECORD_KERNEL_THREADS) { 485 dumpKernelStackTraces(); 486 } 487 488 // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log 489 doSysRq('w'); 490 doSysRq('l'); 491 492 // Try to add the error to the dropbox, but assuming that the ActivityManager 493 // itself may be deadlocked. (which has happened, causing this statement to 494 // deadlock and the watchdog as a whole to be ineffective) 495 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { 496 public void run() { 497 mActivity.addErrorToDropBox( 498 "watchdog", null, "system_server", null, null, 499 subject, null, stack, null); 500 } 501 }; 502 dropboxThread.start(); 503 try { 504 dropboxThread.join(2000); // wait up to 2 seconds for it to return. 505 } catch (InterruptedException ignored) {} 506 507 IActivityController controller; 508 synchronized (this) { 509 controller = mController; 510 } 511 if (controller != null) { 512 Slog.i(TAG, "Reporting stuck state to activity controller"); 513 try { 514 Binder.setDumpDisabled("Service dumps disabled due to hung system process."); 515 // 1 = keep waiting, -1 = kill system 516 int res = controller.systemNotResponding(subject); 517 if (res >= 0) { 518 Slog.i(TAG, "Activity controller requested to coninue to wait"); 519 waitedHalf = false; 520 continue; 521 } 522 } catch (RemoteException e) { 523 } 524 } 525 526 // Only kill the process if the debugger is not attached. 527 if (Debug.isDebuggerConnected()) { 528 debuggerWasConnected = 2; 529 } 530 if (debuggerWasConnected >= 2) { 531 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); 532 } else if (debuggerWasConnected > 0) { 533 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process"); 534 } else if (!allowRestart) { 535 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); 536 } else { 537 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); 538 for (int i=0; i<blockedCheckers.size(); i++) { 539 Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:"); 540 StackTraceElement[] stackTrace 541 = blockedCheckers.get(i).getThread().getStackTrace(); 542 for (StackTraceElement element: stackTrace) { 543 Slog.w(TAG, " at " + element); 544 } 545 } 546 Slog.w(TAG, "*** GOODBYE!"); 547 Process.killProcess(Process.myPid()); 548 System.exit(10); 549 } 550 551 waitedHalf = false; 552 } 553 } 554 doSysRq(char c)555 private void doSysRq(char c) { 556 try { 557 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); 558 sysrq_trigger.write(c); 559 sysrq_trigger.close(); 560 } catch (IOException e) { 561 Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e); 562 } 563 } 564 dumpKernelStackTraces()565 private File dumpKernelStackTraces() { 566 String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null); 567 if (tracesPath == null || tracesPath.length() == 0) { 568 return null; 569 } 570 571 native_dumpKernelStacks(tracesPath); 572 return new File(tracesPath); 573 } 574 native_dumpKernelStacks(String tracesPath)575 private native void native_dumpKernelStacks(String tracesPath); 576 } 577