1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.server; 18 19 import android.app.IActivityController; 20 import android.os.Binder; 21 import android.os.RemoteException; 22 import com.android.server.am.ActivityManagerService; 23 24 import android.content.BroadcastReceiver; 25 import android.content.ContentResolver; 26 import android.content.Context; 27 import android.content.Intent; 28 import android.content.IntentFilter; 29 import android.os.Debug; 30 import android.os.Handler; 31 import android.os.IPowerManager; 32 import android.os.Looper; 33 import android.os.Process; 34 import android.os.ServiceManager; 35 import android.os.SystemClock; 36 import android.os.SystemProperties; 37 import android.util.EventLog; 38 import android.util.Log; 39 import android.util.Slog; 40 41 import java.io.File; 42 import java.io.FileWriter; 43 import java.io.IOException; 44 import java.util.ArrayList; 45 46 /** This class calls its monitor every minute. Killing this process if they don't return **/ 47 public class Watchdog extends Thread { 48 static final String TAG = "Watchdog"; 49 50 // Set this to true to use debug default values. 51 static final boolean DB = false; 52 53 // Set this to true to have the watchdog record kernel thread stacks when it fires 54 static final boolean RECORD_KERNEL_THREADS = true; 55 56 static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000; 57 static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2; 58 59 // These are temporally ordered: larger values as lateness increases 60 static final int COMPLETED = 0; 61 static final int WAITING = 1; 62 static final int WAITED_HALF = 2; 63 static final int OVERDUE = 3; 64 65 // Which native processes to dump into dropbox's stack traces 66 public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { 67 "/system/bin/mediaserver", 68 "/system/bin/sdcard", 69 "/system/bin/surfaceflinger" 70 }; 71 72 static Watchdog sWatchdog; 73 74 /* This handler will be used to post message back onto the main thread */ 75 final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>(); 76 final HandlerChecker mMonitorChecker; 77 ContentResolver mResolver; 78 ActivityManagerService mActivity; 79 80 int mPhonePid; 81 IActivityController mController; 82 boolean mAllowRestart = true; 83 84 /** 85 * Used for checking status of handle threads and scheduling monitor callbacks. 86 */ 87 public final class HandlerChecker implements Runnable { 88 private final Handler mHandler; 89 private final String mName; 90 private final long mWaitMax; 91 private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); 92 private boolean mCompleted; 93 private Monitor mCurrentMonitor; 94 private long mStartTime; 95 HandlerChecker(Handler handler, String name, long waitMaxMillis)96 HandlerChecker(Handler handler, String name, long waitMaxMillis) { 97 mHandler = handler; 98 mName = name; 99 mWaitMax = waitMaxMillis; 100 mCompleted = true; 101 } 102 addMonitor(Monitor monitor)103 public void addMonitor(Monitor monitor) { 104 mMonitors.add(monitor); 105 } 106 scheduleCheckLocked()107 public void scheduleCheckLocked() { 108 if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) { 109 // If the target looper has recently been polling, then 110 // there is no reason to enqueue our checker on it since that 111 // is as good as it not being deadlocked. This avoid having 112 // to do a context switch to check the thread. Note that we 113 // only do this if mCheckReboot is false and we have no 114 // monitors, since those would need to be executed at this point. 115 mCompleted = true; 116 return; 117 } 118 119 if (!mCompleted) { 120 // we already have a check in flight, so no need 121 return; 122 } 123 124 mCompleted = false; 125 mCurrentMonitor = null; 126 mStartTime = SystemClock.uptimeMillis(); 127 mHandler.postAtFrontOfQueue(this); 128 } 129 isOverdueLocked()130 public boolean isOverdueLocked() { 131 return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax); 132 } 133 getCompletionStateLocked()134 public int getCompletionStateLocked() { 135 if (mCompleted) { 136 return COMPLETED; 137 } else { 138 long latency = SystemClock.uptimeMillis() - mStartTime; 139 if (latency < mWaitMax/2) { 140 return WAITING; 141 } else if (latency < mWaitMax) { 142 return WAITED_HALF; 143 } 144 } 145 return OVERDUE; 146 } 147 getThread()148 public Thread getThread() { 149 return mHandler.getLooper().getThread(); 150 } 151 getName()152 public String getName() { 153 return mName; 154 } 155 describeBlockedStateLocked()156 public String describeBlockedStateLocked() { 157 if (mCurrentMonitor == null) { 158 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")"; 159 } else { 160 return "Blocked in monitor " + mCurrentMonitor.getClass().getName() 161 + " on " + mName + " (" + getThread().getName() + ")"; 162 } 163 } 164 165 @Override run()166 public void run() { 167 final int size = mMonitors.size(); 168 for (int i = 0 ; i < size ; i++) { 169 synchronized (Watchdog.this) { 170 mCurrentMonitor = mMonitors.get(i); 171 } 172 mCurrentMonitor.monitor(); 173 } 174 175 synchronized (Watchdog.this) { 176 mCompleted = true; 177 mCurrentMonitor = null; 178 } 179 } 180 } 181 182 final class RebootRequestReceiver extends BroadcastReceiver { 183 @Override onReceive(Context c, Intent intent)184 public void onReceive(Context c, Intent intent) { 185 if (intent.getIntExtra("nowait", 0) != 0) { 186 rebootSystem("Received ACTION_REBOOT broadcast"); 187 return; 188 } 189 Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); 190 } 191 } 192 193 /** Monitor for checking the availability of binder threads. The monitor will block until 194 * there is a binder thread available to process in coming IPCs to make sure other processes 195 * can still communicate with the service. 196 */ 197 private static final class BinderThreadMonitor implements Watchdog.Monitor { 198 @Override monitor()199 public void monitor() { 200 Binder.blockUntilThreadAvailable(); 201 } 202 } 203 204 public interface Monitor { monitor()205 void monitor(); 206 } 207 getInstance()208 public static Watchdog getInstance() { 209 if (sWatchdog == null) { 210 sWatchdog = new Watchdog(); 211 } 212 213 return sWatchdog; 214 } 215 Watchdog()216 private Watchdog() { 217 super("watchdog"); 218 // Initialize handler checkers for each common thread we want to check. Note 219 // that we are not currently checking the background thread, since it can 220 // potentially hold longer running operations with no guarantees about the timeliness 221 // of operations there. 222 223 // The shared foreground thread is the main checker. It is where we 224 // will also dispatch monitor checks and do other work. 225 mMonitorChecker = new HandlerChecker(FgThread.getHandler(), 226 "foreground thread", DEFAULT_TIMEOUT); 227 mHandlerCheckers.add(mMonitorChecker); 228 // Add checker for main thread. We only do a quick check since there 229 // can be UI running on the thread. 230 mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()), 231 "main thread", DEFAULT_TIMEOUT)); 232 // Add checker for shared UI thread. 233 mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), 234 "ui thread", DEFAULT_TIMEOUT)); 235 // And also check IO thread. 236 mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), 237 "i/o thread", DEFAULT_TIMEOUT)); 238 // And the display thread. 239 mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(), 240 "display thread", DEFAULT_TIMEOUT)); 241 242 // Initialize monitor for Binder threads. 243 addMonitor(new BinderThreadMonitor()); 244 } 245 init(Context context, ActivityManagerService activity)246 public void init(Context context, ActivityManagerService activity) { 247 mResolver = context.getContentResolver(); 248 mActivity = activity; 249 250 context.registerReceiver(new RebootRequestReceiver(), 251 new IntentFilter(Intent.ACTION_REBOOT), 252 android.Manifest.permission.REBOOT, null); 253 } 254 processStarted(String name, int pid)255 public void processStarted(String name, int pid) { 256 synchronized (this) { 257 if ("com.android.phone".equals(name)) { 258 mPhonePid = pid; 259 } 260 } 261 } 262 setActivityController(IActivityController controller)263 public void setActivityController(IActivityController controller) { 264 synchronized (this) { 265 mController = controller; 266 } 267 } 268 setAllowRestart(boolean allowRestart)269 public void setAllowRestart(boolean allowRestart) { 270 synchronized (this) { 271 mAllowRestart = allowRestart; 272 } 273 } 274 addMonitor(Monitor monitor)275 public void addMonitor(Monitor monitor) { 276 synchronized (this) { 277 if (isAlive()) { 278 throw new RuntimeException("Monitors can't be added once the Watchdog is running"); 279 } 280 mMonitorChecker.addMonitor(monitor); 281 } 282 } 283 addThread(Handler thread)284 public void addThread(Handler thread) { 285 addThread(thread, DEFAULT_TIMEOUT); 286 } 287 addThread(Handler thread, long timeoutMillis)288 public void addThread(Handler thread, long timeoutMillis) { 289 synchronized (this) { 290 if (isAlive()) { 291 throw new RuntimeException("Threads can't be added once the Watchdog is running"); 292 } 293 final String name = thread.getLooper().getThread().getName(); 294 mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis)); 295 } 296 } 297 298 /** 299 * Perform a full reboot of the system. 300 */ rebootSystem(String reason)301 void rebootSystem(String reason) { 302 Slog.i(TAG, "Rebooting system because: " + reason); 303 IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE); 304 try { 305 pms.reboot(false, reason, false); 306 } catch (RemoteException ex) { 307 } 308 } 309 evaluateCheckerCompletionLocked()310 private int evaluateCheckerCompletionLocked() { 311 int state = COMPLETED; 312 for (int i=0; i<mHandlerCheckers.size(); i++) { 313 HandlerChecker hc = mHandlerCheckers.get(i); 314 state = Math.max(state, hc.getCompletionStateLocked()); 315 } 316 return state; 317 } 318 getBlockedCheckersLocked()319 private ArrayList<HandlerChecker> getBlockedCheckersLocked() { 320 ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>(); 321 for (int i=0; i<mHandlerCheckers.size(); i++) { 322 HandlerChecker hc = mHandlerCheckers.get(i); 323 if (hc.isOverdueLocked()) { 324 checkers.add(hc); 325 } 326 } 327 return checkers; 328 } 329 describeCheckersLocked(ArrayList<HandlerChecker> checkers)330 private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) { 331 StringBuilder builder = new StringBuilder(128); 332 for (int i=0; i<checkers.size(); i++) { 333 if (builder.length() > 0) { 334 builder.append(", "); 335 } 336 builder.append(checkers.get(i).describeBlockedStateLocked()); 337 } 338 return builder.toString(); 339 } 340 341 @Override run()342 public void run() { 343 boolean waitedHalf = false; 344 while (true) { 345 final ArrayList<HandlerChecker> blockedCheckers; 346 final String subject; 347 final boolean allowRestart; 348 int debuggerWasConnected = 0; 349 synchronized (this) { 350 long timeout = CHECK_INTERVAL; 351 // Make sure we (re)spin the checkers that have become idle within 352 // this wait-and-check interval 353 for (int i=0; i<mHandlerCheckers.size(); i++) { 354 HandlerChecker hc = mHandlerCheckers.get(i); 355 hc.scheduleCheckLocked(); 356 } 357 358 if (debuggerWasConnected > 0) { 359 debuggerWasConnected--; 360 } 361 362 // NOTE: We use uptimeMillis() here because we do not want to increment the time we 363 // wait while asleep. If the device is asleep then the thing that we are waiting 364 // to timeout on is asleep as well and won't have a chance to run, causing a false 365 // positive on when to kill things. 366 long start = SystemClock.uptimeMillis(); 367 while (timeout > 0) { 368 if (Debug.isDebuggerConnected()) { 369 debuggerWasConnected = 2; 370 } 371 try { 372 wait(timeout); 373 } catch (InterruptedException e) { 374 Log.wtf(TAG, e); 375 } 376 if (Debug.isDebuggerConnected()) { 377 debuggerWasConnected = 2; 378 } 379 timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start); 380 } 381 382 final int waitState = evaluateCheckerCompletionLocked(); 383 if (waitState == COMPLETED) { 384 // The monitors have returned; reset 385 waitedHalf = false; 386 continue; 387 } else if (waitState == WAITING) { 388 // still waiting but within their configured intervals; back off and recheck 389 continue; 390 } else if (waitState == WAITED_HALF) { 391 if (!waitedHalf) { 392 // We've waited half the deadlock-detection interval. Pull a stack 393 // trace and wait another half. 394 ArrayList<Integer> pids = new ArrayList<Integer>(); 395 pids.add(Process.myPid()); 396 ActivityManagerService.dumpStackTraces(true, pids, null, null, 397 NATIVE_STACKS_OF_INTEREST); 398 waitedHalf = true; 399 } 400 continue; 401 } 402 403 // something is overdue! 404 blockedCheckers = getBlockedCheckersLocked(); 405 subject = describeCheckersLocked(blockedCheckers); 406 allowRestart = mAllowRestart; 407 } 408 409 // If we got here, that means that the system is most likely hung. 410 // First collect stack traces from all threads of the system process. 411 // Then kill this process so that the system will restart. 412 EventLog.writeEvent(EventLogTags.WATCHDOG, subject); 413 414 ArrayList<Integer> pids = new ArrayList<Integer>(); 415 pids.add(Process.myPid()); 416 if (mPhonePid > 0) pids.add(mPhonePid); 417 // Pass !waitedHalf so that just in case we somehow wind up here without having 418 // dumped the halfway stacks, we properly re-initialize the trace file. 419 final File stack = ActivityManagerService.dumpStackTraces( 420 !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST); 421 422 // Give some extra time to make sure the stack traces get written. 423 // The system's been hanging for a minute, another second or two won't hurt much. 424 SystemClock.sleep(2000); 425 426 // Pull our own kernel thread stacks as well if we're configured for that 427 if (RECORD_KERNEL_THREADS) { 428 dumpKernelStackTraces(); 429 } 430 431 // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log 432 doSysRq('w'); 433 doSysRq('l'); 434 435 // Try to add the error to the dropbox, but assuming that the ActivityManager 436 // itself may be deadlocked. (which has happened, causing this statement to 437 // deadlock and the watchdog as a whole to be ineffective) 438 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { 439 public void run() { 440 mActivity.addErrorToDropBox( 441 "watchdog", null, "system_server", null, null, 442 subject, null, stack, null); 443 } 444 }; 445 dropboxThread.start(); 446 try { 447 dropboxThread.join(2000); // wait up to 2 seconds for it to return. 448 } catch (InterruptedException ignored) {} 449 450 IActivityController controller; 451 synchronized (this) { 452 controller = mController; 453 } 454 if (controller != null) { 455 Slog.i(TAG, "Reporting stuck state to activity controller"); 456 try { 457 Binder.setDumpDisabled("Service dumps disabled due to hung system process."); 458 // 1 = keep waiting, -1 = kill system 459 int res = controller.systemNotResponding(subject); 460 if (res >= 0) { 461 Slog.i(TAG, "Activity controller requested to coninue to wait"); 462 waitedHalf = false; 463 continue; 464 } 465 } catch (RemoteException e) { 466 } 467 } 468 469 // Only kill the process if the debugger is not attached. 470 if (Debug.isDebuggerConnected()) { 471 debuggerWasConnected = 2; 472 } 473 if (debuggerWasConnected >= 2) { 474 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); 475 } else if (debuggerWasConnected > 0) { 476 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process"); 477 } else if (!allowRestart) { 478 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); 479 } else { 480 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); 481 for (int i=0; i<blockedCheckers.size(); i++) { 482 Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:"); 483 StackTraceElement[] stackTrace 484 = blockedCheckers.get(i).getThread().getStackTrace(); 485 for (StackTraceElement element: stackTrace) { 486 Slog.w(TAG, " at " + element); 487 } 488 } 489 Slog.w(TAG, "*** GOODBYE!"); 490 Process.killProcess(Process.myPid()); 491 System.exit(10); 492 } 493 494 waitedHalf = false; 495 } 496 } 497 doSysRq(char c)498 private void doSysRq(char c) { 499 try { 500 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); 501 sysrq_trigger.write(c); 502 sysrq_trigger.close(); 503 } catch (IOException e) { 504 Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e); 505 } 506 } 507 dumpKernelStackTraces()508 private File dumpKernelStackTraces() { 509 String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null); 510 if (tracesPath == null || tracesPath.length() == 0) { 511 return null; 512 } 513 514 native_dumpKernelStacks(tracesPath); 515 return new File(tracesPath); 516 } 517 native_dumpKernelStacks(String tracesPath)518 private native void native_dumpKernelStacks(String tracesPath); 519 } 520