1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.server; 18 19 import android.app.IActivityController; 20 import android.os.Binder; 21 import android.os.RemoteException; 22 import com.android.server.am.ActivityManagerService; 23 24 import android.content.BroadcastReceiver; 25 import android.content.ContentResolver; 26 import android.content.Context; 27 import android.content.Intent; 28 import android.content.IntentFilter; 29 import android.os.Debug; 30 import android.os.Handler; 31 import android.os.IPowerManager; 32 import android.os.Looper; 33 import android.os.Process; 34 import android.os.ServiceManager; 35 import android.os.SystemClock; 36 import android.os.SystemProperties; 37 import android.util.EventLog; 38 import android.util.Log; 39 import android.util.Slog; 40 41 import java.io.File; 42 import java.io.FileWriter; 43 import java.io.IOException; 44 import java.util.ArrayList; 45 46 /** This class calls its monitor every minute. Killing this process if they don't return **/ 47 public class Watchdog extends Thread { 48 static final String TAG = "Watchdog"; 49 static final boolean localLOGV = false || false; 50 51 // Set this to true to use debug default values. 52 static final boolean DB = false; 53 54 // Set this to true to have the watchdog record kernel thread stacks when it fires 55 static final boolean RECORD_KERNEL_THREADS = true; 56 57 static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000; 58 static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2; 59 60 // These are temporally ordered: larger values as lateness increases 61 static final int COMPLETED = 0; 62 static final int WAITING = 1; 63 static final int WAITED_HALF = 2; 64 static final int OVERDUE = 3; 65 66 // Which native processes to dump into dropbox's stack traces 67 public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { 68 "/system/bin/mediaserver", 69 "/system/bin/sdcard", 70 "/system/bin/surfaceflinger" 71 }; 72 73 static Watchdog sWatchdog; 74 75 /* This handler will be used to post message back onto the main thread */ 76 final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<HandlerChecker>(); 77 final HandlerChecker mMonitorChecker; 78 ContentResolver mResolver; 79 ActivityManagerService mActivity; 80 81 int mPhonePid; 82 IActivityController mController; 83 boolean mAllowRestart = true; 84 85 /** 86 * Used for checking status of handle threads and scheduling monitor callbacks. 87 */ 88 public final class HandlerChecker implements Runnable { 89 private final Handler mHandler; 90 private final String mName; 91 private final long mWaitMax; 92 private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); 93 private boolean mCompleted; 94 private Monitor mCurrentMonitor; 95 private long mStartTime; 96 HandlerChecker(Handler handler, String name, long waitMaxMillis)97 HandlerChecker(Handler handler, String name, long waitMaxMillis) { 98 mHandler = handler; 99 mName = name; 100 mWaitMax = waitMaxMillis; 101 mCompleted = true; 102 } 103 addMonitor(Monitor monitor)104 public void addMonitor(Monitor monitor) { 105 mMonitors.add(monitor); 106 } 107 scheduleCheckLocked()108 public void scheduleCheckLocked() { 109 if (mMonitors.size() == 0 && mHandler.getLooper().isIdling()) { 110 // If the target looper is or just recently was idling, then 111 // there is no reason to enqueue our checker on it since that 112 // is as good as it not being deadlocked. This avoid having 113 // to do a context switch to check the thread. Note that we 114 // only do this if mCheckReboot is false and we have no 115 // monitors, since those would need to be executed at this point. 116 mCompleted = true; 117 return; 118 } 119 120 if (!mCompleted) { 121 // we already have a check in flight, so no need 122 return; 123 } 124 125 mCompleted = false; 126 mCurrentMonitor = null; 127 mStartTime = SystemClock.uptimeMillis(); 128 mHandler.postAtFrontOfQueue(this); 129 } 130 isOverdueLocked()131 public boolean isOverdueLocked() { 132 return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax); 133 } 134 getCompletionStateLocked()135 public int getCompletionStateLocked() { 136 if (mCompleted) { 137 return COMPLETED; 138 } else { 139 long latency = SystemClock.uptimeMillis() - mStartTime; 140 if (latency < mWaitMax/2) { 141 return WAITING; 142 } else if (latency < mWaitMax) { 143 return WAITED_HALF; 144 } 145 } 146 return OVERDUE; 147 } 148 getThread()149 public Thread getThread() { 150 return mHandler.getLooper().getThread(); 151 } 152 getName()153 public String getName() { 154 return mName; 155 } 156 describeBlockedStateLocked()157 public String describeBlockedStateLocked() { 158 if (mCurrentMonitor == null) { 159 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")"; 160 } else { 161 return "Blocked in monitor " + mCurrentMonitor.getClass().getName() 162 + " on " + mName + " (" + getThread().getName() + ")"; 163 } 164 } 165 166 @Override run()167 public void run() { 168 final int size = mMonitors.size(); 169 for (int i = 0 ; i < size ; i++) { 170 synchronized (Watchdog.this) { 171 mCurrentMonitor = mMonitors.get(i); 172 } 173 mCurrentMonitor.monitor(); 174 } 175 176 synchronized (Watchdog.this) { 177 mCompleted = true; 178 mCurrentMonitor = null; 179 } 180 } 181 } 182 183 final class RebootRequestReceiver extends BroadcastReceiver { 184 @Override onReceive(Context c, Intent intent)185 public void onReceive(Context c, Intent intent) { 186 if (intent.getIntExtra("nowait", 0) != 0) { 187 rebootSystem("Received ACTION_REBOOT broadcast"); 188 return; 189 } 190 Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); 191 } 192 } 193 194 public interface Monitor { monitor()195 void monitor(); 196 } 197 getInstance()198 public static Watchdog getInstance() { 199 if (sWatchdog == null) { 200 sWatchdog = new Watchdog(); 201 } 202 203 return sWatchdog; 204 } 205 Watchdog()206 private Watchdog() { 207 super("watchdog"); 208 // Initialize handler checkers for each common thread we want to check. Note 209 // that we are not currently checking the background thread, since it can 210 // potentially hold longer running operations with no guarantees about the timeliness 211 // of operations there. 212 213 // The shared foreground thread is the main checker. It is where we 214 // will also dispatch monitor checks and do other work. 215 mMonitorChecker = new HandlerChecker(FgThread.getHandler(), 216 "foreground thread", DEFAULT_TIMEOUT); 217 mHandlerCheckers.add(mMonitorChecker); 218 // Add checker for main thread. We only do a quick check since there 219 // can be UI running on the thread. 220 mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()), 221 "main thread", DEFAULT_TIMEOUT)); 222 // Add checker for shared UI thread. 223 mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), 224 "ui thread", DEFAULT_TIMEOUT)); 225 // And also check IO thread. 226 mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), 227 "i/o thread", DEFAULT_TIMEOUT)); 228 // And the display thread. 229 mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(), 230 "display thread", DEFAULT_TIMEOUT)); 231 } 232 init(Context context, ActivityManagerService activity)233 public void init(Context context, ActivityManagerService activity) { 234 mResolver = context.getContentResolver(); 235 mActivity = activity; 236 237 context.registerReceiver(new RebootRequestReceiver(), 238 new IntentFilter(Intent.ACTION_REBOOT), 239 android.Manifest.permission.REBOOT, null); 240 } 241 processStarted(String name, int pid)242 public void processStarted(String name, int pid) { 243 synchronized (this) { 244 if ("com.android.phone".equals(name)) { 245 mPhonePid = pid; 246 } 247 } 248 } 249 setActivityController(IActivityController controller)250 public void setActivityController(IActivityController controller) { 251 synchronized (this) { 252 mController = controller; 253 } 254 } 255 setAllowRestart(boolean allowRestart)256 public void setAllowRestart(boolean allowRestart) { 257 synchronized (this) { 258 mAllowRestart = allowRestart; 259 } 260 } 261 addMonitor(Monitor monitor)262 public void addMonitor(Monitor monitor) { 263 synchronized (this) { 264 if (isAlive()) { 265 throw new RuntimeException("Monitors can't be added once the Watchdog is running"); 266 } 267 mMonitorChecker.addMonitor(monitor); 268 } 269 } 270 addThread(Handler thread)271 public void addThread(Handler thread) { 272 addThread(thread, DEFAULT_TIMEOUT); 273 } 274 addThread(Handler thread, long timeoutMillis)275 public void addThread(Handler thread, long timeoutMillis) { 276 synchronized (this) { 277 if (isAlive()) { 278 throw new RuntimeException("Threads can't be added once the Watchdog is running"); 279 } 280 final String name = thread.getLooper().getThread().getName(); 281 mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis)); 282 } 283 } 284 285 /** 286 * Perform a full reboot of the system. 287 */ rebootSystem(String reason)288 void rebootSystem(String reason) { 289 Slog.i(TAG, "Rebooting system because: " + reason); 290 IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE); 291 try { 292 pms.reboot(false, reason, false); 293 } catch (RemoteException ex) { 294 } 295 } 296 evaluateCheckerCompletionLocked()297 private int evaluateCheckerCompletionLocked() { 298 int state = COMPLETED; 299 for (int i=0; i<mHandlerCheckers.size(); i++) { 300 HandlerChecker hc = mHandlerCheckers.get(i); 301 state = Math.max(state, hc.getCompletionStateLocked()); 302 } 303 return state; 304 } 305 getBlockedCheckersLocked()306 private ArrayList<HandlerChecker> getBlockedCheckersLocked() { 307 ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>(); 308 for (int i=0; i<mHandlerCheckers.size(); i++) { 309 HandlerChecker hc = mHandlerCheckers.get(i); 310 if (hc.isOverdueLocked()) { 311 checkers.add(hc); 312 } 313 } 314 return checkers; 315 } 316 describeCheckersLocked(ArrayList<HandlerChecker> checkers)317 private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) { 318 StringBuilder builder = new StringBuilder(128); 319 for (int i=0; i<checkers.size(); i++) { 320 if (builder.length() > 0) { 321 builder.append(", "); 322 } 323 builder.append(checkers.get(i).describeBlockedStateLocked()); 324 } 325 return builder.toString(); 326 } 327 328 @Override run()329 public void run() { 330 boolean waitedHalf = false; 331 while (true) { 332 final ArrayList<HandlerChecker> blockedCheckers; 333 final String subject; 334 final boolean allowRestart; 335 int debuggerWasConnected = 0; 336 synchronized (this) { 337 long timeout = CHECK_INTERVAL; 338 // Make sure we (re)spin the checkers that have become idle within 339 // this wait-and-check interval 340 for (int i=0; i<mHandlerCheckers.size(); i++) { 341 HandlerChecker hc = mHandlerCheckers.get(i); 342 hc.scheduleCheckLocked(); 343 } 344 345 if (debuggerWasConnected > 0) { 346 debuggerWasConnected--; 347 } 348 349 // NOTE: We use uptimeMillis() here because we do not want to increment the time we 350 // wait while asleep. If the device is asleep then the thing that we are waiting 351 // to timeout on is asleep as well and won't have a chance to run, causing a false 352 // positive on when to kill things. 353 long start = SystemClock.uptimeMillis(); 354 while (timeout > 0) { 355 if (Debug.isDebuggerConnected()) { 356 debuggerWasConnected = 2; 357 } 358 try { 359 wait(timeout); 360 } catch (InterruptedException e) { 361 Log.wtf(TAG, e); 362 } 363 if (Debug.isDebuggerConnected()) { 364 debuggerWasConnected = 2; 365 } 366 timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start); 367 } 368 369 final int waitState = evaluateCheckerCompletionLocked(); 370 if (waitState == COMPLETED) { 371 // The monitors have returned; reset 372 waitedHalf = false; 373 continue; 374 } else if (waitState == WAITING) { 375 // still waiting but within their configured intervals; back off and recheck 376 continue; 377 } else if (waitState == WAITED_HALF) { 378 if (!waitedHalf) { 379 // We've waited half the deadlock-detection interval. Pull a stack 380 // trace and wait another half. 381 ArrayList<Integer> pids = new ArrayList<Integer>(); 382 pids.add(Process.myPid()); 383 ActivityManagerService.dumpStackTraces(true, pids, null, null, 384 NATIVE_STACKS_OF_INTEREST); 385 waitedHalf = true; 386 } 387 continue; 388 } 389 390 // something is overdue! 391 blockedCheckers = getBlockedCheckersLocked(); 392 subject = describeCheckersLocked(blockedCheckers); 393 allowRestart = mAllowRestart; 394 } 395 396 // If we got here, that means that the system is most likely hung. 397 // First collect stack traces from all threads of the system process. 398 // Then kill this process so that the system will restart. 399 EventLog.writeEvent(EventLogTags.WATCHDOG, subject); 400 401 ArrayList<Integer> pids = new ArrayList<Integer>(); 402 pids.add(Process.myPid()); 403 if (mPhonePid > 0) pids.add(mPhonePid); 404 // Pass !waitedHalf so that just in case we somehow wind up here without having 405 // dumped the halfway stacks, we properly re-initialize the trace file. 406 final File stack = ActivityManagerService.dumpStackTraces( 407 !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST); 408 409 // Give some extra time to make sure the stack traces get written. 410 // The system's been hanging for a minute, another second or two won't hurt much. 411 SystemClock.sleep(2000); 412 413 // Pull our own kernel thread stacks as well if we're configured for that 414 if (RECORD_KERNEL_THREADS) { 415 dumpKernelStackTraces(); 416 } 417 418 // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log 419 doSysRq('w'); 420 doSysRq('l'); 421 422 // Try to add the error to the dropbox, but assuming that the ActivityManager 423 // itself may be deadlocked. (which has happened, causing this statement to 424 // deadlock and the watchdog as a whole to be ineffective) 425 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { 426 public void run() { 427 mActivity.addErrorToDropBox( 428 "watchdog", null, "system_server", null, null, 429 subject, null, stack, null); 430 } 431 }; 432 dropboxThread.start(); 433 try { 434 dropboxThread.join(2000); // wait up to 2 seconds for it to return. 435 } catch (InterruptedException ignored) {} 436 437 IActivityController controller; 438 synchronized (this) { 439 controller = mController; 440 } 441 if (controller != null) { 442 Slog.i(TAG, "Reporting stuck state to activity controller"); 443 try { 444 Binder.setDumpDisabled("Service dumps disabled due to hung system process."); 445 // 1 = keep waiting, -1 = kill system 446 int res = controller.systemNotResponding(subject); 447 if (res >= 0) { 448 Slog.i(TAG, "Activity controller requested to coninue to wait"); 449 waitedHalf = false; 450 continue; 451 } 452 } catch (RemoteException e) { 453 } 454 } 455 456 // Only kill the process if the debugger is not attached. 457 if (Debug.isDebuggerConnected()) { 458 debuggerWasConnected = 2; 459 } 460 if (debuggerWasConnected >= 2) { 461 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); 462 } else if (debuggerWasConnected > 0) { 463 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process"); 464 } else if (!allowRestart) { 465 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); 466 } else { 467 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); 468 for (int i=0; i<blockedCheckers.size(); i++) { 469 Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:"); 470 StackTraceElement[] stackTrace 471 = blockedCheckers.get(i).getThread().getStackTrace(); 472 for (StackTraceElement element: stackTrace) { 473 Slog.w(TAG, " at " + element); 474 } 475 } 476 Slog.w(TAG, "*** GOODBYE!"); 477 Process.killProcess(Process.myPid()); 478 System.exit(10); 479 } 480 481 waitedHalf = false; 482 } 483 } 484 doSysRq(char c)485 private void doSysRq(char c) { 486 try { 487 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); 488 sysrq_trigger.write(c); 489 sysrq_trigger.close(); 490 } catch (IOException e) { 491 Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e); 492 } 493 } 494 dumpKernelStackTraces()495 private File dumpKernelStackTraces() { 496 String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null); 497 if (tracesPath == null || tracesPath.length() == 0) { 498 return null; 499 } 500 501 native_dumpKernelStacks(tracesPath); 502 return new File(tracesPath); 503 } 504 native_dumpKernelStacks(String tracesPath)505 private native void native_dumpKernelStacks(String tracesPath); 506 } 507