1 /*
2  * Copyright (C) 2008 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import android.app.IActivityController;
20 import android.os.Binder;
21 import android.os.RemoteException;
22 import com.android.server.am.ActivityManagerService;
23 
24 import android.content.BroadcastReceiver;
25 import android.content.ContentResolver;
26 import android.content.Context;
27 import android.content.Intent;
28 import android.content.IntentFilter;
29 import android.os.Debug;
30 import android.os.Handler;
31 import android.os.IPowerManager;
32 import android.os.Looper;
33 import android.os.Process;
34 import android.os.ServiceManager;
35 import android.os.SystemClock;
36 import android.os.SystemProperties;
37 import android.util.EventLog;
38 import android.util.Log;
39 import android.util.Slog;
40 
41 import java.io.File;
42 import java.io.FileWriter;
43 import java.io.IOException;
44 import java.util.ArrayList;
45 
46 /** This class calls its monitor every minute. Killing this process if they don't return **/
47 public class Watchdog extends Thread {
48     static final String TAG = "Watchdog";
49 
50     // Set this to true to use debug default values.
51     static final boolean DB = false;
52 
53     // Set this to true to have the watchdog record kernel thread stacks when it fires
54     static final boolean RECORD_KERNEL_THREADS = true;
55 
56     static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
57     static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
58 
59     // These are temporally ordered: larger values as lateness increases
60     static final int COMPLETED = 0;
61     static final int WAITING = 1;
62     static final int WAITED_HALF = 2;
63     static final int OVERDUE = 3;
64 
65     // Which native processes to dump into dropbox's stack traces
66     public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
67         "/system/bin/mediaserver",
68         "/system/bin/sdcard",
69         "/system/bin/surfaceflinger"
70     };
71 
72     static Watchdog sWatchdog;
73 
74     /* This handler will be used to post message back onto the main thread */
75     final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>();
76     final HandlerChecker mMonitorChecker;
77     ContentResolver mResolver;
78     ActivityManagerService mActivity;
79 
80     int mPhonePid;
81     IActivityController mController;
82     boolean mAllowRestart = true;
83 
84     /**
85      * Used for checking status of handle threads and scheduling monitor callbacks.
86      */
87     public final class HandlerChecker implements Runnable {
88         private final Handler mHandler;
89         private final String mName;
90         private final long mWaitMax;
91         private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
92         private boolean mCompleted;
93         private Monitor mCurrentMonitor;
94         private long mStartTime;
95 
HandlerChecker(Handler handler, String name, long waitMaxMillis)96         HandlerChecker(Handler handler, String name, long waitMaxMillis) {
97             mHandler = handler;
98             mName = name;
99             mWaitMax = waitMaxMillis;
100             mCompleted = true;
101         }
102 
addMonitor(Monitor monitor)103         public void addMonitor(Monitor monitor) {
104             mMonitors.add(monitor);
105         }
106 
scheduleCheckLocked()107         public void scheduleCheckLocked() {
108             if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
109                 // If the target looper has recently been polling, then
110                 // there is no reason to enqueue our checker on it since that
111                 // is as good as it not being deadlocked.  This avoid having
112                 // to do a context switch to check the thread.  Note that we
113                 // only do this if mCheckReboot is false and we have no
114                 // monitors, since those would need to be executed at this point.
115                 mCompleted = true;
116                 return;
117             }
118 
119             if (!mCompleted) {
120                 // we already have a check in flight, so no need
121                 return;
122             }
123 
124             mCompleted = false;
125             mCurrentMonitor = null;
126             mStartTime = SystemClock.uptimeMillis();
127             mHandler.postAtFrontOfQueue(this);
128         }
129 
isOverdueLocked()130         public boolean isOverdueLocked() {
131             return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
132         }
133 
getCompletionStateLocked()134         public int getCompletionStateLocked() {
135             if (mCompleted) {
136                 return COMPLETED;
137             } else {
138                 long latency = SystemClock.uptimeMillis() - mStartTime;
139                 if (latency < mWaitMax/2) {
140                     return WAITING;
141                 } else if (latency < mWaitMax) {
142                     return WAITED_HALF;
143                 }
144             }
145             return OVERDUE;
146         }
147 
getThread()148         public Thread getThread() {
149             return mHandler.getLooper().getThread();
150         }
151 
getName()152         public String getName() {
153             return mName;
154         }
155 
describeBlockedStateLocked()156         public String describeBlockedStateLocked() {
157             if (mCurrentMonitor == null) {
158                 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
159             } else {
160                 return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
161                         + " on " + mName + " (" + getThread().getName() + ")";
162             }
163         }
164 
165         @Override
run()166         public void run() {
167             final int size = mMonitors.size();
168             for (int i = 0 ; i < size ; i++) {
169                 synchronized (Watchdog.this) {
170                     mCurrentMonitor = mMonitors.get(i);
171                 }
172                 mCurrentMonitor.monitor();
173             }
174 
175             synchronized (Watchdog.this) {
176                 mCompleted = true;
177                 mCurrentMonitor = null;
178             }
179         }
180     }
181 
182     final class RebootRequestReceiver extends BroadcastReceiver {
183         @Override
onReceive(Context c, Intent intent)184         public void onReceive(Context c, Intent intent) {
185             if (intent.getIntExtra("nowait", 0) != 0) {
186                 rebootSystem("Received ACTION_REBOOT broadcast");
187                 return;
188             }
189             Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
190         }
191     }
192 
193     /** Monitor for checking the availability of binder threads. The monitor will block until
194      * there is a binder thread available to process in coming IPCs to make sure other processes
195      * can still communicate with the service.
196      */
197     private static final class BinderThreadMonitor implements Watchdog.Monitor {
198         @Override
monitor()199         public void monitor() {
200             Binder.blockUntilThreadAvailable();
201         }
202     }
203 
204     public interface Monitor {
monitor()205         void monitor();
206     }
207 
getInstance()208     public static Watchdog getInstance() {
209         if (sWatchdog == null) {
210             sWatchdog = new Watchdog();
211         }
212 
213         return sWatchdog;
214     }
215 
Watchdog()216     private Watchdog() {
217         super("watchdog");
218         // Initialize handler checkers for each common thread we want to check.  Note
219         // that we are not currently checking the background thread, since it can
220         // potentially hold longer running operations with no guarantees about the timeliness
221         // of operations there.
222 
223         // The shared foreground thread is the main checker.  It is where we
224         // will also dispatch monitor checks and do other work.
225         mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
226                 "foreground thread", DEFAULT_TIMEOUT);
227         mHandlerCheckers.add(mMonitorChecker);
228         // Add checker for main thread.  We only do a quick check since there
229         // can be UI running on the thread.
230         mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
231                 "main thread", DEFAULT_TIMEOUT));
232         // Add checker for shared UI thread.
233         mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
234                 "ui thread", DEFAULT_TIMEOUT));
235         // And also check IO thread.
236         mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
237                 "i/o thread", DEFAULT_TIMEOUT));
238         // And the display thread.
239         mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
240                 "display thread", DEFAULT_TIMEOUT));
241 
242         // Initialize monitor for Binder threads.
243         addMonitor(new BinderThreadMonitor());
244     }
245 
init(Context context, ActivityManagerService activity)246     public void init(Context context, ActivityManagerService activity) {
247         mResolver = context.getContentResolver();
248         mActivity = activity;
249 
250         context.registerReceiver(new RebootRequestReceiver(),
251                 new IntentFilter(Intent.ACTION_REBOOT),
252                 android.Manifest.permission.REBOOT, null);
253     }
254 
processStarted(String name, int pid)255     public void processStarted(String name, int pid) {
256         synchronized (this) {
257             if ("com.android.phone".equals(name)) {
258                 mPhonePid = pid;
259             }
260         }
261     }
262 
setActivityController(IActivityController controller)263     public void setActivityController(IActivityController controller) {
264         synchronized (this) {
265             mController = controller;
266         }
267     }
268 
setAllowRestart(boolean allowRestart)269     public void setAllowRestart(boolean allowRestart) {
270         synchronized (this) {
271             mAllowRestart = allowRestart;
272         }
273     }
274 
addMonitor(Monitor monitor)275     public void addMonitor(Monitor monitor) {
276         synchronized (this) {
277             if (isAlive()) {
278                 throw new RuntimeException("Monitors can't be added once the Watchdog is running");
279             }
280             mMonitorChecker.addMonitor(monitor);
281         }
282     }
283 
addThread(Handler thread)284     public void addThread(Handler thread) {
285         addThread(thread, DEFAULT_TIMEOUT);
286     }
287 
addThread(Handler thread, long timeoutMillis)288     public void addThread(Handler thread, long timeoutMillis) {
289         synchronized (this) {
290             if (isAlive()) {
291                 throw new RuntimeException("Threads can't be added once the Watchdog is running");
292             }
293             final String name = thread.getLooper().getThread().getName();
294             mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
295         }
296     }
297 
298     /**
299      * Perform a full reboot of the system.
300      */
rebootSystem(String reason)301     void rebootSystem(String reason) {
302         Slog.i(TAG, "Rebooting system because: " + reason);
303         IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
304         try {
305             pms.reboot(false, reason, false);
306         } catch (RemoteException ex) {
307         }
308     }
309 
evaluateCheckerCompletionLocked()310     private int evaluateCheckerCompletionLocked() {
311         int state = COMPLETED;
312         for (int i=0; i<mHandlerCheckers.size(); i++) {
313             HandlerChecker hc = mHandlerCheckers.get(i);
314             state = Math.max(state, hc.getCompletionStateLocked());
315         }
316         return state;
317     }
318 
getBlockedCheckersLocked()319     private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
320         ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
321         for (int i=0; i<mHandlerCheckers.size(); i++) {
322             HandlerChecker hc = mHandlerCheckers.get(i);
323             if (hc.isOverdueLocked()) {
324                 checkers.add(hc);
325             }
326         }
327         return checkers;
328     }
329 
describeCheckersLocked(ArrayList<HandlerChecker> checkers)330     private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) {
331         StringBuilder builder = new StringBuilder(128);
332         for (int i=0; i<checkers.size(); i++) {
333             if (builder.length() > 0) {
334                 builder.append(", ");
335             }
336             builder.append(checkers.get(i).describeBlockedStateLocked());
337         }
338         return builder.toString();
339     }
340 
341     @Override
run()342     public void run() {
343         boolean waitedHalf = false;
344         while (true) {
345             final ArrayList<HandlerChecker> blockedCheckers;
346             final String subject;
347             final boolean allowRestart;
348             int debuggerWasConnected = 0;
349             synchronized (this) {
350                 long timeout = CHECK_INTERVAL;
351                 // Make sure we (re)spin the checkers that have become idle within
352                 // this wait-and-check interval
353                 for (int i=0; i<mHandlerCheckers.size(); i++) {
354                     HandlerChecker hc = mHandlerCheckers.get(i);
355                     hc.scheduleCheckLocked();
356                 }
357 
358                 if (debuggerWasConnected > 0) {
359                     debuggerWasConnected--;
360                 }
361 
362                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
363                 // wait while asleep. If the device is asleep then the thing that we are waiting
364                 // to timeout on is asleep as well and won't have a chance to run, causing a false
365                 // positive on when to kill things.
366                 long start = SystemClock.uptimeMillis();
367                 while (timeout > 0) {
368                     if (Debug.isDebuggerConnected()) {
369                         debuggerWasConnected = 2;
370                     }
371                     try {
372                         wait(timeout);
373                     } catch (InterruptedException e) {
374                         Log.wtf(TAG, e);
375                     }
376                     if (Debug.isDebuggerConnected()) {
377                         debuggerWasConnected = 2;
378                     }
379                     timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
380                 }
381 
382                 final int waitState = evaluateCheckerCompletionLocked();
383                 if (waitState == COMPLETED) {
384                     // The monitors have returned; reset
385                     waitedHalf = false;
386                     continue;
387                 } else if (waitState == WAITING) {
388                     // still waiting but within their configured intervals; back off and recheck
389                     continue;
390                 } else if (waitState == WAITED_HALF) {
391                     if (!waitedHalf) {
392                         // We've waited half the deadlock-detection interval.  Pull a stack
393                         // trace and wait another half.
394                         ArrayList<Integer> pids = new ArrayList<Integer>();
395                         pids.add(Process.myPid());
396                         ActivityManagerService.dumpStackTraces(true, pids, null, null,
397                                 NATIVE_STACKS_OF_INTEREST);
398                         waitedHalf = true;
399                     }
400                     continue;
401                 }
402 
403                 // something is overdue!
404                 blockedCheckers = getBlockedCheckersLocked();
405                 subject = describeCheckersLocked(blockedCheckers);
406                 allowRestart = mAllowRestart;
407             }
408 
409             // If we got here, that means that the system is most likely hung.
410             // First collect stack traces from all threads of the system process.
411             // Then kill this process so that the system will restart.
412             EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
413 
414             ArrayList<Integer> pids = new ArrayList<Integer>();
415             pids.add(Process.myPid());
416             if (mPhonePid > 0) pids.add(mPhonePid);
417             // Pass !waitedHalf so that just in case we somehow wind up here without having
418             // dumped the halfway stacks, we properly re-initialize the trace file.
419             final File stack = ActivityManagerService.dumpStackTraces(
420                     !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
421 
422             // Give some extra time to make sure the stack traces get written.
423             // The system's been hanging for a minute, another second or two won't hurt much.
424             SystemClock.sleep(2000);
425 
426             // Pull our own kernel thread stacks as well if we're configured for that
427             if (RECORD_KERNEL_THREADS) {
428                 dumpKernelStackTraces();
429             }
430 
431             // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
432             doSysRq('w');
433             doSysRq('l');
434 
435             // Try to add the error to the dropbox, but assuming that the ActivityManager
436             // itself may be deadlocked.  (which has happened, causing this statement to
437             // deadlock and the watchdog as a whole to be ineffective)
438             Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
439                     public void run() {
440                         mActivity.addErrorToDropBox(
441                                 "watchdog", null, "system_server", null, null,
442                                 subject, null, stack, null);
443                     }
444                 };
445             dropboxThread.start();
446             try {
447                 dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
448             } catch (InterruptedException ignored) {}
449 
450             IActivityController controller;
451             synchronized (this) {
452                 controller = mController;
453             }
454             if (controller != null) {
455                 Slog.i(TAG, "Reporting stuck state to activity controller");
456                 try {
457                     Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
458                     // 1 = keep waiting, -1 = kill system
459                     int res = controller.systemNotResponding(subject);
460                     if (res >= 0) {
461                         Slog.i(TAG, "Activity controller requested to coninue to wait");
462                         waitedHalf = false;
463                         continue;
464                     }
465                 } catch (RemoteException e) {
466                 }
467             }
468 
469             // Only kill the process if the debugger is not attached.
470             if (Debug.isDebuggerConnected()) {
471                 debuggerWasConnected = 2;
472             }
473             if (debuggerWasConnected >= 2) {
474                 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
475             } else if (debuggerWasConnected > 0) {
476                 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
477             } else if (!allowRestart) {
478                 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
479             } else {
480                 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
481                 for (int i=0; i<blockedCheckers.size(); i++) {
482                     Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
483                     StackTraceElement[] stackTrace
484                             = blockedCheckers.get(i).getThread().getStackTrace();
485                     for (StackTraceElement element: stackTrace) {
486                         Slog.w(TAG, "    at " + element);
487                     }
488                 }
489                 Slog.w(TAG, "*** GOODBYE!");
490                 Process.killProcess(Process.myPid());
491                 System.exit(10);
492             }
493 
494             waitedHalf = false;
495         }
496     }
497 
doSysRq(char c)498     private void doSysRq(char c) {
499         try {
500             FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
501             sysrq_trigger.write(c);
502             sysrq_trigger.close();
503         } catch (IOException e) {
504             Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
505         }
506     }
507 
dumpKernelStackTraces()508     private File dumpKernelStackTraces() {
509         String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
510         if (tracesPath == null || tracesPath.length() == 0) {
511             return null;
512         }
513 
514         native_dumpKernelStacks(tracesPath);
515         return new File(tracesPath);
516     }
517 
native_dumpKernelStacks(String tracesPath)518     private native void native_dumpKernelStacks(String tracesPath);
519 }
520