1 /*
2  * Copyright (C) 2008 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import android.app.IActivityController;
20 import android.os.Binder;
21 import android.os.RemoteException;
22 import com.android.server.am.ActivityManagerService;
23 
24 import android.content.BroadcastReceiver;
25 import android.content.ContentResolver;
26 import android.content.Context;
27 import android.content.Intent;
28 import android.content.IntentFilter;
29 import android.os.Debug;
30 import android.os.Handler;
31 import android.os.IPowerManager;
32 import android.os.Looper;
33 import android.os.Process;
34 import android.os.ServiceManager;
35 import android.os.SystemClock;
36 import android.os.SystemProperties;
37 import android.util.EventLog;
38 import android.util.Log;
39 import android.util.Slog;
40 
41 import java.io.File;
42 import java.io.FileWriter;
43 import java.io.IOException;
44 import java.util.ArrayList;
45 
46 /** This class calls its monitor every minute. Killing this process if they don't return **/
47 public class Watchdog extends Thread {
48     static final String TAG = "Watchdog";
49     static final boolean localLOGV = false || false;
50 
51     // Set this to true to use debug default values.
52     static final boolean DB = false;
53 
54     // Set this to true to have the watchdog record kernel thread stacks when it fires
55     static final boolean RECORD_KERNEL_THREADS = true;
56 
57     static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
58     static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
59 
60     // These are temporally ordered: larger values as lateness increases
61     static final int COMPLETED = 0;
62     static final int WAITING = 1;
63     static final int WAITED_HALF = 2;
64     static final int OVERDUE = 3;
65 
66     // Which native processes to dump into dropbox's stack traces
67     public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
68         "/system/bin/mediaserver",
69         "/system/bin/sdcard",
70         "/system/bin/surfaceflinger"
71     };
72 
73     static Watchdog sWatchdog;
74 
75     /* This handler will be used to post message back onto the main thread */
76     final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<HandlerChecker>();
77     final HandlerChecker mMonitorChecker;
78     ContentResolver mResolver;
79     ActivityManagerService mActivity;
80 
81     int mPhonePid;
82     IActivityController mController;
83     boolean mAllowRestart = true;
84 
85     /**
86      * Used for checking status of handle threads and scheduling monitor callbacks.
87      */
88     public final class HandlerChecker implements Runnable {
89         private final Handler mHandler;
90         private final String mName;
91         private final long mWaitMax;
92         private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
93         private boolean mCompleted;
94         private Monitor mCurrentMonitor;
95         private long mStartTime;
96 
HandlerChecker(Handler handler, String name, long waitMaxMillis)97         HandlerChecker(Handler handler, String name, long waitMaxMillis) {
98             mHandler = handler;
99             mName = name;
100             mWaitMax = waitMaxMillis;
101             mCompleted = true;
102         }
103 
addMonitor(Monitor monitor)104         public void addMonitor(Monitor monitor) {
105             mMonitors.add(monitor);
106         }
107 
scheduleCheckLocked()108         public void scheduleCheckLocked() {
109             if (mMonitors.size() == 0 && mHandler.getLooper().isIdling()) {
110                 // If the target looper is or just recently was idling, then
111                 // there is no reason to enqueue our checker on it since that
112                 // is as good as it not being deadlocked.  This avoid having
113                 // to do a context switch to check the thread.  Note that we
114                 // only do this if mCheckReboot is false and we have no
115                 // monitors, since those would need to be executed at this point.
116                 mCompleted = true;
117                 return;
118             }
119 
120             if (!mCompleted) {
121                 // we already have a check in flight, so no need
122                 return;
123             }
124 
125             mCompleted = false;
126             mCurrentMonitor = null;
127             mStartTime = SystemClock.uptimeMillis();
128             mHandler.postAtFrontOfQueue(this);
129         }
130 
isOverdueLocked()131         public boolean isOverdueLocked() {
132             return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
133         }
134 
getCompletionStateLocked()135         public int getCompletionStateLocked() {
136             if (mCompleted) {
137                 return COMPLETED;
138             } else {
139                 long latency = SystemClock.uptimeMillis() - mStartTime;
140                 if (latency < mWaitMax/2) {
141                     return WAITING;
142                 } else if (latency < mWaitMax) {
143                     return WAITED_HALF;
144                 }
145             }
146             return OVERDUE;
147         }
148 
getThread()149         public Thread getThread() {
150             return mHandler.getLooper().getThread();
151         }
152 
getName()153         public String getName() {
154             return mName;
155         }
156 
describeBlockedStateLocked()157         public String describeBlockedStateLocked() {
158             if (mCurrentMonitor == null) {
159                 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
160             } else {
161                 return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
162                         + " on " + mName + " (" + getThread().getName() + ")";
163             }
164         }
165 
166         @Override
run()167         public void run() {
168             final int size = mMonitors.size();
169             for (int i = 0 ; i < size ; i++) {
170                 synchronized (Watchdog.this) {
171                     mCurrentMonitor = mMonitors.get(i);
172                 }
173                 mCurrentMonitor.monitor();
174             }
175 
176             synchronized (Watchdog.this) {
177                 mCompleted = true;
178                 mCurrentMonitor = null;
179             }
180         }
181     }
182 
183     final class RebootRequestReceiver extends BroadcastReceiver {
184         @Override
onReceive(Context c, Intent intent)185         public void onReceive(Context c, Intent intent) {
186             if (intent.getIntExtra("nowait", 0) != 0) {
187                 rebootSystem("Received ACTION_REBOOT broadcast");
188                 return;
189             }
190             Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
191         }
192     }
193 
194     public interface Monitor {
monitor()195         void monitor();
196     }
197 
getInstance()198     public static Watchdog getInstance() {
199         if (sWatchdog == null) {
200             sWatchdog = new Watchdog();
201         }
202 
203         return sWatchdog;
204     }
205 
Watchdog()206     private Watchdog() {
207         super("watchdog");
208         // Initialize handler checkers for each common thread we want to check.  Note
209         // that we are not currently checking the background thread, since it can
210         // potentially hold longer running operations with no guarantees about the timeliness
211         // of operations there.
212 
213         // The shared foreground thread is the main checker.  It is where we
214         // will also dispatch monitor checks and do other work.
215         mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
216                 "foreground thread", DEFAULT_TIMEOUT);
217         mHandlerCheckers.add(mMonitorChecker);
218         // Add checker for main thread.  We only do a quick check since there
219         // can be UI running on the thread.
220         mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
221                 "main thread", DEFAULT_TIMEOUT));
222         // Add checker for shared UI thread.
223         mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
224                 "ui thread", DEFAULT_TIMEOUT));
225         // And also check IO thread.
226         mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
227                 "i/o thread", DEFAULT_TIMEOUT));
228         // And the display thread.
229         mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
230                 "display thread", DEFAULT_TIMEOUT));
231     }
232 
init(Context context, ActivityManagerService activity)233     public void init(Context context, ActivityManagerService activity) {
234         mResolver = context.getContentResolver();
235         mActivity = activity;
236 
237         context.registerReceiver(new RebootRequestReceiver(),
238                 new IntentFilter(Intent.ACTION_REBOOT),
239                 android.Manifest.permission.REBOOT, null);
240     }
241 
processStarted(String name, int pid)242     public void processStarted(String name, int pid) {
243         synchronized (this) {
244             if ("com.android.phone".equals(name)) {
245                 mPhonePid = pid;
246             }
247         }
248     }
249 
setActivityController(IActivityController controller)250     public void setActivityController(IActivityController controller) {
251         synchronized (this) {
252             mController = controller;
253         }
254     }
255 
setAllowRestart(boolean allowRestart)256     public void setAllowRestart(boolean allowRestart) {
257         synchronized (this) {
258             mAllowRestart = allowRestart;
259         }
260     }
261 
addMonitor(Monitor monitor)262     public void addMonitor(Monitor monitor) {
263         synchronized (this) {
264             if (isAlive()) {
265                 throw new RuntimeException("Monitors can't be added once the Watchdog is running");
266             }
267             mMonitorChecker.addMonitor(monitor);
268         }
269     }
270 
addThread(Handler thread)271     public void addThread(Handler thread) {
272         addThread(thread, DEFAULT_TIMEOUT);
273     }
274 
addThread(Handler thread, long timeoutMillis)275     public void addThread(Handler thread, long timeoutMillis) {
276         synchronized (this) {
277             if (isAlive()) {
278                 throw new RuntimeException("Threads can't be added once the Watchdog is running");
279             }
280             final String name = thread.getLooper().getThread().getName();
281             mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
282         }
283     }
284 
285     /**
286      * Perform a full reboot of the system.
287      */
rebootSystem(String reason)288     void rebootSystem(String reason) {
289         Slog.i(TAG, "Rebooting system because: " + reason);
290         IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
291         try {
292             pms.reboot(false, reason, false);
293         } catch (RemoteException ex) {
294         }
295     }
296 
evaluateCheckerCompletionLocked()297     private int evaluateCheckerCompletionLocked() {
298         int state = COMPLETED;
299         for (int i=0; i<mHandlerCheckers.size(); i++) {
300             HandlerChecker hc = mHandlerCheckers.get(i);
301             state = Math.max(state, hc.getCompletionStateLocked());
302         }
303         return state;
304     }
305 
getBlockedCheckersLocked()306     private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
307         ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
308         for (int i=0; i<mHandlerCheckers.size(); i++) {
309             HandlerChecker hc = mHandlerCheckers.get(i);
310             if (hc.isOverdueLocked()) {
311                 checkers.add(hc);
312             }
313         }
314         return checkers;
315     }
316 
describeCheckersLocked(ArrayList<HandlerChecker> checkers)317     private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) {
318         StringBuilder builder = new StringBuilder(128);
319         for (int i=0; i<checkers.size(); i++) {
320             if (builder.length() > 0) {
321                 builder.append(", ");
322             }
323             builder.append(checkers.get(i).describeBlockedStateLocked());
324         }
325         return builder.toString();
326     }
327 
328     @Override
run()329     public void run() {
330         boolean waitedHalf = false;
331         while (true) {
332             final ArrayList<HandlerChecker> blockedCheckers;
333             final String subject;
334             final boolean allowRestart;
335             int debuggerWasConnected = 0;
336             synchronized (this) {
337                 long timeout = CHECK_INTERVAL;
338                 // Make sure we (re)spin the checkers that have become idle within
339                 // this wait-and-check interval
340                 for (int i=0; i<mHandlerCheckers.size(); i++) {
341                     HandlerChecker hc = mHandlerCheckers.get(i);
342                     hc.scheduleCheckLocked();
343                 }
344 
345                 if (debuggerWasConnected > 0) {
346                     debuggerWasConnected--;
347                 }
348 
349                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
350                 // wait while asleep. If the device is asleep then the thing that we are waiting
351                 // to timeout on is asleep as well and won't have a chance to run, causing a false
352                 // positive on when to kill things.
353                 long start = SystemClock.uptimeMillis();
354                 while (timeout > 0) {
355                     if (Debug.isDebuggerConnected()) {
356                         debuggerWasConnected = 2;
357                     }
358                     try {
359                         wait(timeout);
360                     } catch (InterruptedException e) {
361                         Log.wtf(TAG, e);
362                     }
363                     if (Debug.isDebuggerConnected()) {
364                         debuggerWasConnected = 2;
365                     }
366                     timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
367                 }
368 
369                 final int waitState = evaluateCheckerCompletionLocked();
370                 if (waitState == COMPLETED) {
371                     // The monitors have returned; reset
372                     waitedHalf = false;
373                     continue;
374                 } else if (waitState == WAITING) {
375                     // still waiting but within their configured intervals; back off and recheck
376                     continue;
377                 } else if (waitState == WAITED_HALF) {
378                     if (!waitedHalf) {
379                         // We've waited half the deadlock-detection interval.  Pull a stack
380                         // trace and wait another half.
381                         ArrayList<Integer> pids = new ArrayList<Integer>();
382                         pids.add(Process.myPid());
383                         ActivityManagerService.dumpStackTraces(true, pids, null, null,
384                                 NATIVE_STACKS_OF_INTEREST);
385                         waitedHalf = true;
386                     }
387                     continue;
388                 }
389 
390                 // something is overdue!
391                 blockedCheckers = getBlockedCheckersLocked();
392                 subject = describeCheckersLocked(blockedCheckers);
393                 allowRestart = mAllowRestart;
394             }
395 
396             // If we got here, that means that the system is most likely hung.
397             // First collect stack traces from all threads of the system process.
398             // Then kill this process so that the system will restart.
399             EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
400 
401             ArrayList<Integer> pids = new ArrayList<Integer>();
402             pids.add(Process.myPid());
403             if (mPhonePid > 0) pids.add(mPhonePid);
404             // Pass !waitedHalf so that just in case we somehow wind up here without having
405             // dumped the halfway stacks, we properly re-initialize the trace file.
406             final File stack = ActivityManagerService.dumpStackTraces(
407                     !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
408 
409             // Give some extra time to make sure the stack traces get written.
410             // The system's been hanging for a minute, another second or two won't hurt much.
411             SystemClock.sleep(2000);
412 
413             // Pull our own kernel thread stacks as well if we're configured for that
414             if (RECORD_KERNEL_THREADS) {
415                 dumpKernelStackTraces();
416             }
417 
418             // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
419             doSysRq('w');
420             doSysRq('l');
421 
422             // Try to add the error to the dropbox, but assuming that the ActivityManager
423             // itself may be deadlocked.  (which has happened, causing this statement to
424             // deadlock and the watchdog as a whole to be ineffective)
425             Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
426                     public void run() {
427                         mActivity.addErrorToDropBox(
428                                 "watchdog", null, "system_server", null, null,
429                                 subject, null, stack, null);
430                     }
431                 };
432             dropboxThread.start();
433             try {
434                 dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
435             } catch (InterruptedException ignored) {}
436 
437             IActivityController controller;
438             synchronized (this) {
439                 controller = mController;
440             }
441             if (controller != null) {
442                 Slog.i(TAG, "Reporting stuck state to activity controller");
443                 try {
444                     Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
445                     // 1 = keep waiting, -1 = kill system
446                     int res = controller.systemNotResponding(subject);
447                     if (res >= 0) {
448                         Slog.i(TAG, "Activity controller requested to coninue to wait");
449                         waitedHalf = false;
450                         continue;
451                     }
452                 } catch (RemoteException e) {
453                 }
454             }
455 
456             // Only kill the process if the debugger is not attached.
457             if (Debug.isDebuggerConnected()) {
458                 debuggerWasConnected = 2;
459             }
460             if (debuggerWasConnected >= 2) {
461                 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
462             } else if (debuggerWasConnected > 0) {
463                 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
464             } else if (!allowRestart) {
465                 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
466             } else {
467                 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
468                 for (int i=0; i<blockedCheckers.size(); i++) {
469                     Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
470                     StackTraceElement[] stackTrace
471                             = blockedCheckers.get(i).getThread().getStackTrace();
472                     for (StackTraceElement element: stackTrace) {
473                         Slog.w(TAG, "    at " + element);
474                     }
475                 }
476                 Slog.w(TAG, "*** GOODBYE!");
477                 Process.killProcess(Process.myPid());
478                 System.exit(10);
479             }
480 
481             waitedHalf = false;
482         }
483     }
484 
doSysRq(char c)485     private void doSysRq(char c) {
486         try {
487             FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
488             sysrq_trigger.write(c);
489             sysrq_trigger.close();
490         } catch (IOException e) {
491             Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
492         }
493     }
494 
dumpKernelStackTraces()495     private File dumpKernelStackTraces() {
496         String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
497         if (tracesPath == null || tracesPath.length() == 0) {
498             return null;
499         }
500 
501         native_dumpKernelStacks(tracesPath);
502         return new File(tracesPath);
503     }
504 
native_dumpKernelStacks(String tracesPath)505     private native void native_dumpKernelStacks(String tracesPath);
506 }
507