1 /*
2  * Copyright (C) 2008 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import android.app.IActivityController;
20 import android.os.Binder;
21 import android.os.RemoteException;
22 import com.android.server.am.ActivityManagerService;
23 
24 import android.content.BroadcastReceiver;
25 import android.content.ContentResolver;
26 import android.content.Context;
27 import android.content.Intent;
28 import android.content.IntentFilter;
29 import android.hidl.manager.V1_0.IServiceManager;
30 import android.os.Debug;
31 import android.os.Handler;
32 import android.os.IPowerManager;
33 import android.os.Looper;
34 import android.os.Process;
35 import android.os.ServiceManager;
36 import android.os.SystemClock;
37 import android.os.SystemProperties;
38 import android.util.EventLog;
39 import android.util.Log;
40 import android.util.Slog;
41 
42 import java.io.File;
43 import java.io.FileWriter;
44 import java.io.IOException;
45 import java.util.ArrayList;
46 import java.util.Arrays;
47 import java.util.HashSet;
48 import java.util.List;
49 
50 /** This class calls its monitor every minute. Killing this process if they don't return **/
51 public class Watchdog extends Thread {
52     static final String TAG = "Watchdog";
53 
54     // Set this to true to use debug default values.
55     static final boolean DB = false;
56 
57     // Set this to true to have the watchdog record kernel thread stacks when it fires
58     static final boolean RECORD_KERNEL_THREADS = true;
59 
60     static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
61     static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
62 
63     // These are temporally ordered: larger values as lateness increases
64     static final int COMPLETED = 0;
65     static final int WAITING = 1;
66     static final int WAITED_HALF = 2;
67     static final int OVERDUE = 3;
68 
69     // Which native processes to dump into dropbox's stack traces
70     public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
71         "/system/bin/audioserver",
72         "/system/bin/cameraserver",
73         "/system/bin/drmserver",
74         "/system/bin/mediadrmserver",
75         "/system/bin/mediaserver",
76         "/system/bin/sdcard",
77         "/system/bin/surfaceflinger",
78         "media.extractor", // system/bin/mediaextractor
79         "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
80         "com.android.bluetooth",  // Bluetooth service
81     };
82 
83     public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList(
84         "android.hardware.audio@2.0::IDevicesFactory",
85         "android.hardware.bluetooth@1.0::IBluetoothHci",
86         "android.hardware.camera.provider@2.4::ICameraProvider",
87         "android.hardware.graphics.composer@2.1::IComposer",
88         "android.hardware.vr@1.0::IVr",
89         "android.hardware.media.omx@1.0::IOmx"
90     );
91 
92     static Watchdog sWatchdog;
93 
94     /* This handler will be used to post message back onto the main thread */
95     final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>();
96     final HandlerChecker mMonitorChecker;
97     ContentResolver mResolver;
98     ActivityManagerService mActivity;
99 
100     int mPhonePid;
101     IActivityController mController;
102     boolean mAllowRestart = true;
103 
104     /**
105      * Used for checking status of handle threads and scheduling monitor callbacks.
106      */
107     public final class HandlerChecker implements Runnable {
108         private final Handler mHandler;
109         private final String mName;
110         private final long mWaitMax;
111         private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
112         private boolean mCompleted;
113         private Monitor mCurrentMonitor;
114         private long mStartTime;
115 
HandlerChecker(Handler handler, String name, long waitMaxMillis)116         HandlerChecker(Handler handler, String name, long waitMaxMillis) {
117             mHandler = handler;
118             mName = name;
119             mWaitMax = waitMaxMillis;
120             mCompleted = true;
121         }
122 
addMonitor(Monitor monitor)123         public void addMonitor(Monitor monitor) {
124             mMonitors.add(monitor);
125         }
126 
scheduleCheckLocked()127         public void scheduleCheckLocked() {
128             if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
129                 // If the target looper has recently been polling, then
130                 // there is no reason to enqueue our checker on it since that
131                 // is as good as it not being deadlocked.  This avoid having
132                 // to do a context switch to check the thread.  Note that we
133                 // only do this if mCheckReboot is false and we have no
134                 // monitors, since those would need to be executed at this point.
135                 mCompleted = true;
136                 return;
137             }
138 
139             if (!mCompleted) {
140                 // we already have a check in flight, so no need
141                 return;
142             }
143 
144             mCompleted = false;
145             mCurrentMonitor = null;
146             mStartTime = SystemClock.uptimeMillis();
147             mHandler.postAtFrontOfQueue(this);
148         }
149 
isOverdueLocked()150         public boolean isOverdueLocked() {
151             return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
152         }
153 
getCompletionStateLocked()154         public int getCompletionStateLocked() {
155             if (mCompleted) {
156                 return COMPLETED;
157             } else {
158                 long latency = SystemClock.uptimeMillis() - mStartTime;
159                 if (latency < mWaitMax/2) {
160                     return WAITING;
161                 } else if (latency < mWaitMax) {
162                     return WAITED_HALF;
163                 }
164             }
165             return OVERDUE;
166         }
167 
getThread()168         public Thread getThread() {
169             return mHandler.getLooper().getThread();
170         }
171 
getName()172         public String getName() {
173             return mName;
174         }
175 
describeBlockedStateLocked()176         public String describeBlockedStateLocked() {
177             if (mCurrentMonitor == null) {
178                 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
179             } else {
180                 return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
181                         + " on " + mName + " (" + getThread().getName() + ")";
182             }
183         }
184 
185         @Override
run()186         public void run() {
187             final int size = mMonitors.size();
188             for (int i = 0 ; i < size ; i++) {
189                 synchronized (Watchdog.this) {
190                     mCurrentMonitor = mMonitors.get(i);
191                 }
192                 mCurrentMonitor.monitor();
193             }
194 
195             synchronized (Watchdog.this) {
196                 mCompleted = true;
197                 mCurrentMonitor = null;
198             }
199         }
200     }
201 
202     final class RebootRequestReceiver extends BroadcastReceiver {
203         @Override
onReceive(Context c, Intent intent)204         public void onReceive(Context c, Intent intent) {
205             if (intent.getIntExtra("nowait", 0) != 0) {
206                 rebootSystem("Received ACTION_REBOOT broadcast");
207                 return;
208             }
209             Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
210         }
211     }
212 
213     /** Monitor for checking the availability of binder threads. The monitor will block until
214      * there is a binder thread available to process in coming IPCs to make sure other processes
215      * can still communicate with the service.
216      */
217     private static final class BinderThreadMonitor implements Watchdog.Monitor {
218         @Override
monitor()219         public void monitor() {
220             Binder.blockUntilThreadAvailable();
221         }
222     }
223 
224     public interface Monitor {
monitor()225         void monitor();
226     }
227 
getInstance()228     public static Watchdog getInstance() {
229         if (sWatchdog == null) {
230             sWatchdog = new Watchdog();
231         }
232 
233         return sWatchdog;
234     }
235 
Watchdog()236     private Watchdog() {
237         super("watchdog");
238         // Initialize handler checkers for each common thread we want to check.  Note
239         // that we are not currently checking the background thread, since it can
240         // potentially hold longer running operations with no guarantees about the timeliness
241         // of operations there.
242 
243         // The shared foreground thread is the main checker.  It is where we
244         // will also dispatch monitor checks and do other work.
245         mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
246                 "foreground thread", DEFAULT_TIMEOUT);
247         mHandlerCheckers.add(mMonitorChecker);
248         // Add checker for main thread.  We only do a quick check since there
249         // can be UI running on the thread.
250         mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
251                 "main thread", DEFAULT_TIMEOUT));
252         // Add checker for shared UI thread.
253         mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
254                 "ui thread", DEFAULT_TIMEOUT));
255         // And also check IO thread.
256         mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
257                 "i/o thread", DEFAULT_TIMEOUT));
258         // And the display thread.
259         mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
260                 "display thread", DEFAULT_TIMEOUT));
261 
262         // Initialize monitor for Binder threads.
263         addMonitor(new BinderThreadMonitor());
264     }
265 
init(Context context, ActivityManagerService activity)266     public void init(Context context, ActivityManagerService activity) {
267         mResolver = context.getContentResolver();
268         mActivity = activity;
269 
270         context.registerReceiver(new RebootRequestReceiver(),
271                 new IntentFilter(Intent.ACTION_REBOOT),
272                 android.Manifest.permission.REBOOT, null);
273     }
274 
processStarted(String name, int pid)275     public void processStarted(String name, int pid) {
276         synchronized (this) {
277             if ("com.android.phone".equals(name)) {
278                 mPhonePid = pid;
279             }
280         }
281     }
282 
setActivityController(IActivityController controller)283     public void setActivityController(IActivityController controller) {
284         synchronized (this) {
285             mController = controller;
286         }
287     }
288 
setAllowRestart(boolean allowRestart)289     public void setAllowRestart(boolean allowRestart) {
290         synchronized (this) {
291             mAllowRestart = allowRestart;
292         }
293     }
294 
addMonitor(Monitor monitor)295     public void addMonitor(Monitor monitor) {
296         synchronized (this) {
297             if (isAlive()) {
298                 throw new RuntimeException("Monitors can't be added once the Watchdog is running");
299             }
300             mMonitorChecker.addMonitor(monitor);
301         }
302     }
303 
addThread(Handler thread)304     public void addThread(Handler thread) {
305         addThread(thread, DEFAULT_TIMEOUT);
306     }
307 
addThread(Handler thread, long timeoutMillis)308     public void addThread(Handler thread, long timeoutMillis) {
309         synchronized (this) {
310             if (isAlive()) {
311                 throw new RuntimeException("Threads can't be added once the Watchdog is running");
312             }
313             final String name = thread.getLooper().getThread().getName();
314             mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
315         }
316     }
317 
318     /**
319      * Perform a full reboot of the system.
320      */
rebootSystem(String reason)321     void rebootSystem(String reason) {
322         Slog.i(TAG, "Rebooting system because: " + reason);
323         IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
324         try {
325             pms.reboot(false, reason, false);
326         } catch (RemoteException ex) {
327         }
328     }
329 
evaluateCheckerCompletionLocked()330     private int evaluateCheckerCompletionLocked() {
331         int state = COMPLETED;
332         for (int i=0; i<mHandlerCheckers.size(); i++) {
333             HandlerChecker hc = mHandlerCheckers.get(i);
334             state = Math.max(state, hc.getCompletionStateLocked());
335         }
336         return state;
337     }
338 
getBlockedCheckersLocked()339     private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
340         ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
341         for (int i=0; i<mHandlerCheckers.size(); i++) {
342             HandlerChecker hc = mHandlerCheckers.get(i);
343             if (hc.isOverdueLocked()) {
344                 checkers.add(hc);
345             }
346         }
347         return checkers;
348     }
349 
describeCheckersLocked(ArrayList<HandlerChecker> checkers)350     private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) {
351         StringBuilder builder = new StringBuilder(128);
352         for (int i=0; i<checkers.size(); i++) {
353             if (builder.length() > 0) {
354                 builder.append(", ");
355             }
356             builder.append(checkers.get(i).describeBlockedStateLocked());
357         }
358         return builder.toString();
359     }
360 
getInterestingHalPids()361     private ArrayList<Integer> getInterestingHalPids() {
362         try {
363             IServiceManager serviceManager = IServiceManager.getService();
364             ArrayList<IServiceManager.InstanceDebugInfo> dump =
365                     serviceManager.debugDump();
366             HashSet<Integer> pids = new HashSet<>();
367             for (IServiceManager.InstanceDebugInfo info : dump) {
368                 if (info.pid == IServiceManager.PidConstant.NO_PID) {
369                     continue;
370                 }
371 
372                 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) {
373                     continue;
374                 }
375 
376                 pids.add(info.pid);
377             }
378             return new ArrayList<Integer>(pids);
379         } catch (RemoteException e) {
380             return new ArrayList<Integer>();
381         }
382     }
383 
getInterestingNativePids()384     private ArrayList<Integer> getInterestingNativePids() {
385         ArrayList<Integer> pids = getInterestingHalPids();
386 
387         int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST);
388         if (nativePids != null) {
389             pids.ensureCapacity(pids.size() + nativePids.length);
390             for (int i : nativePids) {
391                 pids.add(i);
392             }
393         }
394 
395         return pids;
396     }
397 
398     @Override
run()399     public void run() {
400         boolean waitedHalf = false;
401         while (true) {
402             final ArrayList<HandlerChecker> blockedCheckers;
403             final String subject;
404             final boolean allowRestart;
405             int debuggerWasConnected = 0;
406             synchronized (this) {
407                 long timeout = CHECK_INTERVAL;
408                 // Make sure we (re)spin the checkers that have become idle within
409                 // this wait-and-check interval
410                 for (int i=0; i<mHandlerCheckers.size(); i++) {
411                     HandlerChecker hc = mHandlerCheckers.get(i);
412                     hc.scheduleCheckLocked();
413                 }
414 
415                 if (debuggerWasConnected > 0) {
416                     debuggerWasConnected--;
417                 }
418 
419                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
420                 // wait while asleep. If the device is asleep then the thing that we are waiting
421                 // to timeout on is asleep as well and won't have a chance to run, causing a false
422                 // positive on when to kill things.
423                 long start = SystemClock.uptimeMillis();
424                 while (timeout > 0) {
425                     if (Debug.isDebuggerConnected()) {
426                         debuggerWasConnected = 2;
427                     }
428                     try {
429                         wait(timeout);
430                     } catch (InterruptedException e) {
431                         Log.wtf(TAG, e);
432                     }
433                     if (Debug.isDebuggerConnected()) {
434                         debuggerWasConnected = 2;
435                     }
436                     timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
437                 }
438 
439                 final int waitState = evaluateCheckerCompletionLocked();
440                 if (waitState == COMPLETED) {
441                     // The monitors have returned; reset
442                     waitedHalf = false;
443                     continue;
444                 } else if (waitState == WAITING) {
445                     // still waiting but within their configured intervals; back off and recheck
446                     continue;
447                 } else if (waitState == WAITED_HALF) {
448                     if (!waitedHalf) {
449                         // We've waited half the deadlock-detection interval.  Pull a stack
450                         // trace and wait another half.
451                         ArrayList<Integer> pids = new ArrayList<Integer>();
452                         pids.add(Process.myPid());
453                         ActivityManagerService.dumpStackTraces(true, pids, null, null,
454                             getInterestingNativePids());
455                         waitedHalf = true;
456                     }
457                     continue;
458                 }
459 
460                 // something is overdue!
461                 blockedCheckers = getBlockedCheckersLocked();
462                 subject = describeCheckersLocked(blockedCheckers);
463                 allowRestart = mAllowRestart;
464             }
465 
466             // If we got here, that means that the system is most likely hung.
467             // First collect stack traces from all threads of the system process.
468             // Then kill this process so that the system will restart.
469             EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
470 
471             ArrayList<Integer> pids = new ArrayList<>();
472             pids.add(Process.myPid());
473             if (mPhonePid > 0) pids.add(mPhonePid);
474             // Pass !waitedHalf so that just in case we somehow wind up here without having
475             // dumped the halfway stacks, we properly re-initialize the trace file.
476             final File stack = ActivityManagerService.dumpStackTraces(
477                     !waitedHalf, pids, null, null, getInterestingNativePids());
478 
479             // Give some extra time to make sure the stack traces get written.
480             // The system's been hanging for a minute, another second or two won't hurt much.
481             SystemClock.sleep(2000);
482 
483             // Pull our own kernel thread stacks as well if we're configured for that
484             if (RECORD_KERNEL_THREADS) {
485                 dumpKernelStackTraces();
486             }
487 
488             // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
489             doSysRq('w');
490             doSysRq('l');
491 
492             // Try to add the error to the dropbox, but assuming that the ActivityManager
493             // itself may be deadlocked.  (which has happened, causing this statement to
494             // deadlock and the watchdog as a whole to be ineffective)
495             Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
496                     public void run() {
497                         mActivity.addErrorToDropBox(
498                                 "watchdog", null, "system_server", null, null,
499                                 subject, null, stack, null);
500                     }
501                 };
502             dropboxThread.start();
503             try {
504                 dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
505             } catch (InterruptedException ignored) {}
506 
507             IActivityController controller;
508             synchronized (this) {
509                 controller = mController;
510             }
511             if (controller != null) {
512                 Slog.i(TAG, "Reporting stuck state to activity controller");
513                 try {
514                     Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
515                     // 1 = keep waiting, -1 = kill system
516                     int res = controller.systemNotResponding(subject);
517                     if (res >= 0) {
518                         Slog.i(TAG, "Activity controller requested to coninue to wait");
519                         waitedHalf = false;
520                         continue;
521                     }
522                 } catch (RemoteException e) {
523                 }
524             }
525 
526             // Only kill the process if the debugger is not attached.
527             if (Debug.isDebuggerConnected()) {
528                 debuggerWasConnected = 2;
529             }
530             if (debuggerWasConnected >= 2) {
531                 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
532             } else if (debuggerWasConnected > 0) {
533                 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
534             } else if (!allowRestart) {
535                 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
536             } else {
537                 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
538                 for (int i=0; i<blockedCheckers.size(); i++) {
539                     Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
540                     StackTraceElement[] stackTrace
541                             = blockedCheckers.get(i).getThread().getStackTrace();
542                     for (StackTraceElement element: stackTrace) {
543                         Slog.w(TAG, "    at " + element);
544                     }
545                 }
546                 Slog.w(TAG, "*** GOODBYE!");
547                 Process.killProcess(Process.myPid());
548                 System.exit(10);
549             }
550 
551             waitedHalf = false;
552         }
553     }
554 
doSysRq(char c)555     private void doSysRq(char c) {
556         try {
557             FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
558             sysrq_trigger.write(c);
559             sysrq_trigger.close();
560         } catch (IOException e) {
561             Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
562         }
563     }
564 
dumpKernelStackTraces()565     private File dumpKernelStackTraces() {
566         String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
567         if (tracesPath == null || tracesPath.length() == 0) {
568             return null;
569         }
570 
571         native_dumpKernelStacks(tracesPath);
572         return new File(tracesPath);
573     }
574 
native_dumpKernelStacks(String tracesPath)575     private native void native_dumpKernelStacks(String tracesPath);
576 }
577