1 /*
2  * Copyright (C) 2008 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import android.app.IActivityController;
20 import android.content.BroadcastReceiver;
21 import android.content.Context;
22 import android.content.Intent;
23 import android.content.IntentFilter;
24 import android.hidl.manager.V1_0.IServiceManager;
25 import android.os.Binder;
26 import android.os.Build;
27 import android.os.Debug;
28 import android.os.Handler;
29 import android.os.IPowerManager;
30 import android.os.Looper;
31 import android.os.Process;
32 import android.os.RemoteException;
33 import android.os.ServiceManager;
34 import android.os.SystemClock;
35 import android.system.ErrnoException;
36 import android.system.Os;
37 import android.system.OsConstants;
38 import android.system.StructRlimit;
39 import android.util.EventLog;
40 import android.util.Log;
41 import android.util.Slog;
42 import android.util.SparseArray;
43 
44 import com.android.internal.os.ProcessCpuTracker;
45 import com.android.internal.os.ZygoteConnectionConstants;
46 import com.android.internal.util.FrameworkStatsLog;
47 import com.android.server.am.ActivityManagerService;
48 import com.android.server.wm.SurfaceAnimationThread;
49 
50 import java.io.File;
51 import java.io.FileWriter;
52 import java.io.IOException;
53 import java.io.StringWriter;
54 import java.nio.charset.StandardCharsets;
55 import java.nio.file.Files;
56 import java.nio.file.Path;
57 import java.nio.file.Paths;
58 import java.util.ArrayList;
59 import java.util.Arrays;
60 import java.util.Collections;
61 import java.util.HashSet;
62 import java.util.List;
63 
64 /** This class calls its monitor every minute. Killing this process if they don't return **/
65 public class Watchdog extends Thread {
66     static final String TAG = "Watchdog";
67 
68     /** Debug flag. */
69     public static final boolean DEBUG = false;
70 
71     // Set this to true to use debug default values.
72     private static final boolean DB = false;
73 
74     // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with
75     //         timeout in com.android.internal.os.ZygoteConnection, or wrapped applications
76     //         can trigger the watchdog.
77     // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped
78     //         applications may not work with a debug build. CTS will fail.
79     private static final long DEFAULT_TIMEOUT = DB ? 10 * 1000 : 60 * 1000;
80     private static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
81 
82     // These are temporally ordered: larger values as lateness increases
83     private static final int COMPLETED = 0;
84     private static final int WAITING = 1;
85     private static final int WAITED_HALF = 2;
86     private static final int OVERDUE = 3;
87 
88     // Which native processes to dump into dropbox's stack traces
89     public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
90         "/system/bin/audioserver",
91         "/system/bin/cameraserver",
92         "/system/bin/drmserver",
93         "/system/bin/mediadrmserver",
94         "/system/bin/mediaserver",
95         "/system/bin/netd",
96         "/system/bin/sdcard",
97         "/system/bin/surfaceflinger",
98         "/system/bin/vold",
99         "media.extractor", // system/bin/mediaextractor
100         "media.metrics", // system/bin/mediametrics
101         "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
102         "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec
103         "com.android.bluetooth",  // Bluetooth service
104         "/apex/com.android.os.statsd/bin/statsd",  // Stats daemon
105     };
106 
107     public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList(
108             "android.hardware.audio@2.0::IDevicesFactory",
109             "android.hardware.audio@4.0::IDevicesFactory",
110             "android.hardware.audio@5.0::IDevicesFactory",
111             "android.hardware.audio@6.0::IDevicesFactory",
112             "android.hardware.biometrics.face@1.0::IBiometricsFace",
113             "android.hardware.biometrics.fingerprint@2.1::IBiometricsFingerprint",
114             "android.hardware.bluetooth@1.0::IBluetoothHci",
115             "android.hardware.camera.provider@2.4::ICameraProvider",
116             "android.hardware.gnss@1.0::IGnss",
117             "android.hardware.graphics.allocator@2.0::IAllocator",
118             "android.hardware.graphics.composer@2.1::IComposer",
119             "android.hardware.health@2.0::IHealth",
120             "android.hardware.light@2.0::ILight",
121             "android.hardware.media.c2@1.0::IComponentStore",
122             "android.hardware.media.omx@1.0::IOmx",
123             "android.hardware.media.omx@1.0::IOmxStore",
124             "android.hardware.neuralnetworks@1.0::IDevice",
125             "android.hardware.power.stats@1.0::IPowerStats",
126             "android.hardware.sensors@1.0::ISensors",
127             "android.hardware.sensors@2.0::ISensors",
128             "android.hardware.sensors@2.1::ISensors",
129             "android.hardware.vr@1.0::IVr",
130             "android.system.suspend@1.0::ISystemSuspend"
131     );
132 
133     private static Watchdog sWatchdog;
134 
135     /* This handler will be used to post message back onto the main thread */
136     private final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>();
137     private final HandlerChecker mMonitorChecker;
138     private ActivityManagerService mActivity;
139 
140     private IActivityController mController;
141     private boolean mAllowRestart = true;
142     private final OpenFdMonitor mOpenFdMonitor;
143     private final List<Integer> mInterestingJavaPids = new ArrayList<>();
144 
145     /**
146      * Used for checking status of handle threads and scheduling monitor callbacks.
147      */
148     public final class HandlerChecker implements Runnable {
149         private final Handler mHandler;
150         private final String mName;
151         private final long mWaitMax;
152         private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
153         private final ArrayList<Monitor> mMonitorQueue = new ArrayList<Monitor>();
154         private boolean mCompleted;
155         private Monitor mCurrentMonitor;
156         private long mStartTime;
157         private int mPauseCount;
158 
HandlerChecker(Handler handler, String name, long waitMaxMillis)159         HandlerChecker(Handler handler, String name, long waitMaxMillis) {
160             mHandler = handler;
161             mName = name;
162             mWaitMax = waitMaxMillis;
163             mCompleted = true;
164         }
165 
addMonitorLocked(Monitor monitor)166         void addMonitorLocked(Monitor monitor) {
167             // We don't want to update mMonitors when the Handler is in the middle of checking
168             // all monitors. We will update mMonitors on the next schedule if it is safe
169             mMonitorQueue.add(monitor);
170         }
171 
scheduleCheckLocked()172         public void scheduleCheckLocked() {
173             if (mCompleted) {
174                 // Safe to update monitors in queue, Handler is not in the middle of work
175                 mMonitors.addAll(mMonitorQueue);
176                 mMonitorQueue.clear();
177             }
178             if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
179                     || (mPauseCount > 0)) {
180                 // Don't schedule until after resume OR
181                 // If the target looper has recently been polling, then
182                 // there is no reason to enqueue our checker on it since that
183                 // is as good as it not being deadlocked.  This avoid having
184                 // to do a context switch to check the thread. Note that we
185                 // only do this if we have no monitors since those would need to
186                 // be executed at this point.
187                 mCompleted = true;
188                 return;
189             }
190             if (!mCompleted) {
191                 // we already have a check in flight, so no need
192                 return;
193             }
194 
195             mCompleted = false;
196             mCurrentMonitor = null;
197             mStartTime = SystemClock.uptimeMillis();
198             mHandler.postAtFrontOfQueue(this);
199         }
200 
isOverdueLocked()201         boolean isOverdueLocked() {
202             return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
203         }
204 
getCompletionStateLocked()205         public int getCompletionStateLocked() {
206             if (mCompleted) {
207                 return COMPLETED;
208             } else {
209                 long latency = SystemClock.uptimeMillis() - mStartTime;
210                 if (latency < mWaitMax/2) {
211                     return WAITING;
212                 } else if (latency < mWaitMax) {
213                     return WAITED_HALF;
214                 }
215             }
216             return OVERDUE;
217         }
218 
getThread()219         public Thread getThread() {
220             return mHandler.getLooper().getThread();
221         }
222 
getName()223         public String getName() {
224             return mName;
225         }
226 
describeBlockedStateLocked()227         String describeBlockedStateLocked() {
228             if (mCurrentMonitor == null) {
229                 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
230             } else {
231                 return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
232                         + " on " + mName + " (" + getThread().getName() + ")";
233             }
234         }
235 
236         @Override
run()237         public void run() {
238             // Once we get here, we ensure that mMonitors does not change even if we call
239             // #addMonitorLocked because we first add the new monitors to mMonitorQueue and
240             // move them to mMonitors on the next schedule when mCompleted is true, at which
241             // point we have completed execution of this method.
242             final int size = mMonitors.size();
243             for (int i = 0 ; i < size ; i++) {
244                 synchronized (Watchdog.this) {
245                     mCurrentMonitor = mMonitors.get(i);
246                 }
247                 mCurrentMonitor.monitor();
248             }
249 
250             synchronized (Watchdog.this) {
251                 mCompleted = true;
252                 mCurrentMonitor = null;
253             }
254         }
255 
256         /** Pause the HandlerChecker. */
pauseLocked(String reason)257         public void pauseLocked(String reason) {
258             mPauseCount++;
259             // Mark as completed, because there's a chance we called this after the watchog
260             // thread loop called Object#wait after 'WAITED_HALF'. In that case we want to ensure
261             // the next call to #getCompletionStateLocked for this checker returns 'COMPLETED'
262             mCompleted = true;
263             Slog.i(TAG, "Pausing HandlerChecker: " + mName + " for reason: "
264                     + reason + ". Pause count: " + mPauseCount);
265         }
266 
267         /** Resume the HandlerChecker from the last {@link #pauseLocked}. */
resumeLocked(String reason)268         public void resumeLocked(String reason) {
269             if (mPauseCount > 0) {
270                 mPauseCount--;
271                 Slog.i(TAG, "Resuming HandlerChecker: " + mName + " for reason: "
272                         + reason + ". Pause count: " + mPauseCount);
273             } else {
274                 Slog.wtf(TAG, "Already resumed HandlerChecker: " + mName);
275             }
276         }
277     }
278 
279     final class RebootRequestReceiver extends BroadcastReceiver {
280         @Override
onReceive(Context c, Intent intent)281         public void onReceive(Context c, Intent intent) {
282             if (intent.getIntExtra("nowait", 0) != 0) {
283                 rebootSystem("Received ACTION_REBOOT broadcast");
284                 return;
285             }
286             Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
287         }
288     }
289 
290     /** Monitor for checking the availability of binder threads. The monitor will block until
291      * there is a binder thread available to process in coming IPCs to make sure other processes
292      * can still communicate with the service.
293      */
294     private static final class BinderThreadMonitor implements Watchdog.Monitor {
295         @Override
monitor()296         public void monitor() {
297             Binder.blockUntilThreadAvailable();
298         }
299     }
300 
301     public interface Monitor {
monitor()302         void monitor();
303     }
304 
getInstance()305     public static Watchdog getInstance() {
306         if (sWatchdog == null) {
307             sWatchdog = new Watchdog();
308         }
309 
310         return sWatchdog;
311     }
312 
Watchdog()313     private Watchdog() {
314         super("watchdog");
315         // Initialize handler checkers for each common thread we want to check.  Note
316         // that we are not currently checking the background thread, since it can
317         // potentially hold longer running operations with no guarantees about the timeliness
318         // of operations there.
319 
320         // The shared foreground thread is the main checker.  It is where we
321         // will also dispatch monitor checks and do other work.
322         mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
323                 "foreground thread", DEFAULT_TIMEOUT);
324         mHandlerCheckers.add(mMonitorChecker);
325         // Add checker for main thread.  We only do a quick check since there
326         // can be UI running on the thread.
327         mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
328                 "main thread", DEFAULT_TIMEOUT));
329         // Add checker for shared UI thread.
330         mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
331                 "ui thread", DEFAULT_TIMEOUT));
332         // And also check IO thread.
333         mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
334                 "i/o thread", DEFAULT_TIMEOUT));
335         // And the display thread.
336         mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
337                 "display thread", DEFAULT_TIMEOUT));
338         // And the animation thread.
339         mHandlerCheckers.add(new HandlerChecker(AnimationThread.getHandler(),
340                 "animation thread", DEFAULT_TIMEOUT));
341         // And the surface animation thread.
342         mHandlerCheckers.add(new HandlerChecker(SurfaceAnimationThread.getHandler(),
343                 "surface animation thread", DEFAULT_TIMEOUT));
344 
345         // Initialize monitor for Binder threads.
346         addMonitor(new BinderThreadMonitor());
347 
348         mOpenFdMonitor = OpenFdMonitor.create();
349 
350         mInterestingJavaPids.add(Process.myPid());
351 
352         // See the notes on DEFAULT_TIMEOUT.
353         assert DB ||
354                 DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;
355     }
356 
357     /**
358      * Registers a {@link BroadcastReceiver} to listen to reboot broadcasts and trigger reboot.
359      * Should be called during boot after the ActivityManagerService is up and registered
360      * as a system service so it can handle registration of a {@link BroadcastReceiver}.
361      */
init(Context context, ActivityManagerService activity)362     public void init(Context context, ActivityManagerService activity) {
363         mActivity = activity;
364         context.registerReceiver(new RebootRequestReceiver(),
365                 new IntentFilter(Intent.ACTION_REBOOT),
366                 android.Manifest.permission.REBOOT, null);
367     }
368 
isInterestingJavaProcess(String processName)369     private static boolean isInterestingJavaProcess(String processName) {
370         return processName.equals(StorageManagerService.sMediaStoreAuthorityProcessName)
371                 || processName.equals("com.android.phone");
372     }
373 
374     /**
375      * Notifies the watchdog when a Java process with {@code pid} is started.
376      * This process may have its stack trace dumped during an ANR.
377      */
processStarted(String processName, int pid)378     public void processStarted(String processName, int pid) {
379         if (isInterestingJavaProcess(processName)) {
380             Slog.i(TAG, "Interesting Java process " + processName + " started. Pid " + pid);
381             synchronized (this) {
382                 mInterestingJavaPids.add(pid);
383             }
384         }
385     }
386 
387     /**
388      * Notifies the watchdog when a Java process with {@code pid} dies.
389      */
processDied(String processName, int pid)390     public void processDied(String processName, int pid) {
391         if (isInterestingJavaProcess(processName)) {
392             Slog.i(TAG, "Interesting Java process " + processName + " died. Pid " + pid);
393             synchronized (this) {
394                 mInterestingJavaPids.remove(Integer.valueOf(pid));
395             }
396         }
397     }
398 
setActivityController(IActivityController controller)399     public void setActivityController(IActivityController controller) {
400         synchronized (this) {
401             mController = controller;
402         }
403     }
404 
setAllowRestart(boolean allowRestart)405     public void setAllowRestart(boolean allowRestart) {
406         synchronized (this) {
407             mAllowRestart = allowRestart;
408         }
409     }
410 
addMonitor(Monitor monitor)411     public void addMonitor(Monitor monitor) {
412         synchronized (this) {
413             mMonitorChecker.addMonitorLocked(monitor);
414         }
415     }
416 
addThread(Handler thread)417     public void addThread(Handler thread) {
418         addThread(thread, DEFAULT_TIMEOUT);
419     }
420 
addThread(Handler thread, long timeoutMillis)421     public void addThread(Handler thread, long timeoutMillis) {
422         synchronized (this) {
423             final String name = thread.getLooper().getThread().getName();
424             mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
425         }
426     }
427 
428     /**
429      * Pauses Watchdog action for the currently running thread. Useful before executing long running
430      * operations that could falsely trigger the watchdog. Each call to this will require a matching
431      * call to {@link #resumeWatchingCurrentThread}.
432      *
433      * <p>If the current thread has not been added to the Watchdog, this call is a no-op.
434      *
435      * <p>If the Watchdog is already paused for the current thread, this call adds
436      * adds another pause and will require an additional {@link #resumeCurrentThread} to resume.
437      *
438      * <p>Note: Use with care, as any deadlocks on the current thread will be undetected until all
439      * pauses have been resumed.
440      */
pauseWatchingCurrentThread(String reason)441     public void pauseWatchingCurrentThread(String reason) {
442         synchronized (this) {
443             for (HandlerChecker hc : mHandlerCheckers) {
444                 if (Thread.currentThread().equals(hc.getThread())) {
445                     hc.pauseLocked(reason);
446                 }
447             }
448         }
449     }
450 
451     /**
452      * Resumes the last pause from {@link #pauseWatchingCurrentThread} for the currently running
453      * thread.
454      *
455      * <p>If the current thread has not been added to the Watchdog, this call is a no-op.
456      *
457      * <p>If the Watchdog action for the current thread is already resumed, this call logs a wtf.
458      *
459      * <p>If all pauses have been resumed, the Watchdog action is finally resumed, otherwise,
460      * the Watchdog action for the current thread remains paused until resume is called at least
461      * as many times as the calls to pause.
462      */
resumeWatchingCurrentThread(String reason)463     public void resumeWatchingCurrentThread(String reason) {
464         synchronized (this) {
465             for (HandlerChecker hc : mHandlerCheckers) {
466                 if (Thread.currentThread().equals(hc.getThread())) {
467                     hc.resumeLocked(reason);
468                 }
469             }
470         }
471     }
472 
473     /**
474      * Perform a full reboot of the system.
475      */
rebootSystem(String reason)476     void rebootSystem(String reason) {
477         Slog.i(TAG, "Rebooting system because: " + reason);
478         IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
479         try {
480             pms.reboot(false, reason, false);
481         } catch (RemoteException ex) {
482         }
483     }
484 
evaluateCheckerCompletionLocked()485     private int evaluateCheckerCompletionLocked() {
486         int state = COMPLETED;
487         for (int i=0; i<mHandlerCheckers.size(); i++) {
488             HandlerChecker hc = mHandlerCheckers.get(i);
489             state = Math.max(state, hc.getCompletionStateLocked());
490         }
491         return state;
492     }
493 
getBlockedCheckersLocked()494     private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
495         ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
496         for (int i=0; i<mHandlerCheckers.size(); i++) {
497             HandlerChecker hc = mHandlerCheckers.get(i);
498             if (hc.isOverdueLocked()) {
499                 checkers.add(hc);
500             }
501         }
502         return checkers;
503     }
504 
describeCheckersLocked(List<HandlerChecker> checkers)505     private String describeCheckersLocked(List<HandlerChecker> checkers) {
506         StringBuilder builder = new StringBuilder(128);
507         for (int i=0; i<checkers.size(); i++) {
508             if (builder.length() > 0) {
509                 builder.append(", ");
510             }
511             builder.append(checkers.get(i).describeBlockedStateLocked());
512         }
513         return builder.toString();
514     }
515 
getInterestingHalPids()516     private static ArrayList<Integer> getInterestingHalPids() {
517         try {
518             IServiceManager serviceManager = IServiceManager.getService();
519             ArrayList<IServiceManager.InstanceDebugInfo> dump =
520                     serviceManager.debugDump();
521             HashSet<Integer> pids = new HashSet<>();
522             for (IServiceManager.InstanceDebugInfo info : dump) {
523                 if (info.pid == IServiceManager.PidConstant.NO_PID) {
524                     continue;
525                 }
526 
527                 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) {
528                     continue;
529                 }
530 
531                 pids.add(info.pid);
532             }
533             return new ArrayList<Integer>(pids);
534         } catch (RemoteException e) {
535             return new ArrayList<Integer>();
536         }
537     }
538 
getInterestingNativePids()539     static ArrayList<Integer> getInterestingNativePids() {
540         ArrayList<Integer> pids = getInterestingHalPids();
541 
542         int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST);
543         if (nativePids != null) {
544             pids.ensureCapacity(pids.size() + nativePids.length);
545             for (int i : nativePids) {
546                 pids.add(i);
547             }
548         }
549 
550         return pids;
551     }
552 
553     @Override
run()554     public void run() {
555         boolean waitedHalf = false;
556         while (true) {
557             final List<HandlerChecker> blockedCheckers;
558             final String subject;
559             final boolean allowRestart;
560             int debuggerWasConnected = 0;
561             synchronized (this) {
562                 long timeout = CHECK_INTERVAL;
563                 // Make sure we (re)spin the checkers that have become idle within
564                 // this wait-and-check interval
565                 for (int i=0; i<mHandlerCheckers.size(); i++) {
566                     HandlerChecker hc = mHandlerCheckers.get(i);
567                     hc.scheduleCheckLocked();
568                 }
569 
570                 if (debuggerWasConnected > 0) {
571                     debuggerWasConnected--;
572                 }
573 
574                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
575                 // wait while asleep. If the device is asleep then the thing that we are waiting
576                 // to timeout on is asleep as well and won't have a chance to run, causing a false
577                 // positive on when to kill things.
578                 long start = SystemClock.uptimeMillis();
579                 while (timeout > 0) {
580                     if (Debug.isDebuggerConnected()) {
581                         debuggerWasConnected = 2;
582                     }
583                     try {
584                         wait(timeout);
585                         // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting
586                     } catch (InterruptedException e) {
587                         Log.wtf(TAG, e);
588                     }
589                     if (Debug.isDebuggerConnected()) {
590                         debuggerWasConnected = 2;
591                     }
592                     timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
593                 }
594 
595                 boolean fdLimitTriggered = false;
596                 if (mOpenFdMonitor != null) {
597                     fdLimitTriggered = mOpenFdMonitor.monitor();
598                 }
599 
600                 if (!fdLimitTriggered) {
601                     final int waitState = evaluateCheckerCompletionLocked();
602                     if (waitState == COMPLETED) {
603                         // The monitors have returned; reset
604                         waitedHalf = false;
605                         continue;
606                     } else if (waitState == WAITING) {
607                         // still waiting but within their configured intervals; back off and recheck
608                         continue;
609                     } else if (waitState == WAITED_HALF) {
610                         if (!waitedHalf) {
611                             Slog.i(TAG, "WAITED_HALF");
612                             // We've waited half the deadlock-detection interval.  Pull a stack
613                             // trace and wait another half.
614                             ArrayList<Integer> pids = new ArrayList<>(mInterestingJavaPids);
615                             ActivityManagerService.dumpStackTraces(pids, null, null,
616                                     getInterestingNativePids(), null);
617                             waitedHalf = true;
618                         }
619                         continue;
620                     }
621 
622                     // something is overdue!
623                     blockedCheckers = getBlockedCheckersLocked();
624                     subject = describeCheckersLocked(blockedCheckers);
625                 } else {
626                     blockedCheckers = Collections.emptyList();
627                     subject = "Open FD high water mark reached";
628                 }
629                 allowRestart = mAllowRestart;
630             }
631 
632             // If we got here, that means that the system is most likely hung.
633             // First collect stack traces from all threads of the system process.
634             // Then kill this process so that the system will restart.
635             EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
636 
637             ArrayList<Integer> pids = new ArrayList<>(mInterestingJavaPids);
638 
639             long anrTime = SystemClock.uptimeMillis();
640             StringBuilder report = new StringBuilder();
641             report.append(MemoryPressureUtil.currentPsiState());
642             ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false);
643             StringWriter tracesFileException = new StringWriter();
644             final File stack = ActivityManagerService.dumpStackTraces(
645                     pids, processCpuTracker, new SparseArray<>(), getInterestingNativePids(),
646                     tracesFileException);
647 
648             // Give some extra time to make sure the stack traces get written.
649             // The system's been hanging for a minute, another second or two won't hurt much.
650             SystemClock.sleep(5000);
651 
652             processCpuTracker.update();
653             report.append(processCpuTracker.printCurrentState(anrTime));
654             report.append(tracesFileException.getBuffer());
655 
656             // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
657             doSysRq('w');
658             doSysRq('l');
659 
660             // Try to add the error to the dropbox, but assuming that the ActivityManager
661             // itself may be deadlocked.  (which has happened, causing this statement to
662             // deadlock and the watchdog as a whole to be ineffective)
663             Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
664                     public void run() {
665                         // If a watched thread hangs before init() is called, we don't have a
666                         // valid mActivity. So we can't log the error to dropbox.
667                         if (mActivity != null) {
668                             mActivity.addErrorToDropBox(
669                                     "watchdog", null, "system_server", null, null, null,
670                                     subject, report.toString(), stack, null);
671                         }
672                         FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED,
673                                 subject);
674                     }
675                 };
676             dropboxThread.start();
677             try {
678                 dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
679             } catch (InterruptedException ignored) {}
680 
681             IActivityController controller;
682             synchronized (this) {
683                 controller = mController;
684             }
685             if (controller != null) {
686                 Slog.i(TAG, "Reporting stuck state to activity controller");
687                 try {
688                     Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
689                     // 1 = keep waiting, -1 = kill system
690                     int res = controller.systemNotResponding(subject);
691                     if (res >= 0) {
692                         Slog.i(TAG, "Activity controller requested to coninue to wait");
693                         waitedHalf = false;
694                         continue;
695                     }
696                 } catch (RemoteException e) {
697                 }
698             }
699 
700             // Only kill the process if the debugger is not attached.
701             if (Debug.isDebuggerConnected()) {
702                 debuggerWasConnected = 2;
703             }
704             if (debuggerWasConnected >= 2) {
705                 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
706             } else if (debuggerWasConnected > 0) {
707                 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
708             } else if (!allowRestart) {
709                 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
710             } else {
711                 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
712                 WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
713                 Slog.w(TAG, "*** GOODBYE!");
714                 Process.killProcess(Process.myPid());
715                 System.exit(10);
716             }
717 
718             waitedHalf = false;
719         }
720     }
721 
doSysRq(char c)722     private void doSysRq(char c) {
723         try {
724             FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
725             sysrq_trigger.write(c);
726             sysrq_trigger.close();
727         } catch (IOException e) {
728             Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
729         }
730     }
731 
732     public static final class OpenFdMonitor {
733         /**
734          * Number of FDs below the soft limit that we trigger a runtime restart at. This was
735          * chosen arbitrarily, but will need to be at least 6 in order to have a sufficient number
736          * of FDs in reserve to complete a dump.
737          */
738         private static final int FD_HIGH_WATER_MARK = 12;
739 
740         private final File mDumpDir;
741         private final File mFdHighWaterMark;
742 
create()743         public static OpenFdMonitor create() {
744             // Only run the FD monitor on debuggable builds (such as userdebug and eng builds).
745             if (!Build.IS_DEBUGGABLE) {
746                 return null;
747             }
748 
749             final StructRlimit rlimit;
750             try {
751                 rlimit = android.system.Os.getrlimit(OsConstants.RLIMIT_NOFILE);
752             } catch (ErrnoException errno) {
753                 Slog.w(TAG, "Error thrown from getrlimit(RLIMIT_NOFILE)", errno);
754                 return null;
755             }
756 
757             // The assumption we're making here is that FD numbers are allocated (more or less)
758             // sequentially, which is currently (and historically) true since open is currently
759             // specified to always return the lowest-numbered non-open file descriptor for the
760             // current process.
761             //
762             // We do this to avoid having to enumerate the contents of /proc/self/fd in order to
763             // count the number of descriptors open in the process.
764             final File fdThreshold = new File("/proc/self/fd/" + (rlimit.rlim_cur - FD_HIGH_WATER_MARK));
765             return new OpenFdMonitor(new File("/data/anr"), fdThreshold);
766         }
767 
OpenFdMonitor(File dumpDir, File fdThreshold)768         OpenFdMonitor(File dumpDir, File fdThreshold) {
769             mDumpDir = dumpDir;
770             mFdHighWaterMark = fdThreshold;
771         }
772 
773         /**
774          * Dumps open file descriptors and their full paths to a temporary file in {@code mDumpDir}.
775          */
dumpOpenDescriptors()776         private void dumpOpenDescriptors() {
777             // We cannot exec lsof to get more info about open file descriptors because a newly
778             // forked process will not have the permissions to readlink. Instead list all open
779             // descriptors from /proc/pid/fd and resolve them.
780             List<String> dumpInfo = new ArrayList<>();
781             String fdDirPath = String.format("/proc/%d/fd/", Process.myPid());
782             File[] fds = new File(fdDirPath).listFiles();
783             if (fds == null) {
784                 dumpInfo.add("Unable to list " + fdDirPath);
785             } else {
786                 for (File f : fds) {
787                     String fdSymLink = f.getAbsolutePath();
788                     String resolvedPath = "";
789                     try {
790                         resolvedPath = Os.readlink(fdSymLink);
791                     } catch (ErrnoException ex) {
792                         resolvedPath = ex.getMessage();
793                     }
794                     dumpInfo.add(fdSymLink + "\t" + resolvedPath);
795                 }
796             }
797 
798             // Dump the fds & paths to a temp file.
799             try {
800                 File dumpFile = File.createTempFile("anr_fd_", "", mDumpDir);
801                 Path out = Paths.get(dumpFile.getAbsolutePath());
802                 Files.write(out, dumpInfo, StandardCharsets.UTF_8);
803             } catch (IOException ex) {
804                 Slog.w(TAG, "Unable to write open descriptors to file: " + ex);
805             }
806         }
807 
808         /**
809          * @return {@code true} if the high water mark was breached and a dump was written,
810          *     {@code false} otherwise.
811          */
monitor()812         public boolean monitor() {
813             if (mFdHighWaterMark.exists()) {
814                 dumpOpenDescriptors();
815                 return true;
816             }
817 
818             return false;
819         }
820     }
821 }
822