1 /*
2  * Copyright (C) 2008 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import static com.android.server.Watchdog.HandlerCheckerAndTimeout.withCustomTimeout;
20 import static com.android.server.Watchdog.HandlerCheckerAndTimeout.withDefaultTimeout;
21 
22 import android.annotation.NonNull;
23 import android.annotation.Nullable;
24 import android.app.IActivityController;
25 import android.content.BroadcastReceiver;
26 import android.content.Context;
27 import android.content.Intent;
28 import android.content.IntentFilter;
29 import android.database.ContentObserver;
30 import android.hidl.manager.V1_0.IServiceManager;
31 import android.net.Uri;
32 import android.os.Binder;
33 import android.os.Build;
34 import android.os.Debug;
35 import android.os.FileUtils;
36 import android.os.Handler;
37 import android.os.IPowerManager;
38 import android.os.Looper;
39 import android.os.Process;
40 import android.os.RemoteException;
41 import android.os.ServiceDebugInfo;
42 import android.os.ServiceManager;
43 import android.os.SystemClock;
44 import android.os.SystemProperties;
45 import android.os.UserHandle;
46 import android.provider.Settings;
47 import android.sysprop.WatchdogProperties;
48 import android.util.Dumpable;
49 import android.util.EventLog;
50 import android.util.Log;
51 import android.util.Slog;
52 import android.util.SparseBooleanArray;
53 
54 import com.android.internal.os.BackgroundThread;
55 import com.android.internal.os.ProcessCpuTracker;
56 import com.android.internal.os.ZygoteConnectionConstants;
57 import com.android.internal.util.FrameworkStatsLog;
58 import com.android.server.am.ActivityManagerService;
59 import com.android.server.am.StackTracesDumpHelper;
60 import com.android.server.am.TraceErrorLogger;
61 import com.android.server.criticalevents.CriticalEventLog;
62 import com.android.server.wm.SurfaceAnimationThread;
63 
64 import java.io.BufferedReader;
65 import java.io.File;
66 import java.io.FileNotFoundException;
67 import java.io.FileReader;
68 import java.io.FileWriter;
69 import java.io.IOException;
70 import java.io.PrintWriter;
71 import java.io.StringWriter;
72 import java.time.Clock;
73 import java.util.ArrayList;
74 import java.util.Arrays;
75 import java.util.Collections;
76 import java.util.HashSet;
77 import java.util.List;
78 import java.util.Optional;
79 import java.util.UUID;
80 import java.util.concurrent.CompletableFuture;
81 import java.util.concurrent.TimeUnit;
82 
83 /**
84  * This class calls its monitor every minute. Killing this process if they don't return
85  **/
86 public class Watchdog implements Dumpable {
87     static final String TAG = "Watchdog";
88 
89     /** Debug flag. */
90     public static final boolean DEBUG = false;
91 
92     // Set this to true to use debug default values.
93     private static final boolean DB = false;
94 
95     // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with
96     //         timeout in com.android.internal.os.ZygoteConnection, or wrapped applications
97     //         can trigger the watchdog.
98     // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped
99     //         applications may not work with a debug build. CTS will fail.
100     private static final long DEFAULT_TIMEOUT = DB ? 10 * 1000 : 60 * 1000;
101 
102     // This ratio is used to compute the pre-watchdog timeout (2 means that the pre-watchdog timeout
103     // will be half the full timeout).
104     //
105     // The pre-watchdog event is similar to a full watchdog except it does not crash system server.
106     private static final int PRE_WATCHDOG_TIMEOUT_RATIO = 4;
107 
108     // These are temporally ordered: larger values as lateness increases
109     static final int COMPLETED = 0;
110     static final int WAITING = 1;
111     static final int WAITED_UNTIL_PRE_WATCHDOG = 2;
112     static final int OVERDUE = 3;
113 
114     // Track watchdog timeout history and break the crash loop if there is.
115     private static final String TIMEOUT_HISTORY_FILE = "/data/system/watchdog-timeout-history.txt";
116     private static final String PROP_FATAL_LOOP_COUNT = "framework_watchdog.fatal_count";
117     private static final String PROP_FATAL_LOOP_WINDOWS_SECS =
118             "framework_watchdog.fatal_window.second";
119 
120     // Which native processes to dump into dropbox's stack traces
121     public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
122         "/system/bin/audioserver",
123         "/system/bin/cameraserver",
124         "/system/bin/drmserver",
125         "/system/bin/keystore2",
126         "/system/bin/mediadrmserver",
127         "/system/bin/mediaserver",
128         "/system/bin/netd",
129         "/system/bin/sdcard",
130         "/system/bin/servicemanager",
131         "/system/bin/surfaceflinger",
132         "/system/bin/vold",
133         "media.extractor", // system/bin/mediaextractor
134         "media.metrics", // system/bin/mediametrics
135         "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
136         "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec
137         "media.transcoding", // Media transcoding service
138         "com.android.bluetooth",  // Bluetooth service
139         "/apex/com.android.art/bin/artd",  // ART daemon
140         "/apex/com.android.os.statsd/bin/statsd",  // Stats daemon
141     };
142 
143     public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList(
144             "android.hardware.audio@4.0::IDevicesFactory",
145             "android.hardware.audio@5.0::IDevicesFactory",
146             "android.hardware.audio@6.0::IDevicesFactory",
147             "android.hardware.audio@7.0::IDevicesFactory",
148             "android.hardware.biometrics.face@1.0::IBiometricsFace",
149             "android.hardware.biometrics.fingerprint@2.1::IBiometricsFingerprint",
150             "android.hardware.bluetooth@1.0::IBluetoothHci",
151             "android.hardware.camera.provider@2.4::ICameraProvider",
152             "android.hardware.gnss@1.0::IGnss",
153             "android.hardware.graphics.allocator@2.0::IAllocator",
154             "android.hardware.graphics.allocator@4.0::IAllocator",
155             "android.hardware.graphics.composer@2.1::IComposer",
156             "android.hardware.health@2.0::IHealth",
157             "android.hardware.light@2.0::ILight",
158             "android.hardware.media.c2@1.0::IComponentStore",
159             "android.hardware.media.omx@1.0::IOmx",
160             "android.hardware.media.omx@1.0::IOmxStore",
161             "android.hardware.neuralnetworks@1.0::IDevice",
162             "android.hardware.power@1.0::IPower",
163             "android.hardware.power.stats@1.0::IPowerStats",
164             "android.hardware.sensors@1.0::ISensors",
165             "android.hardware.sensors@2.0::ISensors",
166             "android.hardware.sensors@2.1::ISensors",
167             "android.hardware.vibrator@1.0::IVibrator",
168             "android.hardware.vr@1.0::IVr",
169             "android.system.suspend@1.0::ISystemSuspend"
170     );
171 
172     public static final String[] AIDL_INTERFACE_PREFIXES_OF_INTEREST = new String[] {
173             "android.hardware.audio.core.IModule/",
174             "android.hardware.audio.core.IConfig/",
175             "android.hardware.audio.effect.IFactory/",
176             "android.hardware.biometrics.face.IFace/",
177             "android.hardware.biometrics.fingerprint.IFingerprint/",
178             "android.hardware.bluetooth.IBluetoothHci/",
179             "android.hardware.camera.provider.ICameraProvider/",
180             "android.hardware.drm.IDrmFactory/",
181             "android.hardware.gnss.IGnss/",
182             "android.hardware.graphics.allocator.IAllocator/",
183             "android.hardware.graphics.composer3.IComposer/",
184             "android.hardware.health.IHealth/",
185             "android.hardware.input.processor.IInputProcessor/",
186             "android.hardware.light.ILights/",
187             "android.hardware.neuralnetworks.IDevice/",
188             "android.hardware.power.IPower/",
189             "android.hardware.power.stats.IPowerStats/",
190             "android.hardware.sensors.ISensors/",
191             "android.hardware.vibrator.IVibrator/",
192             "android.hardware.vibrator.IVibratorManager/",
193             "android.system.suspend.ISystemSuspend/",
194     };
195 
196     private static Watchdog sWatchdog;
197 
198     private final Thread mThread;
199 
200     private final Object mLock = new Object();
201 
202     /* This handler will be used to post message back onto the main thread */
203     private final ArrayList<HandlerCheckerAndTimeout> mHandlerCheckers = new ArrayList<>();
204     private final HandlerChecker mMonitorChecker;
205     private ActivityManagerService mActivity;
206     private IActivityController mController;
207     private boolean mAllowRestart = true;
208     // We start with DEFAULT_TIMEOUT. This will then be update with the timeout values from Settings
209     // once the settings provider is initialized.
210     private volatile long mWatchdogTimeoutMillis = DEFAULT_TIMEOUT;
211     private final List<Integer> mInterestingJavaPids = new ArrayList<>();
212     private final TraceErrorLogger mTraceErrorLogger;
213 
214     /** Holds a checker and its timeout. */
215     static final class HandlerCheckerAndTimeout {
216         private final HandlerChecker mHandler;
217         private final Optional<Long> mCustomTimeoutMillis;
218 
HandlerCheckerAndTimeout(HandlerChecker checker, Optional<Long> timeoutMillis)219         private HandlerCheckerAndTimeout(HandlerChecker checker, Optional<Long> timeoutMillis) {
220             this.mHandler = checker;
221             this.mCustomTimeoutMillis = timeoutMillis;
222         }
223 
checker()224         HandlerChecker checker() {
225             return mHandler;
226         }
227 
228         /** Returns the timeout. */
customTimeoutMillis()229         Optional<Long> customTimeoutMillis() {
230             return mCustomTimeoutMillis;
231         }
232 
233         /**
234          * Creates a checker with the default timeout. The timeout will use the default value which
235          * is configurable server-side.
236          */
withDefaultTimeout(HandlerChecker checker)237         static HandlerCheckerAndTimeout withDefaultTimeout(HandlerChecker checker) {
238             return new HandlerCheckerAndTimeout(checker, Optional.empty());
239         }
240 
241         /**
242          * Creates a checker with a custom timeout. The timeout overrides the default value and will
243          * always be used.
244          */
withCustomTimeout( HandlerChecker checker, long timeoutMillis)245         static HandlerCheckerAndTimeout withCustomTimeout(
246                 HandlerChecker checker, long timeoutMillis) {
247             return new HandlerCheckerAndTimeout(checker, Optional.of(timeoutMillis));
248         }
249     }
250 
251     /** Used for checking status of handle threads and scheduling monitor callbacks. */
252     public static class HandlerChecker implements Runnable {
253         private final Handler mHandler;
254         private final String mName;
255         private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
256         private final ArrayList<Monitor> mMonitorQueue = new ArrayList<Monitor>();
257         private long mWaitMaxMillis;
258         private boolean mCompleted;
259         private Monitor mCurrentMonitor;
260         private long mStartTimeMillis;
261         private int mPauseCount;
262         private long mPauseEndTimeMillis;
263         private Clock mClock;
264         private Object mLock;
265 
HandlerChecker(Handler handler, String name, Object lock, Clock clock)266         HandlerChecker(Handler handler, String name, Object lock, Clock clock) {
267             mHandler = handler;
268             mName = name;
269             mLock = lock;
270             mCompleted = true;
271             mClock = clock;
272         }
273 
HandlerChecker(Handler handler, String name, Object lock)274         HandlerChecker(Handler handler, String name, Object lock) {
275             this(handler, name, lock, SystemClock.uptimeClock());
276         }
277 
addMonitorLocked(Monitor monitor)278         void addMonitorLocked(Monitor monitor) {
279             // We don't want to update mMonitors when the Handler is in the middle of checking
280             // all monitors. We will update mMonitors on the next schedule if it is safe
281             mMonitorQueue.add(monitor);
282         }
283 
284         /**
285          * Schedules a run on the handler thread.
286          *
287          * @param handlerCheckerTimeoutMillis the timeout to use for this run
288          */
scheduleCheckLocked(long handlerCheckerTimeoutMillis)289         public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) {
290             mWaitMaxMillis = handlerCheckerTimeoutMillis;
291 
292             if (mCompleted) {
293                 // Safe to update monitors in queue, Handler is not in the middle of work
294                 mMonitors.addAll(mMonitorQueue);
295                 mMonitorQueue.clear();
296             }
297 
298             long nowMillis = mClock.millis();
299             boolean isPaused = mPauseCount > 0 || mPauseEndTimeMillis > nowMillis;
300             if ((mMonitors.size() == 0 && isHandlerPolling()) || isPaused) {
301                 // Don't schedule until after resume OR
302                 // If the target looper has recently been polling, then
303                 // there is no reason to enqueue our checker on it since that
304                 // is as good as it not being deadlocked.  This avoid having
305                 // to do a context switch to check the thread. Note that we
306                 // only do this if we have no monitors since those would need to
307                 // be executed at this point.
308                 mCompleted = true;
309                 return;
310             }
311             if (!mCompleted) {
312                 // we already have a check in flight, so no need
313                 return;
314             }
315 
316             mCompleted = false;
317             mCurrentMonitor = null;
318             mStartTimeMillis = nowMillis;
319             mPauseEndTimeMillis = 0;
320             mHandler.postAtFrontOfQueue(this);
321         }
322 
isHandlerPolling()323         boolean isHandlerPolling() {
324             return mHandler.getLooper().getQueue().isPolling();
325         }
326 
getCompletionStateLocked()327         public int getCompletionStateLocked() {
328             if (mCompleted) {
329                 return COMPLETED;
330             } else {
331                 long latency = mClock.millis() - mStartTimeMillis;
332                 if (latency < mWaitMaxMillis / PRE_WATCHDOG_TIMEOUT_RATIO) {
333                     return WAITING;
334                 } else if (latency < mWaitMaxMillis) {
335                     return WAITED_UNTIL_PRE_WATCHDOG;
336                 }
337             }
338             return OVERDUE;
339         }
340 
getThread()341         public Thread getThread() {
342             return mHandler.getLooper().getThread();
343         }
344 
getName()345         public String getName() {
346             return mName;
347         }
348 
describeBlockedStateLocked()349         String describeBlockedStateLocked() {
350             final String prefix;
351             if (mCurrentMonitor == null) {
352                 prefix = "Blocked in handler";
353             } else {
354                 prefix =  "Blocked in monitor " + mCurrentMonitor.getClass().getName();
355             }
356             long latencySeconds = (mClock.millis() - mStartTimeMillis) / 1000;
357             return prefix + " on " + mName + " (" + getThread().getName() + ")"
358                 + " for " + latencySeconds + "s";
359         }
360 
361         @Override
run()362         public void run() {
363             // Once we get here, we ensure that mMonitors does not change even if we call
364             // #addMonitorLocked because we first add the new monitors to mMonitorQueue and
365             // move them to mMonitors on the next schedule when mCompleted is true, at which
366             // point we have completed execution of this method.
367             final int size = mMonitors.size();
368             for (int i = 0 ; i < size ; i++) {
369                 synchronized (mLock) {
370                     mCurrentMonitor = mMonitors.get(i);
371                 }
372                 mCurrentMonitor.monitor();
373             }
374 
375             synchronized (mLock) {
376                 mCompleted = true;
377                 mCurrentMonitor = null;
378             }
379         }
380 
381         /**
382          * Pauses the checks for the given time.
383          *
384          * <p>The current run will be ignored and another run will be scheduled after
385          * the given time.
386          */
pauseForLocked(int pauseMillis, String reason)387         public void pauseForLocked(int pauseMillis, String reason) {
388             mPauseEndTimeMillis = mClock.millis() + pauseMillis;
389             // Mark as completed, because there's a chance we called this after the watchog
390             // thread loop called Object#wait after 'WAITED_UNTIL_PRE_WATCHDOG'. In that case we
391             // want to ensure the next call to #getCompletionStateLocked for this checker returns
392             // 'COMPLETED'
393             mCompleted = true;
394             Slog.i(TAG, "Pausing of HandlerChecker: " + mName + " for reason: "
395                     + reason + ". Pause end time: " + mPauseEndTimeMillis);
396         }
397 
398         /** Pause the HandlerChecker. */
pauseLocked(String reason)399         public void pauseLocked(String reason) {
400             mPauseCount++;
401             // Mark as completed, because there's a chance we called this after the watchog
402             // thread loop called Object#wait after 'WAITED_UNTIL_PRE_WATCHDOG'. In that case we
403             // want to ensure the next call to #getCompletionStateLocked for this checker returns
404             // 'COMPLETED'
405             mCompleted = true;
406             Slog.i(TAG, "Pausing HandlerChecker: " + mName + " for reason: "
407                     + reason + ". Pause count: " + mPauseCount);
408         }
409 
410         /** Resume the HandlerChecker from the last {@link #pauseLocked}. */
resumeLocked(String reason)411         public void resumeLocked(String reason) {
412             if (mPauseCount > 0) {
413                 mPauseCount--;
414                 Slog.i(TAG, "Resuming HandlerChecker: " + mName + " for reason: "
415                         + reason + ". Pause count: " + mPauseCount);
416             } else {
417                 Slog.wtf(TAG, "Already resumed HandlerChecker: " + mName);
418             }
419         }
420 
421         @Override
toString()422         public String toString() {
423             return "CheckerHandler for " + mName;
424         }
425     }
426 
427     final class RebootRequestReceiver extends BroadcastReceiver {
428         @Override
onReceive(Context c, Intent intent)429         public void onReceive(Context c, Intent intent) {
430             if (intent.getIntExtra("nowait", 0) != 0) {
431                 rebootSystem("Received ACTION_REBOOT broadcast");
432                 return;
433             }
434             Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
435         }
436     }
437 
438     /** Monitor for checking the availability of binder threads. The monitor will block until
439      * there is a binder thread available to process in coming IPCs to make sure other processes
440      * can still communicate with the service.
441      */
442     private static final class BinderThreadMonitor implements Watchdog.Monitor {
443         @Override
monitor()444         public void monitor() {
445             Binder.blockUntilThreadAvailable();
446         }
447     }
448 
449     public interface Monitor {
monitor()450         void monitor();
451     }
452 
getInstance()453     public static Watchdog getInstance() {
454         if (sWatchdog == null) {
455             sWatchdog = new Watchdog();
456         }
457 
458         return sWatchdog;
459     }
460 
Watchdog()461     private Watchdog() {
462         mThread = new Thread(this::run, "watchdog");
463 
464         // Initialize handler checkers for each common thread we want to check.  Note
465         // that we are not currently checking the background thread, since it can
466         // potentially hold longer running operations with no guarantees about the timeliness
467         // of operations there.
468         //
469         // Use a custom thread to check monitors to avoid lock contention from impacted other
470         // threads.
471         ServiceThread t = new ServiceThread("watchdog.monitor",
472                 android.os.Process.THREAD_PRIORITY_DEFAULT, true /*allowIo*/);
473         t.start();
474         mMonitorChecker = new HandlerChecker(new Handler(t.getLooper()), "monitor thread", mLock);
475         mHandlerCheckers.add(withDefaultTimeout(mMonitorChecker));
476 
477         mHandlerCheckers.add(
478                 withDefaultTimeout(
479                         new HandlerChecker(FgThread.getHandler(), "foreground thread", mLock)));
480         // Add checker for main thread.  We only do a quick check since there
481         // can be UI running on the thread.
482         mHandlerCheckers.add(
483                 withDefaultTimeout(
484                         new HandlerChecker(
485                                 new Handler(Looper.getMainLooper()), "main thread", mLock)));
486         // Add checker for shared UI thread.
487         mHandlerCheckers.add(
488                 withDefaultTimeout(new HandlerChecker(UiThread.getHandler(), "ui thread", mLock)));
489         // And also check IO thread.
490         mHandlerCheckers.add(
491                 withDefaultTimeout(new HandlerChecker(IoThread.getHandler(), "i/o thread", mLock)));
492         // And the display thread.
493         mHandlerCheckers.add(
494                 withDefaultTimeout(
495                         new HandlerChecker(DisplayThread.getHandler(), "display thread", mLock)));
496         // And the animation thread.
497         mHandlerCheckers.add(
498                 withDefaultTimeout(
499                         new HandlerChecker(
500                                 AnimationThread.getHandler(), "animation thread", mLock)));
501         // And the surface animation thread.
502         mHandlerCheckers.add(
503                 withDefaultTimeout(
504                         new HandlerChecker(
505                                 SurfaceAnimationThread.getHandler(),
506                                 "surface animation thread",
507                                 mLock)));
508         // Initialize monitor for Binder threads.
509         addMonitor(new BinderThreadMonitor());
510 
511         mInterestingJavaPids.add(Process.myPid());
512 
513         // See the notes on DEFAULT_TIMEOUT.
514         assert DB || Build.IS_USERDEBUG ||
515                 DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;
516 
517         mTraceErrorLogger = new TraceErrorLogger();
518     }
519 
520     /**
521      * Called by SystemServer to cause the internal thread to begin execution.
522      */
start()523     public void start() {
524         mThread.start();
525     }
526 
527     /**
528      * Registers a {@link BroadcastReceiver} to listen to reboot broadcasts and trigger reboot.
529      * Should be called during boot after the ActivityManagerService is up and registered
530      * as a system service so it can handle registration of a {@link BroadcastReceiver}.
531      */
init(Context context, ActivityManagerService activity)532     public void init(Context context, ActivityManagerService activity) {
533         mActivity = activity;
534         context.registerReceiver(new RebootRequestReceiver(),
535                 new IntentFilter(Intent.ACTION_REBOOT),
536                 android.Manifest.permission.REBOOT, null);
537     }
538 
539     private static class SettingsObserver extends ContentObserver {
540         private final Uri mUri = Settings.Global.getUriFor(Settings.Global.WATCHDOG_TIMEOUT_MILLIS);
541         private final Context mContext;
542         private final Watchdog mWatchdog;
543 
SettingsObserver(Context context, Watchdog watchdog)544         SettingsObserver(Context context, Watchdog watchdog) {
545             super(BackgroundThread.getHandler());
546             mContext = context;
547             mWatchdog = watchdog;
548             // Always kick once to ensure that we match current state
549             onChange();
550         }
551 
552         @Override
onChange(boolean selfChange, Uri uri, int userId)553         public void onChange(boolean selfChange, Uri uri, int userId) {
554             if (mUri.equals(uri)) {
555                 onChange();
556             }
557         }
558 
onChange()559         public void onChange() {
560             try {
561                 mWatchdog.updateWatchdogTimeout(Settings.Global.getLong(
562                         mContext.getContentResolver(),
563                         Settings.Global.WATCHDOG_TIMEOUT_MILLIS, DEFAULT_TIMEOUT));
564             } catch (RuntimeException e) {
565                 Slog.e(TAG, "Exception while reading settings " + e.getMessage(), e);
566             }
567         }
568     }
569 
570     /**
571      * Register an observer to listen to settings.
572      *
573      * It needs to be called after the settings service is initialized.
574      */
registerSettingsObserver(Context context)575     public void registerSettingsObserver(Context context) {
576         context.getContentResolver().registerContentObserver(
577                 Settings.Global.getUriFor(Settings.Global.WATCHDOG_TIMEOUT_MILLIS),
578                 false,
579                 new SettingsObserver(context, this),
580                 UserHandle.USER_SYSTEM);
581     }
582 
583     /**
584      * Updates watchdog timeout values.
585      */
updateWatchdogTimeout(long timeoutMillis)586     void updateWatchdogTimeout(long timeoutMillis) {
587         // See the notes on DEFAULT_TIMEOUT.
588         if (!DB && !Build.IS_USERDEBUG
589                 && timeoutMillis <= ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS) {
590             timeoutMillis = ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS + 1;
591         }
592         mWatchdogTimeoutMillis = timeoutMillis;
593         Slog.i(TAG, "Watchdog timeout updated to " + mWatchdogTimeoutMillis + " millis");
594     }
595 
isInterestingJavaProcess(String processName)596     private static boolean isInterestingJavaProcess(String processName) {
597         return processName.equals(StorageManagerService.sMediaStoreAuthorityProcessName)
598                 || processName.equals("com.android.phone");
599     }
600 
601     /**
602      * Notifies the watchdog when a Java process with {@code pid} is started.
603      * This process may have its stack trace dumped during an ANR.
604      */
processStarted(String processName, int pid)605     public void processStarted(String processName, int pid) {
606         if (isInterestingJavaProcess(processName)) {
607             Slog.i(TAG, "Interesting Java process " + processName + " started. Pid " + pid);
608             synchronized (mLock) {
609                 mInterestingJavaPids.add(pid);
610             }
611         }
612     }
613 
614     /**
615      * Notifies the watchdog when a Java process with {@code pid} dies.
616      */
processDied(String processName, int pid)617     public void processDied(String processName, int pid) {
618         if (isInterestingJavaProcess(processName)) {
619             Slog.i(TAG, "Interesting Java process " + processName + " died. Pid " + pid);
620             synchronized (mLock) {
621                 mInterestingJavaPids.remove(Integer.valueOf(pid));
622             }
623         }
624     }
625 
setActivityController(IActivityController controller)626     public void setActivityController(IActivityController controller) {
627         synchronized (mLock) {
628             mController = controller;
629         }
630     }
631 
setAllowRestart(boolean allowRestart)632     public void setAllowRestart(boolean allowRestart) {
633         synchronized (mLock) {
634             mAllowRestart = allowRestart;
635         }
636     }
637 
addMonitor(Monitor monitor)638     public void addMonitor(Monitor monitor) {
639         synchronized (mLock) {
640             mMonitorChecker.addMonitorLocked(monitor);
641         }
642     }
643 
addThread(Handler thread)644     public void addThread(Handler thread) {
645         synchronized (mLock) {
646             final String name = thread.getLooper().getThread().getName();
647             mHandlerCheckers.add(withDefaultTimeout(new HandlerChecker(thread, name, mLock)));
648         }
649     }
650 
addThread(Handler thread, long timeoutMillis)651     public void addThread(Handler thread, long timeoutMillis) {
652         synchronized (mLock) {
653             final String name = thread.getLooper().getThread().getName();
654             mHandlerCheckers.add(
655                     withCustomTimeout(new HandlerChecker(thread, name, mLock), timeoutMillis));
656         }
657     }
658 
659      /**
660      * Pauses the checks of the watchdog for this thread. This is useful
661      * to run a slow operation on one of the monitored thread.
662      *
663      * <p>After the given time, the timeout will go back to the default value.
664      * <p>This method does not require resume to be called.
665      */
pauseWatchingCurrentThreadFor(int pauseMillis, String reason)666     public void pauseWatchingCurrentThreadFor(int pauseMillis, String reason) {
667         synchronized (mLock) {
668             for (HandlerCheckerAndTimeout hc : mHandlerCheckers) {
669                 HandlerChecker checker = hc.checker();
670                 if (Thread.currentThread().equals(checker.getThread())) {
671                     checker.pauseForLocked(pauseMillis, reason);
672                 }
673             }
674         }
675     }
676 
677     /**
678      * Pauses the checks of the watchdog for the monitor thread for the given time
679      *
680      * <p>Similar to {@link pauseWatchingCurrentThreadFor} but used for monitors added
681      * through {@link #addMonitor}
682      * <p>This method does not require resume to be called.
683      */
pauseWatchingMonitorsFor(int pauseMillis, String reason)684     public void pauseWatchingMonitorsFor(int pauseMillis, String reason) {
685         mMonitorChecker.pauseForLocked(pauseMillis, reason);
686     }
687 
688     /**
689      * Pauses Watchdog action for the currently running thread. Useful before executing long running
690      * operations that could falsely trigger the watchdog. Each call to this will require a matching
691      * call to {@link #resumeWatchingCurrentThread}.
692      *
693      * <p>If the current thread has not been added to the Watchdog, this call is a no-op.
694      *
695      * <p>If the Watchdog is already paused for the current thread, this call adds
696      * adds another pause and will require an additional {@link #resumeCurrentThread} to resume.
697      *
698      * <p>Note: Use with care, as any deadlocks on the current thread will be undetected until all
699      * pauses have been resumed. Prefer to use #pauseWatchingCurrentThreadFor.
700      */
pauseWatchingCurrentThread(String reason)701     public void pauseWatchingCurrentThread(String reason) {
702         synchronized (mLock) {
703             for (HandlerCheckerAndTimeout hc : mHandlerCheckers) {
704                 HandlerChecker checker = hc.checker();
705                 if (Thread.currentThread().equals(checker.getThread())) {
706                     checker.pauseLocked(reason);
707                 }
708             }
709         }
710     }
711 
712     /**
713      * Resumes the last pause from {@link #pauseWatchingCurrentThread} for the currently running
714      * thread.
715      *
716      * <p>If the current thread has not been added to the Watchdog, this call is a no-op.
717      *
718      * <p>If the Watchdog action for the current thread is already resumed, this call logs a wtf.
719      *
720      * <p>If all pauses have been resumed, the Watchdog action is finally resumed, otherwise,
721      * the Watchdog action for the current thread remains paused until resume is called at least
722      * as many times as the calls to pause.
723      */
resumeWatchingCurrentThread(String reason)724     public void resumeWatchingCurrentThread(String reason) {
725         synchronized (mLock) {
726             for (HandlerCheckerAndTimeout hc : mHandlerCheckers) {
727                 HandlerChecker checker = hc.checker();
728                 if (Thread.currentThread().equals(checker.getThread())) {
729                     checker.resumeLocked(reason);
730                 }
731             }
732         }
733     }
734 
735     /**
736      * Perform a full reboot of the system.
737      */
rebootSystem(String reason)738     void rebootSystem(String reason) {
739         Slog.i(TAG, "Rebooting system because: " + reason);
740         IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
741         try {
742             pms.reboot(false, reason, false);
743         } catch (RemoteException ex) {
744         }
745     }
746 
evaluateCheckerCompletionLocked()747     private int evaluateCheckerCompletionLocked() {
748         int state = COMPLETED;
749         for (int i=0; i<mHandlerCheckers.size(); i++) {
750             HandlerChecker hc = mHandlerCheckers.get(i).checker();
751             state = Math.max(state, hc.getCompletionStateLocked());
752         }
753         return state;
754     }
755 
getCheckersWithStateLocked(int completionState)756     private ArrayList<HandlerChecker> getCheckersWithStateLocked(int completionState) {
757         ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
758         for (int i=0; i<mHandlerCheckers.size(); i++) {
759             HandlerChecker hc = mHandlerCheckers.get(i).checker();
760             if (hc.getCompletionStateLocked() == completionState) {
761                 checkers.add(hc);
762             }
763         }
764         return checkers;
765     }
766 
describeCheckersLocked(List<HandlerChecker> checkers)767     private String describeCheckersLocked(List<HandlerChecker> checkers) {
768         StringBuilder builder = new StringBuilder(128);
769         for (int i=0; i<checkers.size(); i++) {
770             if (builder.length() > 0) {
771                 builder.append(", ");
772             }
773             builder.append(checkers.get(i).describeBlockedStateLocked());
774         }
775         return builder.toString();
776     }
777 
addInterestingHidlPids(HashSet<Integer> pids)778     private static void addInterestingHidlPids(HashSet<Integer> pids) {
779         try {
780             IServiceManager serviceManager = IServiceManager.getService();
781             ArrayList<IServiceManager.InstanceDebugInfo> dump =
782                     serviceManager.debugDump();
783             for (IServiceManager.InstanceDebugInfo info : dump) {
784                 if (info.pid == IServiceManager.PidConstant.NO_PID) {
785                     continue;
786                 }
787 
788                 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) {
789                     continue;
790                 }
791 
792                 pids.add(info.pid);
793             }
794         } catch (RemoteException e) {
795             Log.w(TAG, e);
796         }
797     }
798 
addInterestingAidlPids(HashSet<Integer> pids)799     private static void addInterestingAidlPids(HashSet<Integer> pids) {
800         ServiceDebugInfo[] infos = ServiceManager.getServiceDebugInfo();
801         if (infos == null) return;
802 
803         for (ServiceDebugInfo info : infos) {
804             for (String prefix : AIDL_INTERFACE_PREFIXES_OF_INTEREST) {
805                 if (info.name.startsWith(prefix)) {
806                     pids.add(info.debugPid);
807                 }
808             }
809         }
810     }
811 
getInterestingNativePids()812     static ArrayList<Integer> getInterestingNativePids() {
813         HashSet<Integer> pids = new HashSet<>();
814         addInterestingAidlPids(pids);
815         addInterestingHidlPids(pids);
816 
817         int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST);
818         if (nativePids != null) {
819             for (int i : nativePids) {
820                 pids.add(i);
821             }
822         }
823 
824         return new ArrayList<Integer>(pids);
825     }
826 
run()827     private void run() {
828         boolean waitedHalf = false;
829 
830         while (true) {
831             List<HandlerChecker> blockedCheckers = Collections.emptyList();
832             String subject = "";
833             boolean allowRestart = true;
834             int debuggerWasConnected = 0;
835             boolean doWaitedPreDump = false;
836             // The value of mWatchdogTimeoutMillis might change while we are executing the loop.
837             // We store the current value to use a consistent value for all handlers.
838             final long watchdogTimeoutMillis = mWatchdogTimeoutMillis;
839             final long checkIntervalMillis = watchdogTimeoutMillis / PRE_WATCHDOG_TIMEOUT_RATIO;
840             final ArrayList<Integer> pids;
841             synchronized (mLock) {
842                 long timeout = checkIntervalMillis;
843                 // Make sure we (re)spin the checkers that have become idle within
844                 // this wait-and-check interval
845                 for (int i=0; i<mHandlerCheckers.size(); i++) {
846                     HandlerCheckerAndTimeout hc = mHandlerCheckers.get(i);
847                     // We pick the watchdog to apply every time we reschedule the checkers. The
848                     // default timeout might have changed since the last run.
849                     hc.checker().scheduleCheckLocked(hc.customTimeoutMillis()
850                             .orElse(watchdogTimeoutMillis * Build.HW_TIMEOUT_MULTIPLIER));
851                 }
852 
853                 if (debuggerWasConnected > 0) {
854                     debuggerWasConnected--;
855                 }
856 
857                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
858                 // wait while asleep. If the device is asleep then the thing that we are waiting
859                 // to timeout on is asleep as well and won't have a chance to run, causing a false
860                 // positive on when to kill things.
861                 long start = SystemClock.uptimeMillis();
862                 while (timeout > 0) {
863                     if (Debug.isDebuggerConnected()) {
864                         debuggerWasConnected = 2;
865                     }
866                     try {
867                         mLock.wait(timeout);
868                         // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting
869                     } catch (InterruptedException e) {
870                         Log.wtf(TAG, e);
871                     }
872                     if (Debug.isDebuggerConnected()) {
873                         debuggerWasConnected = 2;
874                     }
875                     timeout = checkIntervalMillis - (SystemClock.uptimeMillis() - start);
876                 }
877 
878                 final int waitState = evaluateCheckerCompletionLocked();
879                 if (waitState == COMPLETED) {
880                     // The monitors have returned; reset
881                     waitedHalf = false;
882                     continue;
883                 } else if (waitState == WAITING) {
884                     // still waiting but within their configured intervals; back off and recheck
885                     continue;
886                 } else if (waitState == WAITED_UNTIL_PRE_WATCHDOG) {
887                     if (!waitedHalf) {
888                         Slog.i(TAG, "WAITED_UNTIL_PRE_WATCHDOG");
889                         waitedHalf = true;
890                         // We've waited until the pre-watchdog, but we'd need to do the stack trace
891                         // dump w/o the lock.
892                         blockedCheckers = getCheckersWithStateLocked(WAITED_UNTIL_PRE_WATCHDOG);
893                         subject = describeCheckersLocked(blockedCheckers);
894                         pids = new ArrayList<>(mInterestingJavaPids);
895                         doWaitedPreDump = true;
896                     } else {
897                         continue;
898                     }
899                 } else {
900                     // something is overdue!
901                     blockedCheckers = getCheckersWithStateLocked(OVERDUE);
902                     subject = describeCheckersLocked(blockedCheckers);
903                     allowRestart = mAllowRestart;
904                     pids = new ArrayList<>(mInterestingJavaPids);
905                 }
906             } // END synchronized (mLock)
907 
908             // If we got here, that means that the system is most likely hung.
909             //
910             // First collect stack traces from all threads of the system process.
911             //
912             // Then, if we reached the full timeout, kill this process so that the system will
913             // restart. If we reached pre-watchdog timeout, just log some information and continue.
914             logWatchog(doWaitedPreDump, subject, pids);
915 
916             if (doWaitedPreDump) {
917                 // We have waited for only pre-watchdog timeout, we continue to wait for the
918                 // duration of the full timeout before killing the process.
919                 continue;
920             }
921 
922             IActivityController controller;
923             synchronized (mLock) {
924                 controller = mController;
925             }
926             if (controller != null) {
927                 Slog.i(TAG, "Reporting stuck state to activity controller");
928                 try {
929                     Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
930                     // 1 = keep waiting, -1 = kill system
931                     int res = controller.systemNotResponding(subject);
932                     if (res >= 0) {
933                         Slog.i(TAG, "Activity controller requested to coninue to wait");
934                         waitedHalf = false;
935                         continue;
936                     }
937                 } catch (RemoteException e) {
938                 }
939             }
940 
941             // Only kill the process if the debugger is not attached.
942             if (Debug.isDebuggerConnected()) {
943                 debuggerWasConnected = 2;
944             }
945             if (debuggerWasConnected >= 2) {
946                 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
947             } else if (debuggerWasConnected > 0) {
948                 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
949             } else if (!allowRestart) {
950                 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
951             } else {
952                 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
953                 WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
954                 Slog.w(TAG, "*** GOODBYE!");
955                 if (!Build.IS_USER && isCrashLoopFound()
956                         && !WatchdogProperties.should_ignore_fatal_count().orElse(false)) {
957                     breakCrashLoop();
958                 }
959                 Process.killProcess(Process.myPid());
960                 System.exit(10);
961             }
962 
963             waitedHalf = false;
964         }
965     }
966 
logWatchog(boolean preWatchdog, String subject, ArrayList<Integer> pids)967     private void logWatchog(boolean preWatchdog, String subject, ArrayList<Integer> pids) {
968         // Get critical event log before logging the pre-watchdog so that it doesn't
969         // occur in the log.
970         String criticalEvents =
971                 CriticalEventLog.getInstance().logLinesForSystemServerTraceFile();
972         final UUID errorId = mTraceErrorLogger.generateErrorId();
973         if (mTraceErrorLogger.isAddErrorIdEnabled()) {
974             mTraceErrorLogger.addProcessInfoAndErrorIdToTrace("system_server", Process.myPid(),
975                     errorId);
976             mTraceErrorLogger.addSubjectToTrace(subject, errorId);
977         }
978 
979         final String dropboxTag;
980         if (preWatchdog) {
981             dropboxTag = "pre_watchdog";
982             CriticalEventLog.getInstance().logHalfWatchdog(subject);
983             FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_PRE_WATCHDOG_OCCURRED);
984         } else {
985             dropboxTag = "watchdog";
986             CriticalEventLog.getInstance().logWatchdog(subject, errorId);
987             EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
988             // Log the atom as early as possible since it is used as a mechanism to trigger
989             // Perfetto. Ideally, the Perfetto trace capture should happen as close to the
990             // point in time when the Watchdog happens as possible.
991             FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, subject);
992         }
993 
994         long anrTime = SystemClock.uptimeMillis();
995         StringBuilder report = new StringBuilder();
996         report.append(ResourcePressureUtil.currentPsiState());
997         ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false);
998         StringWriter tracesFileException = new StringWriter();
999         final File stack = StackTracesDumpHelper.dumpStackTraces(
1000                 pids, processCpuTracker, new SparseBooleanArray(),
1001                 CompletableFuture.completedFuture(getInterestingNativePids()), tracesFileException,
1002                 subject, criticalEvents, Runnable::run, /* latencyTracker= */null);
1003         // Give some extra time to make sure the stack traces get written.
1004         // The system's been hanging for a whlie, another second or two won't hurt much.
1005         SystemClock.sleep(5000);
1006         processCpuTracker.update();
1007         report.append(processCpuTracker.printCurrentState(anrTime, 10));
1008         report.append(tracesFileException.getBuffer());
1009 
1010         if (!preWatchdog) {
1011             // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the
1012             // kernel log
1013             doSysRq('w');
1014             doSysRq('l');
1015         }
1016 
1017         // Try to add the error to the dropbox, but assuming that the ActivityManager
1018         // itself may be deadlocked.  (which has happened, causing this statement to
1019         // deadlock and the watchdog as a whole to be ineffective)
1020         Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
1021                 public void run() {
1022                     // If a watched thread hangs before init() is called, we don't have a
1023                     // valid mActivity. So we can't log the error to dropbox.
1024                     if (mActivity != null) {
1025                         mActivity.addErrorToDropBox(
1026                                 dropboxTag, null, "system_server", null, null, null,
1027                                 null, report.toString(), stack, null, null, null,
1028                                 errorId, null);
1029                     }
1030                 }
1031             };
1032         dropboxThread.start();
1033         try {
1034             dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
1035         } catch (InterruptedException ignored) { }
1036     }
1037 
doSysRq(char c)1038     private void doSysRq(char c) {
1039         try {
1040             FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
1041             sysrq_trigger.write(c);
1042             sysrq_trigger.close();
1043         } catch (IOException e) {
1044             Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
1045         }
1046     }
1047 
resetTimeoutHistory()1048     private void resetTimeoutHistory() {
1049         writeTimeoutHistory(new ArrayList<String>());
1050     }
1051 
writeTimeoutHistory(Iterable<String> crashHistory)1052     private void writeTimeoutHistory(Iterable<String> crashHistory) {
1053         String data = String.join(",", crashHistory);
1054 
1055         try (FileWriter writer = new FileWriter(TIMEOUT_HISTORY_FILE)) {
1056             writer.write(SystemProperties.get("ro.boottime.zygote"));
1057             writer.write(":");
1058             writer.write(data);
1059         } catch (IOException e) {
1060             Slog.e(TAG, "Failed to write file " + TIMEOUT_HISTORY_FILE, e);
1061         }
1062     }
1063 
readTimeoutHistory()1064     private String[] readTimeoutHistory() {
1065         final String[] emptyStringArray = {};
1066 
1067         try (BufferedReader reader = new BufferedReader(new FileReader(TIMEOUT_HISTORY_FILE))) {
1068             String line = reader.readLine();
1069             if (line == null) {
1070                 return emptyStringArray;
1071             }
1072 
1073             String[] data = line.trim().split(":");
1074             String boottime = data.length >= 1 ? data[0] : "";
1075             String history = data.length >= 2 ? data[1] : "";
1076             if (SystemProperties.get("ro.boottime.zygote").equals(boottime) && !history.isEmpty()) {
1077                 return history.split(",");
1078             } else {
1079                 return emptyStringArray;
1080             }
1081         } catch (FileNotFoundException e) {
1082             return emptyStringArray;
1083         } catch (IOException e) {
1084             Slog.e(TAG, "Failed to read file " + TIMEOUT_HISTORY_FILE, e);
1085             return emptyStringArray;
1086         }
1087     }
1088 
hasActiveUsbConnection()1089     private boolean hasActiveUsbConnection() {
1090         try {
1091             final String state = FileUtils.readTextFile(
1092                     new File("/sys/class/android_usb/android0/state"),
1093                     128 /*max*/, null /*ellipsis*/).trim();
1094             if ("CONFIGURED".equals(state)) {
1095                 return true;
1096             }
1097         } catch (IOException e) {
1098             Slog.w(TAG, "Failed to determine if device was on USB", e);
1099         }
1100         return false;
1101     }
1102 
isCrashLoopFound()1103     private boolean isCrashLoopFound() {
1104         int fatalCount = WatchdogProperties.fatal_count().orElse(0);
1105         long fatalWindowMs = TimeUnit.SECONDS.toMillis(
1106                 WatchdogProperties.fatal_window_seconds().orElse(0));
1107         if (fatalCount == 0 || fatalWindowMs == 0) {
1108             if (fatalCount != fatalWindowMs) {
1109                 Slog.w(TAG, String.format("sysprops '%s' and '%s' should be set or unset together",
1110                             PROP_FATAL_LOOP_COUNT, PROP_FATAL_LOOP_WINDOWS_SECS));
1111             }
1112             return false;
1113         }
1114 
1115         // new-history = [last (fatalCount - 1) items in old-history] + [nowMs].
1116         long nowMs = SystemClock.elapsedRealtime(); // Time since boot including deep sleep.
1117         String[] rawCrashHistory = readTimeoutHistory();
1118         ArrayList<String> crashHistory = new ArrayList<String>(Arrays.asList(Arrays.copyOfRange(
1119                         rawCrashHistory,
1120                         Math.max(0, rawCrashHistory.length - fatalCount - 1),
1121                         rawCrashHistory.length)));
1122         // Something wrong here.
1123         crashHistory.add(String.valueOf(nowMs));
1124         writeTimeoutHistory(crashHistory);
1125 
1126         // Returns false if the device has an active USB connection.
1127         if (hasActiveUsbConnection()) {
1128             return false;
1129         }
1130 
1131         long firstCrashMs;
1132         try {
1133             firstCrashMs = Long.parseLong(crashHistory.get(0));
1134         } catch (NumberFormatException t) {
1135             Slog.w(TAG, "Failed to parseLong " + crashHistory.get(0), t);
1136             resetTimeoutHistory();
1137             return false;
1138         }
1139         return crashHistory.size() >= fatalCount && nowMs - firstCrashMs < fatalWindowMs;
1140     }
1141 
breakCrashLoop()1142     private void breakCrashLoop() {
1143         try (FileWriter kmsg = new FileWriter("/dev/kmsg_debug", /* append= */ true)) {
1144             kmsg.append("Fatal reset to escape the system_server crashing loop\n");
1145         } catch (IOException e) {
1146             Slog.w(TAG, "Failed to append to kmsg", e);
1147         }
1148         doSysRq('c');
1149     }
1150 
1151     @Override
dump(@onNull PrintWriter pw, @Nullable String[] args)1152     public void dump(@NonNull PrintWriter pw, @Nullable String[] args) {
1153         pw.print("WatchdogTimeoutMillis=");
1154         pw.println(mWatchdogTimeoutMillis);
1155     }
1156 }
1157