1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import static android.service.watchdog.ExplicitHealthCheckService.PackageConfig;
20 
21 import static java.lang.annotation.RetentionPolicy.SOURCE;
22 
23 import android.annotation.IntDef;
24 import android.annotation.Nullable;
25 import android.content.Context;
26 import android.content.pm.PackageManager;
27 import android.content.pm.VersionedPackage;
28 import android.net.ConnectivityModuleConnector;
29 import android.os.Environment;
30 import android.os.Handler;
31 import android.os.Looper;
32 import android.os.Process;
33 import android.os.SystemProperties;
34 import android.provider.DeviceConfig;
35 import android.text.TextUtils;
36 import android.util.ArrayMap;
37 import android.util.ArraySet;
38 import android.util.AtomicFile;
39 import android.util.LongArrayQueue;
40 import android.util.MathUtils;
41 import android.util.Slog;
42 import android.util.Xml;
43 
44 import com.android.internal.annotations.GuardedBy;
45 import com.android.internal.annotations.VisibleForTesting;
46 import com.android.internal.os.BackgroundThread;
47 import com.android.internal.util.FastXmlSerializer;
48 import com.android.internal.util.IndentingPrintWriter;
49 import com.android.internal.util.XmlUtils;
50 
51 import libcore.io.IoUtils;
52 
53 import org.xmlpull.v1.XmlPullParser;
54 import org.xmlpull.v1.XmlPullParserException;
55 import org.xmlpull.v1.XmlSerializer;
56 
57 import java.io.File;
58 import java.io.FileNotFoundException;
59 import java.io.FileOutputStream;
60 import java.io.IOException;
61 import java.io.InputStream;
62 import java.lang.annotation.Retention;
63 import java.lang.annotation.RetentionPolicy;
64 import java.nio.charset.StandardCharsets;
65 import java.util.ArrayList;
66 import java.util.Collections;
67 import java.util.Iterator;
68 import java.util.List;
69 import java.util.Map;
70 import java.util.Set;
71 import java.util.concurrent.TimeUnit;
72 
73 /**
74  * Monitors the health of packages on the system and notifies interested observers when packages
75  * fail. On failure, the registered observer with the least user impacting mitigation will
76  * be notified.
77  */
78 public class PackageWatchdog {
79     private static final String TAG = "PackageWatchdog";
80 
81     static final String PROPERTY_WATCHDOG_TRIGGER_DURATION_MILLIS =
82             "watchdog_trigger_failure_duration_millis";
83     static final String PROPERTY_WATCHDOG_TRIGGER_FAILURE_COUNT =
84             "watchdog_trigger_failure_count";
85     static final String PROPERTY_WATCHDOG_EXPLICIT_HEALTH_CHECK_ENABLED =
86             "watchdog_explicit_health_check_enabled";
87 
88     // TODO: make the following values configurable via DeviceConfig
89     private static final long NATIVE_CRASH_POLLING_INTERVAL_MILLIS =
90             TimeUnit.SECONDS.toMillis(30);
91     private static final long NUMBER_OF_NATIVE_CRASH_POLLS = 10;
92 
93 
94     public static final int FAILURE_REASON_UNKNOWN = 0;
95     public static final int FAILURE_REASON_NATIVE_CRASH = 1;
96     public static final int FAILURE_REASON_EXPLICIT_HEALTH_CHECK = 2;
97     public static final int FAILURE_REASON_APP_CRASH = 3;
98     public static final int FAILURE_REASON_APP_NOT_RESPONDING = 4;
99 
100     @IntDef(prefix = { "FAILURE_REASON_" }, value = {
101             FAILURE_REASON_UNKNOWN,
102             FAILURE_REASON_NATIVE_CRASH,
103             FAILURE_REASON_EXPLICIT_HEALTH_CHECK,
104             FAILURE_REASON_APP_CRASH,
105             FAILURE_REASON_APP_NOT_RESPONDING
106     })
107     @Retention(RetentionPolicy.SOURCE)
108     public @interface FailureReasons {}
109 
110     // Duration to count package failures before it resets to 0
111     @VisibleForTesting
112     static final int DEFAULT_TRIGGER_FAILURE_DURATION_MS =
113             (int) TimeUnit.MINUTES.toMillis(1);
114     // Number of package failures within the duration above before we notify observers
115     @VisibleForTesting
116     static final int DEFAULT_TRIGGER_FAILURE_COUNT = 5;
117     @VisibleForTesting
118     static final long DEFAULT_OBSERVING_DURATION_MS = TimeUnit.DAYS.toMillis(2);
119     // Whether explicit health checks are enabled or not
120     private static final boolean DEFAULT_EXPLICIT_HEALTH_CHECK_ENABLED = true;
121 
122     @VisibleForTesting
123     static final int DEFAULT_BOOT_LOOP_TRIGGER_COUNT = 5;
124     static final long DEFAULT_BOOT_LOOP_TRIGGER_WINDOW_MS = TimeUnit.MINUTES.toMillis(10);
125     private static final String PROP_RESCUE_BOOT_COUNT = "sys.rescue_boot_count";
126     private static final String PROP_RESCUE_BOOT_START = "sys.rescue_boot_start";
127 
128     private long mNumberOfNativeCrashPollsRemaining;
129 
130     private static final int DB_VERSION = 1;
131     private static final String TAG_PACKAGE_WATCHDOG = "package-watchdog";
132     private static final String TAG_PACKAGE = "package";
133     private static final String TAG_OBSERVER = "observer";
134     private static final String ATTR_VERSION = "version";
135     private static final String ATTR_NAME = "name";
136     private static final String ATTR_DURATION = "duration";
137     private static final String ATTR_EXPLICIT_HEALTH_CHECK_DURATION = "health-check-duration";
138     private static final String ATTR_PASSED_HEALTH_CHECK = "passed-health-check";
139 
140     @GuardedBy("PackageWatchdog.class")
141     private static PackageWatchdog sPackageWatchdog;
142 
143     private final Object mLock = new Object();
144     // System server context
145     private final Context mContext;
146     // Handler to run short running tasks
147     private final Handler mShortTaskHandler;
148     // Handler for processing IO and long running tasks
149     private final Handler mLongTaskHandler;
150     // Contains (observer-name -> observer-handle) that have ever been registered from
151     // previous boots. Observers with all packages expired are periodically pruned.
152     // It is saved to disk on system shutdown and repouplated on startup so it survives reboots.
153     @GuardedBy("mLock")
154     private final ArrayMap<String, ObserverInternal> mAllObservers = new ArrayMap<>();
155     // File containing the XML data of monitored packages /data/system/package-watchdog.xml
156     private final AtomicFile mPolicyFile;
157     private final ExplicitHealthCheckController mHealthCheckController;
158     private final ConnectivityModuleConnector mConnectivityModuleConnector;
159     private final Runnable mSyncRequests = this::syncRequests;
160     private final Runnable mSyncStateWithScheduledReason = this::syncStateWithScheduledReason;
161     private final Runnable mSaveToFile = this::saveToFile;
162     private final SystemClock mSystemClock;
163     private final BootThreshold mBootThreshold;
164     // The set of packages that have been synced with the ExplicitHealthCheckController
165     @GuardedBy("mLock")
166     private Set<String> mRequestedHealthCheckPackages = new ArraySet<>();
167     @GuardedBy("mLock")
168     private boolean mIsPackagesReady;
169     // Flag to control whether explicit health checks are supported or not
170     @GuardedBy("mLock")
171     private boolean mIsHealthCheckEnabled = DEFAULT_EXPLICIT_HEALTH_CHECK_ENABLED;
172     @GuardedBy("mLock")
173     private int mTriggerFailureDurationMs = DEFAULT_TRIGGER_FAILURE_DURATION_MS;
174     @GuardedBy("mLock")
175     private int mTriggerFailureCount = DEFAULT_TRIGGER_FAILURE_COUNT;
176     // SystemClock#uptimeMillis when we last executed #syncState
177     // 0 if no prune is scheduled.
178     @GuardedBy("mLock")
179     private long mUptimeAtLastStateSync;
180     // If true, sync explicit health check packages with the ExplicitHealthCheckController.
181     @GuardedBy("mLock")
182     private boolean mSyncRequired = false;
183 
184     @FunctionalInterface
185     @VisibleForTesting
186     interface SystemClock {
187         // TODO: Add elapsedRealtime to this interface
uptimeMillis()188         long uptimeMillis();
189     }
190 
PackageWatchdog(Context context)191     private PackageWatchdog(Context context) {
192         // Needs to be constructed inline
193         this(context, new AtomicFile(
194                         new File(new File(Environment.getDataDirectory(), "system"),
195                                 "package-watchdog.xml")),
196                 new Handler(Looper.myLooper()), BackgroundThread.getHandler(),
197                 new ExplicitHealthCheckController(context),
198                 ConnectivityModuleConnector.getInstance(),
199                 android.os.SystemClock::uptimeMillis);
200     }
201 
202     /**
203      * Creates a PackageWatchdog that allows injecting dependencies.
204      */
205     @VisibleForTesting
PackageWatchdog(Context context, AtomicFile policyFile, Handler shortTaskHandler, Handler longTaskHandler, ExplicitHealthCheckController controller, ConnectivityModuleConnector connectivityModuleConnector, SystemClock clock)206     PackageWatchdog(Context context, AtomicFile policyFile, Handler shortTaskHandler,
207             Handler longTaskHandler, ExplicitHealthCheckController controller,
208             ConnectivityModuleConnector connectivityModuleConnector, SystemClock clock) {
209         mContext = context;
210         mPolicyFile = policyFile;
211         mShortTaskHandler = shortTaskHandler;
212         mLongTaskHandler = longTaskHandler;
213         mHealthCheckController = controller;
214         mConnectivityModuleConnector = connectivityModuleConnector;
215         mSystemClock = clock;
216         mNumberOfNativeCrashPollsRemaining = NUMBER_OF_NATIVE_CRASH_POLLS;
217         mBootThreshold = new BootThreshold(DEFAULT_BOOT_LOOP_TRIGGER_COUNT,
218                 DEFAULT_BOOT_LOOP_TRIGGER_WINDOW_MS);
219         loadFromFile();
220         sPackageWatchdog = this;
221     }
222 
223     /** Creates or gets singleton instance of PackageWatchdog. */
getInstance(Context context)224     public static PackageWatchdog getInstance(Context context) {
225         synchronized (PackageWatchdog.class) {
226             if (sPackageWatchdog == null) {
227                 new PackageWatchdog(context);
228             }
229             return sPackageWatchdog;
230         }
231     }
232 
233     /**
234      * Called during boot to notify when packages are ready on the device so we can start
235      * binding.
236      */
onPackagesReady()237     public void onPackagesReady() {
238         synchronized (mLock) {
239             mIsPackagesReady = true;
240             mHealthCheckController.setCallbacks(packageName -> onHealthCheckPassed(packageName),
241                     packages -> onSupportedPackages(packages),
242                     () -> {
243                             syncRequestsAsync();
244                             mSyncRequired = true;
245                     });
246             setPropertyChangedListenerLocked();
247             updateConfigs();
248             registerConnectivityModuleHealthListener();
249         }
250     }
251 
252     /**
253      * Registers {@code observer} to listen for package failures. Add a new ObserverInternal for
254      * this observer if it does not already exist.
255      *
256      * <p>Observers are expected to call this on boot. It does not specify any packages but
257      * it will resume observing any packages requested from a previous boot.
258      */
registerHealthObserver(PackageHealthObserver observer)259     public void registerHealthObserver(PackageHealthObserver observer) {
260         synchronized (mLock) {
261             ObserverInternal internalObserver = mAllObservers.get(observer.getName());
262             if (internalObserver != null) {
263                 internalObserver.registeredObserver = observer;
264             } else {
265                 internalObserver = new ObserverInternal(observer.getName(), new ArrayList<>());
266                 internalObserver.registeredObserver = observer;
267                 mAllObservers.put(observer.getName(), internalObserver);
268                 syncState("added new observer");
269             }
270         }
271     }
272 
273     /**
274      * Starts observing the health of the {@code packages} for {@code observer} and notifies
275      * {@code observer} of any package failures within the monitoring duration.
276      *
277      * <p>If monitoring a package supporting explicit health check, at the end of the monitoring
278      * duration if {@link #onHealthCheckPassed} was never called,
279      * {@link PackageHealthObserver#execute} will be called as if the package failed.
280      *
281      * <p>If {@code observer} is already monitoring a package in {@code packageNames},
282      * the monitoring window of that package will be reset to {@code durationMs} and the health
283      * check state will be reset to a default depending on if the package is contained in
284      * {@link mPackagesWithExplicitHealthCheckEnabled}.
285      *
286      * <p>If {@code packageNames} is empty, this will be a no-op.
287      *
288      * <p>If {@code durationMs} is less than 1, a default monitoring duration
289      * {@link #DEFAULT_OBSERVING_DURATION_MS} will be used.
290      */
startObservingHealth(PackageHealthObserver observer, List<String> packageNames, long durationMs)291     public void startObservingHealth(PackageHealthObserver observer, List<String> packageNames,
292             long durationMs) {
293         if (packageNames.isEmpty()) {
294             Slog.wtf(TAG, "No packages to observe, " + observer.getName());
295             return;
296         }
297         if (durationMs < 1) {
298             Slog.wtf(TAG, "Invalid duration " + durationMs + "ms for observer "
299                     + observer.getName() + ". Not observing packages " + packageNames);
300             durationMs = DEFAULT_OBSERVING_DURATION_MS;
301         }
302 
303         List<MonitoredPackage> packages = new ArrayList<>();
304         for (int i = 0; i < packageNames.size(); i++) {
305             // Health checks not available yet so health check state will start INACTIVE
306             MonitoredPackage pkg = newMonitoredPackage(packageNames.get(i), durationMs, false);
307             if (pkg != null) {
308                 packages.add(pkg);
309             }
310         }
311 
312         if (packages.isEmpty()) {
313             return;
314         }
315 
316         // Sync before we add the new packages to the observers. This will #pruneObservers,
317         // causing any elapsed time to be deducted from all existing packages before we add new
318         // packages. This maintains the invariant that the elapsed time for ALL (new and existing)
319         // packages is the same.
320         mLongTaskHandler.post(() -> {
321             syncState("observing new packages");
322 
323             synchronized (mLock) {
324                 ObserverInternal oldObserver = mAllObservers.get(observer.getName());
325                 if (oldObserver == null) {
326                     Slog.d(TAG, observer.getName() + " started monitoring health "
327                             + "of packages " + packageNames);
328                     mAllObservers.put(observer.getName(),
329                             new ObserverInternal(observer.getName(), packages));
330                 } else {
331                     Slog.d(TAG, observer.getName() + " added the following "
332                             + "packages to monitor " + packageNames);
333                     oldObserver.updatePackagesLocked(packages);
334                 }
335             }
336 
337             // Register observer in case not already registered
338             registerHealthObserver(observer);
339 
340             // Sync after we add the new packages to the observers. We may have received packges
341             // requiring an earlier schedule than we are currently scheduled for.
342             syncState("updated observers");
343         });
344 
345     }
346 
347     /**
348      * Unregisters {@code observer} from listening to package failure.
349      * Additionally, this stops observing any packages that may have previously been observed
350      * even from a previous boot.
351      */
unregisterHealthObserver(PackageHealthObserver observer)352     public void unregisterHealthObserver(PackageHealthObserver observer) {
353         synchronized (mLock) {
354             mAllObservers.remove(observer.getName());
355         }
356         syncState("unregistering observer: " + observer.getName());
357     }
358 
359     /**
360      * Called when a process fails due to a crash, ANR or explicit health check.
361      *
362      * <p>For each package contained in the process, one registered observer with the least user
363      * impact will be notified for mitigation.
364      *
365      * <p>This method could be called frequently if there is a severe problem on the device.
366      */
onPackageFailure(List<VersionedPackage> packages, @FailureReasons int failureReason)367     public void onPackageFailure(List<VersionedPackage> packages,
368             @FailureReasons int failureReason) {
369         if (packages == null) {
370             Slog.w(TAG, "Could not resolve a list of failing packages");
371             return;
372         }
373         mLongTaskHandler.post(() -> {
374             synchronized (mLock) {
375                 if (mAllObservers.isEmpty()) {
376                     return;
377                 }
378                 boolean requiresImmediateAction = (failureReason == FAILURE_REASON_NATIVE_CRASH
379                         || failureReason == FAILURE_REASON_EXPLICIT_HEALTH_CHECK);
380                 if (requiresImmediateAction) {
381                     handleFailureImmediately(packages, failureReason);
382                 } else {
383                     for (int pIndex = 0; pIndex < packages.size(); pIndex++) {
384                         VersionedPackage versionedPackage = packages.get(pIndex);
385                         // Observer that will receive failure for versionedPackage
386                         PackageHealthObserver currentObserverToNotify = null;
387                         int currentObserverImpact = Integer.MAX_VALUE;
388 
389                         // Find observer with least user impact
390                         for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) {
391                             ObserverInternal observer = mAllObservers.valueAt(oIndex);
392                             PackageHealthObserver registeredObserver = observer.registeredObserver;
393                             if (registeredObserver != null
394                                     && observer.onPackageFailureLocked(
395                                     versionedPackage.getPackageName())) {
396                                 int impact = registeredObserver.onHealthCheckFailed(
397                                         versionedPackage, failureReason);
398                                 if (impact != PackageHealthObserverImpact.USER_IMPACT_NONE
399                                         && impact < currentObserverImpact) {
400                                     currentObserverToNotify = registeredObserver;
401                                     currentObserverImpact = impact;
402                                 }
403                             }
404                         }
405 
406                         // Execute action with least user impact
407                         if (currentObserverToNotify != null) {
408                             currentObserverToNotify.execute(versionedPackage, failureReason);
409                         }
410                     }
411                 }
412             }
413         });
414     }
415 
416     /**
417      * For native crashes or explicit health check failures, call directly into each observer to
418      * mitigate the error without going through failure threshold logic.
419      */
handleFailureImmediately(List<VersionedPackage> packages, @FailureReasons int failureReason)420     private void handleFailureImmediately(List<VersionedPackage> packages,
421             @FailureReasons int failureReason) {
422         VersionedPackage failingPackage = packages.size() > 0 ? packages.get(0) : null;
423         PackageHealthObserver currentObserverToNotify = null;
424         int currentObserverImpact = Integer.MAX_VALUE;
425         for (ObserverInternal observer: mAllObservers.values()) {
426             PackageHealthObserver registeredObserver = observer.registeredObserver;
427             if (registeredObserver != null) {
428                 int impact = registeredObserver.onHealthCheckFailed(
429                         failingPackage, failureReason);
430                 if (impact != PackageHealthObserverImpact.USER_IMPACT_NONE
431                         && impact < currentObserverImpact) {
432                     currentObserverToNotify = registeredObserver;
433                     currentObserverImpact = impact;
434                 }
435             }
436         }
437         if (currentObserverToNotify != null) {
438             currentObserverToNotify.execute(failingPackage,  failureReason);
439         }
440     }
441 
442     /**
443      * Called when the system server boots. If the system server is detected to be in a boot loop,
444      * query each observer and perform the mitigation action with the lowest user impact.
445      */
noteBoot()446     public void noteBoot() {
447         synchronized (mLock) {
448             if (mBootThreshold.incrementAndTest()) {
449                 mBootThreshold.reset();
450                 PackageHealthObserver currentObserverToNotify = null;
451                 int currentObserverImpact = Integer.MAX_VALUE;
452                 for (int i = 0; i < mAllObservers.size(); i++) {
453                     final ObserverInternal observer = mAllObservers.valueAt(i);
454                     PackageHealthObserver registeredObserver = observer.registeredObserver;
455                     if (registeredObserver != null) {
456                         int impact = registeredObserver.onBootLoop();
457                         if (impact != PackageHealthObserverImpact.USER_IMPACT_NONE
458                                 && impact < currentObserverImpact) {
459                             currentObserverToNotify = registeredObserver;
460                             currentObserverImpact = impact;
461                         }
462                     }
463                 }
464                 if (currentObserverToNotify != null) {
465                     currentObserverToNotify.executeBootLoopMitigation();
466                 }
467             }
468         }
469     }
470 
471     // TODO(b/120598832): Optimize write? Maybe only write a separate smaller file? Also
472     // avoid holding lock?
473     // This currently adds about 7ms extra to shutdown thread
474     /** Writes the package information to file during shutdown. */
writeNow()475     public void writeNow() {
476         synchronized (mLock) {
477             // Must only run synchronous tasks as this runs on the ShutdownThread and no other
478             // thread is guaranteed to run during shutdown.
479             if (!mAllObservers.isEmpty()) {
480                 mLongTaskHandler.removeCallbacks(mSaveToFile);
481                 pruneObserversLocked();
482                 saveToFile();
483                 Slog.i(TAG, "Last write to update package durations");
484             }
485         }
486     }
487 
488     /**
489      * Enables or disables explicit health checks.
490      * <p> If explicit health checks are enabled, the health check service is started.
491      * <p> If explicit health checks are disabled, pending explicit health check requests are
492      * passed and the health check service is stopped.
493      */
setExplicitHealthCheckEnabled(boolean enabled)494     private void setExplicitHealthCheckEnabled(boolean enabled) {
495         synchronized (mLock) {
496             mIsHealthCheckEnabled = enabled;
497             mHealthCheckController.setEnabled(enabled);
498             // Prune to update internal state whenever health check is enabled/disabled
499             syncState("health check state " + (enabled ? "enabled" : "disabled"));
500         }
501     }
502 
503     /**
504      * This method should be only called on mShortTaskHandler, since it modifies
505      * {@link #mNumberOfNativeCrashPollsRemaining}.
506      */
checkAndMitigateNativeCrashes()507     private void checkAndMitigateNativeCrashes() {
508         mNumberOfNativeCrashPollsRemaining--;
509         // Check if native watchdog reported a crash
510         if ("1".equals(SystemProperties.get("sys.init.updatable_crashing"))) {
511             // We rollback everything available when crash is unattributable
512             onPackageFailure(Collections.EMPTY_LIST, FAILURE_REASON_NATIVE_CRASH);
513             // we stop polling after an attempt to execute rollback, regardless of whether the
514             // attempt succeeds or not
515         } else {
516             if (mNumberOfNativeCrashPollsRemaining > 0) {
517                 mShortTaskHandler.postDelayed(() -> checkAndMitigateNativeCrashes(),
518                         NATIVE_CRASH_POLLING_INTERVAL_MILLIS);
519             }
520         }
521     }
522 
523     /**
524      * Since this method can eventually trigger a rollback, it should be called
525      * only once boot has completed {@code onBootCompleted} and not earlier, because the install
526      * session must be entirely completed before we try to rollback.
527      */
scheduleCheckAndMitigateNativeCrashes()528     public void scheduleCheckAndMitigateNativeCrashes() {
529         Slog.i(TAG, "Scheduling " + mNumberOfNativeCrashPollsRemaining + " polls to check "
530                 + "and mitigate native crashes");
531         mShortTaskHandler.post(()->checkAndMitigateNativeCrashes());
532     }
533 
534     /** Possible severity values of the user impact of a {@link PackageHealthObserver#execute}. */
535     @Retention(SOURCE)
536     @IntDef(value = {PackageHealthObserverImpact.USER_IMPACT_NONE,
537                      PackageHealthObserverImpact.USER_IMPACT_LOW,
538                      PackageHealthObserverImpact.USER_IMPACT_MEDIUM,
539                      PackageHealthObserverImpact.USER_IMPACT_HIGH})
540     public @interface PackageHealthObserverImpact {
541         /** No action to take. */
542         int USER_IMPACT_NONE = 0;
543         /* Action has low user impact, user of a device will barely notice. */
544         int USER_IMPACT_LOW = 1;
545         /* Action has medium user impact, user of a device will likely notice. */
546         int USER_IMPACT_MEDIUM = 3;
547         /* Action has high user impact, a last resort, user of a device will be very frustrated. */
548         int USER_IMPACT_HIGH = 5;
549     }
550 
551     /** Register instances of this interface to receive notifications on package failure. */
552     public interface PackageHealthObserver {
553         /**
554          * Called when health check fails for the {@code versionedPackage}.
555          *
556          * @param versionedPackage the package that is failing. This may be null if a native
557          *                          service is crashing.
558          * @param failureReason   the type of failure that is occurring.
559          *
560          *
561          * @return any one of {@link PackageHealthObserverImpact} to express the impact
562          * to the user on {@link #execute}
563          */
onHealthCheckFailed( @ullable VersionedPackage versionedPackage, @FailureReasons int failureReason)564         @PackageHealthObserverImpact int onHealthCheckFailed(
565                 @Nullable VersionedPackage versionedPackage,
566                 @FailureReasons int failureReason);
567 
568         /**
569          * Executes mitigation for {@link #onHealthCheckFailed}.
570          *
571          * @param versionedPackage the package that is failing. This may be null if a native
572          *                          service is crashing.
573          * @param failureReason   the type of failure that is occurring.
574          * @return {@code true} if action was executed successfully, {@code false} otherwise
575          */
execute(@ullable VersionedPackage versionedPackage, @FailureReasons int failureReason)576         boolean execute(@Nullable VersionedPackage versionedPackage,
577                 @FailureReasons int failureReason);
578 
579 
580         /**
581          * Called when the system server has booted several times within a window of time, defined
582          * by {@link #mBootThreshold}
583          */
onBootLoop()584         default @PackageHealthObserverImpact int onBootLoop() {
585             return PackageHealthObserverImpact.USER_IMPACT_NONE;
586         }
587 
588         /**
589          * Executes mitigation for {@link #onBootLoop}
590          */
executeBootLoopMitigation()591         default boolean executeBootLoopMitigation() {
592             return false;
593         }
594 
595         // TODO(b/120598832): Ensure uniqueness?
596         /**
597          * Identifier for the observer, should not change across device updates otherwise the
598          * watchdog may drop observing packages with the old name.
599          */
getName()600         String getName();
601 
602         /**
603          * An observer will not be pruned if this is set, even if the observer is not explicitly
604          * monitoring any packages.
605          */
isPersistent()606         default boolean isPersistent() {
607             return false;
608         }
609 
610         /**
611          * Returns {@code true} if this observer wishes to observe the given package, {@code false}
612          * otherwise
613          *
614          * <p> A persistent observer may choose to start observing certain failing packages, even if
615          * it has not explicitly asked to watch the package with {@link #startObservingHealth}.
616          */
mayObservePackage(String packageName)617         default boolean mayObservePackage(String packageName) {
618             return false;
619         }
620     }
621 
getTriggerFailureCount()622     long getTriggerFailureCount() {
623         synchronized (mLock) {
624             return mTriggerFailureCount;
625         }
626     }
627 
628     /**
629      * Serializes and syncs health check requests with the {@link ExplicitHealthCheckController}.
630      */
syncRequestsAsync()631     private void syncRequestsAsync() {
632         mShortTaskHandler.removeCallbacks(mSyncRequests);
633         mShortTaskHandler.post(mSyncRequests);
634     }
635 
636     /**
637      * Syncs health check requests with the {@link ExplicitHealthCheckController}.
638      * Calls to this must be serialized.
639      *
640      * @see #syncRequestsAsync
641      */
syncRequests()642     private void syncRequests() {
643         boolean syncRequired = false;
644         synchronized (mLock) {
645             if (mIsPackagesReady) {
646                 Set<String> packages = getPackagesPendingHealthChecksLocked();
647                 if (mSyncRequired || !packages.equals(mRequestedHealthCheckPackages)
648                         || packages.isEmpty()) {
649                     syncRequired = true;
650                     mRequestedHealthCheckPackages = packages;
651                 }
652             } // else, we will sync requests when packages become ready
653         }
654 
655         // Call outside lock to avoid holding lock when calling into the controller.
656         if (syncRequired) {
657             Slog.i(TAG, "Syncing health check requests for packages: "
658                     + mRequestedHealthCheckPackages);
659             mHealthCheckController.syncRequests(mRequestedHealthCheckPackages);
660             mSyncRequired = false;
661         }
662     }
663 
664     /**
665      * Updates the observers monitoring {@code packageName} that explicit health check has passed.
666      *
667      * <p> This update is strictly for registered observers at the time of the call
668      * Observers that register after this signal will have no knowledge of prior signals and will
669      * effectively behave as if the explicit health check hasn't passed for {@code packageName}.
670      *
671      * <p> {@code packageName} can still be considered failed if reported by
672      * {@link #onPackageFailureLocked} before the package expires.
673      *
674      * <p> Triggered by components outside the system server when they are fully functional after an
675      * update.
676      */
onHealthCheckPassed(String packageName)677     private void onHealthCheckPassed(String packageName) {
678         Slog.i(TAG, "Health check passed for package: " + packageName);
679         boolean isStateChanged = false;
680 
681         synchronized (mLock) {
682             for (int observerIdx = 0; observerIdx < mAllObservers.size(); observerIdx++) {
683                 ObserverInternal observer = mAllObservers.valueAt(observerIdx);
684                 MonitoredPackage monitoredPackage = observer.packages.get(packageName);
685 
686                 if (monitoredPackage != null) {
687                     int oldState = monitoredPackage.getHealthCheckStateLocked();
688                     int newState = monitoredPackage.tryPassHealthCheckLocked();
689                     isStateChanged |= oldState != newState;
690                 }
691             }
692         }
693 
694         if (isStateChanged) {
695             syncState("health check passed for " + packageName);
696         }
697     }
698 
onSupportedPackages(List<PackageConfig> supportedPackages)699     private void onSupportedPackages(List<PackageConfig> supportedPackages) {
700         boolean isStateChanged = false;
701 
702         Map<String, Long> supportedPackageTimeouts = new ArrayMap<>();
703         Iterator<PackageConfig> it = supportedPackages.iterator();
704         while (it.hasNext()) {
705             PackageConfig info = it.next();
706             supportedPackageTimeouts.put(info.getPackageName(), info.getHealthCheckTimeoutMillis());
707         }
708 
709         synchronized (mLock) {
710             Slog.d(TAG, "Received supported packages " + supportedPackages);
711             Iterator<ObserverInternal> oit = mAllObservers.values().iterator();
712             while (oit.hasNext()) {
713                 Iterator<MonitoredPackage> pit = oit.next().packages.values().iterator();
714                 while (pit.hasNext()) {
715                     MonitoredPackage monitoredPackage = pit.next();
716                     String packageName = monitoredPackage.getName();
717                     int oldState = monitoredPackage.getHealthCheckStateLocked();
718                     int newState;
719 
720                     if (supportedPackageTimeouts.containsKey(packageName)) {
721                         // Supported packages become ACTIVE if currently INACTIVE
722                         newState = monitoredPackage.setHealthCheckActiveLocked(
723                                 supportedPackageTimeouts.get(packageName));
724                     } else {
725                         // Unsupported packages are marked as PASSED unless already FAILED
726                         newState = monitoredPackage.tryPassHealthCheckLocked();
727                     }
728                     isStateChanged |= oldState != newState;
729                 }
730             }
731         }
732 
733         if (isStateChanged) {
734             syncState("updated health check supported packages " + supportedPackages);
735         }
736     }
737 
738     @GuardedBy("mLock")
getPackagesPendingHealthChecksLocked()739     private Set<String> getPackagesPendingHealthChecksLocked() {
740         Slog.d(TAG, "Getting all observed packages pending health checks");
741         Set<String> packages = new ArraySet<>();
742         Iterator<ObserverInternal> oit = mAllObservers.values().iterator();
743         while (oit.hasNext()) {
744             ObserverInternal observer = oit.next();
745             Iterator<MonitoredPackage> pit =
746                     observer.packages.values().iterator();
747             while (pit.hasNext()) {
748                 MonitoredPackage monitoredPackage = pit.next();
749                 String packageName = monitoredPackage.getName();
750                 if (monitoredPackage.isPendingHealthChecksLocked()) {
751                     packages.add(packageName);
752                 }
753             }
754         }
755         return packages;
756     }
757 
758     /**
759      * Syncs the state of the observers.
760      *
761      * <p> Prunes all observers, saves new state to disk, syncs health check requests with the
762      * health check service and schedules the next state sync.
763      */
syncState(String reason)764     private void syncState(String reason) {
765         synchronized (mLock) {
766             Slog.i(TAG, "Syncing state, reason: " + reason);
767             pruneObserversLocked();
768 
769             saveToFileAsync();
770             syncRequestsAsync();
771 
772             // Done syncing state, schedule the next state sync
773             scheduleNextSyncStateLocked();
774         }
775     }
776 
syncStateWithScheduledReason()777     private void syncStateWithScheduledReason() {
778         syncState("scheduled");
779     }
780 
781     @GuardedBy("mLock")
scheduleNextSyncStateLocked()782     private void scheduleNextSyncStateLocked() {
783         long durationMs = getNextStateSyncMillisLocked();
784         mShortTaskHandler.removeCallbacks(mSyncStateWithScheduledReason);
785         if (durationMs == Long.MAX_VALUE) {
786             Slog.i(TAG, "Cancelling state sync, nothing to sync");
787             mUptimeAtLastStateSync = 0;
788         } else {
789             Slog.i(TAG, "Scheduling next state sync in " + durationMs + "ms");
790             mUptimeAtLastStateSync = mSystemClock.uptimeMillis();
791             mShortTaskHandler.postDelayed(mSyncStateWithScheduledReason, durationMs);
792         }
793     }
794 
795     /**
796      * Returns the next duration in millis to sync the watchdog state.
797      *
798      * @returns Long#MAX_VALUE if there are no observed packages.
799      */
800     @GuardedBy("mLock")
getNextStateSyncMillisLocked()801     private long getNextStateSyncMillisLocked() {
802         long shortestDurationMs = Long.MAX_VALUE;
803         for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) {
804             ArrayMap<String, MonitoredPackage> packages = mAllObservers.valueAt(oIndex).packages;
805             for (int pIndex = 0; pIndex < packages.size(); pIndex++) {
806                 MonitoredPackage mp = packages.valueAt(pIndex);
807                 long duration = mp.getShortestScheduleDurationMsLocked();
808                 if (duration < shortestDurationMs) {
809                     shortestDurationMs = duration;
810                 }
811             }
812         }
813         return shortestDurationMs;
814     }
815 
816     /**
817      * Removes {@code elapsedMs} milliseconds from all durations on monitored packages
818      * and updates other internal state.
819      */
820     @GuardedBy("mLock")
pruneObserversLocked()821     private void pruneObserversLocked() {
822         long elapsedMs = mUptimeAtLastStateSync == 0
823                 ? 0 : mSystemClock.uptimeMillis() - mUptimeAtLastStateSync;
824         if (elapsedMs <= 0) {
825             Slog.i(TAG, "Not pruning observers, elapsed time: " + elapsedMs + "ms");
826             return;
827         }
828 
829         Slog.i(TAG, "Removing " + elapsedMs + "ms from all packages on all observers");
830         Iterator<ObserverInternal> it = mAllObservers.values().iterator();
831         while (it.hasNext()) {
832             ObserverInternal observer = it.next();
833             Set<MonitoredPackage> failedPackages =
834                     observer.prunePackagesLocked(elapsedMs);
835             if (!failedPackages.isEmpty()) {
836                 onHealthCheckFailed(observer, failedPackages);
837             }
838             if (observer.packages.isEmpty() && (observer.registeredObserver == null
839                     || !observer.registeredObserver.isPersistent())) {
840                 Slog.i(TAG, "Discarding observer " + observer.name + ". All packages expired");
841                 it.remove();
842             }
843         }
844     }
845 
onHealthCheckFailed(ObserverInternal observer, Set<MonitoredPackage> failedPackages)846     private void onHealthCheckFailed(ObserverInternal observer,
847             Set<MonitoredPackage> failedPackages) {
848         mLongTaskHandler.post(() -> {
849             synchronized (mLock) {
850                 PackageHealthObserver registeredObserver = observer.registeredObserver;
851                 if (registeredObserver != null) {
852                     Iterator<MonitoredPackage> it = failedPackages.iterator();
853                     while (it.hasNext()) {
854                         VersionedPackage versionedPkg = it.next().mPackage;
855                         Slog.i(TAG, "Explicit health check failed for package " + versionedPkg);
856                         registeredObserver.execute(versionedPkg,
857                                 PackageWatchdog.FAILURE_REASON_EXPLICIT_HEALTH_CHECK);
858                     }
859                 }
860             }
861         });
862     }
863 
864     @Nullable
getVersionedPackage(String packageName)865     private VersionedPackage getVersionedPackage(String packageName) {
866         final PackageManager pm = mContext.getPackageManager();
867         if (pm == null || TextUtils.isEmpty(packageName)) {
868             return null;
869         }
870         try {
871             final long versionCode = pm.getPackageInfo(
872                     packageName, 0 /* flags */).getLongVersionCode();
873             return new VersionedPackage(packageName, versionCode);
874         } catch (PackageManager.NameNotFoundException e) {
875             return null;
876         }
877     }
878 
879     /**
880      * Loads mAllObservers from file.
881      *
882      * <p>Note that this is <b>not</b> thread safe and should only called be called
883      * from the constructor.
884      */
loadFromFile()885     private void loadFromFile() {
886         InputStream infile = null;
887         mAllObservers.clear();
888         try {
889             infile = mPolicyFile.openRead();
890             final XmlPullParser parser = Xml.newPullParser();
891             parser.setInput(infile, StandardCharsets.UTF_8.name());
892             XmlUtils.beginDocument(parser, TAG_PACKAGE_WATCHDOG);
893             int outerDepth = parser.getDepth();
894             while (XmlUtils.nextElementWithin(parser, outerDepth)) {
895                 ObserverInternal observer = ObserverInternal.read(parser, this);
896                 if (observer != null) {
897                     mAllObservers.put(observer.name, observer);
898                 }
899             }
900         } catch (FileNotFoundException e) {
901             // Nothing to monitor
902         } catch (IOException | NumberFormatException | XmlPullParserException e) {
903             Slog.wtf(TAG, "Unable to read monitored packages, deleting file", e);
904             mPolicyFile.delete();
905         } finally {
906             IoUtils.closeQuietly(infile);
907         }
908     }
909 
910     /** Adds a {@link DeviceConfig#OnPropertiesChangedListener}. */
setPropertyChangedListenerLocked()911     private void setPropertyChangedListenerLocked() {
912         DeviceConfig.addOnPropertiesChangedListener(
913                 DeviceConfig.NAMESPACE_ROLLBACK,
914                 mContext.getMainExecutor(),
915                 (properties) -> {
916                     if (!DeviceConfig.NAMESPACE_ROLLBACK.equals(properties.getNamespace())) {
917                         return;
918                     }
919                     updateConfigs();
920                 });
921     }
922 
923     /**
924      * Health check is enabled or disabled after reading the flags
925      * from DeviceConfig.
926      */
updateConfigs()927     private void updateConfigs() {
928         synchronized (mLock) {
929             mTriggerFailureCount = DeviceConfig.getInt(
930                     DeviceConfig.NAMESPACE_ROLLBACK,
931                     PROPERTY_WATCHDOG_TRIGGER_FAILURE_COUNT,
932                     DEFAULT_TRIGGER_FAILURE_COUNT);
933             if (mTriggerFailureCount <= 0) {
934                 mTriggerFailureCount = DEFAULT_TRIGGER_FAILURE_COUNT;
935             }
936 
937             mTriggerFailureDurationMs = DeviceConfig.getInt(
938                     DeviceConfig.NAMESPACE_ROLLBACK,
939                     PROPERTY_WATCHDOG_TRIGGER_DURATION_MILLIS,
940                     DEFAULT_TRIGGER_FAILURE_DURATION_MS);
941             if (mTriggerFailureDurationMs <= 0) {
942                 mTriggerFailureDurationMs = DEFAULT_TRIGGER_FAILURE_DURATION_MS;
943             }
944 
945             setExplicitHealthCheckEnabled(DeviceConfig.getBoolean(
946                     DeviceConfig.NAMESPACE_ROLLBACK,
947                     PROPERTY_WATCHDOG_EXPLICIT_HEALTH_CHECK_ENABLED,
948                     DEFAULT_EXPLICIT_HEALTH_CHECK_ENABLED));
949         }
950     }
951 
registerConnectivityModuleHealthListener()952     private void registerConnectivityModuleHealthListener() {
953         // TODO: have an internal method to trigger a rollback by reporting high severity errors,
954         // and rely on ActivityManager to inform the watchdog of severe network stack crashes
955         // instead of having this listener in parallel.
956         mConnectivityModuleConnector.registerHealthListener(
957                 packageName -> {
958                     final VersionedPackage pkg = getVersionedPackage(packageName);
959                     if (pkg == null) {
960                         Slog.wtf(TAG, "NetworkStack failed but could not find its package");
961                         return;
962                     }
963                     final List<VersionedPackage> pkgList = Collections.singletonList(pkg);
964                     onPackageFailure(pkgList, FAILURE_REASON_EXPLICIT_HEALTH_CHECK);
965                 });
966     }
967 
968     /**
969      * Persists mAllObservers to file. Threshold information is ignored.
970      */
saveToFile()971     private boolean saveToFile() {
972         Slog.i(TAG, "Saving observer state to file");
973         synchronized (mLock) {
974             FileOutputStream stream;
975             try {
976                 stream = mPolicyFile.startWrite();
977             } catch (IOException e) {
978                 Slog.w(TAG, "Cannot update monitored packages", e);
979                 return false;
980             }
981 
982             try {
983                 XmlSerializer out = new FastXmlSerializer();
984                 out.setOutput(stream, StandardCharsets.UTF_8.name());
985                 out.startDocument(null, true);
986                 out.startTag(null, TAG_PACKAGE_WATCHDOG);
987                 out.attribute(null, ATTR_VERSION, Integer.toString(DB_VERSION));
988                 for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) {
989                     mAllObservers.valueAt(oIndex).writeLocked(out);
990                 }
991                 out.endTag(null, TAG_PACKAGE_WATCHDOG);
992                 out.endDocument();
993                 mPolicyFile.finishWrite(stream);
994                 return true;
995             } catch (IOException e) {
996                 Slog.w(TAG, "Failed to save monitored packages, restoring backup", e);
997                 mPolicyFile.failWrite(stream);
998                 return false;
999             } finally {
1000                 IoUtils.closeQuietly(stream);
1001             }
1002         }
1003     }
1004 
saveToFileAsync()1005     private void saveToFileAsync() {
1006         if (!mLongTaskHandler.hasCallbacks(mSaveToFile)) {
1007             mLongTaskHandler.post(mSaveToFile);
1008         }
1009     }
1010 
1011     /** Dump status of every observer in mAllObservers. */
dump(IndentingPrintWriter pw)1012     public void dump(IndentingPrintWriter pw) {
1013         pw.println("Package Watchdog status");
1014         pw.increaseIndent();
1015         synchronized (mLock) {
1016             for (String observerName : mAllObservers.keySet()) {
1017                 pw.println("Observer name: " + observerName);
1018                 pw.increaseIndent();
1019                 ObserverInternal observerInternal = mAllObservers.get(observerName);
1020                 observerInternal.dump(pw);
1021                 pw.decreaseIndent();
1022             }
1023         }
1024     }
1025 
1026     /**
1027      * Represents an observer monitoring a set of packages along with the failure thresholds for
1028      * each package.
1029      *
1030      * <p> Note, the PackageWatchdog#mLock must always be held when reading or writing
1031      * instances of this class.
1032      */
1033     private static class ObserverInternal {
1034         public final String name;
1035         @GuardedBy("mLock")
1036         public final ArrayMap<String, MonitoredPackage> packages = new ArrayMap<>();
1037         @Nullable
1038         @GuardedBy("mLock")
1039         public PackageHealthObserver registeredObserver;
1040 
ObserverInternal(String name, List<MonitoredPackage> packages)1041         ObserverInternal(String name, List<MonitoredPackage> packages) {
1042             this.name = name;
1043             updatePackagesLocked(packages);
1044         }
1045 
1046         /**
1047          * Writes important {@link MonitoredPackage} details for this observer to file.
1048          * Does not persist any package failure thresholds.
1049          */
1050         @GuardedBy("mLock")
writeLocked(XmlSerializer out)1051         public boolean writeLocked(XmlSerializer out) {
1052             try {
1053                 out.startTag(null, TAG_OBSERVER);
1054                 out.attribute(null, ATTR_NAME, name);
1055                 for (int i = 0; i < packages.size(); i++) {
1056                     MonitoredPackage p = packages.valueAt(i);
1057                     p.writeLocked(out);
1058                 }
1059                 out.endTag(null, TAG_OBSERVER);
1060                 return true;
1061             } catch (IOException e) {
1062                 Slog.w(TAG, "Cannot save observer", e);
1063                 return false;
1064             }
1065         }
1066 
1067         @GuardedBy("mLock")
updatePackagesLocked(List<MonitoredPackage> packages)1068         public void updatePackagesLocked(List<MonitoredPackage> packages) {
1069             for (int pIndex = 0; pIndex < packages.size(); pIndex++) {
1070                 MonitoredPackage p = packages.get(pIndex);
1071                 MonitoredPackage existingPackage = this.packages.get(p.getName());
1072                 if (existingPackage != null) {
1073                     existingPackage.updateHealthCheckDuration(p.mDurationMs);
1074                 } else {
1075                     this.packages.put(p.getName(), p);
1076                 }
1077             }
1078         }
1079 
1080         /**
1081          * Reduces the monitoring durations of all packages observed by this observer by
1082          * {@code elapsedMs}. If any duration is less than 0, the package is removed from
1083          * observation. If any health check duration is less than 0, the health check result
1084          * is evaluated.
1085          *
1086          * @return a {@link Set} of packages that were removed from the observer without explicit
1087          * health check passing, or an empty list if no package expired for which an explicit health
1088          * check was still pending
1089          */
1090         @GuardedBy("mLock")
prunePackagesLocked(long elapsedMs)1091         private Set<MonitoredPackage> prunePackagesLocked(long elapsedMs) {
1092             Set<MonitoredPackage> failedPackages = new ArraySet<>();
1093             Iterator<MonitoredPackage> it = packages.values().iterator();
1094             while (it.hasNext()) {
1095                 MonitoredPackage p = it.next();
1096                 int oldState = p.getHealthCheckStateLocked();
1097                 int newState = p.handleElapsedTimeLocked(elapsedMs);
1098                 if (oldState != HealthCheckState.FAILED
1099                         && newState == HealthCheckState.FAILED) {
1100                     Slog.i(TAG, "Package " + p.getName() + " failed health check");
1101                     failedPackages.add(p);
1102                 }
1103                 if (p.isExpiredLocked()) {
1104                     it.remove();
1105                 }
1106             }
1107             return failedPackages;
1108         }
1109 
1110         /**
1111          * Increments failure counts of {@code packageName}.
1112          * @returns {@code true} if failure threshold is exceeded, {@code false} otherwise
1113          */
1114         @GuardedBy("mLock")
onPackageFailureLocked(String packageName)1115         public boolean onPackageFailureLocked(String packageName) {
1116             if (packages.get(packageName) == null && registeredObserver.isPersistent()
1117                     && registeredObserver.mayObservePackage(packageName)) {
1118                 packages.put(packageName, sPackageWatchdog.newMonitoredPackage(
1119                         packageName, DEFAULT_OBSERVING_DURATION_MS, false));
1120             }
1121             MonitoredPackage p = packages.get(packageName);
1122             if (p != null) {
1123                 return p.onFailureLocked();
1124             }
1125             return false;
1126         }
1127 
1128         /**
1129          * Returns one ObserverInternal from the {@code parser} and advances its state.
1130          *
1131          * <p>Note that this method is <b>not</b> thread safe. It should only be called from
1132          * #loadFromFile which in turn is only called on construction of the
1133          * singleton PackageWatchdog.
1134          **/
read(XmlPullParser parser, PackageWatchdog watchdog)1135         public static ObserverInternal read(XmlPullParser parser, PackageWatchdog watchdog) {
1136             String observerName = null;
1137             if (TAG_OBSERVER.equals(parser.getName())) {
1138                 observerName = parser.getAttributeValue(null, ATTR_NAME);
1139                 if (TextUtils.isEmpty(observerName)) {
1140                     Slog.wtf(TAG, "Unable to read observer name");
1141                     return null;
1142                 }
1143             }
1144             List<MonitoredPackage> packages = new ArrayList<>();
1145             int innerDepth = parser.getDepth();
1146             try {
1147                 while (XmlUtils.nextElementWithin(parser, innerDepth)) {
1148                     if (TAG_PACKAGE.equals(parser.getName())) {
1149                         try {
1150                             String packageName = parser.getAttributeValue(null, ATTR_NAME);
1151                             long duration = Long.parseLong(
1152                                     parser.getAttributeValue(null, ATTR_DURATION));
1153                             long healthCheckDuration = Long.parseLong(
1154                                     parser.getAttributeValue(null,
1155                                             ATTR_EXPLICIT_HEALTH_CHECK_DURATION));
1156                             boolean hasPassedHealthCheck = Boolean.parseBoolean(
1157                                     parser.getAttributeValue(null, ATTR_PASSED_HEALTH_CHECK));
1158                             MonitoredPackage pkg = watchdog.newMonitoredPackage(packageName,
1159                                     duration, healthCheckDuration, hasPassedHealthCheck);
1160                             if (pkg != null) {
1161                                 packages.add(pkg);
1162                             }
1163                         } catch (NumberFormatException e) {
1164                             Slog.wtf(TAG, "Skipping package for observer " + observerName, e);
1165                             continue;
1166                         }
1167                     }
1168                 }
1169             } catch (XmlPullParserException | IOException e) {
1170                 Slog.wtf(TAG, "Unable to read observer " + observerName, e);
1171                 return null;
1172             }
1173             if (packages.isEmpty()) {
1174                 return null;
1175             }
1176             return new ObserverInternal(observerName, packages);
1177         }
1178 
1179         /** Dumps information about this observer and the packages it watches. */
dump(IndentingPrintWriter pw)1180         public void dump(IndentingPrintWriter pw) {
1181             boolean isPersistent = registeredObserver != null && registeredObserver.isPersistent();
1182             pw.println("Persistent: " + isPersistent);
1183             for (String packageName : packages.keySet()) {
1184                 MonitoredPackage p = packages.get(packageName);
1185                 pw.println(packageName +  ": ");
1186                 pw.increaseIndent();
1187                 pw.println("# Failures: " + p.mFailureHistory.size());
1188                 pw.println("Monitoring duration remaining: " + p.mDurationMs + "ms");
1189                 pw.println("Explicit health check duration: " + p.mHealthCheckDurationMs + "ms");
1190                 pw.println("Health check state: " + p.toString(p.mHealthCheckState));
1191                 pw.decreaseIndent();
1192             }
1193         }
1194     }
1195 
1196     @Retention(SOURCE)
1197     @IntDef(value = {
1198             HealthCheckState.ACTIVE,
1199             HealthCheckState.INACTIVE,
1200             HealthCheckState.PASSED,
1201             HealthCheckState.FAILED})
1202     public @interface HealthCheckState {
1203         // The package has not passed health check but has requested a health check
1204         int ACTIVE = 0;
1205         // The package has not passed health check and has not requested a health check
1206         int INACTIVE = 1;
1207         // The package has passed health check
1208         int PASSED = 2;
1209         // The package has failed health check
1210         int FAILED = 3;
1211     }
1212 
newMonitoredPackage( String name, long durationMs, boolean hasPassedHealthCheck)1213     MonitoredPackage newMonitoredPackage(
1214             String name, long durationMs, boolean hasPassedHealthCheck) {
1215         return newMonitoredPackage(name, durationMs, Long.MAX_VALUE, hasPassedHealthCheck);
1216     }
1217 
newMonitoredPackage(String name, long durationMs, long healthCheckDurationMs, boolean hasPassedHealthCheck)1218     MonitoredPackage newMonitoredPackage(String name, long durationMs, long healthCheckDurationMs,
1219             boolean hasPassedHealthCheck) {
1220         VersionedPackage pkg = getVersionedPackage(name);
1221         if (pkg == null) {
1222             return null;
1223         }
1224         return new MonitoredPackage(pkg, durationMs, healthCheckDurationMs, hasPassedHealthCheck);
1225     }
1226 
1227     /**
1228      * Represents a package and its health check state along with the time
1229      * it should be monitored for.
1230      *
1231      * <p> Note, the PackageWatchdog#mLock must always be held when reading or writing
1232      * instances of this class.
1233      */
1234     class MonitoredPackage {
1235         private final VersionedPackage mPackage;
1236         // Times when package failures happen sorted in ascending order
1237         @GuardedBy("mLock")
1238         private final LongArrayQueue mFailureHistory = new LongArrayQueue();
1239         // One of STATE_[ACTIVE|INACTIVE|PASSED|FAILED]. Updated on construction and after
1240         // methods that could change the health check state: handleElapsedTimeLocked and
1241         // tryPassHealthCheckLocked
1242         private int mHealthCheckState = HealthCheckState.INACTIVE;
1243         // Whether an explicit health check has passed.
1244         // This value in addition with mHealthCheckDurationMs determines the health check state
1245         // of the package, see #getHealthCheckStateLocked
1246         @GuardedBy("mLock")
1247         private boolean mHasPassedHealthCheck;
1248         // System uptime duration to monitor package.
1249         @GuardedBy("mLock")
1250         private long mDurationMs;
1251         // System uptime duration to check the result of an explicit health check
1252         // Initially, MAX_VALUE until we get a value from the health check service
1253         // and request health checks.
1254         // This value in addition with mHasPassedHealthCheck determines the health check state
1255         // of the package, see #getHealthCheckStateLocked
1256         @GuardedBy("mLock")
1257         private long mHealthCheckDurationMs = Long.MAX_VALUE;
1258 
MonitoredPackage(VersionedPackage pkg, long durationMs, long healthCheckDurationMs, boolean hasPassedHealthCheck)1259         private MonitoredPackage(VersionedPackage pkg, long durationMs,
1260                 long healthCheckDurationMs, boolean hasPassedHealthCheck) {
1261             mPackage = pkg;
1262             mDurationMs = durationMs;
1263             mHealthCheckDurationMs = healthCheckDurationMs;
1264             mHasPassedHealthCheck = hasPassedHealthCheck;
1265             updateHealthCheckStateLocked();
1266         }
1267 
1268         /** Writes the salient fields to disk using {@code out}. */
1269         @GuardedBy("mLock")
writeLocked(XmlSerializer out)1270         public void writeLocked(XmlSerializer out) throws IOException {
1271             out.startTag(null, TAG_PACKAGE);
1272             out.attribute(null, ATTR_NAME, getName());
1273             out.attribute(null, ATTR_DURATION, String.valueOf(mDurationMs));
1274             out.attribute(null, ATTR_EXPLICIT_HEALTH_CHECK_DURATION,
1275                     String.valueOf(mHealthCheckDurationMs));
1276             out.attribute(null, ATTR_PASSED_HEALTH_CHECK,
1277                     String.valueOf(mHasPassedHealthCheck));
1278             out.endTag(null, TAG_PACKAGE);
1279         }
1280 
1281         /**
1282          * Increment package failures or resets failure count depending on the last package failure.
1283          *
1284          * @return {@code true} if failure count exceeds a threshold, {@code false} otherwise
1285          */
1286         @GuardedBy("mLock")
onFailureLocked()1287         public boolean onFailureLocked() {
1288             // Sliding window algorithm: find out if there exists a window containing failures >=
1289             // mTriggerFailureCount.
1290             final long now = mSystemClock.uptimeMillis();
1291             mFailureHistory.addLast(now);
1292             while (now - mFailureHistory.peekFirst() > mTriggerFailureDurationMs) {
1293                 // Prune values falling out of the window
1294                 mFailureHistory.removeFirst();
1295             }
1296             boolean failed = mFailureHistory.size() >= mTriggerFailureCount;
1297             if (failed) {
1298                 mFailureHistory.clear();
1299             }
1300             return failed;
1301         }
1302 
1303         /**
1304          * Sets the initial health check duration.
1305          *
1306          * @return the new health check state
1307          */
1308         @GuardedBy("mLock")
setHealthCheckActiveLocked(long initialHealthCheckDurationMs)1309         public int setHealthCheckActiveLocked(long initialHealthCheckDurationMs) {
1310             if (initialHealthCheckDurationMs <= 0) {
1311                 Slog.wtf(TAG, "Cannot set non-positive health check duration "
1312                         + initialHealthCheckDurationMs + "ms for package " + getName()
1313                         + ". Using total duration " + mDurationMs + "ms instead");
1314                 initialHealthCheckDurationMs = mDurationMs;
1315             }
1316             if (mHealthCheckState == HealthCheckState.INACTIVE) {
1317                 // Transitions to ACTIVE
1318                 mHealthCheckDurationMs = initialHealthCheckDurationMs;
1319             }
1320             return updateHealthCheckStateLocked();
1321         }
1322 
1323         /**
1324          * Updates the monitoring durations of the package.
1325          *
1326          * @return the new health check state
1327          */
1328         @GuardedBy("mLock")
handleElapsedTimeLocked(long elapsedMs)1329         public int handleElapsedTimeLocked(long elapsedMs) {
1330             if (elapsedMs <= 0) {
1331                 Slog.w(TAG, "Cannot handle non-positive elapsed time for package " + getName());
1332                 return mHealthCheckState;
1333             }
1334             // Transitions to FAILED if now <= 0 and health check not passed
1335             mDurationMs -= elapsedMs;
1336             if (mHealthCheckState == HealthCheckState.ACTIVE) {
1337                 // We only update health check durations if we have #setHealthCheckActiveLocked
1338                 // This ensures we don't leave the INACTIVE state for an unexpected elapsed time
1339                 // Transitions to FAILED if now <= 0 and health check not passed
1340                 mHealthCheckDurationMs -= elapsedMs;
1341             }
1342             return updateHealthCheckStateLocked();
1343         }
1344 
1345         /** Explicitly update the monitoring duration of the package. */
1346         @GuardedBy("mLock")
updateHealthCheckDuration(long newDurationMs)1347         public void updateHealthCheckDuration(long newDurationMs) {
1348             mDurationMs = newDurationMs;
1349         }
1350 
1351         /**
1352          * Marks the health check as passed and transitions to {@link HealthCheckState.PASSED}
1353          * if not yet {@link HealthCheckState.FAILED}.
1354          *
1355          * @return the new {@link HealthCheckState health check state}
1356          */
1357         @GuardedBy("mLock")
1358         @HealthCheckState
tryPassHealthCheckLocked()1359         public int tryPassHealthCheckLocked() {
1360             if (mHealthCheckState != HealthCheckState.FAILED) {
1361                 // FAILED is a final state so only pass if we haven't failed
1362                 // Transition to PASSED
1363                 mHasPassedHealthCheck = true;
1364             }
1365             return updateHealthCheckStateLocked();
1366         }
1367 
1368         /** Returns the monitored package name. */
getName()1369         private String getName() {
1370             return mPackage.getPackageName();
1371         }
1372 
1373         /**
1374          * Returns the current {@link HealthCheckState health check state}.
1375          */
1376         @GuardedBy("mLock")
1377         @HealthCheckState
getHealthCheckStateLocked()1378         public int getHealthCheckStateLocked() {
1379             return mHealthCheckState;
1380         }
1381 
1382         /**
1383          * Returns the shortest duration before the package should be scheduled for a prune.
1384          *
1385          * @return the duration or {@link Long#MAX_VALUE} if the package should not be scheduled
1386          */
1387         @GuardedBy("mLock")
getShortestScheduleDurationMsLocked()1388         public long getShortestScheduleDurationMsLocked() {
1389             // Consider health check duration only if #isPendingHealthChecksLocked is true
1390             return Math.min(toPositive(mDurationMs),
1391                     isPendingHealthChecksLocked()
1392                     ? toPositive(mHealthCheckDurationMs) : Long.MAX_VALUE);
1393         }
1394 
1395         /**
1396          * Returns {@code true} if the total duration left to monitor the package is less than or
1397          * equal to 0 {@code false} otherwise.
1398          */
1399         @GuardedBy("mLock")
isExpiredLocked()1400         public boolean isExpiredLocked() {
1401             return mDurationMs <= 0;
1402         }
1403 
1404         /**
1405          * Returns {@code true} if the package, {@link #getName} is expecting health check results
1406          * {@code false} otherwise.
1407          */
1408         @GuardedBy("mLock")
isPendingHealthChecksLocked()1409         public boolean isPendingHealthChecksLocked() {
1410             return mHealthCheckState == HealthCheckState.ACTIVE
1411                     || mHealthCheckState == HealthCheckState.INACTIVE;
1412         }
1413 
1414         /**
1415          * Updates the health check state based on {@link #mHasPassedHealthCheck}
1416          * and {@link #mHealthCheckDurationMs}.
1417          *
1418          * @return the new {@link HealthCheckState health check state}
1419          */
1420         @GuardedBy("mLock")
1421         @HealthCheckState
updateHealthCheckStateLocked()1422         private int updateHealthCheckStateLocked() {
1423             int oldState = mHealthCheckState;
1424             if (mHasPassedHealthCheck) {
1425                 // Set final state first to avoid ambiguity
1426                 mHealthCheckState = HealthCheckState.PASSED;
1427             } else if (mHealthCheckDurationMs <= 0 || mDurationMs <= 0) {
1428                 // Set final state first to avoid ambiguity
1429                 mHealthCheckState = HealthCheckState.FAILED;
1430             } else if (mHealthCheckDurationMs == Long.MAX_VALUE) {
1431                 mHealthCheckState = HealthCheckState.INACTIVE;
1432             } else {
1433                 mHealthCheckState = HealthCheckState.ACTIVE;
1434             }
1435             Slog.i(TAG, "Updated health check state for package " + getName() + ": "
1436                     + toString(oldState) + " -> " + toString(mHealthCheckState));
1437             return mHealthCheckState;
1438         }
1439 
1440         /** Returns a {@link String} representation of the current health check state. */
toString(@ealthCheckState int state)1441         private String toString(@HealthCheckState int state) {
1442             switch (state) {
1443                 case HealthCheckState.ACTIVE:
1444                     return "ACTIVE";
1445                 case HealthCheckState.INACTIVE:
1446                     return "INACTIVE";
1447                 case HealthCheckState.PASSED:
1448                     return "PASSED";
1449                 case HealthCheckState.FAILED:
1450                     return "FAILED";
1451                 default:
1452                     return "UNKNOWN";
1453             }
1454         }
1455 
1456         /** Returns {@code value} if it is greater than 0 or {@link Long#MAX_VALUE} otherwise. */
toPositive(long value)1457         private long toPositive(long value) {
1458             return value > 0 ? value : Long.MAX_VALUE;
1459         }
1460     }
1461 
1462     /**
1463      * Handles the thresholding logic for system server boots.
1464      */
1465     static class BootThreshold {
1466 
1467         private final int mBootTriggerCount;
1468         private final long mTriggerWindow;
1469 
BootThreshold(int bootTriggerCount, long triggerWindow)1470         BootThreshold(int bootTriggerCount, long triggerWindow) {
1471             this.mBootTriggerCount = bootTriggerCount;
1472             this.mTriggerWindow = triggerWindow;
1473         }
1474 
reset()1475         public void reset() {
1476             setStart(0);
1477             setCount(0);
1478         }
1479 
getCount()1480         private int getCount() {
1481             return SystemProperties.getInt(PROP_RESCUE_BOOT_COUNT, 0);
1482         }
1483 
setCount(int count)1484         private void setCount(int count) {
1485             SystemProperties.set(PROP_RESCUE_BOOT_COUNT, Integer.toString(count));
1486         }
1487 
getStart()1488         public long getStart() {
1489             return SystemProperties.getLong(PROP_RESCUE_BOOT_START, 0);
1490         }
1491 
setStart(long start)1492         public void setStart(long start) {
1493             final long now = android.os.SystemClock.elapsedRealtime();
1494             final long newStart = MathUtils.constrain(start, 0, now);
1495             SystemProperties.set(PROP_RESCUE_BOOT_START, Long.toString(newStart));
1496         }
1497 
1498         /** Increments the boot counter, and returns whether the device is bootlooping. */
incrementAndTest()1499         public boolean incrementAndTest() {
1500             final long now = android.os.SystemClock.elapsedRealtime();
1501             if (now - getStart() < 0) {
1502                 Slog.e(TAG, "Window was less than zero. Resetting start to current time.");
1503                 setStart(now);
1504             }
1505             final long window = now - getStart();
1506             if (window >= mTriggerWindow) {
1507                 setCount(1);
1508                 setStart(now);
1509                 return false;
1510             } else {
1511                 int count = getCount() + 1;
1512                 setCount(count);
1513                 EventLogTags.writeRescueNote(Process.ROOT_UID, count, window);
1514                 return count >= mBootTriggerCount;
1515             }
1516         }
1517 
1518     }
1519 }
1520