1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import static android.service.watchdog.ExplicitHealthCheckService.PackageConfig;
20 
21 import static java.lang.annotation.RetentionPolicy.SOURCE;
22 
23 import android.annotation.IntDef;
24 import android.annotation.Nullable;
25 import android.content.Context;
26 import android.content.pm.PackageInfo;
27 import android.content.pm.PackageManager;
28 import android.content.pm.VersionedPackage;
29 import android.crashrecovery.flags.Flags;
30 import android.net.ConnectivityModuleConnector;
31 import android.os.Environment;
32 import android.os.Handler;
33 import android.os.Looper;
34 import android.os.Process;
35 import android.os.SystemProperties;
36 import android.provider.DeviceConfig;
37 import android.sysprop.CrashRecoveryProperties;
38 import android.text.TextUtils;
39 import android.util.ArrayMap;
40 import android.util.ArraySet;
41 import android.util.AtomicFile;
42 import android.util.LongArrayQueue;
43 import android.util.Slog;
44 import android.util.Xml;
45 
46 import com.android.internal.annotations.GuardedBy;
47 import com.android.internal.annotations.VisibleForTesting;
48 import com.android.internal.os.BackgroundThread;
49 import com.android.internal.util.IndentingPrintWriter;
50 import com.android.internal.util.XmlUtils;
51 import com.android.modules.utils.TypedXmlPullParser;
52 import com.android.modules.utils.TypedXmlSerializer;
53 
54 import libcore.io.IoUtils;
55 
56 import org.xmlpull.v1.XmlPullParserException;
57 
58 import java.io.BufferedReader;
59 import java.io.BufferedWriter;
60 import java.io.File;
61 import java.io.FileInputStream;
62 import java.io.FileNotFoundException;
63 import java.io.FileOutputStream;
64 import java.io.FileReader;
65 import java.io.FileWriter;
66 import java.io.IOException;
67 import java.io.InputStream;
68 import java.io.ObjectInputStream;
69 import java.io.ObjectOutputStream;
70 import java.lang.annotation.Retention;
71 import java.lang.annotation.RetentionPolicy;
72 import java.util.ArrayList;
73 import java.util.Collections;
74 import java.util.HashMap;
75 import java.util.Iterator;
76 import java.util.List;
77 import java.util.Map;
78 import java.util.NoSuchElementException;
79 import java.util.Set;
80 import java.util.concurrent.TimeUnit;
81 
82 /**
83  * Monitors the health of packages on the system and notifies interested observers when packages
84  * fail. On failure, the registered observer with the least user impacting mitigation will
85  * be notified.
86  */
87 public class PackageWatchdog {
88     private static final String TAG = "PackageWatchdog";
89 
90     static final String PROPERTY_WATCHDOG_TRIGGER_DURATION_MILLIS =
91             "watchdog_trigger_failure_duration_millis";
92     static final String PROPERTY_WATCHDOG_TRIGGER_FAILURE_COUNT =
93             "watchdog_trigger_failure_count";
94     static final String PROPERTY_WATCHDOG_EXPLICIT_HEALTH_CHECK_ENABLED =
95             "watchdog_explicit_health_check_enabled";
96 
97     // TODO: make the following values configurable via DeviceConfig
98     private static final long NATIVE_CRASH_POLLING_INTERVAL_MILLIS =
99             TimeUnit.SECONDS.toMillis(30);
100     private static final long NUMBER_OF_NATIVE_CRASH_POLLS = 10;
101 
102 
103     public static final int FAILURE_REASON_UNKNOWN = 0;
104     public static final int FAILURE_REASON_NATIVE_CRASH = 1;
105     public static final int FAILURE_REASON_EXPLICIT_HEALTH_CHECK = 2;
106     public static final int FAILURE_REASON_APP_CRASH = 3;
107     public static final int FAILURE_REASON_APP_NOT_RESPONDING = 4;
108     public static final int FAILURE_REASON_BOOT_LOOP = 5;
109 
110     @IntDef(prefix = { "FAILURE_REASON_" }, value = {
111             FAILURE_REASON_UNKNOWN,
112             FAILURE_REASON_NATIVE_CRASH,
113             FAILURE_REASON_EXPLICIT_HEALTH_CHECK,
114             FAILURE_REASON_APP_CRASH,
115             FAILURE_REASON_APP_NOT_RESPONDING,
116             FAILURE_REASON_BOOT_LOOP
117     })
118     @Retention(RetentionPolicy.SOURCE)
119     public @interface FailureReasons {}
120 
121     // Duration to count package failures before it resets to 0
122     @VisibleForTesting
123     static final int DEFAULT_TRIGGER_FAILURE_DURATION_MS =
124             (int) TimeUnit.MINUTES.toMillis(1);
125     // Number of package failures within the duration above before we notify observers
126     @VisibleForTesting
127     static final int DEFAULT_TRIGGER_FAILURE_COUNT = 5;
128     @VisibleForTesting
129     static final long DEFAULT_OBSERVING_DURATION_MS = TimeUnit.DAYS.toMillis(2);
130     // Sliding window for tracking how many mitigation calls were made for a package.
131     @VisibleForTesting
132     static final long DEFAULT_DEESCALATION_WINDOW_MS = TimeUnit.HOURS.toMillis(1);
133     // Whether explicit health checks are enabled or not
134     private static final boolean DEFAULT_EXPLICIT_HEALTH_CHECK_ENABLED = true;
135 
136     @VisibleForTesting
137     static final int DEFAULT_BOOT_LOOP_TRIGGER_COUNT = 5;
138 
139     static final long DEFAULT_BOOT_LOOP_TRIGGER_WINDOW_MS = TimeUnit.MINUTES.toMillis(10);
140 
141     // Time needed to apply mitigation
142     private static final String MITIGATION_WINDOW_MS =
143             "persist.device_config.configuration.mitigation_window_ms";
144     @VisibleForTesting
145     static final long DEFAULT_MITIGATION_WINDOW_MS = TimeUnit.SECONDS.toMillis(5);
146 
147     // Threshold level at which or above user might experience significant disruption.
148     private static final String MAJOR_USER_IMPACT_LEVEL_THRESHOLD =
149             "persist.device_config.configuration.major_user_impact_level_threshold";
150     private static final int DEFAULT_MAJOR_USER_IMPACT_LEVEL_THRESHOLD =
151             PackageHealthObserverImpact.USER_IMPACT_LEVEL_71;
152 
153     private long mNumberOfNativeCrashPollsRemaining;
154 
155     private static final int DB_VERSION = 1;
156     private static final String TAG_PACKAGE_WATCHDOG = "package-watchdog";
157     private static final String TAG_PACKAGE = "package";
158     private static final String TAG_OBSERVER = "observer";
159     private static final String ATTR_VERSION = "version";
160     private static final String ATTR_NAME = "name";
161     private static final String ATTR_DURATION = "duration";
162     private static final String ATTR_EXPLICIT_HEALTH_CHECK_DURATION = "health-check-duration";
163     private static final String ATTR_PASSED_HEALTH_CHECK = "passed-health-check";
164     private static final String ATTR_MITIGATION_CALLS = "mitigation-calls";
165     private static final String ATTR_MITIGATION_COUNT = "mitigation-count";
166 
167     // A file containing information about the current mitigation count in the case of a boot loop.
168     // This allows boot loop information to persist in the case of an fs-checkpoint being
169     // aborted.
170     private static final String METADATA_FILE = "/metadata/watchdog/mitigation_count.txt";
171 
172     @GuardedBy("PackageWatchdog.class")
173     private static PackageWatchdog sPackageWatchdog;
174 
175     private final Object mLock = new Object();
176     // System server context
177     private final Context mContext;
178     // Handler to run short running tasks
179     private final Handler mShortTaskHandler;
180     // Handler for processing IO and long running tasks
181     private final Handler mLongTaskHandler;
182     // Contains (observer-name -> observer-handle) that have ever been registered from
183     // previous boots. Observers with all packages expired are periodically pruned.
184     // It is saved to disk on system shutdown and repouplated on startup so it survives reboots.
185     @GuardedBy("mLock")
186     private final ArrayMap<String, ObserverInternal> mAllObservers = new ArrayMap<>();
187     // File containing the XML data of monitored packages /data/system/package-watchdog.xml
188     private final AtomicFile mPolicyFile;
189     private final ExplicitHealthCheckController mHealthCheckController;
190     private final ConnectivityModuleConnector mConnectivityModuleConnector;
191     private final Runnable mSyncRequests = this::syncRequests;
192     private final Runnable mSyncStateWithScheduledReason = this::syncStateWithScheduledReason;
193     private final Runnable mSaveToFile = this::saveToFile;
194     private final SystemClock mSystemClock;
195     private final BootThreshold mBootThreshold;
196     private final DeviceConfig.OnPropertiesChangedListener
197             mOnPropertyChangedListener = this::onPropertyChanged;
198 
199     // The set of packages that have been synced with the ExplicitHealthCheckController
200     @GuardedBy("mLock")
201     private Set<String> mRequestedHealthCheckPackages = new ArraySet<>();
202     @GuardedBy("mLock")
203     private boolean mIsPackagesReady;
204     // Flag to control whether explicit health checks are supported or not
205     @GuardedBy("mLock")
206     private boolean mIsHealthCheckEnabled = DEFAULT_EXPLICIT_HEALTH_CHECK_ENABLED;
207     @GuardedBy("mLock")
208     private int mTriggerFailureDurationMs = DEFAULT_TRIGGER_FAILURE_DURATION_MS;
209     @GuardedBy("mLock")
210     private int mTriggerFailureCount = DEFAULT_TRIGGER_FAILURE_COUNT;
211     // SystemClock#uptimeMillis when we last executed #syncState
212     // 0 if no prune is scheduled.
213     @GuardedBy("mLock")
214     private long mUptimeAtLastStateSync;
215     // If true, sync explicit health check packages with the ExplicitHealthCheckController.
216     @GuardedBy("mLock")
217     private boolean mSyncRequired = false;
218 
219     @GuardedBy("mLock")
220     private long mLastMitigation = -1000000;
221 
222     @FunctionalInterface
223     @VisibleForTesting
224     interface SystemClock {
uptimeMillis()225         long uptimeMillis();
226     }
227 
PackageWatchdog(Context context)228     private PackageWatchdog(Context context) {
229         // Needs to be constructed inline
230         this(context, new AtomicFile(
231                         new File(new File(Environment.getDataDirectory(), "system"),
232                                 "package-watchdog.xml")),
233                 new Handler(Looper.myLooper()), BackgroundThread.getHandler(),
234                 new ExplicitHealthCheckController(context),
235                 ConnectivityModuleConnector.getInstance(),
236                 android.os.SystemClock::uptimeMillis);
237     }
238 
239     /**
240      * Creates a PackageWatchdog that allows injecting dependencies.
241      */
242     @VisibleForTesting
PackageWatchdog(Context context, AtomicFile policyFile, Handler shortTaskHandler, Handler longTaskHandler, ExplicitHealthCheckController controller, ConnectivityModuleConnector connectivityModuleConnector, SystemClock clock)243     PackageWatchdog(Context context, AtomicFile policyFile, Handler shortTaskHandler,
244             Handler longTaskHandler, ExplicitHealthCheckController controller,
245             ConnectivityModuleConnector connectivityModuleConnector, SystemClock clock) {
246         mContext = context;
247         mPolicyFile = policyFile;
248         mShortTaskHandler = shortTaskHandler;
249         mLongTaskHandler = longTaskHandler;
250         mHealthCheckController = controller;
251         mConnectivityModuleConnector = connectivityModuleConnector;
252         mSystemClock = clock;
253         mNumberOfNativeCrashPollsRemaining = NUMBER_OF_NATIVE_CRASH_POLLS;
254         mBootThreshold = new BootThreshold(DEFAULT_BOOT_LOOP_TRIGGER_COUNT,
255                 DEFAULT_BOOT_LOOP_TRIGGER_WINDOW_MS);
256 
257         loadFromFile();
258         sPackageWatchdog = this;
259     }
260 
261     /** Creates or gets singleton instance of PackageWatchdog. */
getInstance(Context context)262     public static PackageWatchdog getInstance(Context context) {
263         synchronized (PackageWatchdog.class) {
264             if (sPackageWatchdog == null) {
265                 new PackageWatchdog(context);
266             }
267             return sPackageWatchdog;
268         }
269     }
270 
271     /**
272      * Called during boot to notify when packages are ready on the device so we can start
273      * binding.
274      */
onPackagesReady()275     public void onPackagesReady() {
276         synchronized (mLock) {
277             mIsPackagesReady = true;
278             mHealthCheckController.setCallbacks(packageName -> onHealthCheckPassed(packageName),
279                     packages -> onSupportedPackages(packages),
280                     this::onSyncRequestNotified);
281             setPropertyChangedListenerLocked();
282             updateConfigs();
283             registerConnectivityModuleHealthListener();
284         }
285     }
286 
287     /**
288      * Registers {@code observer} to listen for package failures. Add a new ObserverInternal for
289      * this observer if it does not already exist.
290      *
291      * <p>Observers are expected to call this on boot. It does not specify any packages but
292      * it will resume observing any packages requested from a previous boot.
293      */
registerHealthObserver(PackageHealthObserver observer)294     public void registerHealthObserver(PackageHealthObserver observer) {
295         synchronized (mLock) {
296             ObserverInternal internalObserver = mAllObservers.get(observer.getName());
297             if (internalObserver != null) {
298                 internalObserver.registeredObserver = observer;
299             } else {
300                 internalObserver = new ObserverInternal(observer.getName(), new ArrayList<>());
301                 internalObserver.registeredObserver = observer;
302                 mAllObservers.put(observer.getName(), internalObserver);
303                 syncState("added new observer");
304             }
305         }
306     }
307 
308     /**
309      * Starts observing the health of the {@code packages} for {@code observer} and notifies
310      * {@code observer} of any package failures within the monitoring duration.
311      *
312      * <p>If monitoring a package supporting explicit health check, at the end of the monitoring
313      * duration if {@link #onHealthCheckPassed} was never called,
314      * {@link PackageHealthObserver#execute} will be called as if the package failed.
315      *
316      * <p>If {@code observer} is already monitoring a package in {@code packageNames},
317      * the monitoring window of that package will be reset to {@code durationMs} and the health
318      * check state will be reset to a default depending on if the package is contained in
319      * {@link mPackagesWithExplicitHealthCheckEnabled}.
320      *
321      * <p>If {@code packageNames} is empty, this will be a no-op.
322      *
323      * <p>If {@code durationMs} is less than 1, a default monitoring duration
324      * {@link #DEFAULT_OBSERVING_DURATION_MS} will be used.
325      */
startObservingHealth(PackageHealthObserver observer, List<String> packageNames, long durationMs)326     public void startObservingHealth(PackageHealthObserver observer, List<String> packageNames,
327             long durationMs) {
328         if (packageNames.isEmpty()) {
329             Slog.wtf(TAG, "No packages to observe, " + observer.getName());
330             return;
331         }
332         if (durationMs < 1) {
333             Slog.wtf(TAG, "Invalid duration " + durationMs + "ms for observer "
334                     + observer.getName() + ". Not observing packages " + packageNames);
335             durationMs = DEFAULT_OBSERVING_DURATION_MS;
336         }
337 
338         List<MonitoredPackage> packages = new ArrayList<>();
339         for (int i = 0; i < packageNames.size(); i++) {
340             // Health checks not available yet so health check state will start INACTIVE
341             MonitoredPackage pkg = newMonitoredPackage(packageNames.get(i), durationMs, false);
342             if (pkg != null) {
343                 packages.add(pkg);
344             } else {
345                 Slog.w(TAG, "Failed to create MonitoredPackage for pkg=" + packageNames.get(i));
346             }
347         }
348 
349         if (packages.isEmpty()) {
350             return;
351         }
352 
353         // Sync before we add the new packages to the observers. This will #pruneObservers,
354         // causing any elapsed time to be deducted from all existing packages before we add new
355         // packages. This maintains the invariant that the elapsed time for ALL (new and existing)
356         // packages is the same.
357         mLongTaskHandler.post(() -> {
358             syncState("observing new packages");
359 
360             synchronized (mLock) {
361                 ObserverInternal oldObserver = mAllObservers.get(observer.getName());
362                 if (oldObserver == null) {
363                     Slog.d(TAG, observer.getName() + " started monitoring health "
364                             + "of packages " + packageNames);
365                     mAllObservers.put(observer.getName(),
366                             new ObserverInternal(observer.getName(), packages));
367                 } else {
368                     Slog.d(TAG, observer.getName() + " added the following "
369                             + "packages to monitor " + packageNames);
370                     oldObserver.updatePackagesLocked(packages);
371                 }
372             }
373 
374             // Register observer in case not already registered
375             registerHealthObserver(observer);
376 
377             // Sync after we add the new packages to the observers. We may have received packges
378             // requiring an earlier schedule than we are currently scheduled for.
379             syncState("updated observers");
380         });
381 
382     }
383 
384     /**
385      * Unregisters {@code observer} from listening to package failure.
386      * Additionally, this stops observing any packages that may have previously been observed
387      * even from a previous boot.
388      */
unregisterHealthObserver(PackageHealthObserver observer)389     public void unregisterHealthObserver(PackageHealthObserver observer) {
390         mLongTaskHandler.post(() -> {
391             synchronized (mLock) {
392                 mAllObservers.remove(observer.getName());
393             }
394             syncState("unregistering observer: " + observer.getName());
395         });
396     }
397 
398     /**
399      * Called when a process fails due to a crash, ANR or explicit health check.
400      *
401      * <p>For each package contained in the process, one registered observer with the least user
402      * impact will be notified for mitigation.
403      *
404      * <p>This method could be called frequently if there is a severe problem on the device.
405      */
onPackageFailure(List<VersionedPackage> packages, @FailureReasons int failureReason)406     public void onPackageFailure(List<VersionedPackage> packages,
407             @FailureReasons int failureReason) {
408         if (packages == null) {
409             Slog.w(TAG, "Could not resolve a list of failing packages");
410             return;
411         }
412         synchronized (mLock) {
413             final long now = mSystemClock.uptimeMillis();
414             if (Flags.recoverabilityDetection()) {
415                 if (now >= mLastMitigation
416                         && (now - mLastMitigation) < getMitigationWindowMs()) {
417                     Slog.i(TAG, "Skipping onPackageFailure mitigation");
418                     return;
419                 }
420             }
421         }
422         mLongTaskHandler.post(() -> {
423             synchronized (mLock) {
424                 if (mAllObservers.isEmpty()) {
425                     return;
426                 }
427                 boolean requiresImmediateAction = (failureReason == FAILURE_REASON_NATIVE_CRASH
428                         || failureReason == FAILURE_REASON_EXPLICIT_HEALTH_CHECK);
429                 if (requiresImmediateAction) {
430                     handleFailureImmediately(packages, failureReason);
431                 } else {
432                     for (int pIndex = 0; pIndex < packages.size(); pIndex++) {
433                         VersionedPackage versionedPackage = packages.get(pIndex);
434                         // Observer that will receive failure for versionedPackage
435                         PackageHealthObserver currentObserverToNotify = null;
436                         int currentObserverImpact = Integer.MAX_VALUE;
437                         MonitoredPackage currentMonitoredPackage = null;
438 
439                         // Find observer with least user impact
440                         for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) {
441                             ObserverInternal observer = mAllObservers.valueAt(oIndex);
442                             PackageHealthObserver registeredObserver = observer.registeredObserver;
443                             if (registeredObserver != null
444                                     && observer.onPackageFailureLocked(
445                                     versionedPackage.getPackageName())) {
446                                 MonitoredPackage p = observer.getMonitoredPackage(
447                                         versionedPackage.getPackageName());
448                                 int mitigationCount = 1;
449                                 if (p != null) {
450                                     mitigationCount = p.getMitigationCountLocked() + 1;
451                                 }
452                                 int impact = registeredObserver.onHealthCheckFailed(
453                                         versionedPackage, failureReason, mitigationCount);
454                                 if (impact != PackageHealthObserverImpact.USER_IMPACT_LEVEL_0
455                                         && impact < currentObserverImpact) {
456                                     currentObserverToNotify = registeredObserver;
457                                     currentObserverImpact = impact;
458                                     currentMonitoredPackage = p;
459                                 }
460                             }
461                         }
462 
463                         // Execute action with least user impact
464                         if (currentObserverToNotify != null) {
465                             int mitigationCount = 1;
466                             if (currentMonitoredPackage != null) {
467                                 currentMonitoredPackage.noteMitigationCallLocked();
468                                 mitigationCount =
469                                         currentMonitoredPackage.getMitigationCountLocked();
470                             }
471                             if (Flags.recoverabilityDetection()) {
472                                 maybeExecute(currentObserverToNotify, versionedPackage,
473                                         failureReason, currentObserverImpact, mitigationCount);
474                             } else {
475                                 currentObserverToNotify.execute(versionedPackage,
476                                         failureReason, mitigationCount);
477                             }
478                         }
479                     }
480                 }
481             }
482         });
483     }
484 
485     /**
486      * For native crashes or explicit health check failures, call directly into each observer to
487      * mitigate the error without going through failure threshold logic.
488      */
handleFailureImmediately(List<VersionedPackage> packages, @FailureReasons int failureReason)489     private void handleFailureImmediately(List<VersionedPackage> packages,
490             @FailureReasons int failureReason) {
491         VersionedPackage failingPackage = packages.size() > 0 ? packages.get(0) : null;
492         PackageHealthObserver currentObserverToNotify = null;
493         int currentObserverImpact = Integer.MAX_VALUE;
494         for (ObserverInternal observer: mAllObservers.values()) {
495             PackageHealthObserver registeredObserver = observer.registeredObserver;
496             if (registeredObserver != null) {
497                 int impact = registeredObserver.onHealthCheckFailed(
498                         failingPackage, failureReason, 1);
499                 if (impact != PackageHealthObserverImpact.USER_IMPACT_LEVEL_0
500                         && impact < currentObserverImpact) {
501                     currentObserverToNotify = registeredObserver;
502                     currentObserverImpact = impact;
503                 }
504             }
505         }
506         if (currentObserverToNotify != null) {
507             if (Flags.recoverabilityDetection()) {
508                 maybeExecute(currentObserverToNotify, failingPackage, failureReason,
509                         currentObserverImpact, /*mitigationCount=*/ 1);
510             } else {
511                 currentObserverToNotify.execute(failingPackage,  failureReason, 1);
512             }
513         }
514     }
515 
maybeExecute(PackageHealthObserver currentObserverToNotify, VersionedPackage versionedPackage, @FailureReasons int failureReason, int currentObserverImpact, int mitigationCount)516     private void maybeExecute(PackageHealthObserver currentObserverToNotify,
517                               VersionedPackage versionedPackage,
518                               @FailureReasons int failureReason,
519                               int currentObserverImpact,
520                               int mitigationCount) {
521         if (currentObserverImpact < getUserImpactLevelLimit()) {
522             synchronized (mLock) {
523                 mLastMitigation = mSystemClock.uptimeMillis();
524             }
525             currentObserverToNotify.execute(versionedPackage, failureReason, mitigationCount);
526         }
527     }
528 
getMitigationWindowMs()529     private long getMitigationWindowMs() {
530         return SystemProperties.getLong(MITIGATION_WINDOW_MS, DEFAULT_MITIGATION_WINDOW_MS);
531     }
532 
533 
534     /**
535      * Called when the system server boots. If the system server is detected to be in a boot loop,
536      * query each observer and perform the mitigation action with the lowest user impact.
537      *
538      * Note: PackageWatchdog considers system_server restart loop as bootloop. Full reboots
539      * are not counted in bootloop.
540      */
541     @SuppressWarnings("GuardedBy")
noteBoot()542     public void noteBoot() {
543         synchronized (mLock) {
544             // if boot count has reached threshold, start mitigation.
545             // We wait until threshold number of restarts only for the first time. Perform
546             // mitigations for every restart after that.
547             boolean mitigate = mBootThreshold.incrementAndTest();
548             if (mitigate) {
549                 if (!Flags.recoverabilityDetection()) {
550                     mBootThreshold.reset();
551                 }
552                 int mitigationCount = mBootThreshold.getMitigationCount() + 1;
553                 PackageHealthObserver currentObserverToNotify = null;
554                 ObserverInternal currentObserverInternal = null;
555                 int currentObserverImpact = Integer.MAX_VALUE;
556                 for (int i = 0; i < mAllObservers.size(); i++) {
557                     final ObserverInternal observer = mAllObservers.valueAt(i);
558                     PackageHealthObserver registeredObserver = observer.registeredObserver;
559                     if (registeredObserver != null) {
560                         int impact = Flags.recoverabilityDetection()
561                                 ? registeredObserver.onBootLoop(
562                                         observer.getBootMitigationCount() + 1)
563                                 : registeredObserver.onBootLoop(mitigationCount);
564                         if (impact != PackageHealthObserverImpact.USER_IMPACT_LEVEL_0
565                                 && impact < currentObserverImpact) {
566                             currentObserverToNotify = registeredObserver;
567                             currentObserverInternal = observer;
568                             currentObserverImpact = impact;
569                         }
570                     }
571                 }
572                 if (currentObserverToNotify != null) {
573                     if (Flags.recoverabilityDetection()) {
574                         int currentObserverMitigationCount =
575                                 currentObserverInternal.getBootMitigationCount() + 1;
576                         currentObserverInternal.setBootMitigationCount(
577                                 currentObserverMitigationCount);
578                         saveAllObserversBootMitigationCountToMetadata(METADATA_FILE);
579                         currentObserverToNotify.executeBootLoopMitigation(
580                                 currentObserverMitigationCount);
581                     } else {
582                         mBootThreshold.setMitigationCount(mitigationCount);
583                         mBootThreshold.saveMitigationCountToMetadata();
584                         currentObserverToNotify.executeBootLoopMitigation(mitigationCount);
585                     }
586                 }
587             }
588         }
589     }
590 
591     // TODO(b/120598832): Optimize write? Maybe only write a separate smaller file? Also
592     // avoid holding lock?
593     // This currently adds about 7ms extra to shutdown thread
594     /** Writes the package information to file during shutdown. */
writeNow()595     public void writeNow() {
596         synchronized (mLock) {
597             // Must only run synchronous tasks as this runs on the ShutdownThread and no other
598             // thread is guaranteed to run during shutdown.
599             if (!mAllObservers.isEmpty()) {
600                 mLongTaskHandler.removeCallbacks(mSaveToFile);
601                 pruneObserversLocked();
602                 saveToFile();
603                 Slog.i(TAG, "Last write to update package durations");
604             }
605         }
606     }
607 
608     /**
609      * Enables or disables explicit health checks.
610      * <p> If explicit health checks are enabled, the health check service is started.
611      * <p> If explicit health checks are disabled, pending explicit health check requests are
612      * passed and the health check service is stopped.
613      */
setExplicitHealthCheckEnabled(boolean enabled)614     private void setExplicitHealthCheckEnabled(boolean enabled) {
615         synchronized (mLock) {
616             mIsHealthCheckEnabled = enabled;
617             mHealthCheckController.setEnabled(enabled);
618             mSyncRequired = true;
619             // Prune to update internal state whenever health check is enabled/disabled
620             syncState("health check state " + (enabled ? "enabled" : "disabled"));
621         }
622     }
623 
624     /**
625      * This method should be only called on mShortTaskHandler, since it modifies
626      * {@link #mNumberOfNativeCrashPollsRemaining}.
627      */
checkAndMitigateNativeCrashes()628     private void checkAndMitigateNativeCrashes() {
629         mNumberOfNativeCrashPollsRemaining--;
630         // Check if native watchdog reported a crash
631         if ("1".equals(SystemProperties.get("sys.init.updatable_crashing"))) {
632             // We rollback all available low impact rollbacks when crash is unattributable
633             onPackageFailure(Collections.EMPTY_LIST, FAILURE_REASON_NATIVE_CRASH);
634             // we stop polling after an attempt to execute rollback, regardless of whether the
635             // attempt succeeds or not
636         } else {
637             if (mNumberOfNativeCrashPollsRemaining > 0) {
638                 mShortTaskHandler.postDelayed(() -> checkAndMitigateNativeCrashes(),
639                         NATIVE_CRASH_POLLING_INTERVAL_MILLIS);
640             }
641         }
642     }
643 
644     /**
645      * Since this method can eventually trigger a rollback, it should be called
646      * only once boot has completed {@code onBootCompleted} and not earlier, because the install
647      * session must be entirely completed before we try to rollback.
648      */
scheduleCheckAndMitigateNativeCrashes()649     public void scheduleCheckAndMitigateNativeCrashes() {
650         Slog.i(TAG, "Scheduling " + mNumberOfNativeCrashPollsRemaining + " polls to check "
651                 + "and mitigate native crashes");
652         mShortTaskHandler.post(()->checkAndMitigateNativeCrashes());
653     }
654 
getUserImpactLevelLimit()655     private int getUserImpactLevelLimit() {
656         return SystemProperties.getInt(MAJOR_USER_IMPACT_LEVEL_THRESHOLD,
657                 DEFAULT_MAJOR_USER_IMPACT_LEVEL_THRESHOLD);
658     }
659 
660     /** Possible severity values of the user impact of a {@link PackageHealthObserver#execute}. */
661     @Retention(SOURCE)
662     @IntDef(value = {PackageHealthObserverImpact.USER_IMPACT_LEVEL_0,
663                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_10,
664                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_20,
665                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_30,
666                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_40,
667                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_50,
668                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_70,
669                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_71,
670                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_75,
671                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_80,
672                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_90,
673                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_100})
674     public @interface PackageHealthObserverImpact {
675         /** No action to take. */
676         int USER_IMPACT_LEVEL_0 = 0;
677         /* Action has low user impact, user of a device will barely notice. */
678         int USER_IMPACT_LEVEL_10 = 10;
679         /* Actions having medium user impact, user of a device will likely notice. */
680         int USER_IMPACT_LEVEL_20 = 20;
681         int USER_IMPACT_LEVEL_30 = 30;
682         int USER_IMPACT_LEVEL_40 = 40;
683         int USER_IMPACT_LEVEL_50 = 50;
684         int USER_IMPACT_LEVEL_70 = 70;
685         /* Action has high user impact, a last resort, user of a device will be very frustrated. */
686         int USER_IMPACT_LEVEL_71 = 71;
687         int USER_IMPACT_LEVEL_75 = 75;
688         int USER_IMPACT_LEVEL_80 = 80;
689         int USER_IMPACT_LEVEL_90 = 90;
690         int USER_IMPACT_LEVEL_100 = 100;
691     }
692 
693     /** Register instances of this interface to receive notifications on package failure. */
694     public interface PackageHealthObserver {
695         /**
696          * Called when health check fails for the {@code versionedPackage}.
697          *
698          * @param versionedPackage the package that is failing. This may be null if a native
699          *                          service is crashing.
700          * @param failureReason   the type of failure that is occurring.
701          * @param mitigationCount the number of times mitigation has been called for this package
702          *                        (including this time).
703          *
704          *
705          * @return any one of {@link PackageHealthObserverImpact} to express the impact
706          * to the user on {@link #execute}
707          */
onHealthCheckFailed( @ullable VersionedPackage versionedPackage, @FailureReasons int failureReason, int mitigationCount)708         @PackageHealthObserverImpact int onHealthCheckFailed(
709                 @Nullable VersionedPackage versionedPackage,
710                 @FailureReasons int failureReason,
711                 int mitigationCount);
712 
713         /**
714          * Executes mitigation for {@link #onHealthCheckFailed}.
715          *
716          * @param versionedPackage the package that is failing. This may be null if a native
717          *                          service is crashing.
718          * @param failureReason   the type of failure that is occurring.
719          * @param mitigationCount the number of times mitigation has been called for this package
720          *                        (including this time).
721          * @return {@code true} if action was executed successfully, {@code false} otherwise
722          */
execute(@ullable VersionedPackage versionedPackage, @FailureReasons int failureReason, int mitigationCount)723         boolean execute(@Nullable VersionedPackage versionedPackage,
724                 @FailureReasons int failureReason, int mitigationCount);
725 
726 
727         /**
728          * Called when the system server has booted several times within a window of time, defined
729          * by {@link #mBootThreshold}
730          *
731          * @param mitigationCount the number of times mitigation has been attempted for this
732          *                        boot loop (including this time).
733          */
onBootLoop(int mitigationCount)734         default @PackageHealthObserverImpact int onBootLoop(int mitigationCount) {
735             return PackageHealthObserverImpact.USER_IMPACT_LEVEL_0;
736         }
737 
738         /**
739          * Executes mitigation for {@link #onBootLoop}
740          * @param mitigationCount the number of times mitigation has been attempted for this
741          *                        boot loop (including this time).
742          */
executeBootLoopMitigation(int mitigationCount)743         default boolean executeBootLoopMitigation(int mitigationCount) {
744             return false;
745         }
746 
747         // TODO(b/120598832): Ensure uniqueness?
748         /**
749          * Identifier for the observer, should not change across device updates otherwise the
750          * watchdog may drop observing packages with the old name.
751          */
getName()752         String getName();
753 
754         /**
755          * An observer will not be pruned if this is set, even if the observer is not explicitly
756          * monitoring any packages.
757          */
isPersistent()758         default boolean isPersistent() {
759             return false;
760         }
761 
762         /**
763          * Returns {@code true} if this observer wishes to observe the given package, {@code false}
764          * otherwise
765          *
766          * <p> A persistent observer may choose to start observing certain failing packages, even if
767          * it has not explicitly asked to watch the package with {@link #startObservingHealth}.
768          */
mayObservePackage(String packageName)769         default boolean mayObservePackage(String packageName) {
770             return false;
771         }
772     }
773 
774     @VisibleForTesting
getTriggerFailureCount()775     long getTriggerFailureCount() {
776         synchronized (mLock) {
777             return mTriggerFailureCount;
778         }
779     }
780 
781     @VisibleForTesting
getTriggerFailureDurationMs()782     long getTriggerFailureDurationMs() {
783         synchronized (mLock) {
784             return mTriggerFailureDurationMs;
785         }
786     }
787 
788     /**
789      * Serializes and syncs health check requests with the {@link ExplicitHealthCheckController}.
790      */
syncRequestsAsync()791     private void syncRequestsAsync() {
792         mShortTaskHandler.removeCallbacks(mSyncRequests);
793         mShortTaskHandler.post(mSyncRequests);
794     }
795 
796     /**
797      * Syncs health check requests with the {@link ExplicitHealthCheckController}.
798      * Calls to this must be serialized.
799      *
800      * @see #syncRequestsAsync
801      */
syncRequests()802     private void syncRequests() {
803         boolean syncRequired = false;
804         synchronized (mLock) {
805             if (mIsPackagesReady) {
806                 Set<String> packages = getPackagesPendingHealthChecksLocked();
807                 if (mSyncRequired || !packages.equals(mRequestedHealthCheckPackages)
808                         || packages.isEmpty()) {
809                     syncRequired = true;
810                     mRequestedHealthCheckPackages = packages;
811                 }
812             } // else, we will sync requests when packages become ready
813         }
814 
815         // Call outside lock to avoid holding lock when calling into the controller.
816         if (syncRequired) {
817             Slog.i(TAG, "Syncing health check requests for packages: "
818                     + mRequestedHealthCheckPackages);
819             mHealthCheckController.syncRequests(mRequestedHealthCheckPackages);
820             mSyncRequired = false;
821         }
822     }
823 
824     /**
825      * Updates the observers monitoring {@code packageName} that explicit health check has passed.
826      *
827      * <p> This update is strictly for registered observers at the time of the call
828      * Observers that register after this signal will have no knowledge of prior signals and will
829      * effectively behave as if the explicit health check hasn't passed for {@code packageName}.
830      *
831      * <p> {@code packageName} can still be considered failed if reported by
832      * {@link #onPackageFailureLocked} before the package expires.
833      *
834      * <p> Triggered by components outside the system server when they are fully functional after an
835      * update.
836      */
onHealthCheckPassed(String packageName)837     private void onHealthCheckPassed(String packageName) {
838         Slog.i(TAG, "Health check passed for package: " + packageName);
839         boolean isStateChanged = false;
840 
841         synchronized (mLock) {
842             for (int observerIdx = 0; observerIdx < mAllObservers.size(); observerIdx++) {
843                 ObserverInternal observer = mAllObservers.valueAt(observerIdx);
844                 MonitoredPackage monitoredPackage = observer.getMonitoredPackage(packageName);
845 
846                 if (monitoredPackage != null) {
847                     int oldState = monitoredPackage.getHealthCheckStateLocked();
848                     int newState = monitoredPackage.tryPassHealthCheckLocked();
849                     isStateChanged |= oldState != newState;
850                 }
851             }
852         }
853 
854         if (isStateChanged) {
855             syncState("health check passed for " + packageName);
856         }
857     }
858 
onSupportedPackages(List<PackageConfig> supportedPackages)859     private void onSupportedPackages(List<PackageConfig> supportedPackages) {
860         boolean isStateChanged = false;
861 
862         Map<String, Long> supportedPackageTimeouts = new ArrayMap<>();
863         Iterator<PackageConfig> it = supportedPackages.iterator();
864         while (it.hasNext()) {
865             PackageConfig info = it.next();
866             supportedPackageTimeouts.put(info.getPackageName(), info.getHealthCheckTimeoutMillis());
867         }
868 
869         synchronized (mLock) {
870             Slog.d(TAG, "Received supported packages " + supportedPackages);
871             Iterator<ObserverInternal> oit = mAllObservers.values().iterator();
872             while (oit.hasNext()) {
873                 Iterator<MonitoredPackage> pit = oit.next().getMonitoredPackages()
874                         .values().iterator();
875                 while (pit.hasNext()) {
876                     MonitoredPackage monitoredPackage = pit.next();
877                     String packageName = monitoredPackage.getName();
878                     int oldState = monitoredPackage.getHealthCheckStateLocked();
879                     int newState;
880 
881                     if (supportedPackageTimeouts.containsKey(packageName)) {
882                         // Supported packages become ACTIVE if currently INACTIVE
883                         newState = monitoredPackage.setHealthCheckActiveLocked(
884                                 supportedPackageTimeouts.get(packageName));
885                     } else {
886                         // Unsupported packages are marked as PASSED unless already FAILED
887                         newState = monitoredPackage.tryPassHealthCheckLocked();
888                     }
889                     isStateChanged |= oldState != newState;
890                 }
891             }
892         }
893 
894         if (isStateChanged) {
895             syncState("updated health check supported packages " + supportedPackages);
896         }
897     }
898 
onSyncRequestNotified()899     private void onSyncRequestNotified() {
900         synchronized (mLock) {
901             mSyncRequired = true;
902             syncRequestsAsync();
903         }
904     }
905 
906     @GuardedBy("mLock")
getPackagesPendingHealthChecksLocked()907     private Set<String> getPackagesPendingHealthChecksLocked() {
908         Set<String> packages = new ArraySet<>();
909         Iterator<ObserverInternal> oit = mAllObservers.values().iterator();
910         while (oit.hasNext()) {
911             ObserverInternal observer = oit.next();
912             Iterator<MonitoredPackage> pit =
913                     observer.getMonitoredPackages().values().iterator();
914             while (pit.hasNext()) {
915                 MonitoredPackage monitoredPackage = pit.next();
916                 String packageName = monitoredPackage.getName();
917                 if (monitoredPackage.isPendingHealthChecksLocked()) {
918                     packages.add(packageName);
919                 }
920             }
921         }
922         return packages;
923     }
924 
925     /**
926      * Syncs the state of the observers.
927      *
928      * <p> Prunes all observers, saves new state to disk, syncs health check requests with the
929      * health check service and schedules the next state sync.
930      */
syncState(String reason)931     private void syncState(String reason) {
932         synchronized (mLock) {
933             Slog.i(TAG, "Syncing state, reason: " + reason);
934             pruneObserversLocked();
935 
936             saveToFileAsync();
937             syncRequestsAsync();
938 
939             // Done syncing state, schedule the next state sync
940             scheduleNextSyncStateLocked();
941         }
942     }
943 
syncStateWithScheduledReason()944     private void syncStateWithScheduledReason() {
945         syncState("scheduled");
946     }
947 
948     @GuardedBy("mLock")
scheduleNextSyncStateLocked()949     private void scheduleNextSyncStateLocked() {
950         long durationMs = getNextStateSyncMillisLocked();
951         mShortTaskHandler.removeCallbacks(mSyncStateWithScheduledReason);
952         if (durationMs == Long.MAX_VALUE) {
953             Slog.i(TAG, "Cancelling state sync, nothing to sync");
954             mUptimeAtLastStateSync = 0;
955         } else {
956             mUptimeAtLastStateSync = mSystemClock.uptimeMillis();
957             mShortTaskHandler.postDelayed(mSyncStateWithScheduledReason, durationMs);
958         }
959     }
960 
961     /**
962      * Returns the next duration in millis to sync the watchdog state.
963      *
964      * @returns Long#MAX_VALUE if there are no observed packages.
965      */
966     @GuardedBy("mLock")
getNextStateSyncMillisLocked()967     private long getNextStateSyncMillisLocked() {
968         long shortestDurationMs = Long.MAX_VALUE;
969         for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) {
970             ArrayMap<String, MonitoredPackage> packages = mAllObservers.valueAt(oIndex)
971                     .getMonitoredPackages();
972             for (int pIndex = 0; pIndex < packages.size(); pIndex++) {
973                 MonitoredPackage mp = packages.valueAt(pIndex);
974                 long duration = mp.getShortestScheduleDurationMsLocked();
975                 if (duration < shortestDurationMs) {
976                     shortestDurationMs = duration;
977                 }
978             }
979         }
980         return shortestDurationMs;
981     }
982 
983     /**
984      * Removes {@code elapsedMs} milliseconds from all durations on monitored packages
985      * and updates other internal state.
986      */
987     @GuardedBy("mLock")
pruneObserversLocked()988     private void pruneObserversLocked() {
989         long elapsedMs = mUptimeAtLastStateSync == 0
990                 ? 0 : mSystemClock.uptimeMillis() - mUptimeAtLastStateSync;
991         if (elapsedMs <= 0) {
992             Slog.i(TAG, "Not pruning observers, elapsed time: " + elapsedMs + "ms");
993             return;
994         }
995 
996         Iterator<ObserverInternal> it = mAllObservers.values().iterator();
997         while (it.hasNext()) {
998             ObserverInternal observer = it.next();
999             Set<MonitoredPackage> failedPackages =
1000                     observer.prunePackagesLocked(elapsedMs);
1001             if (!failedPackages.isEmpty()) {
1002                 onHealthCheckFailed(observer, failedPackages);
1003             }
1004             if (observer.getMonitoredPackages().isEmpty() && (observer.registeredObserver == null
1005                     || !observer.registeredObserver.isPersistent())) {
1006                 Slog.i(TAG, "Discarding observer " + observer.name + ". All packages expired");
1007                 it.remove();
1008             }
1009         }
1010     }
1011 
onHealthCheckFailed(ObserverInternal observer, Set<MonitoredPackage> failedPackages)1012     private void onHealthCheckFailed(ObserverInternal observer,
1013             Set<MonitoredPackage> failedPackages) {
1014         mLongTaskHandler.post(() -> {
1015             synchronized (mLock) {
1016                 PackageHealthObserver registeredObserver = observer.registeredObserver;
1017                 if (registeredObserver != null) {
1018                     Iterator<MonitoredPackage> it = failedPackages.iterator();
1019                     while (it.hasNext()) {
1020                         VersionedPackage versionedPkg = getVersionedPackage(it.next().getName());
1021                         if (versionedPkg != null) {
1022                             Slog.i(TAG,
1023                                     "Explicit health check failed for package " + versionedPkg);
1024                             registeredObserver.execute(versionedPkg,
1025                                     PackageWatchdog.FAILURE_REASON_EXPLICIT_HEALTH_CHECK, 1);
1026                         }
1027                     }
1028                 }
1029             }
1030         });
1031     }
1032 
1033     /**
1034      * Gets PackageInfo for the given package. Matches any user and apex.
1035      *
1036      * @throws PackageManager.NameNotFoundException if no such package is installed.
1037      */
getPackageInfo(String packageName)1038     private PackageInfo getPackageInfo(String packageName)
1039             throws PackageManager.NameNotFoundException {
1040         PackageManager pm = mContext.getPackageManager();
1041         try {
1042             // The MATCH_ANY_USER flag doesn't mix well with the MATCH_APEX
1043             // flag, so make two separate attempts to get the package info.
1044             // We don't need both flags at the same time because we assume
1045             // apex files are always installed for all users.
1046             return pm.getPackageInfo(packageName, PackageManager.MATCH_ANY_USER);
1047         } catch (PackageManager.NameNotFoundException e) {
1048             return pm.getPackageInfo(packageName, PackageManager.MATCH_APEX);
1049         }
1050     }
1051 
1052     @Nullable
getVersionedPackage(String packageName)1053     private VersionedPackage getVersionedPackage(String packageName) {
1054         final PackageManager pm = mContext.getPackageManager();
1055         if (pm == null || TextUtils.isEmpty(packageName)) {
1056             return null;
1057         }
1058         try {
1059             final long versionCode = getPackageInfo(packageName).getLongVersionCode();
1060             return new VersionedPackage(packageName, versionCode);
1061         } catch (PackageManager.NameNotFoundException e) {
1062             return null;
1063         }
1064     }
1065 
1066     /**
1067      * Loads mAllObservers from file.
1068      *
1069      * <p>Note that this is <b>not</b> thread safe and should only called be called
1070      * from the constructor.
1071      */
loadFromFile()1072     private void loadFromFile() {
1073         InputStream infile = null;
1074         mAllObservers.clear();
1075         try {
1076             infile = mPolicyFile.openRead();
1077             final TypedXmlPullParser parser = Xml.resolvePullParser(infile);
1078             XmlUtils.beginDocument(parser, TAG_PACKAGE_WATCHDOG);
1079             int outerDepth = parser.getDepth();
1080             while (XmlUtils.nextElementWithin(parser, outerDepth)) {
1081                 ObserverInternal observer = ObserverInternal.read(parser, this);
1082                 if (observer != null) {
1083                     mAllObservers.put(observer.name, observer);
1084                 }
1085             }
1086         } catch (FileNotFoundException e) {
1087             // Nothing to monitor
1088         } catch (IOException | NumberFormatException | XmlPullParserException e) {
1089             Slog.wtf(TAG, "Unable to read monitored packages, deleting file", e);
1090             mPolicyFile.delete();
1091         } finally {
1092             IoUtils.closeQuietly(infile);
1093         }
1094     }
1095 
onPropertyChanged(DeviceConfig.Properties properties)1096     private void onPropertyChanged(DeviceConfig.Properties properties) {
1097         try {
1098             updateConfigs();
1099         } catch (Exception ignore) {
1100             Slog.w(TAG, "Failed to reload device config changes");
1101         }
1102     }
1103 
1104     /** Adds a {@link DeviceConfig#OnPropertiesChangedListener}. */
setPropertyChangedListenerLocked()1105     private void setPropertyChangedListenerLocked() {
1106         DeviceConfig.addOnPropertiesChangedListener(
1107                 DeviceConfig.NAMESPACE_ROLLBACK,
1108                 mContext.getMainExecutor(),
1109                 mOnPropertyChangedListener);
1110     }
1111 
1112     @VisibleForTesting
removePropertyChangedListener()1113     void removePropertyChangedListener() {
1114         DeviceConfig.removeOnPropertiesChangedListener(mOnPropertyChangedListener);
1115     }
1116 
1117     /**
1118      * Health check is enabled or disabled after reading the flags
1119      * from DeviceConfig.
1120      */
1121     @VisibleForTesting
updateConfigs()1122     void updateConfigs() {
1123         synchronized (mLock) {
1124             mTriggerFailureCount = DeviceConfig.getInt(
1125                     DeviceConfig.NAMESPACE_ROLLBACK,
1126                     PROPERTY_WATCHDOG_TRIGGER_FAILURE_COUNT,
1127                     DEFAULT_TRIGGER_FAILURE_COUNT);
1128             if (mTriggerFailureCount <= 0) {
1129                 mTriggerFailureCount = DEFAULT_TRIGGER_FAILURE_COUNT;
1130             }
1131 
1132             mTriggerFailureDurationMs = DeviceConfig.getInt(
1133                     DeviceConfig.NAMESPACE_ROLLBACK,
1134                     PROPERTY_WATCHDOG_TRIGGER_DURATION_MILLIS,
1135                     DEFAULT_TRIGGER_FAILURE_DURATION_MS);
1136             if (mTriggerFailureDurationMs <= 0) {
1137                 mTriggerFailureDurationMs = DEFAULT_TRIGGER_FAILURE_DURATION_MS;
1138             }
1139 
1140             setExplicitHealthCheckEnabled(DeviceConfig.getBoolean(
1141                     DeviceConfig.NAMESPACE_ROLLBACK,
1142                     PROPERTY_WATCHDOG_EXPLICIT_HEALTH_CHECK_ENABLED,
1143                     DEFAULT_EXPLICIT_HEALTH_CHECK_ENABLED));
1144         }
1145     }
1146 
registerConnectivityModuleHealthListener()1147     private void registerConnectivityModuleHealthListener() {
1148         // TODO: have an internal method to trigger a rollback by reporting high severity errors,
1149         // and rely on ActivityManager to inform the watchdog of severe network stack crashes
1150         // instead of having this listener in parallel.
1151         mConnectivityModuleConnector.registerHealthListener(
1152                 packageName -> {
1153                     final VersionedPackage pkg = getVersionedPackage(packageName);
1154                     if (pkg == null) {
1155                         Slog.wtf(TAG, "NetworkStack failed but could not find its package");
1156                         return;
1157                     }
1158                     final List<VersionedPackage> pkgList = Collections.singletonList(pkg);
1159                     onPackageFailure(pkgList, FAILURE_REASON_EXPLICIT_HEALTH_CHECK);
1160                 });
1161     }
1162 
1163     /**
1164      * Persists mAllObservers to file. Threshold information is ignored.
1165      */
saveToFile()1166     private boolean saveToFile() {
1167         Slog.i(TAG, "Saving observer state to file");
1168         synchronized (mLock) {
1169             FileOutputStream stream;
1170             try {
1171                 stream = mPolicyFile.startWrite();
1172             } catch (IOException e) {
1173                 Slog.w(TAG, "Cannot update monitored packages", e);
1174                 return false;
1175             }
1176 
1177             try {
1178                 TypedXmlSerializer out = Xml.resolveSerializer(stream);
1179                 out.startDocument(null, true);
1180                 out.startTag(null, TAG_PACKAGE_WATCHDOG);
1181                 out.attributeInt(null, ATTR_VERSION, DB_VERSION);
1182                 for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) {
1183                     mAllObservers.valueAt(oIndex).writeLocked(out);
1184                 }
1185                 out.endTag(null, TAG_PACKAGE_WATCHDOG);
1186                 out.endDocument();
1187                 mPolicyFile.finishWrite(stream);
1188                 return true;
1189             } catch (IOException e) {
1190                 Slog.w(TAG, "Failed to save monitored packages, restoring backup", e);
1191                 mPolicyFile.failWrite(stream);
1192                 return false;
1193             } finally {
1194                 IoUtils.closeQuietly(stream);
1195             }
1196         }
1197     }
1198 
saveToFileAsync()1199     private void saveToFileAsync() {
1200         if (!mLongTaskHandler.hasCallbacks(mSaveToFile)) {
1201             mLongTaskHandler.post(mSaveToFile);
1202         }
1203     }
1204 
1205     /** Convert a {@code LongArrayQueue} to a String of comma-separated values. */
longArrayQueueToString(LongArrayQueue queue)1206     public static String longArrayQueueToString(LongArrayQueue queue) {
1207         if (queue.size() > 0) {
1208             StringBuilder sb = new StringBuilder();
1209             sb.append(queue.get(0));
1210             for (int i = 1; i < queue.size(); i++) {
1211                 sb.append(",");
1212                 sb.append(queue.get(i));
1213             }
1214             return sb.toString();
1215         }
1216         return "";
1217     }
1218 
1219     /** Parse a comma-separated String of longs into a LongArrayQueue. */
parseLongArrayQueue(String commaSeparatedValues)1220     public static LongArrayQueue parseLongArrayQueue(String commaSeparatedValues) {
1221         LongArrayQueue result = new LongArrayQueue();
1222         if (!TextUtils.isEmpty(commaSeparatedValues)) {
1223             String[] values = commaSeparatedValues.split(",");
1224             for (String value : values) {
1225                 result.addLast(Long.parseLong(value));
1226             }
1227         }
1228         return result;
1229     }
1230 
1231 
1232     /** Dump status of every observer in mAllObservers. */
dump(IndentingPrintWriter pw)1233     public void dump(IndentingPrintWriter pw) {
1234         pw.println("Package Watchdog status");
1235         pw.increaseIndent();
1236         synchronized (mLock) {
1237             for (String observerName : mAllObservers.keySet()) {
1238                 pw.println("Observer name: " + observerName);
1239                 pw.increaseIndent();
1240                 ObserverInternal observerInternal = mAllObservers.get(observerName);
1241                 observerInternal.dump(pw);
1242                 pw.decreaseIndent();
1243             }
1244         }
1245     }
1246 
1247     @VisibleForTesting
1248     @GuardedBy("mLock")
registerObserverInternal(ObserverInternal observerInternal)1249     void registerObserverInternal(ObserverInternal observerInternal) {
1250         mAllObservers.put(observerInternal.name, observerInternal);
1251     }
1252 
1253     /**
1254      * Represents an observer monitoring a set of packages along with the failure thresholds for
1255      * each package.
1256      *
1257      * <p> Note, the PackageWatchdog#mLock must always be held when reading or writing
1258      * instances of this class.
1259      */
1260     static class ObserverInternal {
1261         public final String name;
1262         @GuardedBy("mLock")
1263         private final ArrayMap<String, MonitoredPackage> mPackages = new ArrayMap<>();
1264         @Nullable
1265         @GuardedBy("mLock")
1266         public PackageHealthObserver registeredObserver;
1267         private int mMitigationCount;
1268 
ObserverInternal(String name, List<MonitoredPackage> packages)1269         ObserverInternal(String name, List<MonitoredPackage> packages) {
1270             this(name, packages, /*mitigationCount=*/ 0);
1271         }
1272 
ObserverInternal(String name, List<MonitoredPackage> packages, int mitigationCount)1273         ObserverInternal(String name, List<MonitoredPackage> packages, int mitigationCount) {
1274             this.name = name;
1275             updatePackagesLocked(packages);
1276             this.mMitigationCount = mitigationCount;
1277         }
1278 
1279         /**
1280          * Writes important {@link MonitoredPackage} details for this observer to file.
1281          * Does not persist any package failure thresholds.
1282          */
1283         @GuardedBy("mLock")
writeLocked(TypedXmlSerializer out)1284         public boolean writeLocked(TypedXmlSerializer out) {
1285             try {
1286                 out.startTag(null, TAG_OBSERVER);
1287                 out.attribute(null, ATTR_NAME, name);
1288                 if (Flags.recoverabilityDetection()) {
1289                     out.attributeInt(null, ATTR_MITIGATION_COUNT, mMitigationCount);
1290                 }
1291                 for (int i = 0; i < mPackages.size(); i++) {
1292                     MonitoredPackage p = mPackages.valueAt(i);
1293                     p.writeLocked(out);
1294                 }
1295                 out.endTag(null, TAG_OBSERVER);
1296                 return true;
1297             } catch (IOException e) {
1298                 Slog.w(TAG, "Cannot save observer", e);
1299                 return false;
1300             }
1301         }
1302 
getBootMitigationCount()1303         public int getBootMitigationCount() {
1304             return mMitigationCount;
1305         }
1306 
setBootMitigationCount(int mitigationCount)1307         public void setBootMitigationCount(int mitigationCount) {
1308             mMitigationCount = mitigationCount;
1309         }
1310 
1311         @GuardedBy("mLock")
updatePackagesLocked(List<MonitoredPackage> packages)1312         public void updatePackagesLocked(List<MonitoredPackage> packages) {
1313             for (int pIndex = 0; pIndex < packages.size(); pIndex++) {
1314                 MonitoredPackage p = packages.get(pIndex);
1315                 MonitoredPackage existingPackage = getMonitoredPackage(p.getName());
1316                 if (existingPackage != null) {
1317                     existingPackage.updateHealthCheckDuration(p.mDurationMs);
1318                 } else {
1319                     putMonitoredPackage(p);
1320                 }
1321             }
1322         }
1323 
1324         /**
1325          * Reduces the monitoring durations of all packages observed by this observer by
1326          * {@code elapsedMs}. If any duration is less than 0, the package is removed from
1327          * observation. If any health check duration is less than 0, the health check result
1328          * is evaluated.
1329          *
1330          * @return a {@link Set} of packages that were removed from the observer without explicit
1331          * health check passing, or an empty list if no package expired for which an explicit health
1332          * check was still pending
1333          */
1334         @GuardedBy("mLock")
prunePackagesLocked(long elapsedMs)1335         private Set<MonitoredPackage> prunePackagesLocked(long elapsedMs) {
1336             Set<MonitoredPackage> failedPackages = new ArraySet<>();
1337             Iterator<MonitoredPackage> it = mPackages.values().iterator();
1338             while (it.hasNext()) {
1339                 MonitoredPackage p = it.next();
1340                 int oldState = p.getHealthCheckStateLocked();
1341                 int newState = p.handleElapsedTimeLocked(elapsedMs);
1342                 if (oldState != HealthCheckState.FAILED
1343                         && newState == HealthCheckState.FAILED) {
1344                     Slog.i(TAG, "Package " + p.getName() + " failed health check");
1345                     failedPackages.add(p);
1346                 }
1347                 if (p.isExpiredLocked()) {
1348                     it.remove();
1349                 }
1350             }
1351             return failedPackages;
1352         }
1353 
1354         /**
1355          * Increments failure counts of {@code packageName}.
1356          * @returns {@code true} if failure threshold is exceeded, {@code false} otherwise
1357          */
1358         @GuardedBy("mLock")
onPackageFailureLocked(String packageName)1359         public boolean onPackageFailureLocked(String packageName) {
1360             if (getMonitoredPackage(packageName) == null && registeredObserver.isPersistent()
1361                     && registeredObserver.mayObservePackage(packageName)) {
1362                 putMonitoredPackage(sPackageWatchdog.newMonitoredPackage(
1363                         packageName, DEFAULT_OBSERVING_DURATION_MS, false));
1364             }
1365             MonitoredPackage p = getMonitoredPackage(packageName);
1366             if (p != null) {
1367                 return p.onFailureLocked();
1368             }
1369             return false;
1370         }
1371 
1372         /**
1373          * Returns the map of packages monitored by this observer.
1374          *
1375          * @return a mapping of package names to {@link MonitoredPackage} objects.
1376          */
1377         @GuardedBy("mLock")
getMonitoredPackages()1378         public ArrayMap<String, MonitoredPackage> getMonitoredPackages() {
1379             return mPackages;
1380         }
1381 
1382         /**
1383          * Returns the {@link MonitoredPackage} associated with a given package name if the
1384          * package is being monitored by this observer.
1385          *
1386          * @param packageName: the name of the package.
1387          * @return the {@link MonitoredPackage} object associated with the package name if one
1388          *         exists, {@code null} otherwise.
1389          */
1390         @GuardedBy("mLock")
1391         @Nullable
getMonitoredPackage(String packageName)1392         public MonitoredPackage getMonitoredPackage(String packageName) {
1393             return mPackages.get(packageName);
1394         }
1395 
1396         /**
1397          * Associates a {@link MonitoredPackage} with the observer.
1398          *
1399          * @param p: the {@link MonitoredPackage} to store.
1400          */
1401         @GuardedBy("mLock")
putMonitoredPackage(MonitoredPackage p)1402         public void putMonitoredPackage(MonitoredPackage p) {
1403             mPackages.put(p.getName(), p);
1404         }
1405 
1406         /**
1407          * Returns one ObserverInternal from the {@code parser} and advances its state.
1408          *
1409          * <p>Note that this method is <b>not</b> thread safe. It should only be called from
1410          * #loadFromFile which in turn is only called on construction of the
1411          * singleton PackageWatchdog.
1412          **/
read(TypedXmlPullParser parser, PackageWatchdog watchdog)1413         public static ObserverInternal read(TypedXmlPullParser parser, PackageWatchdog watchdog) {
1414             String observerName = null;
1415             int observerMitigationCount = 0;
1416             if (TAG_OBSERVER.equals(parser.getName())) {
1417                 observerName = parser.getAttributeValue(null, ATTR_NAME);
1418                 if (TextUtils.isEmpty(observerName)) {
1419                     Slog.wtf(TAG, "Unable to read observer name");
1420                     return null;
1421                 }
1422             }
1423             List<MonitoredPackage> packages = new ArrayList<>();
1424             int innerDepth = parser.getDepth();
1425             try {
1426                 if (Flags.recoverabilityDetection()) {
1427                     try {
1428                         observerMitigationCount =
1429                                 parser.getAttributeInt(null, ATTR_MITIGATION_COUNT);
1430                     } catch (XmlPullParserException e) {
1431                         Slog.i(
1432                             TAG,
1433                             "ObserverInternal mitigation count was not present.");
1434                     }
1435                 }
1436                 while (XmlUtils.nextElementWithin(parser, innerDepth)) {
1437                     if (TAG_PACKAGE.equals(parser.getName())) {
1438                         try {
1439                             MonitoredPackage pkg = watchdog.parseMonitoredPackage(parser);
1440                             if (pkg != null) {
1441                                 packages.add(pkg);
1442                             }
1443                         } catch (NumberFormatException e) {
1444                             Slog.wtf(TAG, "Skipping package for observer " + observerName, e);
1445                             continue;
1446                         }
1447                     }
1448                 }
1449             } catch (XmlPullParserException | IOException e) {
1450                 Slog.wtf(TAG, "Unable to read observer " + observerName, e);
1451                 return null;
1452             }
1453             if (packages.isEmpty()) {
1454                 return null;
1455             }
1456             return new ObserverInternal(observerName, packages, observerMitigationCount);
1457         }
1458 
1459         /** Dumps information about this observer and the packages it watches. */
dump(IndentingPrintWriter pw)1460         public void dump(IndentingPrintWriter pw) {
1461             boolean isPersistent = registeredObserver != null && registeredObserver.isPersistent();
1462             pw.println("Persistent: " + isPersistent);
1463             for (String packageName : mPackages.keySet()) {
1464                 MonitoredPackage p = getMonitoredPackage(packageName);
1465                 pw.println(packageName +  ": ");
1466                 pw.increaseIndent();
1467                 pw.println("# Failures: " + p.mFailureHistory.size());
1468                 pw.println("Monitoring duration remaining: " + p.mDurationMs + "ms");
1469                 pw.println("Explicit health check duration: " + p.mHealthCheckDurationMs + "ms");
1470                 pw.println("Health check state: " + p.toString(p.mHealthCheckState));
1471                 pw.decreaseIndent();
1472             }
1473         }
1474     }
1475 
1476     @Retention(SOURCE)
1477     @IntDef(value = {
1478             HealthCheckState.ACTIVE,
1479             HealthCheckState.INACTIVE,
1480             HealthCheckState.PASSED,
1481             HealthCheckState.FAILED})
1482     public @interface HealthCheckState {
1483         // The package has not passed health check but has requested a health check
1484         int ACTIVE = 0;
1485         // The package has not passed health check and has not requested a health check
1486         int INACTIVE = 1;
1487         // The package has passed health check
1488         int PASSED = 2;
1489         // The package has failed health check
1490         int FAILED = 3;
1491     }
1492 
newMonitoredPackage( String name, long durationMs, boolean hasPassedHealthCheck)1493     MonitoredPackage newMonitoredPackage(
1494             String name, long durationMs, boolean hasPassedHealthCheck) {
1495         return newMonitoredPackage(name, durationMs, Long.MAX_VALUE, hasPassedHealthCheck,
1496                 new LongArrayQueue());
1497     }
1498 
newMonitoredPackage(String name, long durationMs, long healthCheckDurationMs, boolean hasPassedHealthCheck, LongArrayQueue mitigationCalls)1499     MonitoredPackage newMonitoredPackage(String name, long durationMs, long healthCheckDurationMs,
1500             boolean hasPassedHealthCheck, LongArrayQueue mitigationCalls) {
1501         return new MonitoredPackage(name, durationMs, healthCheckDurationMs,
1502                 hasPassedHealthCheck, mitigationCalls);
1503     }
1504 
parseMonitoredPackage(TypedXmlPullParser parser)1505     MonitoredPackage parseMonitoredPackage(TypedXmlPullParser parser)
1506             throws XmlPullParserException {
1507         String packageName = parser.getAttributeValue(null, ATTR_NAME);
1508         long duration = parser.getAttributeLong(null, ATTR_DURATION);
1509         long healthCheckDuration = parser.getAttributeLong(null,
1510                         ATTR_EXPLICIT_HEALTH_CHECK_DURATION);
1511         boolean hasPassedHealthCheck = parser.getAttributeBoolean(null, ATTR_PASSED_HEALTH_CHECK);
1512         LongArrayQueue mitigationCalls = parseLongArrayQueue(
1513                 parser.getAttributeValue(null, ATTR_MITIGATION_CALLS));
1514         return newMonitoredPackage(packageName,
1515                 duration, healthCheckDuration, hasPassedHealthCheck, mitigationCalls);
1516     }
1517 
1518     /**
1519      * Represents a package and its health check state along with the time
1520      * it should be monitored for.
1521      *
1522      * <p> Note, the PackageWatchdog#mLock must always be held when reading or writing
1523      * instances of this class.
1524      */
1525     class MonitoredPackage {
1526         private final String mPackageName;
1527         // Times when package failures happen sorted in ascending order
1528         @GuardedBy("mLock")
1529         private final LongArrayQueue mFailureHistory = new LongArrayQueue();
1530         // Times when an observer was called to mitigate this package's failure. Sorted in
1531         // ascending order.
1532         @GuardedBy("mLock")
1533         private final LongArrayQueue mMitigationCalls;
1534         // One of STATE_[ACTIVE|INACTIVE|PASSED|FAILED]. Updated on construction and after
1535         // methods that could change the health check state: handleElapsedTimeLocked and
1536         // tryPassHealthCheckLocked
1537         private int mHealthCheckState = HealthCheckState.INACTIVE;
1538         // Whether an explicit health check has passed.
1539         // This value in addition with mHealthCheckDurationMs determines the health check state
1540         // of the package, see #getHealthCheckStateLocked
1541         @GuardedBy("mLock")
1542         private boolean mHasPassedHealthCheck;
1543         // System uptime duration to monitor package.
1544         @GuardedBy("mLock")
1545         private long mDurationMs;
1546         // System uptime duration to check the result of an explicit health check
1547         // Initially, MAX_VALUE until we get a value from the health check service
1548         // and request health checks.
1549         // This value in addition with mHasPassedHealthCheck determines the health check state
1550         // of the package, see #getHealthCheckStateLocked
1551         @GuardedBy("mLock")
1552         private long mHealthCheckDurationMs = Long.MAX_VALUE;
1553 
MonitoredPackage(String packageName, long durationMs, long healthCheckDurationMs, boolean hasPassedHealthCheck, LongArrayQueue mitigationCalls)1554         MonitoredPackage(String packageName, long durationMs,
1555                 long healthCheckDurationMs, boolean hasPassedHealthCheck,
1556                 LongArrayQueue mitigationCalls) {
1557             mPackageName = packageName;
1558             mDurationMs = durationMs;
1559             mHealthCheckDurationMs = healthCheckDurationMs;
1560             mHasPassedHealthCheck = hasPassedHealthCheck;
1561             mMitigationCalls = mitigationCalls;
1562             updateHealthCheckStateLocked();
1563         }
1564 
1565         /** Writes the salient fields to disk using {@code out}. */
1566         @GuardedBy("mLock")
writeLocked(TypedXmlSerializer out)1567         public void writeLocked(TypedXmlSerializer out) throws IOException {
1568             out.startTag(null, TAG_PACKAGE);
1569             out.attribute(null, ATTR_NAME, getName());
1570             out.attributeLong(null, ATTR_DURATION, mDurationMs);
1571             out.attributeLong(null, ATTR_EXPLICIT_HEALTH_CHECK_DURATION, mHealthCheckDurationMs);
1572             out.attributeBoolean(null, ATTR_PASSED_HEALTH_CHECK, mHasPassedHealthCheck);
1573             LongArrayQueue normalizedCalls = normalizeMitigationCalls();
1574             out.attribute(null, ATTR_MITIGATION_CALLS, longArrayQueueToString(normalizedCalls));
1575             out.endTag(null, TAG_PACKAGE);
1576         }
1577 
1578         /**
1579          * Increment package failures or resets failure count depending on the last package failure.
1580          *
1581          * @return {@code true} if failure count exceeds a threshold, {@code false} otherwise
1582          */
1583         @GuardedBy("mLock")
onFailureLocked()1584         public boolean onFailureLocked() {
1585             // Sliding window algorithm: find out if there exists a window containing failures >=
1586             // mTriggerFailureCount.
1587             final long now = mSystemClock.uptimeMillis();
1588             mFailureHistory.addLast(now);
1589             while (now - mFailureHistory.peekFirst() > mTriggerFailureDurationMs) {
1590                 // Prune values falling out of the window
1591                 mFailureHistory.removeFirst();
1592             }
1593             boolean failed = mFailureHistory.size() >= mTriggerFailureCount;
1594             if (failed) {
1595                 mFailureHistory.clear();
1596             }
1597             return failed;
1598         }
1599 
1600         /**
1601          * Notes the timestamp of a mitigation call into the observer.
1602          */
1603         @GuardedBy("mLock")
noteMitigationCallLocked()1604         public void noteMitigationCallLocked() {
1605             mMitigationCalls.addLast(mSystemClock.uptimeMillis());
1606         }
1607 
1608         /**
1609          * Prunes any mitigation calls outside of the de-escalation window, and returns the
1610          * number of calls that are in the window afterwards.
1611          *
1612          * @return the number of mitigation calls made in the de-escalation window.
1613          */
1614         @GuardedBy("mLock")
getMitigationCountLocked()1615         public int getMitigationCountLocked() {
1616             try {
1617                 final long now = mSystemClock.uptimeMillis();
1618                 while (now - mMitigationCalls.peekFirst() > DEFAULT_DEESCALATION_WINDOW_MS) {
1619                     mMitigationCalls.removeFirst();
1620                 }
1621             } catch (NoSuchElementException ignore) {
1622             }
1623 
1624             return mMitigationCalls.size();
1625         }
1626 
1627         /**
1628          * Before writing to disk, make the mitigation call timestamps relative to the current
1629          * system uptime. This is because they need to be relative to the uptime which will reset
1630          * at the next boot.
1631          *
1632          * @return a LongArrayQueue of the mitigation calls relative to the current system uptime.
1633          */
1634         @GuardedBy("mLock")
normalizeMitigationCalls()1635         public LongArrayQueue normalizeMitigationCalls() {
1636             LongArrayQueue normalized = new LongArrayQueue();
1637             final long now = mSystemClock.uptimeMillis();
1638             for (int i = 0; i < mMitigationCalls.size(); i++) {
1639                 normalized.addLast(mMitigationCalls.get(i) - now);
1640             }
1641             return normalized;
1642         }
1643 
1644         /**
1645          * Sets the initial health check duration.
1646          *
1647          * @return the new health check state
1648          */
1649         @GuardedBy("mLock")
setHealthCheckActiveLocked(long initialHealthCheckDurationMs)1650         public int setHealthCheckActiveLocked(long initialHealthCheckDurationMs) {
1651             if (initialHealthCheckDurationMs <= 0) {
1652                 Slog.wtf(TAG, "Cannot set non-positive health check duration "
1653                         + initialHealthCheckDurationMs + "ms for package " + getName()
1654                         + ". Using total duration " + mDurationMs + "ms instead");
1655                 initialHealthCheckDurationMs = mDurationMs;
1656             }
1657             if (mHealthCheckState == HealthCheckState.INACTIVE) {
1658                 // Transitions to ACTIVE
1659                 mHealthCheckDurationMs = initialHealthCheckDurationMs;
1660             }
1661             return updateHealthCheckStateLocked();
1662         }
1663 
1664         /**
1665          * Updates the monitoring durations of the package.
1666          *
1667          * @return the new health check state
1668          */
1669         @GuardedBy("mLock")
handleElapsedTimeLocked(long elapsedMs)1670         public int handleElapsedTimeLocked(long elapsedMs) {
1671             if (elapsedMs <= 0) {
1672                 Slog.w(TAG, "Cannot handle non-positive elapsed time for package " + getName());
1673                 return mHealthCheckState;
1674             }
1675             // Transitions to FAILED if now <= 0 and health check not passed
1676             mDurationMs -= elapsedMs;
1677             if (mHealthCheckState == HealthCheckState.ACTIVE) {
1678                 // We only update health check durations if we have #setHealthCheckActiveLocked
1679                 // This ensures we don't leave the INACTIVE state for an unexpected elapsed time
1680                 // Transitions to FAILED if now <= 0 and health check not passed
1681                 mHealthCheckDurationMs -= elapsedMs;
1682             }
1683             return updateHealthCheckStateLocked();
1684         }
1685 
1686         /** Explicitly update the monitoring duration of the package. */
1687         @GuardedBy("mLock")
updateHealthCheckDuration(long newDurationMs)1688         public void updateHealthCheckDuration(long newDurationMs) {
1689             mDurationMs = newDurationMs;
1690         }
1691 
1692         /**
1693          * Marks the health check as passed and transitions to {@link HealthCheckState.PASSED}
1694          * if not yet {@link HealthCheckState.FAILED}.
1695          *
1696          * @return the new {@link HealthCheckState health check state}
1697          */
1698         @GuardedBy("mLock")
1699         @HealthCheckState
tryPassHealthCheckLocked()1700         public int tryPassHealthCheckLocked() {
1701             if (mHealthCheckState != HealthCheckState.FAILED) {
1702                 // FAILED is a final state so only pass if we haven't failed
1703                 // Transition to PASSED
1704                 mHasPassedHealthCheck = true;
1705             }
1706             return updateHealthCheckStateLocked();
1707         }
1708 
1709         /** Returns the monitored package name. */
getName()1710         private String getName() {
1711             return mPackageName;
1712         }
1713 
1714         /**
1715          * Returns the current {@link HealthCheckState health check state}.
1716          */
1717         @GuardedBy("mLock")
1718         @HealthCheckState
getHealthCheckStateLocked()1719         public int getHealthCheckStateLocked() {
1720             return mHealthCheckState;
1721         }
1722 
1723         /**
1724          * Returns the shortest duration before the package should be scheduled for a prune.
1725          *
1726          * @return the duration or {@link Long#MAX_VALUE} if the package should not be scheduled
1727          */
1728         @GuardedBy("mLock")
getShortestScheduleDurationMsLocked()1729         public long getShortestScheduleDurationMsLocked() {
1730             // Consider health check duration only if #isPendingHealthChecksLocked is true
1731             return Math.min(toPositive(mDurationMs),
1732                     isPendingHealthChecksLocked()
1733                     ? toPositive(mHealthCheckDurationMs) : Long.MAX_VALUE);
1734         }
1735 
1736         /**
1737          * Returns {@code true} if the total duration left to monitor the package is less than or
1738          * equal to 0 {@code false} otherwise.
1739          */
1740         @GuardedBy("mLock")
isExpiredLocked()1741         public boolean isExpiredLocked() {
1742             return mDurationMs <= 0;
1743         }
1744 
1745         /**
1746          * Returns {@code true} if the package, {@link #getName} is expecting health check results
1747          * {@code false} otherwise.
1748          */
1749         @GuardedBy("mLock")
isPendingHealthChecksLocked()1750         public boolean isPendingHealthChecksLocked() {
1751             return mHealthCheckState == HealthCheckState.ACTIVE
1752                     || mHealthCheckState == HealthCheckState.INACTIVE;
1753         }
1754 
1755         /**
1756          * Updates the health check state based on {@link #mHasPassedHealthCheck}
1757          * and {@link #mHealthCheckDurationMs}.
1758          *
1759          * @return the new {@link HealthCheckState health check state}
1760          */
1761         @GuardedBy("mLock")
1762         @HealthCheckState
updateHealthCheckStateLocked()1763         private int updateHealthCheckStateLocked() {
1764             int oldState = mHealthCheckState;
1765             if (mHasPassedHealthCheck) {
1766                 // Set final state first to avoid ambiguity
1767                 mHealthCheckState = HealthCheckState.PASSED;
1768             } else if (mHealthCheckDurationMs <= 0 || mDurationMs <= 0) {
1769                 // Set final state first to avoid ambiguity
1770                 mHealthCheckState = HealthCheckState.FAILED;
1771             } else if (mHealthCheckDurationMs == Long.MAX_VALUE) {
1772                 mHealthCheckState = HealthCheckState.INACTIVE;
1773             } else {
1774                 mHealthCheckState = HealthCheckState.ACTIVE;
1775             }
1776 
1777             if (oldState != mHealthCheckState) {
1778                 Slog.i(TAG, "Updated health check state for package " + getName() + ": "
1779                         + toString(oldState) + " -> " + toString(mHealthCheckState));
1780             }
1781             return mHealthCheckState;
1782         }
1783 
1784         /** Returns a {@link String} representation of the current health check state. */
toString(@ealthCheckState int state)1785         private String toString(@HealthCheckState int state) {
1786             switch (state) {
1787                 case HealthCheckState.ACTIVE:
1788                     return "ACTIVE";
1789                 case HealthCheckState.INACTIVE:
1790                     return "INACTIVE";
1791                 case HealthCheckState.PASSED:
1792                     return "PASSED";
1793                 case HealthCheckState.FAILED:
1794                     return "FAILED";
1795                 default:
1796                     return "UNKNOWN";
1797             }
1798         }
1799 
1800         /** Returns {@code value} if it is greater than 0 or {@link Long#MAX_VALUE} otherwise. */
toPositive(long value)1801         private long toPositive(long value) {
1802             return value > 0 ? value : Long.MAX_VALUE;
1803         }
1804 
1805         /** Compares the equality of this object with another {@link MonitoredPackage}. */
1806         @VisibleForTesting
isEqualTo(MonitoredPackage pkg)1807         boolean isEqualTo(MonitoredPackage pkg) {
1808             return (getName().equals(pkg.getName()))
1809                     && mDurationMs == pkg.mDurationMs
1810                     && mHasPassedHealthCheck == pkg.mHasPassedHealthCheck
1811                     && mHealthCheckDurationMs == pkg.mHealthCheckDurationMs
1812                     && (mMitigationCalls.toString()).equals(pkg.mMitigationCalls.toString());
1813         }
1814     }
1815 
1816     @GuardedBy("mLock")
1817     @SuppressWarnings("GuardedBy")
saveAllObserversBootMitigationCountToMetadata(String filePath)1818     void saveAllObserversBootMitigationCountToMetadata(String filePath) {
1819         HashMap<String, Integer> bootMitigationCounts = new HashMap<>();
1820         for (int i = 0; i < mAllObservers.size(); i++) {
1821             final ObserverInternal observer = mAllObservers.valueAt(i);
1822             bootMitigationCounts.put(observer.name, observer.getBootMitigationCount());
1823         }
1824 
1825         try {
1826             FileOutputStream fileStream = new FileOutputStream(new File(filePath));
1827             ObjectOutputStream objectStream = new ObjectOutputStream(fileStream);
1828             objectStream.writeObject(bootMitigationCounts);
1829             objectStream.flush();
1830             objectStream.close();
1831             fileStream.close();
1832         } catch (Exception e) {
1833             Slog.i(TAG, "Could not save observers metadata to file: " + e);
1834         }
1835     }
1836 
1837     /**
1838      * Handles the thresholding logic for system server boots.
1839      */
1840     class BootThreshold {
1841 
1842         private final int mBootTriggerCount;
1843         private final long mTriggerWindow;
1844 
BootThreshold(int bootTriggerCount, long triggerWindow)1845         BootThreshold(int bootTriggerCount, long triggerWindow) {
1846             this.mBootTriggerCount = bootTriggerCount;
1847             this.mTriggerWindow = triggerWindow;
1848         }
1849 
reset()1850         public void reset() {
1851             setStart(0);
1852             setCount(0);
1853         }
1854 
getCount()1855         protected int getCount() {
1856             return CrashRecoveryProperties.rescueBootCount().orElse(0);
1857         }
1858 
setCount(int count)1859         protected void setCount(int count) {
1860             CrashRecoveryProperties.rescueBootCount(count);
1861         }
1862 
getStart()1863         public long getStart() {
1864             return CrashRecoveryProperties.rescueBootStart().orElse(0L);
1865         }
1866 
getMitigationCount()1867         public int getMitigationCount() {
1868             return CrashRecoveryProperties.bootMitigationCount().orElse(0);
1869         }
1870 
setStart(long start)1871         public void setStart(long start) {
1872             CrashRecoveryProperties.rescueBootStart(getStartTime(start));
1873         }
1874 
setMitigationStart(long start)1875         public void setMitigationStart(long start) {
1876             CrashRecoveryProperties.bootMitigationStart(getStartTime(start));
1877         }
1878 
getMitigationStart()1879         public long getMitigationStart() {
1880             return CrashRecoveryProperties.bootMitigationStart().orElse(0L);
1881         }
1882 
setMitigationCount(int count)1883         public void setMitigationCount(int count) {
1884             CrashRecoveryProperties.bootMitigationCount(count);
1885         }
1886 
constrain(long amount, long low, long high)1887         private static long constrain(long amount, long low, long high) {
1888             return amount < low ? low : (amount > high ? high : amount);
1889         }
1890 
getStartTime(long start)1891         public long getStartTime(long start) {
1892             final long now = mSystemClock.uptimeMillis();
1893             return constrain(start, 0, now);
1894         }
1895 
saveMitigationCountToMetadata()1896         public void saveMitigationCountToMetadata() {
1897             try (BufferedWriter writer = new BufferedWriter(new FileWriter(METADATA_FILE))) {
1898                 writer.write(String.valueOf(getMitigationCount()));
1899             } catch (Exception e) {
1900                 Slog.e(TAG, "Could not save metadata to file: " + e);
1901             }
1902         }
1903 
readMitigationCountFromMetadataIfNecessary()1904         public void readMitigationCountFromMetadataIfNecessary() {
1905             File bootPropsFile = new File(METADATA_FILE);
1906             if (bootPropsFile.exists()) {
1907                 try (BufferedReader reader = new BufferedReader(new FileReader(METADATA_FILE))) {
1908                     String mitigationCount = reader.readLine();
1909                     setMitigationCount(Integer.parseInt(mitigationCount));
1910                     bootPropsFile.delete();
1911                 } catch (Exception e) {
1912                     Slog.i(TAG, "Could not read metadata file: " + e);
1913                 }
1914             }
1915         }
1916 
1917 
1918         /** Increments the boot counter, and returns whether the device is bootlooping. */
1919         @GuardedBy("mLock")
incrementAndTest()1920         public boolean incrementAndTest() {
1921             if (Flags.recoverabilityDetection()) {
1922                 readAllObserversBootMitigationCountIfNecessary(METADATA_FILE);
1923             } else {
1924                 readMitigationCountFromMetadataIfNecessary();
1925             }
1926 
1927             final long now = mSystemClock.uptimeMillis();
1928             if (now - getStart() < 0) {
1929                 Slog.e(TAG, "Window was less than zero. Resetting start to current time.");
1930                 setStart(now);
1931                 setMitigationStart(now);
1932             }
1933             if (now - getMitigationStart() > DEFAULT_DEESCALATION_WINDOW_MS) {
1934                 setMitigationStart(now);
1935                 if (Flags.recoverabilityDetection()) {
1936                     resetAllObserversBootMitigationCount();
1937                 } else {
1938                     setMitigationCount(0);
1939                 }
1940             }
1941             final long window = now - getStart();
1942             if (window >= mTriggerWindow) {
1943                 setCount(1);
1944                 setStart(now);
1945                 return false;
1946             } else {
1947                 int count = getCount() + 1;
1948                 setCount(count);
1949                 EventLogTags.writeRescueNote(Process.ROOT_UID, count, window);
1950                 if (Flags.recoverabilityDetection()) {
1951                     // After a reboot (e.g. by WARM_REBOOT or mainline rollback) we apply
1952                     // mitigations without waiting for DEFAULT_BOOT_LOOP_TRIGGER_COUNT.
1953                     return (count >= mBootTriggerCount)
1954                             || (performedMitigationsDuringWindow() && count > 1);
1955                 }
1956                 return count >= mBootTriggerCount;
1957             }
1958         }
1959 
1960         @GuardedBy("mLock")
performedMitigationsDuringWindow()1961         private boolean performedMitigationsDuringWindow() {
1962             for (ObserverInternal observerInternal: mAllObservers.values()) {
1963                 if (observerInternal.getBootMitigationCount() > 0) {
1964                     return true;
1965                 }
1966             }
1967             return false;
1968         }
1969 
1970         @GuardedBy("mLock")
resetAllObserversBootMitigationCount()1971         private void resetAllObserversBootMitigationCount() {
1972             for (int i = 0; i < mAllObservers.size(); i++) {
1973                 final ObserverInternal observer = mAllObservers.valueAt(i);
1974                 observer.setBootMitigationCount(0);
1975             }
1976             saveAllObserversBootMitigationCountToMetadata(METADATA_FILE);
1977         }
1978 
1979         @GuardedBy("mLock")
1980         @SuppressWarnings("GuardedBy")
readAllObserversBootMitigationCountIfNecessary(String filePath)1981         void readAllObserversBootMitigationCountIfNecessary(String filePath) {
1982             File metadataFile = new File(filePath);
1983             if (metadataFile.exists()) {
1984                 try {
1985                     FileInputStream fileStream = new FileInputStream(metadataFile);
1986                     ObjectInputStream objectStream = new ObjectInputStream(fileStream);
1987                     HashMap<String, Integer> bootMitigationCounts =
1988                             (HashMap<String, Integer>) objectStream.readObject();
1989                     objectStream.close();
1990                     fileStream.close();
1991 
1992                     for (int i = 0; i < mAllObservers.size(); i++) {
1993                         final ObserverInternal observer = mAllObservers.valueAt(i);
1994                         if (bootMitigationCounts.containsKey(observer.name)) {
1995                             observer.setBootMitigationCount(
1996                                     bootMitigationCounts.get(observer.name));
1997                         }
1998                     }
1999                 } catch (Exception e) {
2000                     Slog.i(TAG, "Could not read observer metadata file: " + e);
2001                 }
2002             }
2003         }
2004 
2005     }
2006 }
2007