1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.server; 18 19 import static android.service.watchdog.ExplicitHealthCheckService.PackageConfig; 20 21 import static java.lang.annotation.RetentionPolicy.SOURCE; 22 23 import android.annotation.IntDef; 24 import android.annotation.Nullable; 25 import android.content.Context; 26 import android.content.pm.PackageManager; 27 import android.content.pm.VersionedPackage; 28 import android.net.ConnectivityModuleConnector; 29 import android.os.Environment; 30 import android.os.Handler; 31 import android.os.Looper; 32 import android.os.Process; 33 import android.os.SystemProperties; 34 import android.provider.DeviceConfig; 35 import android.text.TextUtils; 36 import android.util.ArrayMap; 37 import android.util.ArraySet; 38 import android.util.AtomicFile; 39 import android.util.LongArrayQueue; 40 import android.util.MathUtils; 41 import android.util.Slog; 42 import android.util.Xml; 43 44 import com.android.internal.annotations.GuardedBy; 45 import com.android.internal.annotations.VisibleForTesting; 46 import com.android.internal.os.BackgroundThread; 47 import com.android.internal.util.FastXmlSerializer; 48 import com.android.internal.util.IndentingPrintWriter; 49 import com.android.internal.util.XmlUtils; 50 51 import libcore.io.IoUtils; 52 53 import org.xmlpull.v1.XmlPullParser; 54 import org.xmlpull.v1.XmlPullParserException; 55 import org.xmlpull.v1.XmlSerializer; 56 57 import java.io.File; 58 import java.io.FileNotFoundException; 59 import java.io.FileOutputStream; 60 import java.io.IOException; 61 import java.io.InputStream; 62 import java.lang.annotation.Retention; 63 import java.lang.annotation.RetentionPolicy; 64 import java.nio.charset.StandardCharsets; 65 import java.util.ArrayList; 66 import java.util.Collections; 67 import java.util.Iterator; 68 import java.util.List; 69 import java.util.Map; 70 import java.util.Set; 71 import java.util.concurrent.TimeUnit; 72 73 /** 74 * Monitors the health of packages on the system and notifies interested observers when packages 75 * fail. On failure, the registered observer with the least user impacting mitigation will 76 * be notified. 77 */ 78 public class PackageWatchdog { 79 private static final String TAG = "PackageWatchdog"; 80 81 static final String PROPERTY_WATCHDOG_TRIGGER_DURATION_MILLIS = 82 "watchdog_trigger_failure_duration_millis"; 83 static final String PROPERTY_WATCHDOG_TRIGGER_FAILURE_COUNT = 84 "watchdog_trigger_failure_count"; 85 static final String PROPERTY_WATCHDOG_EXPLICIT_HEALTH_CHECK_ENABLED = 86 "watchdog_explicit_health_check_enabled"; 87 88 // TODO: make the following values configurable via DeviceConfig 89 private static final long NATIVE_CRASH_POLLING_INTERVAL_MILLIS = 90 TimeUnit.SECONDS.toMillis(30); 91 private static final long NUMBER_OF_NATIVE_CRASH_POLLS = 10; 92 93 94 public static final int FAILURE_REASON_UNKNOWN = 0; 95 public static final int FAILURE_REASON_NATIVE_CRASH = 1; 96 public static final int FAILURE_REASON_EXPLICIT_HEALTH_CHECK = 2; 97 public static final int FAILURE_REASON_APP_CRASH = 3; 98 public static final int FAILURE_REASON_APP_NOT_RESPONDING = 4; 99 100 @IntDef(prefix = { "FAILURE_REASON_" }, value = { 101 FAILURE_REASON_UNKNOWN, 102 FAILURE_REASON_NATIVE_CRASH, 103 FAILURE_REASON_EXPLICIT_HEALTH_CHECK, 104 FAILURE_REASON_APP_CRASH, 105 FAILURE_REASON_APP_NOT_RESPONDING 106 }) 107 @Retention(RetentionPolicy.SOURCE) 108 public @interface FailureReasons {} 109 110 // Duration to count package failures before it resets to 0 111 @VisibleForTesting 112 static final int DEFAULT_TRIGGER_FAILURE_DURATION_MS = 113 (int) TimeUnit.MINUTES.toMillis(1); 114 // Number of package failures within the duration above before we notify observers 115 @VisibleForTesting 116 static final int DEFAULT_TRIGGER_FAILURE_COUNT = 5; 117 @VisibleForTesting 118 static final long DEFAULT_OBSERVING_DURATION_MS = TimeUnit.DAYS.toMillis(2); 119 // Whether explicit health checks are enabled or not 120 private static final boolean DEFAULT_EXPLICIT_HEALTH_CHECK_ENABLED = true; 121 122 @VisibleForTesting 123 static final int DEFAULT_BOOT_LOOP_TRIGGER_COUNT = 5; 124 static final long DEFAULT_BOOT_LOOP_TRIGGER_WINDOW_MS = TimeUnit.MINUTES.toMillis(10); 125 private static final String PROP_RESCUE_BOOT_COUNT = "sys.rescue_boot_count"; 126 private static final String PROP_RESCUE_BOOT_START = "sys.rescue_boot_start"; 127 128 private long mNumberOfNativeCrashPollsRemaining; 129 130 private static final int DB_VERSION = 1; 131 private static final String TAG_PACKAGE_WATCHDOG = "package-watchdog"; 132 private static final String TAG_PACKAGE = "package"; 133 private static final String TAG_OBSERVER = "observer"; 134 private static final String ATTR_VERSION = "version"; 135 private static final String ATTR_NAME = "name"; 136 private static final String ATTR_DURATION = "duration"; 137 private static final String ATTR_EXPLICIT_HEALTH_CHECK_DURATION = "health-check-duration"; 138 private static final String ATTR_PASSED_HEALTH_CHECK = "passed-health-check"; 139 140 @GuardedBy("PackageWatchdog.class") 141 private static PackageWatchdog sPackageWatchdog; 142 143 private final Object mLock = new Object(); 144 // System server context 145 private final Context mContext; 146 // Handler to run short running tasks 147 private final Handler mShortTaskHandler; 148 // Handler for processing IO and long running tasks 149 private final Handler mLongTaskHandler; 150 // Contains (observer-name -> observer-handle) that have ever been registered from 151 // previous boots. Observers with all packages expired are periodically pruned. 152 // It is saved to disk on system shutdown and repouplated on startup so it survives reboots. 153 @GuardedBy("mLock") 154 private final ArrayMap<String, ObserverInternal> mAllObservers = new ArrayMap<>(); 155 // File containing the XML data of monitored packages /data/system/package-watchdog.xml 156 private final AtomicFile mPolicyFile; 157 private final ExplicitHealthCheckController mHealthCheckController; 158 private final ConnectivityModuleConnector mConnectivityModuleConnector; 159 private final Runnable mSyncRequests = this::syncRequests; 160 private final Runnable mSyncStateWithScheduledReason = this::syncStateWithScheduledReason; 161 private final Runnable mSaveToFile = this::saveToFile; 162 private final SystemClock mSystemClock; 163 private final BootThreshold mBootThreshold; 164 // The set of packages that have been synced with the ExplicitHealthCheckController 165 @GuardedBy("mLock") 166 private Set<String> mRequestedHealthCheckPackages = new ArraySet<>(); 167 @GuardedBy("mLock") 168 private boolean mIsPackagesReady; 169 // Flag to control whether explicit health checks are supported or not 170 @GuardedBy("mLock") 171 private boolean mIsHealthCheckEnabled = DEFAULT_EXPLICIT_HEALTH_CHECK_ENABLED; 172 @GuardedBy("mLock") 173 private int mTriggerFailureDurationMs = DEFAULT_TRIGGER_FAILURE_DURATION_MS; 174 @GuardedBy("mLock") 175 private int mTriggerFailureCount = DEFAULT_TRIGGER_FAILURE_COUNT; 176 // SystemClock#uptimeMillis when we last executed #syncState 177 // 0 if no prune is scheduled. 178 @GuardedBy("mLock") 179 private long mUptimeAtLastStateSync; 180 // If true, sync explicit health check packages with the ExplicitHealthCheckController. 181 @GuardedBy("mLock") 182 private boolean mSyncRequired = false; 183 184 @FunctionalInterface 185 @VisibleForTesting 186 interface SystemClock { 187 // TODO: Add elapsedRealtime to this interface uptimeMillis()188 long uptimeMillis(); 189 } 190 PackageWatchdog(Context context)191 private PackageWatchdog(Context context) { 192 // Needs to be constructed inline 193 this(context, new AtomicFile( 194 new File(new File(Environment.getDataDirectory(), "system"), 195 "package-watchdog.xml")), 196 new Handler(Looper.myLooper()), BackgroundThread.getHandler(), 197 new ExplicitHealthCheckController(context), 198 ConnectivityModuleConnector.getInstance(), 199 android.os.SystemClock::uptimeMillis); 200 } 201 202 /** 203 * Creates a PackageWatchdog that allows injecting dependencies. 204 */ 205 @VisibleForTesting PackageWatchdog(Context context, AtomicFile policyFile, Handler shortTaskHandler, Handler longTaskHandler, ExplicitHealthCheckController controller, ConnectivityModuleConnector connectivityModuleConnector, SystemClock clock)206 PackageWatchdog(Context context, AtomicFile policyFile, Handler shortTaskHandler, 207 Handler longTaskHandler, ExplicitHealthCheckController controller, 208 ConnectivityModuleConnector connectivityModuleConnector, SystemClock clock) { 209 mContext = context; 210 mPolicyFile = policyFile; 211 mShortTaskHandler = shortTaskHandler; 212 mLongTaskHandler = longTaskHandler; 213 mHealthCheckController = controller; 214 mConnectivityModuleConnector = connectivityModuleConnector; 215 mSystemClock = clock; 216 mNumberOfNativeCrashPollsRemaining = NUMBER_OF_NATIVE_CRASH_POLLS; 217 mBootThreshold = new BootThreshold(DEFAULT_BOOT_LOOP_TRIGGER_COUNT, 218 DEFAULT_BOOT_LOOP_TRIGGER_WINDOW_MS); 219 loadFromFile(); 220 sPackageWatchdog = this; 221 } 222 223 /** Creates or gets singleton instance of PackageWatchdog. */ getInstance(Context context)224 public static PackageWatchdog getInstance(Context context) { 225 synchronized (PackageWatchdog.class) { 226 if (sPackageWatchdog == null) { 227 new PackageWatchdog(context); 228 } 229 return sPackageWatchdog; 230 } 231 } 232 233 /** 234 * Called during boot to notify when packages are ready on the device so we can start 235 * binding. 236 */ onPackagesReady()237 public void onPackagesReady() { 238 synchronized (mLock) { 239 mIsPackagesReady = true; 240 mHealthCheckController.setCallbacks(packageName -> onHealthCheckPassed(packageName), 241 packages -> onSupportedPackages(packages), 242 () -> { 243 syncRequestsAsync(); 244 mSyncRequired = true; 245 }); 246 setPropertyChangedListenerLocked(); 247 updateConfigs(); 248 registerConnectivityModuleHealthListener(); 249 } 250 } 251 252 /** 253 * Registers {@code observer} to listen for package failures. Add a new ObserverInternal for 254 * this observer if it does not already exist. 255 * 256 * <p>Observers are expected to call this on boot. It does not specify any packages but 257 * it will resume observing any packages requested from a previous boot. 258 */ registerHealthObserver(PackageHealthObserver observer)259 public void registerHealthObserver(PackageHealthObserver observer) { 260 synchronized (mLock) { 261 ObserverInternal internalObserver = mAllObservers.get(observer.getName()); 262 if (internalObserver != null) { 263 internalObserver.registeredObserver = observer; 264 } else { 265 internalObserver = new ObserverInternal(observer.getName(), new ArrayList<>()); 266 internalObserver.registeredObserver = observer; 267 mAllObservers.put(observer.getName(), internalObserver); 268 syncState("added new observer"); 269 } 270 } 271 } 272 273 /** 274 * Starts observing the health of the {@code packages} for {@code observer} and notifies 275 * {@code observer} of any package failures within the monitoring duration. 276 * 277 * <p>If monitoring a package supporting explicit health check, at the end of the monitoring 278 * duration if {@link #onHealthCheckPassed} was never called, 279 * {@link PackageHealthObserver#execute} will be called as if the package failed. 280 * 281 * <p>If {@code observer} is already monitoring a package in {@code packageNames}, 282 * the monitoring window of that package will be reset to {@code durationMs} and the health 283 * check state will be reset to a default depending on if the package is contained in 284 * {@link mPackagesWithExplicitHealthCheckEnabled}. 285 * 286 * <p>If {@code packageNames} is empty, this will be a no-op. 287 * 288 * <p>If {@code durationMs} is less than 1, a default monitoring duration 289 * {@link #DEFAULT_OBSERVING_DURATION_MS} will be used. 290 */ startObservingHealth(PackageHealthObserver observer, List<String> packageNames, long durationMs)291 public void startObservingHealth(PackageHealthObserver observer, List<String> packageNames, 292 long durationMs) { 293 if (packageNames.isEmpty()) { 294 Slog.wtf(TAG, "No packages to observe, " + observer.getName()); 295 return; 296 } 297 if (durationMs < 1) { 298 Slog.wtf(TAG, "Invalid duration " + durationMs + "ms for observer " 299 + observer.getName() + ". Not observing packages " + packageNames); 300 durationMs = DEFAULT_OBSERVING_DURATION_MS; 301 } 302 303 List<MonitoredPackage> packages = new ArrayList<>(); 304 for (int i = 0; i < packageNames.size(); i++) { 305 // Health checks not available yet so health check state will start INACTIVE 306 MonitoredPackage pkg = newMonitoredPackage(packageNames.get(i), durationMs, false); 307 if (pkg != null) { 308 packages.add(pkg); 309 } 310 } 311 312 if (packages.isEmpty()) { 313 return; 314 } 315 316 // Sync before we add the new packages to the observers. This will #pruneObservers, 317 // causing any elapsed time to be deducted from all existing packages before we add new 318 // packages. This maintains the invariant that the elapsed time for ALL (new and existing) 319 // packages is the same. 320 mLongTaskHandler.post(() -> { 321 syncState("observing new packages"); 322 323 synchronized (mLock) { 324 ObserverInternal oldObserver = mAllObservers.get(observer.getName()); 325 if (oldObserver == null) { 326 Slog.d(TAG, observer.getName() + " started monitoring health " 327 + "of packages " + packageNames); 328 mAllObservers.put(observer.getName(), 329 new ObserverInternal(observer.getName(), packages)); 330 } else { 331 Slog.d(TAG, observer.getName() + " added the following " 332 + "packages to monitor " + packageNames); 333 oldObserver.updatePackagesLocked(packages); 334 } 335 } 336 337 // Register observer in case not already registered 338 registerHealthObserver(observer); 339 340 // Sync after we add the new packages to the observers. We may have received packges 341 // requiring an earlier schedule than we are currently scheduled for. 342 syncState("updated observers"); 343 }); 344 345 } 346 347 /** 348 * Unregisters {@code observer} from listening to package failure. 349 * Additionally, this stops observing any packages that may have previously been observed 350 * even from a previous boot. 351 */ unregisterHealthObserver(PackageHealthObserver observer)352 public void unregisterHealthObserver(PackageHealthObserver observer) { 353 synchronized (mLock) { 354 mAllObservers.remove(observer.getName()); 355 } 356 syncState("unregistering observer: " + observer.getName()); 357 } 358 359 /** 360 * Called when a process fails due to a crash, ANR or explicit health check. 361 * 362 * <p>For each package contained in the process, one registered observer with the least user 363 * impact will be notified for mitigation. 364 * 365 * <p>This method could be called frequently if there is a severe problem on the device. 366 */ onPackageFailure(List<VersionedPackage> packages, @FailureReasons int failureReason)367 public void onPackageFailure(List<VersionedPackage> packages, 368 @FailureReasons int failureReason) { 369 if (packages == null) { 370 Slog.w(TAG, "Could not resolve a list of failing packages"); 371 return; 372 } 373 mLongTaskHandler.post(() -> { 374 synchronized (mLock) { 375 if (mAllObservers.isEmpty()) { 376 return; 377 } 378 boolean requiresImmediateAction = (failureReason == FAILURE_REASON_NATIVE_CRASH 379 || failureReason == FAILURE_REASON_EXPLICIT_HEALTH_CHECK); 380 if (requiresImmediateAction) { 381 handleFailureImmediately(packages, failureReason); 382 } else { 383 for (int pIndex = 0; pIndex < packages.size(); pIndex++) { 384 VersionedPackage versionedPackage = packages.get(pIndex); 385 // Observer that will receive failure for versionedPackage 386 PackageHealthObserver currentObserverToNotify = null; 387 int currentObserverImpact = Integer.MAX_VALUE; 388 389 // Find observer with least user impact 390 for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) { 391 ObserverInternal observer = mAllObservers.valueAt(oIndex); 392 PackageHealthObserver registeredObserver = observer.registeredObserver; 393 if (registeredObserver != null 394 && observer.onPackageFailureLocked( 395 versionedPackage.getPackageName())) { 396 int impact = registeredObserver.onHealthCheckFailed( 397 versionedPackage, failureReason); 398 if (impact != PackageHealthObserverImpact.USER_IMPACT_NONE 399 && impact < currentObserverImpact) { 400 currentObserverToNotify = registeredObserver; 401 currentObserverImpact = impact; 402 } 403 } 404 } 405 406 // Execute action with least user impact 407 if (currentObserverToNotify != null) { 408 currentObserverToNotify.execute(versionedPackage, failureReason); 409 } 410 } 411 } 412 } 413 }); 414 } 415 416 /** 417 * For native crashes or explicit health check failures, call directly into each observer to 418 * mitigate the error without going through failure threshold logic. 419 */ handleFailureImmediately(List<VersionedPackage> packages, @FailureReasons int failureReason)420 private void handleFailureImmediately(List<VersionedPackage> packages, 421 @FailureReasons int failureReason) { 422 VersionedPackage failingPackage = packages.size() > 0 ? packages.get(0) : null; 423 PackageHealthObserver currentObserverToNotify = null; 424 int currentObserverImpact = Integer.MAX_VALUE; 425 for (ObserverInternal observer: mAllObservers.values()) { 426 PackageHealthObserver registeredObserver = observer.registeredObserver; 427 if (registeredObserver != null) { 428 int impact = registeredObserver.onHealthCheckFailed( 429 failingPackage, failureReason); 430 if (impact != PackageHealthObserverImpact.USER_IMPACT_NONE 431 && impact < currentObserverImpact) { 432 currentObserverToNotify = registeredObserver; 433 currentObserverImpact = impact; 434 } 435 } 436 } 437 if (currentObserverToNotify != null) { 438 currentObserverToNotify.execute(failingPackage, failureReason); 439 } 440 } 441 442 /** 443 * Called when the system server boots. If the system server is detected to be in a boot loop, 444 * query each observer and perform the mitigation action with the lowest user impact. 445 */ noteBoot()446 public void noteBoot() { 447 synchronized (mLock) { 448 if (mBootThreshold.incrementAndTest()) { 449 mBootThreshold.reset(); 450 PackageHealthObserver currentObserverToNotify = null; 451 int currentObserverImpact = Integer.MAX_VALUE; 452 for (int i = 0; i < mAllObservers.size(); i++) { 453 final ObserverInternal observer = mAllObservers.valueAt(i); 454 PackageHealthObserver registeredObserver = observer.registeredObserver; 455 if (registeredObserver != null) { 456 int impact = registeredObserver.onBootLoop(); 457 if (impact != PackageHealthObserverImpact.USER_IMPACT_NONE 458 && impact < currentObserverImpact) { 459 currentObserverToNotify = registeredObserver; 460 currentObserverImpact = impact; 461 } 462 } 463 } 464 if (currentObserverToNotify != null) { 465 currentObserverToNotify.executeBootLoopMitigation(); 466 } 467 } 468 } 469 } 470 471 // TODO(b/120598832): Optimize write? Maybe only write a separate smaller file? Also 472 // avoid holding lock? 473 // This currently adds about 7ms extra to shutdown thread 474 /** Writes the package information to file during shutdown. */ writeNow()475 public void writeNow() { 476 synchronized (mLock) { 477 // Must only run synchronous tasks as this runs on the ShutdownThread and no other 478 // thread is guaranteed to run during shutdown. 479 if (!mAllObservers.isEmpty()) { 480 mLongTaskHandler.removeCallbacks(mSaveToFile); 481 pruneObserversLocked(); 482 saveToFile(); 483 Slog.i(TAG, "Last write to update package durations"); 484 } 485 } 486 } 487 488 /** 489 * Enables or disables explicit health checks. 490 * <p> If explicit health checks are enabled, the health check service is started. 491 * <p> If explicit health checks are disabled, pending explicit health check requests are 492 * passed and the health check service is stopped. 493 */ setExplicitHealthCheckEnabled(boolean enabled)494 private void setExplicitHealthCheckEnabled(boolean enabled) { 495 synchronized (mLock) { 496 mIsHealthCheckEnabled = enabled; 497 mHealthCheckController.setEnabled(enabled); 498 // Prune to update internal state whenever health check is enabled/disabled 499 syncState("health check state " + (enabled ? "enabled" : "disabled")); 500 } 501 } 502 503 /** 504 * This method should be only called on mShortTaskHandler, since it modifies 505 * {@link #mNumberOfNativeCrashPollsRemaining}. 506 */ checkAndMitigateNativeCrashes()507 private void checkAndMitigateNativeCrashes() { 508 mNumberOfNativeCrashPollsRemaining--; 509 // Check if native watchdog reported a crash 510 if ("1".equals(SystemProperties.get("sys.init.updatable_crashing"))) { 511 // We rollback everything available when crash is unattributable 512 onPackageFailure(Collections.EMPTY_LIST, FAILURE_REASON_NATIVE_CRASH); 513 // we stop polling after an attempt to execute rollback, regardless of whether the 514 // attempt succeeds or not 515 } else { 516 if (mNumberOfNativeCrashPollsRemaining > 0) { 517 mShortTaskHandler.postDelayed(() -> checkAndMitigateNativeCrashes(), 518 NATIVE_CRASH_POLLING_INTERVAL_MILLIS); 519 } 520 } 521 } 522 523 /** 524 * Since this method can eventually trigger a rollback, it should be called 525 * only once boot has completed {@code onBootCompleted} and not earlier, because the install 526 * session must be entirely completed before we try to rollback. 527 */ scheduleCheckAndMitigateNativeCrashes()528 public void scheduleCheckAndMitigateNativeCrashes() { 529 Slog.i(TAG, "Scheduling " + mNumberOfNativeCrashPollsRemaining + " polls to check " 530 + "and mitigate native crashes"); 531 mShortTaskHandler.post(()->checkAndMitigateNativeCrashes()); 532 } 533 534 /** Possible severity values of the user impact of a {@link PackageHealthObserver#execute}. */ 535 @Retention(SOURCE) 536 @IntDef(value = {PackageHealthObserverImpact.USER_IMPACT_NONE, 537 PackageHealthObserverImpact.USER_IMPACT_LOW, 538 PackageHealthObserverImpact.USER_IMPACT_MEDIUM, 539 PackageHealthObserverImpact.USER_IMPACT_HIGH}) 540 public @interface PackageHealthObserverImpact { 541 /** No action to take. */ 542 int USER_IMPACT_NONE = 0; 543 /* Action has low user impact, user of a device will barely notice. */ 544 int USER_IMPACT_LOW = 1; 545 /* Action has medium user impact, user of a device will likely notice. */ 546 int USER_IMPACT_MEDIUM = 3; 547 /* Action has high user impact, a last resort, user of a device will be very frustrated. */ 548 int USER_IMPACT_HIGH = 5; 549 } 550 551 /** Register instances of this interface to receive notifications on package failure. */ 552 public interface PackageHealthObserver { 553 /** 554 * Called when health check fails for the {@code versionedPackage}. 555 * 556 * @param versionedPackage the package that is failing. This may be null if a native 557 * service is crashing. 558 * @param failureReason the type of failure that is occurring. 559 * 560 * 561 * @return any one of {@link PackageHealthObserverImpact} to express the impact 562 * to the user on {@link #execute} 563 */ onHealthCheckFailed( @ullable VersionedPackage versionedPackage, @FailureReasons int failureReason)564 @PackageHealthObserverImpact int onHealthCheckFailed( 565 @Nullable VersionedPackage versionedPackage, 566 @FailureReasons int failureReason); 567 568 /** 569 * Executes mitigation for {@link #onHealthCheckFailed}. 570 * 571 * @param versionedPackage the package that is failing. This may be null if a native 572 * service is crashing. 573 * @param failureReason the type of failure that is occurring. 574 * @return {@code true} if action was executed successfully, {@code false} otherwise 575 */ execute(@ullable VersionedPackage versionedPackage, @FailureReasons int failureReason)576 boolean execute(@Nullable VersionedPackage versionedPackage, 577 @FailureReasons int failureReason); 578 579 580 /** 581 * Called when the system server has booted several times within a window of time, defined 582 * by {@link #mBootThreshold} 583 */ onBootLoop()584 default @PackageHealthObserverImpact int onBootLoop() { 585 return PackageHealthObserverImpact.USER_IMPACT_NONE; 586 } 587 588 /** 589 * Executes mitigation for {@link #onBootLoop} 590 */ executeBootLoopMitigation()591 default boolean executeBootLoopMitigation() { 592 return false; 593 } 594 595 // TODO(b/120598832): Ensure uniqueness? 596 /** 597 * Identifier for the observer, should not change across device updates otherwise the 598 * watchdog may drop observing packages with the old name. 599 */ getName()600 String getName(); 601 602 /** 603 * An observer will not be pruned if this is set, even if the observer is not explicitly 604 * monitoring any packages. 605 */ isPersistent()606 default boolean isPersistent() { 607 return false; 608 } 609 610 /** 611 * Returns {@code true} if this observer wishes to observe the given package, {@code false} 612 * otherwise 613 * 614 * <p> A persistent observer may choose to start observing certain failing packages, even if 615 * it has not explicitly asked to watch the package with {@link #startObservingHealth}. 616 */ mayObservePackage(String packageName)617 default boolean mayObservePackage(String packageName) { 618 return false; 619 } 620 } 621 getTriggerFailureCount()622 long getTriggerFailureCount() { 623 synchronized (mLock) { 624 return mTriggerFailureCount; 625 } 626 } 627 628 /** 629 * Serializes and syncs health check requests with the {@link ExplicitHealthCheckController}. 630 */ syncRequestsAsync()631 private void syncRequestsAsync() { 632 mShortTaskHandler.removeCallbacks(mSyncRequests); 633 mShortTaskHandler.post(mSyncRequests); 634 } 635 636 /** 637 * Syncs health check requests with the {@link ExplicitHealthCheckController}. 638 * Calls to this must be serialized. 639 * 640 * @see #syncRequestsAsync 641 */ syncRequests()642 private void syncRequests() { 643 boolean syncRequired = false; 644 synchronized (mLock) { 645 if (mIsPackagesReady) { 646 Set<String> packages = getPackagesPendingHealthChecksLocked(); 647 if (mSyncRequired || !packages.equals(mRequestedHealthCheckPackages) 648 || packages.isEmpty()) { 649 syncRequired = true; 650 mRequestedHealthCheckPackages = packages; 651 } 652 } // else, we will sync requests when packages become ready 653 } 654 655 // Call outside lock to avoid holding lock when calling into the controller. 656 if (syncRequired) { 657 Slog.i(TAG, "Syncing health check requests for packages: " 658 + mRequestedHealthCheckPackages); 659 mHealthCheckController.syncRequests(mRequestedHealthCheckPackages); 660 mSyncRequired = false; 661 } 662 } 663 664 /** 665 * Updates the observers monitoring {@code packageName} that explicit health check has passed. 666 * 667 * <p> This update is strictly for registered observers at the time of the call 668 * Observers that register after this signal will have no knowledge of prior signals and will 669 * effectively behave as if the explicit health check hasn't passed for {@code packageName}. 670 * 671 * <p> {@code packageName} can still be considered failed if reported by 672 * {@link #onPackageFailureLocked} before the package expires. 673 * 674 * <p> Triggered by components outside the system server when they are fully functional after an 675 * update. 676 */ onHealthCheckPassed(String packageName)677 private void onHealthCheckPassed(String packageName) { 678 Slog.i(TAG, "Health check passed for package: " + packageName); 679 boolean isStateChanged = false; 680 681 synchronized (mLock) { 682 for (int observerIdx = 0; observerIdx < mAllObservers.size(); observerIdx++) { 683 ObserverInternal observer = mAllObservers.valueAt(observerIdx); 684 MonitoredPackage monitoredPackage = observer.packages.get(packageName); 685 686 if (monitoredPackage != null) { 687 int oldState = monitoredPackage.getHealthCheckStateLocked(); 688 int newState = monitoredPackage.tryPassHealthCheckLocked(); 689 isStateChanged |= oldState != newState; 690 } 691 } 692 } 693 694 if (isStateChanged) { 695 syncState("health check passed for " + packageName); 696 } 697 } 698 onSupportedPackages(List<PackageConfig> supportedPackages)699 private void onSupportedPackages(List<PackageConfig> supportedPackages) { 700 boolean isStateChanged = false; 701 702 Map<String, Long> supportedPackageTimeouts = new ArrayMap<>(); 703 Iterator<PackageConfig> it = supportedPackages.iterator(); 704 while (it.hasNext()) { 705 PackageConfig info = it.next(); 706 supportedPackageTimeouts.put(info.getPackageName(), info.getHealthCheckTimeoutMillis()); 707 } 708 709 synchronized (mLock) { 710 Slog.d(TAG, "Received supported packages " + supportedPackages); 711 Iterator<ObserverInternal> oit = mAllObservers.values().iterator(); 712 while (oit.hasNext()) { 713 Iterator<MonitoredPackage> pit = oit.next().packages.values().iterator(); 714 while (pit.hasNext()) { 715 MonitoredPackage monitoredPackage = pit.next(); 716 String packageName = monitoredPackage.getName(); 717 int oldState = monitoredPackage.getHealthCheckStateLocked(); 718 int newState; 719 720 if (supportedPackageTimeouts.containsKey(packageName)) { 721 // Supported packages become ACTIVE if currently INACTIVE 722 newState = monitoredPackage.setHealthCheckActiveLocked( 723 supportedPackageTimeouts.get(packageName)); 724 } else { 725 // Unsupported packages are marked as PASSED unless already FAILED 726 newState = monitoredPackage.tryPassHealthCheckLocked(); 727 } 728 isStateChanged |= oldState != newState; 729 } 730 } 731 } 732 733 if (isStateChanged) { 734 syncState("updated health check supported packages " + supportedPackages); 735 } 736 } 737 738 @GuardedBy("mLock") getPackagesPendingHealthChecksLocked()739 private Set<String> getPackagesPendingHealthChecksLocked() { 740 Slog.d(TAG, "Getting all observed packages pending health checks"); 741 Set<String> packages = new ArraySet<>(); 742 Iterator<ObserverInternal> oit = mAllObservers.values().iterator(); 743 while (oit.hasNext()) { 744 ObserverInternal observer = oit.next(); 745 Iterator<MonitoredPackage> pit = 746 observer.packages.values().iterator(); 747 while (pit.hasNext()) { 748 MonitoredPackage monitoredPackage = pit.next(); 749 String packageName = monitoredPackage.getName(); 750 if (monitoredPackage.isPendingHealthChecksLocked()) { 751 packages.add(packageName); 752 } 753 } 754 } 755 return packages; 756 } 757 758 /** 759 * Syncs the state of the observers. 760 * 761 * <p> Prunes all observers, saves new state to disk, syncs health check requests with the 762 * health check service and schedules the next state sync. 763 */ syncState(String reason)764 private void syncState(String reason) { 765 synchronized (mLock) { 766 Slog.i(TAG, "Syncing state, reason: " + reason); 767 pruneObserversLocked(); 768 769 saveToFileAsync(); 770 syncRequestsAsync(); 771 772 // Done syncing state, schedule the next state sync 773 scheduleNextSyncStateLocked(); 774 } 775 } 776 syncStateWithScheduledReason()777 private void syncStateWithScheduledReason() { 778 syncState("scheduled"); 779 } 780 781 @GuardedBy("mLock") scheduleNextSyncStateLocked()782 private void scheduleNextSyncStateLocked() { 783 long durationMs = getNextStateSyncMillisLocked(); 784 mShortTaskHandler.removeCallbacks(mSyncStateWithScheduledReason); 785 if (durationMs == Long.MAX_VALUE) { 786 Slog.i(TAG, "Cancelling state sync, nothing to sync"); 787 mUptimeAtLastStateSync = 0; 788 } else { 789 Slog.i(TAG, "Scheduling next state sync in " + durationMs + "ms"); 790 mUptimeAtLastStateSync = mSystemClock.uptimeMillis(); 791 mShortTaskHandler.postDelayed(mSyncStateWithScheduledReason, durationMs); 792 } 793 } 794 795 /** 796 * Returns the next duration in millis to sync the watchdog state. 797 * 798 * @returns Long#MAX_VALUE if there are no observed packages. 799 */ 800 @GuardedBy("mLock") getNextStateSyncMillisLocked()801 private long getNextStateSyncMillisLocked() { 802 long shortestDurationMs = Long.MAX_VALUE; 803 for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) { 804 ArrayMap<String, MonitoredPackage> packages = mAllObservers.valueAt(oIndex).packages; 805 for (int pIndex = 0; pIndex < packages.size(); pIndex++) { 806 MonitoredPackage mp = packages.valueAt(pIndex); 807 long duration = mp.getShortestScheduleDurationMsLocked(); 808 if (duration < shortestDurationMs) { 809 shortestDurationMs = duration; 810 } 811 } 812 } 813 return shortestDurationMs; 814 } 815 816 /** 817 * Removes {@code elapsedMs} milliseconds from all durations on monitored packages 818 * and updates other internal state. 819 */ 820 @GuardedBy("mLock") pruneObserversLocked()821 private void pruneObserversLocked() { 822 long elapsedMs = mUptimeAtLastStateSync == 0 823 ? 0 : mSystemClock.uptimeMillis() - mUptimeAtLastStateSync; 824 if (elapsedMs <= 0) { 825 Slog.i(TAG, "Not pruning observers, elapsed time: " + elapsedMs + "ms"); 826 return; 827 } 828 829 Slog.i(TAG, "Removing " + elapsedMs + "ms from all packages on all observers"); 830 Iterator<ObserverInternal> it = mAllObservers.values().iterator(); 831 while (it.hasNext()) { 832 ObserverInternal observer = it.next(); 833 Set<MonitoredPackage> failedPackages = 834 observer.prunePackagesLocked(elapsedMs); 835 if (!failedPackages.isEmpty()) { 836 onHealthCheckFailed(observer, failedPackages); 837 } 838 if (observer.packages.isEmpty() && (observer.registeredObserver == null 839 || !observer.registeredObserver.isPersistent())) { 840 Slog.i(TAG, "Discarding observer " + observer.name + ". All packages expired"); 841 it.remove(); 842 } 843 } 844 } 845 onHealthCheckFailed(ObserverInternal observer, Set<MonitoredPackage> failedPackages)846 private void onHealthCheckFailed(ObserverInternal observer, 847 Set<MonitoredPackage> failedPackages) { 848 mLongTaskHandler.post(() -> { 849 synchronized (mLock) { 850 PackageHealthObserver registeredObserver = observer.registeredObserver; 851 if (registeredObserver != null) { 852 Iterator<MonitoredPackage> it = failedPackages.iterator(); 853 while (it.hasNext()) { 854 VersionedPackage versionedPkg = it.next().mPackage; 855 Slog.i(TAG, "Explicit health check failed for package " + versionedPkg); 856 registeredObserver.execute(versionedPkg, 857 PackageWatchdog.FAILURE_REASON_EXPLICIT_HEALTH_CHECK); 858 } 859 } 860 } 861 }); 862 } 863 864 @Nullable getVersionedPackage(String packageName)865 private VersionedPackage getVersionedPackage(String packageName) { 866 final PackageManager pm = mContext.getPackageManager(); 867 if (pm == null || TextUtils.isEmpty(packageName)) { 868 return null; 869 } 870 try { 871 final long versionCode = pm.getPackageInfo( 872 packageName, 0 /* flags */).getLongVersionCode(); 873 return new VersionedPackage(packageName, versionCode); 874 } catch (PackageManager.NameNotFoundException e) { 875 return null; 876 } 877 } 878 879 /** 880 * Loads mAllObservers from file. 881 * 882 * <p>Note that this is <b>not</b> thread safe and should only called be called 883 * from the constructor. 884 */ loadFromFile()885 private void loadFromFile() { 886 InputStream infile = null; 887 mAllObservers.clear(); 888 try { 889 infile = mPolicyFile.openRead(); 890 final XmlPullParser parser = Xml.newPullParser(); 891 parser.setInput(infile, StandardCharsets.UTF_8.name()); 892 XmlUtils.beginDocument(parser, TAG_PACKAGE_WATCHDOG); 893 int outerDepth = parser.getDepth(); 894 while (XmlUtils.nextElementWithin(parser, outerDepth)) { 895 ObserverInternal observer = ObserverInternal.read(parser, this); 896 if (observer != null) { 897 mAllObservers.put(observer.name, observer); 898 } 899 } 900 } catch (FileNotFoundException e) { 901 // Nothing to monitor 902 } catch (IOException | NumberFormatException | XmlPullParserException e) { 903 Slog.wtf(TAG, "Unable to read monitored packages, deleting file", e); 904 mPolicyFile.delete(); 905 } finally { 906 IoUtils.closeQuietly(infile); 907 } 908 } 909 910 /** Adds a {@link DeviceConfig#OnPropertiesChangedListener}. */ setPropertyChangedListenerLocked()911 private void setPropertyChangedListenerLocked() { 912 DeviceConfig.addOnPropertiesChangedListener( 913 DeviceConfig.NAMESPACE_ROLLBACK, 914 mContext.getMainExecutor(), 915 (properties) -> { 916 if (!DeviceConfig.NAMESPACE_ROLLBACK.equals(properties.getNamespace())) { 917 return; 918 } 919 updateConfigs(); 920 }); 921 } 922 923 /** 924 * Health check is enabled or disabled after reading the flags 925 * from DeviceConfig. 926 */ updateConfigs()927 private void updateConfigs() { 928 synchronized (mLock) { 929 mTriggerFailureCount = DeviceConfig.getInt( 930 DeviceConfig.NAMESPACE_ROLLBACK, 931 PROPERTY_WATCHDOG_TRIGGER_FAILURE_COUNT, 932 DEFAULT_TRIGGER_FAILURE_COUNT); 933 if (mTriggerFailureCount <= 0) { 934 mTriggerFailureCount = DEFAULT_TRIGGER_FAILURE_COUNT; 935 } 936 937 mTriggerFailureDurationMs = DeviceConfig.getInt( 938 DeviceConfig.NAMESPACE_ROLLBACK, 939 PROPERTY_WATCHDOG_TRIGGER_DURATION_MILLIS, 940 DEFAULT_TRIGGER_FAILURE_DURATION_MS); 941 if (mTriggerFailureDurationMs <= 0) { 942 mTriggerFailureDurationMs = DEFAULT_TRIGGER_FAILURE_DURATION_MS; 943 } 944 945 setExplicitHealthCheckEnabled(DeviceConfig.getBoolean( 946 DeviceConfig.NAMESPACE_ROLLBACK, 947 PROPERTY_WATCHDOG_EXPLICIT_HEALTH_CHECK_ENABLED, 948 DEFAULT_EXPLICIT_HEALTH_CHECK_ENABLED)); 949 } 950 } 951 registerConnectivityModuleHealthListener()952 private void registerConnectivityModuleHealthListener() { 953 // TODO: have an internal method to trigger a rollback by reporting high severity errors, 954 // and rely on ActivityManager to inform the watchdog of severe network stack crashes 955 // instead of having this listener in parallel. 956 mConnectivityModuleConnector.registerHealthListener( 957 packageName -> { 958 final VersionedPackage pkg = getVersionedPackage(packageName); 959 if (pkg == null) { 960 Slog.wtf(TAG, "NetworkStack failed but could not find its package"); 961 return; 962 } 963 final List<VersionedPackage> pkgList = Collections.singletonList(pkg); 964 onPackageFailure(pkgList, FAILURE_REASON_EXPLICIT_HEALTH_CHECK); 965 }); 966 } 967 968 /** 969 * Persists mAllObservers to file. Threshold information is ignored. 970 */ saveToFile()971 private boolean saveToFile() { 972 Slog.i(TAG, "Saving observer state to file"); 973 synchronized (mLock) { 974 FileOutputStream stream; 975 try { 976 stream = mPolicyFile.startWrite(); 977 } catch (IOException e) { 978 Slog.w(TAG, "Cannot update monitored packages", e); 979 return false; 980 } 981 982 try { 983 XmlSerializer out = new FastXmlSerializer(); 984 out.setOutput(stream, StandardCharsets.UTF_8.name()); 985 out.startDocument(null, true); 986 out.startTag(null, TAG_PACKAGE_WATCHDOG); 987 out.attribute(null, ATTR_VERSION, Integer.toString(DB_VERSION)); 988 for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) { 989 mAllObservers.valueAt(oIndex).writeLocked(out); 990 } 991 out.endTag(null, TAG_PACKAGE_WATCHDOG); 992 out.endDocument(); 993 mPolicyFile.finishWrite(stream); 994 return true; 995 } catch (IOException e) { 996 Slog.w(TAG, "Failed to save monitored packages, restoring backup", e); 997 mPolicyFile.failWrite(stream); 998 return false; 999 } finally { 1000 IoUtils.closeQuietly(stream); 1001 } 1002 } 1003 } 1004 saveToFileAsync()1005 private void saveToFileAsync() { 1006 if (!mLongTaskHandler.hasCallbacks(mSaveToFile)) { 1007 mLongTaskHandler.post(mSaveToFile); 1008 } 1009 } 1010 1011 /** Dump status of every observer in mAllObservers. */ dump(IndentingPrintWriter pw)1012 public void dump(IndentingPrintWriter pw) { 1013 pw.println("Package Watchdog status"); 1014 pw.increaseIndent(); 1015 synchronized (mLock) { 1016 for (String observerName : mAllObservers.keySet()) { 1017 pw.println("Observer name: " + observerName); 1018 pw.increaseIndent(); 1019 ObserverInternal observerInternal = mAllObservers.get(observerName); 1020 observerInternal.dump(pw); 1021 pw.decreaseIndent(); 1022 } 1023 } 1024 } 1025 1026 /** 1027 * Represents an observer monitoring a set of packages along with the failure thresholds for 1028 * each package. 1029 * 1030 * <p> Note, the PackageWatchdog#mLock must always be held when reading or writing 1031 * instances of this class. 1032 */ 1033 private static class ObserverInternal { 1034 public final String name; 1035 @GuardedBy("mLock") 1036 public final ArrayMap<String, MonitoredPackage> packages = new ArrayMap<>(); 1037 @Nullable 1038 @GuardedBy("mLock") 1039 public PackageHealthObserver registeredObserver; 1040 ObserverInternal(String name, List<MonitoredPackage> packages)1041 ObserverInternal(String name, List<MonitoredPackage> packages) { 1042 this.name = name; 1043 updatePackagesLocked(packages); 1044 } 1045 1046 /** 1047 * Writes important {@link MonitoredPackage} details for this observer to file. 1048 * Does not persist any package failure thresholds. 1049 */ 1050 @GuardedBy("mLock") writeLocked(XmlSerializer out)1051 public boolean writeLocked(XmlSerializer out) { 1052 try { 1053 out.startTag(null, TAG_OBSERVER); 1054 out.attribute(null, ATTR_NAME, name); 1055 for (int i = 0; i < packages.size(); i++) { 1056 MonitoredPackage p = packages.valueAt(i); 1057 p.writeLocked(out); 1058 } 1059 out.endTag(null, TAG_OBSERVER); 1060 return true; 1061 } catch (IOException e) { 1062 Slog.w(TAG, "Cannot save observer", e); 1063 return false; 1064 } 1065 } 1066 1067 @GuardedBy("mLock") updatePackagesLocked(List<MonitoredPackage> packages)1068 public void updatePackagesLocked(List<MonitoredPackage> packages) { 1069 for (int pIndex = 0; pIndex < packages.size(); pIndex++) { 1070 MonitoredPackage p = packages.get(pIndex); 1071 MonitoredPackage existingPackage = this.packages.get(p.getName()); 1072 if (existingPackage != null) { 1073 existingPackage.updateHealthCheckDuration(p.mDurationMs); 1074 } else { 1075 this.packages.put(p.getName(), p); 1076 } 1077 } 1078 } 1079 1080 /** 1081 * Reduces the monitoring durations of all packages observed by this observer by 1082 * {@code elapsedMs}. If any duration is less than 0, the package is removed from 1083 * observation. If any health check duration is less than 0, the health check result 1084 * is evaluated. 1085 * 1086 * @return a {@link Set} of packages that were removed from the observer without explicit 1087 * health check passing, or an empty list if no package expired for which an explicit health 1088 * check was still pending 1089 */ 1090 @GuardedBy("mLock") prunePackagesLocked(long elapsedMs)1091 private Set<MonitoredPackage> prunePackagesLocked(long elapsedMs) { 1092 Set<MonitoredPackage> failedPackages = new ArraySet<>(); 1093 Iterator<MonitoredPackage> it = packages.values().iterator(); 1094 while (it.hasNext()) { 1095 MonitoredPackage p = it.next(); 1096 int oldState = p.getHealthCheckStateLocked(); 1097 int newState = p.handleElapsedTimeLocked(elapsedMs); 1098 if (oldState != HealthCheckState.FAILED 1099 && newState == HealthCheckState.FAILED) { 1100 Slog.i(TAG, "Package " + p.getName() + " failed health check"); 1101 failedPackages.add(p); 1102 } 1103 if (p.isExpiredLocked()) { 1104 it.remove(); 1105 } 1106 } 1107 return failedPackages; 1108 } 1109 1110 /** 1111 * Increments failure counts of {@code packageName}. 1112 * @returns {@code true} if failure threshold is exceeded, {@code false} otherwise 1113 */ 1114 @GuardedBy("mLock") onPackageFailureLocked(String packageName)1115 public boolean onPackageFailureLocked(String packageName) { 1116 if (packages.get(packageName) == null && registeredObserver.isPersistent() 1117 && registeredObserver.mayObservePackage(packageName)) { 1118 packages.put(packageName, sPackageWatchdog.newMonitoredPackage( 1119 packageName, DEFAULT_OBSERVING_DURATION_MS, false)); 1120 } 1121 MonitoredPackage p = packages.get(packageName); 1122 if (p != null) { 1123 return p.onFailureLocked(); 1124 } 1125 return false; 1126 } 1127 1128 /** 1129 * Returns one ObserverInternal from the {@code parser} and advances its state. 1130 * 1131 * <p>Note that this method is <b>not</b> thread safe. It should only be called from 1132 * #loadFromFile which in turn is only called on construction of the 1133 * singleton PackageWatchdog. 1134 **/ read(XmlPullParser parser, PackageWatchdog watchdog)1135 public static ObserverInternal read(XmlPullParser parser, PackageWatchdog watchdog) { 1136 String observerName = null; 1137 if (TAG_OBSERVER.equals(parser.getName())) { 1138 observerName = parser.getAttributeValue(null, ATTR_NAME); 1139 if (TextUtils.isEmpty(observerName)) { 1140 Slog.wtf(TAG, "Unable to read observer name"); 1141 return null; 1142 } 1143 } 1144 List<MonitoredPackage> packages = new ArrayList<>(); 1145 int innerDepth = parser.getDepth(); 1146 try { 1147 while (XmlUtils.nextElementWithin(parser, innerDepth)) { 1148 if (TAG_PACKAGE.equals(parser.getName())) { 1149 try { 1150 String packageName = parser.getAttributeValue(null, ATTR_NAME); 1151 long duration = Long.parseLong( 1152 parser.getAttributeValue(null, ATTR_DURATION)); 1153 long healthCheckDuration = Long.parseLong( 1154 parser.getAttributeValue(null, 1155 ATTR_EXPLICIT_HEALTH_CHECK_DURATION)); 1156 boolean hasPassedHealthCheck = Boolean.parseBoolean( 1157 parser.getAttributeValue(null, ATTR_PASSED_HEALTH_CHECK)); 1158 MonitoredPackage pkg = watchdog.newMonitoredPackage(packageName, 1159 duration, healthCheckDuration, hasPassedHealthCheck); 1160 if (pkg != null) { 1161 packages.add(pkg); 1162 } 1163 } catch (NumberFormatException e) { 1164 Slog.wtf(TAG, "Skipping package for observer " + observerName, e); 1165 continue; 1166 } 1167 } 1168 } 1169 } catch (XmlPullParserException | IOException e) { 1170 Slog.wtf(TAG, "Unable to read observer " + observerName, e); 1171 return null; 1172 } 1173 if (packages.isEmpty()) { 1174 return null; 1175 } 1176 return new ObserverInternal(observerName, packages); 1177 } 1178 1179 /** Dumps information about this observer and the packages it watches. */ dump(IndentingPrintWriter pw)1180 public void dump(IndentingPrintWriter pw) { 1181 boolean isPersistent = registeredObserver != null && registeredObserver.isPersistent(); 1182 pw.println("Persistent: " + isPersistent); 1183 for (String packageName : packages.keySet()) { 1184 MonitoredPackage p = packages.get(packageName); 1185 pw.println(packageName + ": "); 1186 pw.increaseIndent(); 1187 pw.println("# Failures: " + p.mFailureHistory.size()); 1188 pw.println("Monitoring duration remaining: " + p.mDurationMs + "ms"); 1189 pw.println("Explicit health check duration: " + p.mHealthCheckDurationMs + "ms"); 1190 pw.println("Health check state: " + p.toString(p.mHealthCheckState)); 1191 pw.decreaseIndent(); 1192 } 1193 } 1194 } 1195 1196 @Retention(SOURCE) 1197 @IntDef(value = { 1198 HealthCheckState.ACTIVE, 1199 HealthCheckState.INACTIVE, 1200 HealthCheckState.PASSED, 1201 HealthCheckState.FAILED}) 1202 public @interface HealthCheckState { 1203 // The package has not passed health check but has requested a health check 1204 int ACTIVE = 0; 1205 // The package has not passed health check and has not requested a health check 1206 int INACTIVE = 1; 1207 // The package has passed health check 1208 int PASSED = 2; 1209 // The package has failed health check 1210 int FAILED = 3; 1211 } 1212 newMonitoredPackage( String name, long durationMs, boolean hasPassedHealthCheck)1213 MonitoredPackage newMonitoredPackage( 1214 String name, long durationMs, boolean hasPassedHealthCheck) { 1215 return newMonitoredPackage(name, durationMs, Long.MAX_VALUE, hasPassedHealthCheck); 1216 } 1217 newMonitoredPackage(String name, long durationMs, long healthCheckDurationMs, boolean hasPassedHealthCheck)1218 MonitoredPackage newMonitoredPackage(String name, long durationMs, long healthCheckDurationMs, 1219 boolean hasPassedHealthCheck) { 1220 VersionedPackage pkg = getVersionedPackage(name); 1221 if (pkg == null) { 1222 return null; 1223 } 1224 return new MonitoredPackage(pkg, durationMs, healthCheckDurationMs, hasPassedHealthCheck); 1225 } 1226 1227 /** 1228 * Represents a package and its health check state along with the time 1229 * it should be monitored for. 1230 * 1231 * <p> Note, the PackageWatchdog#mLock must always be held when reading or writing 1232 * instances of this class. 1233 */ 1234 class MonitoredPackage { 1235 private final VersionedPackage mPackage; 1236 // Times when package failures happen sorted in ascending order 1237 @GuardedBy("mLock") 1238 private final LongArrayQueue mFailureHistory = new LongArrayQueue(); 1239 // One of STATE_[ACTIVE|INACTIVE|PASSED|FAILED]. Updated on construction and after 1240 // methods that could change the health check state: handleElapsedTimeLocked and 1241 // tryPassHealthCheckLocked 1242 private int mHealthCheckState = HealthCheckState.INACTIVE; 1243 // Whether an explicit health check has passed. 1244 // This value in addition with mHealthCheckDurationMs determines the health check state 1245 // of the package, see #getHealthCheckStateLocked 1246 @GuardedBy("mLock") 1247 private boolean mHasPassedHealthCheck; 1248 // System uptime duration to monitor package. 1249 @GuardedBy("mLock") 1250 private long mDurationMs; 1251 // System uptime duration to check the result of an explicit health check 1252 // Initially, MAX_VALUE until we get a value from the health check service 1253 // and request health checks. 1254 // This value in addition with mHasPassedHealthCheck determines the health check state 1255 // of the package, see #getHealthCheckStateLocked 1256 @GuardedBy("mLock") 1257 private long mHealthCheckDurationMs = Long.MAX_VALUE; 1258 MonitoredPackage(VersionedPackage pkg, long durationMs, long healthCheckDurationMs, boolean hasPassedHealthCheck)1259 private MonitoredPackage(VersionedPackage pkg, long durationMs, 1260 long healthCheckDurationMs, boolean hasPassedHealthCheck) { 1261 mPackage = pkg; 1262 mDurationMs = durationMs; 1263 mHealthCheckDurationMs = healthCheckDurationMs; 1264 mHasPassedHealthCheck = hasPassedHealthCheck; 1265 updateHealthCheckStateLocked(); 1266 } 1267 1268 /** Writes the salient fields to disk using {@code out}. */ 1269 @GuardedBy("mLock") writeLocked(XmlSerializer out)1270 public void writeLocked(XmlSerializer out) throws IOException { 1271 out.startTag(null, TAG_PACKAGE); 1272 out.attribute(null, ATTR_NAME, getName()); 1273 out.attribute(null, ATTR_DURATION, String.valueOf(mDurationMs)); 1274 out.attribute(null, ATTR_EXPLICIT_HEALTH_CHECK_DURATION, 1275 String.valueOf(mHealthCheckDurationMs)); 1276 out.attribute(null, ATTR_PASSED_HEALTH_CHECK, 1277 String.valueOf(mHasPassedHealthCheck)); 1278 out.endTag(null, TAG_PACKAGE); 1279 } 1280 1281 /** 1282 * Increment package failures or resets failure count depending on the last package failure. 1283 * 1284 * @return {@code true} if failure count exceeds a threshold, {@code false} otherwise 1285 */ 1286 @GuardedBy("mLock") onFailureLocked()1287 public boolean onFailureLocked() { 1288 // Sliding window algorithm: find out if there exists a window containing failures >= 1289 // mTriggerFailureCount. 1290 final long now = mSystemClock.uptimeMillis(); 1291 mFailureHistory.addLast(now); 1292 while (now - mFailureHistory.peekFirst() > mTriggerFailureDurationMs) { 1293 // Prune values falling out of the window 1294 mFailureHistory.removeFirst(); 1295 } 1296 boolean failed = mFailureHistory.size() >= mTriggerFailureCount; 1297 if (failed) { 1298 mFailureHistory.clear(); 1299 } 1300 return failed; 1301 } 1302 1303 /** 1304 * Sets the initial health check duration. 1305 * 1306 * @return the new health check state 1307 */ 1308 @GuardedBy("mLock") setHealthCheckActiveLocked(long initialHealthCheckDurationMs)1309 public int setHealthCheckActiveLocked(long initialHealthCheckDurationMs) { 1310 if (initialHealthCheckDurationMs <= 0) { 1311 Slog.wtf(TAG, "Cannot set non-positive health check duration " 1312 + initialHealthCheckDurationMs + "ms for package " + getName() 1313 + ". Using total duration " + mDurationMs + "ms instead"); 1314 initialHealthCheckDurationMs = mDurationMs; 1315 } 1316 if (mHealthCheckState == HealthCheckState.INACTIVE) { 1317 // Transitions to ACTIVE 1318 mHealthCheckDurationMs = initialHealthCheckDurationMs; 1319 } 1320 return updateHealthCheckStateLocked(); 1321 } 1322 1323 /** 1324 * Updates the monitoring durations of the package. 1325 * 1326 * @return the new health check state 1327 */ 1328 @GuardedBy("mLock") handleElapsedTimeLocked(long elapsedMs)1329 public int handleElapsedTimeLocked(long elapsedMs) { 1330 if (elapsedMs <= 0) { 1331 Slog.w(TAG, "Cannot handle non-positive elapsed time for package " + getName()); 1332 return mHealthCheckState; 1333 } 1334 // Transitions to FAILED if now <= 0 and health check not passed 1335 mDurationMs -= elapsedMs; 1336 if (mHealthCheckState == HealthCheckState.ACTIVE) { 1337 // We only update health check durations if we have #setHealthCheckActiveLocked 1338 // This ensures we don't leave the INACTIVE state for an unexpected elapsed time 1339 // Transitions to FAILED if now <= 0 and health check not passed 1340 mHealthCheckDurationMs -= elapsedMs; 1341 } 1342 return updateHealthCheckStateLocked(); 1343 } 1344 1345 /** Explicitly update the monitoring duration of the package. */ 1346 @GuardedBy("mLock") updateHealthCheckDuration(long newDurationMs)1347 public void updateHealthCheckDuration(long newDurationMs) { 1348 mDurationMs = newDurationMs; 1349 } 1350 1351 /** 1352 * Marks the health check as passed and transitions to {@link HealthCheckState.PASSED} 1353 * if not yet {@link HealthCheckState.FAILED}. 1354 * 1355 * @return the new {@link HealthCheckState health check state} 1356 */ 1357 @GuardedBy("mLock") 1358 @HealthCheckState tryPassHealthCheckLocked()1359 public int tryPassHealthCheckLocked() { 1360 if (mHealthCheckState != HealthCheckState.FAILED) { 1361 // FAILED is a final state so only pass if we haven't failed 1362 // Transition to PASSED 1363 mHasPassedHealthCheck = true; 1364 } 1365 return updateHealthCheckStateLocked(); 1366 } 1367 1368 /** Returns the monitored package name. */ getName()1369 private String getName() { 1370 return mPackage.getPackageName(); 1371 } 1372 1373 /** 1374 * Returns the current {@link HealthCheckState health check state}. 1375 */ 1376 @GuardedBy("mLock") 1377 @HealthCheckState getHealthCheckStateLocked()1378 public int getHealthCheckStateLocked() { 1379 return mHealthCheckState; 1380 } 1381 1382 /** 1383 * Returns the shortest duration before the package should be scheduled for a prune. 1384 * 1385 * @return the duration or {@link Long#MAX_VALUE} if the package should not be scheduled 1386 */ 1387 @GuardedBy("mLock") getShortestScheduleDurationMsLocked()1388 public long getShortestScheduleDurationMsLocked() { 1389 // Consider health check duration only if #isPendingHealthChecksLocked is true 1390 return Math.min(toPositive(mDurationMs), 1391 isPendingHealthChecksLocked() 1392 ? toPositive(mHealthCheckDurationMs) : Long.MAX_VALUE); 1393 } 1394 1395 /** 1396 * Returns {@code true} if the total duration left to monitor the package is less than or 1397 * equal to 0 {@code false} otherwise. 1398 */ 1399 @GuardedBy("mLock") isExpiredLocked()1400 public boolean isExpiredLocked() { 1401 return mDurationMs <= 0; 1402 } 1403 1404 /** 1405 * Returns {@code true} if the package, {@link #getName} is expecting health check results 1406 * {@code false} otherwise. 1407 */ 1408 @GuardedBy("mLock") isPendingHealthChecksLocked()1409 public boolean isPendingHealthChecksLocked() { 1410 return mHealthCheckState == HealthCheckState.ACTIVE 1411 || mHealthCheckState == HealthCheckState.INACTIVE; 1412 } 1413 1414 /** 1415 * Updates the health check state based on {@link #mHasPassedHealthCheck} 1416 * and {@link #mHealthCheckDurationMs}. 1417 * 1418 * @return the new {@link HealthCheckState health check state} 1419 */ 1420 @GuardedBy("mLock") 1421 @HealthCheckState updateHealthCheckStateLocked()1422 private int updateHealthCheckStateLocked() { 1423 int oldState = mHealthCheckState; 1424 if (mHasPassedHealthCheck) { 1425 // Set final state first to avoid ambiguity 1426 mHealthCheckState = HealthCheckState.PASSED; 1427 } else if (mHealthCheckDurationMs <= 0 || mDurationMs <= 0) { 1428 // Set final state first to avoid ambiguity 1429 mHealthCheckState = HealthCheckState.FAILED; 1430 } else if (mHealthCheckDurationMs == Long.MAX_VALUE) { 1431 mHealthCheckState = HealthCheckState.INACTIVE; 1432 } else { 1433 mHealthCheckState = HealthCheckState.ACTIVE; 1434 } 1435 Slog.i(TAG, "Updated health check state for package " + getName() + ": " 1436 + toString(oldState) + " -> " + toString(mHealthCheckState)); 1437 return mHealthCheckState; 1438 } 1439 1440 /** Returns a {@link String} representation of the current health check state. */ toString(@ealthCheckState int state)1441 private String toString(@HealthCheckState int state) { 1442 switch (state) { 1443 case HealthCheckState.ACTIVE: 1444 return "ACTIVE"; 1445 case HealthCheckState.INACTIVE: 1446 return "INACTIVE"; 1447 case HealthCheckState.PASSED: 1448 return "PASSED"; 1449 case HealthCheckState.FAILED: 1450 return "FAILED"; 1451 default: 1452 return "UNKNOWN"; 1453 } 1454 } 1455 1456 /** Returns {@code value} if it is greater than 0 or {@link Long#MAX_VALUE} otherwise. */ toPositive(long value)1457 private long toPositive(long value) { 1458 return value > 0 ? value : Long.MAX_VALUE; 1459 } 1460 } 1461 1462 /** 1463 * Handles the thresholding logic for system server boots. 1464 */ 1465 static class BootThreshold { 1466 1467 private final int mBootTriggerCount; 1468 private final long mTriggerWindow; 1469 BootThreshold(int bootTriggerCount, long triggerWindow)1470 BootThreshold(int bootTriggerCount, long triggerWindow) { 1471 this.mBootTriggerCount = bootTriggerCount; 1472 this.mTriggerWindow = triggerWindow; 1473 } 1474 reset()1475 public void reset() { 1476 setStart(0); 1477 setCount(0); 1478 } 1479 getCount()1480 private int getCount() { 1481 return SystemProperties.getInt(PROP_RESCUE_BOOT_COUNT, 0); 1482 } 1483 setCount(int count)1484 private void setCount(int count) { 1485 SystemProperties.set(PROP_RESCUE_BOOT_COUNT, Integer.toString(count)); 1486 } 1487 getStart()1488 public long getStart() { 1489 return SystemProperties.getLong(PROP_RESCUE_BOOT_START, 0); 1490 } 1491 setStart(long start)1492 public void setStart(long start) { 1493 final long now = android.os.SystemClock.elapsedRealtime(); 1494 final long newStart = MathUtils.constrain(start, 0, now); 1495 SystemProperties.set(PROP_RESCUE_BOOT_START, Long.toString(newStart)); 1496 } 1497 1498 /** Increments the boot counter, and returns whether the device is bootlooping. */ incrementAndTest()1499 public boolean incrementAndTest() { 1500 final long now = android.os.SystemClock.elapsedRealtime(); 1501 if (now - getStart() < 0) { 1502 Slog.e(TAG, "Window was less than zero. Resetting start to current time."); 1503 setStart(now); 1504 } 1505 final long window = now - getStart(); 1506 if (window >= mTriggerWindow) { 1507 setCount(1); 1508 setStart(now); 1509 return false; 1510 } else { 1511 int count = getCount() + 1; 1512 setCount(count); 1513 EventLogTags.writeRescueNote(Process.ROOT_UID, count, window); 1514 return count >= mBootTriggerCount; 1515 } 1516 } 1517 1518 } 1519 } 1520