1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import android.content.ContentResolver;
20 import android.content.Context;
21 import android.os.Build;
22 import android.os.Environment;
23 import android.os.FileUtils;
24 import android.os.RecoverySystem;
25 import android.os.SystemClock;
26 import android.os.SystemProperties;
27 import android.os.UserHandle;
28 import android.provider.Settings;
29 import android.text.format.DateUtils;
30 import android.util.ExceptionUtils;
31 import android.util.Log;
32 import android.util.MathUtils;
33 import android.util.Slog;
34 import android.util.SparseArray;
35 
36 import com.android.internal.util.ArrayUtils;
37 import com.android.server.pm.PackageManagerService;
38 
39 import java.io.File;
40 
41 /**
42  * Utilities to help rescue the system from crash loops. Callers are expected to
43  * report boot events and persistent app crashes, and if they happen frequently
44  * enough this class will slowly escalate through several rescue operations
45  * before finally rebooting and prompting the user if they want to wipe data as
46  * a last resort.
47  *
48  * @hide
49  */
50 public class RescueParty {
51     private static final String TAG = "RescueParty";
52 
53     private static final String PROP_ENABLE_RESCUE = "persist.sys.enable_rescue";
54     private static final String PROP_DISABLE_RESCUE = "persist.sys.disable_rescue";
55     private static final String PROP_RESCUE_LEVEL = "sys.rescue_level";
56     private static final String PROP_RESCUE_BOOT_COUNT = "sys.rescue_boot_count";
57     private static final String PROP_RESCUE_BOOT_START = "sys.rescue_boot_start";
58 
59     private static final int LEVEL_NONE = 0;
60     private static final int LEVEL_RESET_SETTINGS_UNTRUSTED_DEFAULTS = 1;
61     private static final int LEVEL_RESET_SETTINGS_UNTRUSTED_CHANGES = 2;
62     private static final int LEVEL_RESET_SETTINGS_TRUSTED_DEFAULTS = 3;
63     private static final int LEVEL_FACTORY_RESET = 4;
64 
65     /** Threshold for boot loops */
66     private static final Threshold sBoot = new BootThreshold();
67     /** Threshold for app crash loops */
68     private static SparseArray<Threshold> sApps = new SparseArray<>();
69 
isDisabled()70     private static boolean isDisabled() {
71         // Check if we're explicitly enabled for testing
72         if (SystemProperties.getBoolean(PROP_ENABLE_RESCUE, false)) {
73             return false;
74         }
75 
76         // We're disabled on all engineering devices
77         if (Build.IS_ENG) {
78             Slog.v(TAG, "Disabled because of eng build");
79             return true;
80         }
81 
82         // We're disabled on userdebug devices connected over USB, since that's
83         // a decent signal that someone is actively trying to debug the device,
84         // or that it's in a lab environment.
85         if (Build.IS_USERDEBUG && isUsbActive()) {
86             Slog.v(TAG, "Disabled because of active USB connection");
87             return true;
88         }
89 
90         // One last-ditch check
91         if (SystemProperties.getBoolean(PROP_DISABLE_RESCUE, false)) {
92             Slog.v(TAG, "Disabled because of manual property");
93             return true;
94         }
95 
96         return false;
97     }
98 
99     /**
100      * Take note of a boot event. If we notice too many of these events
101      * happening in rapid succession, we'll send out a rescue party.
102      */
noteBoot(Context context)103     public static void noteBoot(Context context) {
104         if (isDisabled()) return;
105         if (sBoot.incrementAndTest()) {
106             sBoot.reset();
107             incrementRescueLevel(sBoot.uid);
108             executeRescueLevel(context);
109         }
110     }
111 
112     /**
113      * Take note of a persistent app crash. If we notice too many of these
114      * events happening in rapid succession, we'll send out a rescue party.
115      */
notePersistentAppCrash(Context context, int uid)116     public static void notePersistentAppCrash(Context context, int uid) {
117         if (isDisabled()) return;
118         Threshold t = sApps.get(uid);
119         if (t == null) {
120             t = new AppThreshold(uid);
121             sApps.put(uid, t);
122         }
123         if (t.incrementAndTest()) {
124             t.reset();
125             incrementRescueLevel(t.uid);
126             executeRescueLevel(context);
127         }
128     }
129 
130     /**
131      * Check if we're currently attempting to reboot for a factory reset.
132      */
isAttemptingFactoryReset()133     public static boolean isAttemptingFactoryReset() {
134         return SystemProperties.getInt(PROP_RESCUE_LEVEL, LEVEL_NONE) == LEVEL_FACTORY_RESET;
135     }
136 
137     /**
138      * Escalate to the next rescue level. After incrementing the level you'll
139      * probably want to call {@link #executeRescueLevel(Context)}.
140      */
incrementRescueLevel(int triggerUid)141     private static void incrementRescueLevel(int triggerUid) {
142         final int level = MathUtils.constrain(
143                 SystemProperties.getInt(PROP_RESCUE_LEVEL, LEVEL_NONE) + 1,
144                 LEVEL_NONE, LEVEL_FACTORY_RESET);
145         SystemProperties.set(PROP_RESCUE_LEVEL, Integer.toString(level));
146 
147         EventLogTags.writeRescueLevel(level, triggerUid);
148         PackageManagerService.logCriticalInfo(Log.WARN, "Incremented rescue level to "
149                 + levelToString(level) + " triggered by UID " + triggerUid);
150     }
151 
152     /**
153      * Called when {@code SettingsProvider} has been published, which is a good
154      * opportunity to reset any settings depending on our rescue level.
155      */
onSettingsProviderPublished(Context context)156     public static void onSettingsProviderPublished(Context context) {
157         executeRescueLevel(context);
158     }
159 
executeRescueLevel(Context context)160     private static void executeRescueLevel(Context context) {
161         final int level = SystemProperties.getInt(PROP_RESCUE_LEVEL, LEVEL_NONE);
162         if (level == LEVEL_NONE) return;
163 
164         Slog.w(TAG, "Attempting rescue level " + levelToString(level));
165         try {
166             executeRescueLevelInternal(context, level);
167             EventLogTags.writeRescueSuccess(level);
168             PackageManagerService.logCriticalInfo(Log.DEBUG,
169                     "Finished rescue level " + levelToString(level));
170         } catch (Throwable t) {
171             final String msg = ExceptionUtils.getCompleteMessage(t);
172             EventLogTags.writeRescueFailure(level, msg);
173             PackageManagerService.logCriticalInfo(Log.ERROR,
174                     "Failed rescue level " + levelToString(level) + ": " + msg);
175         }
176     }
177 
executeRescueLevelInternal(Context context, int level)178     private static void executeRescueLevelInternal(Context context, int level) throws Exception {
179         switch (level) {
180             case LEVEL_RESET_SETTINGS_UNTRUSTED_DEFAULTS:
181                 resetAllSettings(context, Settings.RESET_MODE_UNTRUSTED_DEFAULTS);
182                 break;
183             case LEVEL_RESET_SETTINGS_UNTRUSTED_CHANGES:
184                 resetAllSettings(context, Settings.RESET_MODE_UNTRUSTED_CHANGES);
185                 break;
186             case LEVEL_RESET_SETTINGS_TRUSTED_DEFAULTS:
187                 resetAllSettings(context, Settings.RESET_MODE_TRUSTED_DEFAULTS);
188                 break;
189             case LEVEL_FACTORY_RESET:
190                 RecoverySystem.rebootPromptAndWipeUserData(context, TAG);
191                 break;
192         }
193     }
194 
resetAllSettings(Context context, int mode)195     private static void resetAllSettings(Context context, int mode) throws Exception {
196         // Try our best to reset all settings possible, and once finished
197         // rethrow any exception that we encountered
198         Exception res = null;
199         final ContentResolver resolver = context.getContentResolver();
200         try {
201             Settings.Global.resetToDefaultsAsUser(resolver, null, mode, UserHandle.USER_SYSTEM);
202         } catch (Throwable t) {
203             res = new RuntimeException("Failed to reset global settings", t);
204         }
205         for (int userId : getAllUserIds()) {
206             try {
207                 Settings.Secure.resetToDefaultsAsUser(resolver, null, mode, userId);
208             } catch (Throwable t) {
209                 res = new RuntimeException("Failed to reset secure settings for " + userId, t);
210             }
211         }
212         if (res != null) {
213             throw res;
214         }
215     }
216 
217     /**
218      * Threshold that can be triggered if a number of events occur within a
219      * window of time.
220      */
221     private abstract static class Threshold {
getCount()222         public abstract int getCount();
setCount(int count)223         public abstract void setCount(int count);
getStart()224         public abstract long getStart();
setStart(long start)225         public abstract void setStart(long start);
226 
227         private final int uid;
228         private final int triggerCount;
229         private final long triggerWindow;
230 
Threshold(int uid, int triggerCount, long triggerWindow)231         public Threshold(int uid, int triggerCount, long triggerWindow) {
232             this.uid = uid;
233             this.triggerCount = triggerCount;
234             this.triggerWindow = triggerWindow;
235         }
236 
reset()237         public void reset() {
238             setCount(0);
239             setStart(0);
240         }
241 
242         /**
243          * @return if this threshold has been triggered
244          */
incrementAndTest()245         public boolean incrementAndTest() {
246             final long now = SystemClock.elapsedRealtime();
247             final long window = now - getStart();
248             if (window > triggerWindow) {
249                 setCount(1);
250                 setStart(now);
251                 return false;
252             } else {
253                 int count = getCount() + 1;
254                 setCount(count);
255                 EventLogTags.writeRescueNote(uid, count, window);
256                 Slog.w(TAG, "Noticed " + count + " events for UID " + uid + " in last "
257                         + (window / 1000) + " sec");
258                 return (count >= triggerCount);
259             }
260         }
261     }
262 
263     /**
264      * Specialization of {@link Threshold} for monitoring boot events. It stores
265      * counters in system properties for robustness.
266      */
267     private static class BootThreshold extends Threshold {
BootThreshold()268         public BootThreshold() {
269             // We're interested in 5 events in any 300 second period; this
270             // window is super relaxed because booting can take a long time if
271             // forced to dexopt things.
272             super(android.os.Process.ROOT_UID, 5, 300 * DateUtils.SECOND_IN_MILLIS);
273         }
274 
275         @Override
getCount()276         public int getCount() {
277             return SystemProperties.getInt(PROP_RESCUE_BOOT_COUNT, 0);
278         }
279 
280         @Override
setCount(int count)281         public void setCount(int count) {
282             SystemProperties.set(PROP_RESCUE_BOOT_COUNT, Integer.toString(count));
283         }
284 
285         @Override
getStart()286         public long getStart() {
287             return SystemProperties.getLong(PROP_RESCUE_BOOT_START, 0);
288         }
289 
290         @Override
setStart(long start)291         public void setStart(long start) {
292             SystemProperties.set(PROP_RESCUE_BOOT_START, Long.toString(start));
293         }
294     }
295 
296     /**
297      * Specialization of {@link Threshold} for monitoring app crashes. It stores
298      * counters in memory.
299      */
300     private static class AppThreshold extends Threshold {
301         private int count;
302         private long start;
303 
AppThreshold(int uid)304         public AppThreshold(int uid) {
305             // We're interested in 5 events in any 30 second period; apps crash
306             // pretty quickly so we can keep a tight leash on them.
307             super(uid, 5, 30 * DateUtils.SECOND_IN_MILLIS);
308         }
309 
getCount()310         @Override public int getCount() { return count; }
setCount(int count)311         @Override public void setCount(int count) { this.count = count; }
getStart()312         @Override public long getStart() { return start; }
setStart(long start)313         @Override public void setStart(long start) { this.start = start; }
314     }
315 
getAllUserIds()316     private static int[] getAllUserIds() {
317         int[] userIds = { UserHandle.USER_SYSTEM };
318         try {
319             for (File file : FileUtils.listFilesOrEmpty(Environment.getDataSystemDeDirectory())) {
320                 try {
321                     final int userId = Integer.parseInt(file.getName());
322                     if (userId != UserHandle.USER_SYSTEM) {
323                         userIds = ArrayUtils.appendInt(userIds, userId);
324                     }
325                 } catch (NumberFormatException ignored) {
326                 }
327             }
328         } catch (Throwable t) {
329             Slog.w(TAG, "Trouble discovering users", t);
330         }
331         return userIds;
332     }
333 
334     /**
335      * Hacky test to check if the device has an active USB connection, which is
336      * a good proxy for someone doing local development work.
337      */
isUsbActive()338     private static boolean isUsbActive() {
339         try {
340             final String state = FileUtils
341                     .readTextFile(new File("/sys/class/android_usb/android0/state"), 128, "");
342             return "CONFIGURED".equals(state.trim());
343         } catch (Throwable t) {
344             Slog.w(TAG, "Failed to determine if device was on USB", t);
345             return false;
346         }
347     }
348 
levelToString(int level)349     private static String levelToString(int level) {
350         switch (level) {
351             case LEVEL_NONE: return "NONE";
352             case LEVEL_RESET_SETTINGS_UNTRUSTED_DEFAULTS: return "RESET_SETTINGS_UNTRUSTED_DEFAULTS";
353             case LEVEL_RESET_SETTINGS_UNTRUSTED_CHANGES: return "RESET_SETTINGS_UNTRUSTED_CHANGES";
354             case LEVEL_RESET_SETTINGS_TRUSTED_DEFAULTS: return "RESET_SETTINGS_TRUSTED_DEFAULTS";
355             case LEVEL_FACTORY_RESET: return "FACTORY_RESET";
356             default: return Integer.toString(level);
357         }
358     }
359 }
360