1 /**
2  * Copyright (c) 2020, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef CPP_WATCHDOG_SERVER_SRC_PERFORMANCEPROFILER_H_
18 #define CPP_WATCHDOG_SERVER_SRC_PERFORMANCEPROFILER_H_
19 
20 #include "PressureMonitor.h"
21 #include "ProcDiskStatsCollector.h"
22 #include "ProcStatCollector.h"
23 #include "UidStatsCollector.h"
24 #include "WatchdogPerfService.h"
25 
26 #include <android-base/chrono_utils.h>
27 #include <android-base/result.h>
28 #include <android/util/ProtoOutputStream.h>
29 #include <cutils/multiuser.h>
30 #include <gtest/gtest_prod.h>
31 #include <meminfo/procmeminfo.h>
32 #include <utils/Errors.h>
33 #include <utils/Mutex.h>
34 #include <utils/RefBase.h>
35 #include <utils/SystemClock.h>
36 
37 #include <android_car_feature.h>
38 
39 #include <ctime>
40 #include <string>
41 #include <unordered_set>
42 #include <variant>
43 #include <vector>
44 
45 namespace android {
46 namespace automotive {
47 namespace watchdog {
48 
49 // Number of periodic collection records to cache in memory.
50 constexpr int32_t kDefaultPeriodicCollectionBufferSize = 180;
51 constexpr const char kEmptyCollectionMessage[] = "No collection recorded\n";
52 
53 // Forward declaration for testing use only.
54 namespace internal {
55 
56 class PerformanceProfilerPeer;
57 
58 }  // namespace internal
59 
60 // Below classes, structs and enums should be used only by the implementation and unit tests.
61 enum ProcStatType {
62     IO_BLOCKED_TASKS_COUNT = 0,
63     MAJOR_FAULTS,
64     CPU_TIME,
65     MEMORY_STATS,
66     PROC_STAT_TYPES,
67 };
68 
69 // UserPackageStats represents the user package performance stats.
70 class UserPackageStats {
71 public:
72     // TODO(b/332773702): Rename nested structs
73     //  first-level IoStatsView, ProcSingleStatsView, and ProcCpuStatsView renames to Uid*Stats
74     //  second-level ProcessValue and ProcessCpuValue renames to Process*Stats
75     struct IoStatsView {
76         int64_t bytes[UID_STATES] = {0};
77         int64_t fsync[UID_STATES] = {0};
78 
totalBytesIoStatsView79         int64_t totalBytes() const {
80             return std::numeric_limits<int64_t>::max() - bytes[UidState::FOREGROUND] >
81                             bytes[UidState::BACKGROUND]
82                     ? bytes[UidState::FOREGROUND] + bytes[UidState::BACKGROUND]
83                     : std::numeric_limits<int64_t>::max();
84         }
85     };
86     struct ProcSingleStatsView {
87         uint64_t value = 0;
88         struct ProcessValue {
89             std::string comm = "";
90             uint64_t value = 0;
91         };
92         std::vector<ProcessValue> topNProcesses = {};
93     };
94     struct ProcCpuStatsView {
95         int64_t cpuTimeMillis = 0;
96         int64_t cpuCycles = 0;
97         struct ProcessCpuValue {
98             int32_t pid = -1;
99             std::string comm = "";
100             int64_t cpuTimeMillis = 0;
101             int64_t cpuCycles = 0;
102         };
103         std::vector<ProcessCpuValue> topNProcesses = {};
104     };
105     struct MemoryStats {
106         uint64_t rssKb = 0;
107         uint64_t pssKb = 0;
108         uint64_t ussKb = 0;
109         uint64_t swapPssKb = 0;
110     };
111     struct UidMemoryStats {
112         MemoryStats memoryStats;
113         bool isSmapsRollupSupported;
114         struct ProcessMemoryStats {
115             std::string comm = "";
116             MemoryStats memoryStats;
117         };
118         std::vector<ProcessMemoryStats> topNProcesses = {};
119     };
120 
121     UserPackageStats(MetricType metricType, const UidStats& uidStats);
122     UserPackageStats(ProcStatType procStatType, const UidStats& uidStats, int topNProcessCount,
123                      bool isSmapsRollupSupported);
124 
125     // Class must be DefaultInsertable for std::vector<T>::resize to work
UserPackageStats()126     UserPackageStats() : uid(0), genericPackageName("") {}
127     // For unit test case only
UserPackageStats(uid_t uid,std::string genericPackageName,std::variant<std::monostate,IoStatsView,ProcSingleStatsView,ProcCpuStatsView,UidMemoryStats> statsView)128     UserPackageStats(uid_t uid, std::string genericPackageName,
129                      std::variant<std::monostate, IoStatsView, ProcSingleStatsView,
130                                   ProcCpuStatsView, UidMemoryStats>
131                              statsView) :
132           uid(uid),
133           genericPackageName(std::move(genericPackageName)),
134           statsView(std::move(statsView)) {}
135 
136     // Returns the primary value of the current StatsView. If the variant has value
137     // |std::monostate|, returns 0.
138     //
139     // This value should be used to sort the StatsViews.
140     uint64_t getValue() const;
141     std::string toString(MetricType metricsType, const int64_t totalIoStats[][UID_STATES]) const;
142     std::string toString(int64_t totalValue) const;
143     std::string toString(int64_t totalRssKb, int64_t totalPssKb) const;
144 
145     uid_t uid;
146     std::string genericPackageName;
147     std::variant<std::monostate, IoStatsView, ProcSingleStatsView, ProcCpuStatsView, UidMemoryStats>
148             statsView;
149 
150 private:
151     void cacheTopNProcessSingleStats(
152             ProcStatType procStatType, const UidStats& uidStats, int topNProcessCount,
153             std::vector<UserPackageStats::ProcSingleStatsView::ProcessValue>* topNProcesses);
154     void cacheTopNProcessCpuStats(
155             const UidStats& uidStats, int topNProcessCount,
156             std::vector<UserPackageStats::ProcCpuStatsView::ProcessCpuValue>* topNProcesses);
157     void cacheTopNProcessMemStats(
158             const UidStats& uidStats, int topNProcessCount, bool isSmapsRollupSupported,
159             std::vector<UserPackageStats::UidMemoryStats::ProcessMemoryStats>* topNProcesses);
160 };
161 
162 /**
163  * User package summary performance stats collected from the `/proc/uid_io/stats`,
164  * `/proc/[pid]/stat`, `/proc/[pid]/task/[tid]/stat`, and /proc/[pid]/status` files.
165  */
166 struct UserPackageSummaryStats {
167     std::vector<UserPackageStats> topNCpuTimes = {};
168     std::vector<UserPackageStats> topNIoReads = {};
169     std::vector<UserPackageStats> topNIoWrites = {};
170     std::vector<UserPackageStats> topNIoBlocked = {};
171     std::vector<UserPackageStats> topNMajorFaults = {};
172     std::vector<UserPackageStats> topNMemStats = {};
173     int64_t totalIoStats[METRIC_TYPES][UID_STATES] = {{0}};
174     std::unordered_map<uid_t, uint64_t> taskCountByUid = {};
175     // TODO(b/337115923): Clean up below duplicate fields and report `totalMajorFaults`,
176     //  `totalRssKb`, `totalPssKb`, and `majorFaultsPercentChange` as part of `SystemSummaryStats`.
177     int64_t totalCpuTimeMillis = 0;
178     uint64_t totalCpuCycles = 0;
179     uint64_t totalMajorFaults = 0;
180     uint64_t totalRssKb = 0;
181     uint64_t totalPssKb = 0;
182     // Percentage of increase/decrease in the major page faults since last collection.
183     double majorFaultsPercentChange = 0.0;
184     std::string toString() const;
185 };
186 
187 // TODO(b/268402964): Calculate the total CPU cycles using the per-UID BPF tool.
188 // System performance stats collected from the `/proc/stats` file.
189 struct SystemSummaryStats {
190     int64_t cpuIoWaitTimeMillis = 0;
191     int64_t cpuIdleTimeMillis = 0;
192     int64_t totalCpuTimeMillis = 0;
193     uint64_t totalCpuCycles = 0;
194     uint64_t contextSwitchesCount = 0;
195     uint32_t ioBlockedProcessCount = 0;
196     uint32_t totalProcessCount = 0;
197     std::string toString() const;
198 };
199 
200 // Performance record collected during a sampling/collection period.
201 struct PerfStatsRecord {
202     time_point_millis collectionTimeMillis;
203     SystemSummaryStats systemSummaryStats;
204     UserPackageSummaryStats userPackageSummaryStats;
205     std::unordered_map<PressureMonitorInterface::PressureLevel, std::chrono::milliseconds>
206             memoryPressureLevelDurations;
207     std::string toString() const;
208 };
209 
210 // Group of performance records collected for a collection event.
211 struct CollectionInfo {
212     size_t maxCacheSize = 0;               // Maximum cache size for the collection.
213     std::vector<PerfStatsRecord> records;  // Cache of collected performance records.
214     std::string toString() const;
215 };
216 
217 // Group of performance records collected for a user switch collection event.
218 struct UserSwitchCollectionInfo : CollectionInfo {
219     userid_t from = 0;
220     userid_t to = 0;
221 };
222 
223 // PerformanceProfiler implements the I/O performance data collection module.
224 class PerformanceProfiler final :
225       public DataProcessorInterface,
226       public PressureMonitorInterface::PressureChangeCallbackInterface {
227 public:
228     PerformanceProfiler(
229             const android::sp<PressureMonitorInterface>& pressureMonitor,
230             const std::function<int64_t()>& getElapsedTimeSinceBootMillisFunc = &elapsedRealtime) :
kPressureMonitor(pressureMonitor)231           kPressureMonitor(pressureMonitor),
232           kGetElapsedTimeSinceBootMillisFunc(getElapsedTimeSinceBootMillisFunc),
233           mTopNStatsPerCategory(0),
234           mTopNStatsPerSubcategory(0),
235           mMaxUserSwitchEvents(0),
236           mSystemEventDataCacheDurationSec(0),
237           // TODO(b/333722043): Once carwatchdogd has sys_ptrace capability, set
238           // mIsSmapsRollupSupported field from `android::meminfo::IsSmapsRollupSupported()`.
239           // Disabling smaps_rollup support because this file cannot be read without sys_ptrace
240           // capability.
241           mIsSmapsRollupSupported(false),
242           mIsMemoryProfilingEnabled(android::car::feature::car_watchdog_memory_profiling()),
243           mBoottimeCollection({}),
244           mPeriodicCollection({}),
245           mUserSwitchCollections({}),
246           mWakeUpCollection({}),
247           mCustomCollection({}),
248           mLastMajorFaults(0),
249           mDoSendResourceUsageStats(false),
250           mMemoryPressureLevelDeltaInfo(PressureLevelDeltaInfo(getElapsedTimeSinceBootMillisFunc)) {
251     }
252 
~PerformanceProfiler()253     ~PerformanceProfiler() { terminate(); }
254 
name()255     std::string name() const override { return "PerformanceProfiler"; }
256 
257     // Implements DataProcessorInterface.
258     android::base::Result<void> onSystemStartup() override;
259 
260     void onCarWatchdogServiceRegistered() override;
261 
262     android::base::Result<void> onBoottimeCollection(
263             time_point_millis time,
264             const android::wp<UidStatsCollectorInterface>& uidStatsCollector,
265             const android::wp<ProcStatCollectorInterface>& procStatCollector,
266             aidl::android::automotive::watchdog::internal::ResourceStats* resourceStats) override;
267 
268     android::base::Result<void> onWakeUpCollection(
269             time_point_millis time,
270             const android::wp<UidStatsCollectorInterface>& uidStatsCollector,
271             const android::wp<ProcStatCollectorInterface>& procStatCollector) override;
272 
273     android::base::Result<void> onPeriodicCollection(
274             time_point_millis time, SystemState systemState,
275             const android::wp<UidStatsCollectorInterface>& uidStatsCollector,
276             const android::wp<ProcStatCollectorInterface>& procStatCollector,
277             aidl::android::automotive::watchdog::internal::ResourceStats* resourceStats) override;
278 
279     android::base::Result<void> onUserSwitchCollection(
280             time_point_millis time, userid_t from, userid_t to,
281             const android::wp<UidStatsCollectorInterface>& uidStatsCollector,
282             const android::wp<ProcStatCollectorInterface>& procStatCollector) override;
283 
284     android::base::Result<void> onCustomCollection(
285             time_point_millis time, SystemState systemState,
286             const std::unordered_set<std::string>& filterPackages,
287             const android::wp<UidStatsCollectorInterface>& uidStatsCollector,
288             const android::wp<ProcStatCollectorInterface>& procStatCollector,
289             aidl::android::automotive::watchdog::internal::ResourceStats* resourceStats) override;
290 
onPeriodicMonitor(time_t time,const android::wp<ProcDiskStatsCollectorInterface> & procDiskStatsCollector,const std::function<void ()> & alertHandler)291     android::base::Result<void> onPeriodicMonitor(
292             [[maybe_unused]] time_t time,
293             [[maybe_unused]] const android::wp<ProcDiskStatsCollectorInterface>&
294                     procDiskStatsCollector,
295             [[maybe_unused]] const std::function<void()>& alertHandler) override {
296         // No monitoring done here as this DataProcessor only collects I/O performance records.
297         return {};
298     }
299 
300     android::base::Result<void> onDump(int fd) const override;
301 
302     android::base::Result<void> onDumpProto(
303             const CollectionIntervals& collectionIntervals,
304             android::util::ProtoOutputStream& outProto) const override;
305 
306     android::base::Result<void> onCustomCollectionDump(int fd) override;
307 
308     void onPressureChanged(PressureMonitorInterface::PressureLevel) override;
309 
310 protected:
311     android::base::Result<void> init();
312 
313     // Clears in-memory cache.
314     void terminate();
315 
316 private:
317     class PressureLevelDeltaInfo {
318     public:
PressureLevelDeltaInfo(const std::function<int64_t ()> & getElapsedTimeSinceBootMillisFunc)319         explicit PressureLevelDeltaInfo(
320                 const std::function<int64_t()>& getElapsedTimeSinceBootMillisFunc) :
321               kGetElapsedTimeSinceBootMillisFunc(getElapsedTimeSinceBootMillisFunc),
322               mLatestPressureLevel(PressureMonitorInterface::PRESSURE_LEVEL_NONE),
323               mLatestPressureLevelElapsedRealtimeMillis(getElapsedTimeSinceBootMillisFunc()) {}
324 
325         // Calculates the duration for the previously reported pressure level, updates it in
326         // mPressureLevelDurations, and sets the latest pressure level and its elapsed realtime.
327         void setLatestPressureLevelLocked(PressureMonitorInterface::PressureLevel pressureLevel);
328 
329         // Returns the latest pressure stats and flushes stats to mPressureLevelDurations.
330         std::unordered_map<PressureMonitorInterface::PressureLevel, std::chrono::milliseconds>
331         onCollectionLocked();
332 
333     private:
334         // Updated by test for mocking elapsed time.
335         const std::function<int64_t()> kGetElapsedTimeSinceBootMillisFunc;
336 
337         // Latest pressure level reported by the PressureMonitor.
338         PressureMonitorInterface::PressureLevel mLatestPressureLevel;
339 
340         // Time when the latest pressure level was recorded. Used to calculate
341         // pressureLevelDurations.
342         int64_t mLatestPressureLevelElapsedRealtimeMillis = 0;
343 
344         // Duration spent in different pressure levels since the last poll.
345         std::unordered_map<PressureMonitorInterface::PressureLevel, std::chrono::milliseconds>
346                 mPressureLevelDurations = {};
347     };
348 
349     // Processes the collected data.
350     android::base::Result<void> processLocked(
351             time_point_millis time, SystemState systemState,
352             const std::unordered_set<std::string>& filterPackages,
353             const android::sp<UidStatsCollectorInterface>& uidStatsCollector,
354             const android::sp<ProcStatCollectorInterface>& procStatCollector,
355             CollectionInfo* collectionInfo,
356             aidl::android::automotive::watchdog::internal::ResourceStats* resourceStats);
357 
358     // Processes per-UID performance data.
359     void processUidStatsLocked(
360             bool isGarageModeActive, int64_t totalCpuTimeMillis,
361             const std::unordered_set<std::string>& filterPackages,
362             const android::sp<UidStatsCollectorInterface>& uidStatsCollector,
363             std::vector<aidl::android::automotive::watchdog::internal::UidResourceUsageStats>*
364                     uidResourceUsageStats,
365             UserPackageSummaryStats* userPackageSummaryStats);
366 
367     // Processes system performance data from the `/proc/stats` file.
368     void processProcStatLocked(const android::sp<ProcStatCollectorInterface>& procStatCollector,
369                                SystemSummaryStats* systemSummaryStats) const;
370 
371     // Dump the user switch collection
372     android::base::Result<void> onUserSwitchCollectionDump(int fd) const;
373 
374     void clearExpiredSystemEventCollections(time_point_millis time);
375 
376     void dumpStatsRecordsProto(const CollectionInfo& collection,
377                                android::util::ProtoOutputStream& outProto) const;
378 
379     void dumpPackageCpuStatsProto(const std::vector<UserPackageStats>& userPackageStats,
380                                   android::util::ProtoOutputStream& outProto) const;
381 
382     void dumpPackageStorageIoStatsProto(const std::vector<UserPackageStats>& userPackageStats,
383                                         const uint64_t storageStatsFieldId,
384                                         android::util::ProtoOutputStream& outProto) const;
385 
386     void dumpPackageTaskStateStatsProto(const std::vector<UserPackageStats>& userPackageStats,
387                                         const std::unordered_map<uid_t, uint64_t>& taskCountByUid,
388                                         android::util::ProtoOutputStream& outProto) const;
389 
390     void dumpPackageMajorPageFaultsProto(const std::vector<UserPackageStats>& userPackageStats,
391                                          android::util::ProtoOutputStream& outProto) const;
392 
393     // Pressure monitor instance.
394     const android::sp<PressureMonitorInterface> kPressureMonitor;
395 
396     // Updated by test for mocking elapsed time.
397     const std::function<int64_t()> kGetElapsedTimeSinceBootMillisFunc;
398 
399     // Top N per-UID stats per category.
400     int mTopNStatsPerCategory;
401 
402     // Top N per-process stats per subcategory.
403     int mTopNStatsPerSubcategory;
404 
405     // Max amount of user switch events cached in |mUserSwitchCollections|.
406     size_t mMaxUserSwitchEvents;
407 
408     // Amount of seconds before a system event's cache is cleared.
409     std::chrono::seconds mSystemEventDataCacheDurationSec;
410 
411     // Smaps rollup is supported by kernel or not.
412     bool mIsSmapsRollupSupported;
413 
414     // Memory Profiling feature flag is enabled or not.
415     bool mIsMemoryProfilingEnabled;
416 
417     // Makes sure only one collection is running at any given time.
418     mutable Mutex mMutex;
419 
420     // Info for the boot-time collection event. The cache is persisted until system shutdown/reboot
421     // or a wake-up collection occurs.
422     CollectionInfo mBoottimeCollection GUARDED_BY(mMutex);
423 
424     // Info for the periodic collection event. The cache size is limited by
425     // |ro.carwatchdog.periodic_collection_buffer_size|.
426     CollectionInfo mPeriodicCollection GUARDED_BY(mMutex);
427 
428     // Cache for user switch collection events. Events are cached from oldest to newest.
429     std::vector<UserSwitchCollectionInfo> mUserSwitchCollections GUARDED_BY(mMutex);
430 
431     // Info for the wake-up collection event. Only the latest wake-up collection is cached.
432     CollectionInfo mWakeUpCollection GUARDED_BY(mMutex);
433 
434     // Info for the custom collection event. The info is cleared at the end of every custom
435     // collection.
436     CollectionInfo mCustomCollection GUARDED_BY(mMutex);
437 
438     // Major faults delta from last collection. Useful when calculating the percentage change in
439     // major faults since last collection.
440     uint64_t mLastMajorFaults GUARDED_BY(mMutex);
441 
442     // Enables the sending of resource usage stats to CarService.
443     bool mDoSendResourceUsageStats GUARDED_BY(mMutex);
444 
445     // Aggregated pressure level changes occurred since the last collection.
446     PressureLevelDeltaInfo mMemoryPressureLevelDeltaInfo GUARDED_BY(mMutex);
447 
448     friend class WatchdogPerfService;
449 
450     // For unit tests.
451     friend class internal::PerformanceProfilerPeer;
452 };
453 
454 }  // namespace watchdog
455 }  // namespace automotive
456 }  // namespace android
457 
458 #endif  //  CPP_WATCHDOG_SERVER_SRC_PERFORMANCEPROFILER_H_
459