1 /**
2 * Copyright (c) 2020, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "carwatchdogd"
18
19 #include "IoPerfCollection.h"
20
21 #include <WatchdogProperties.sysprop.h>
22 #include <android-base/file.h>
23 #include <android-base/parseint.h>
24 #include <android-base/stringprintf.h>
25 #include <android-base/strings.h>
26 #include <binder/IServiceManager.h>
27 #include <cutils/android_filesystem_config.h>
28 #include <inttypes.h>
29 #include <log/log.h>
30 #include <processgroup/sched_policy.h>
31 #include <pthread.h>
32 #include <pwd.h>
33
34 #include <algorithm>
35 #include <iomanip>
36 #include <iterator>
37 #include <limits>
38 #include <string>
39 #include <thread>
40 #include <unordered_map>
41 #include <unordered_set>
42 #include <vector>
43
44 namespace android {
45 namespace automotive {
46 namespace watchdog {
47
48 using android::defaultServiceManager;
49 using android::IBinder;
50 using android::IServiceManager;
51 using android::sp;
52 using android::String16;
53 using android::base::Error;
54 using android::base::ParseUint;
55 using android::base::Result;
56 using android::base::Split;
57 using android::base::StringAppendF;
58 using android::base::StringPrintf;
59 using android::base::WriteStringToFd;
60 using android::content::pm::IPackageManagerNative;
61
62 namespace {
63
64 const int32_t kDefaultTopNStatsPerCategory = 10;
65 const int32_t kDefaultTopNStatsPerSubcategory = 5;
66 const std::chrono::seconds kDefaultBoottimeCollectionInterval = 1s;
67 const std::chrono::seconds kDefaultPeriodicCollectionInterval = 10s;
68 // Number of periodic collection perf data snapshots to cache in memory.
69 const int32_t kDefaultPeriodicCollectionBufferSize = 180;
70
71 // Minimum collection interval between subsequent collections.
72 const std::chrono::nanoseconds kMinCollectionInterval = 1s;
73
74 // Default values for the custom collection interval and max_duration.
75 const std::chrono::nanoseconds kCustomCollectionInterval = 10s;
76 const std::chrono::nanoseconds kCustomCollectionDuration = 30min;
77
78 const std::string kDumpMajorDelimiter = std::string(100, '-') + "\n";
79
80 constexpr const char* kHelpText =
81 "\nCustom I/O performance data collection dump options:\n"
82 "%s: Starts custom I/O performance data collection. Customize the collection behavior with "
83 "the following optional arguments:\n"
84 "\t%s <seconds>: Modifies the collection interval. Default behavior is to collect once "
85 "every %lld seconds.\n"
86 "\t%s <seconds>: Modifies the maximum collection duration. Default behavior is to collect "
87 "until %ld minutes before automatically stopping the custom collection and discarding "
88 "the collected data.\n"
89 "\t%s <package name>,<package, name>,...: Comma-separated value containing package names. "
90 "When provided, the results are filtered only to the provided package names. Default "
91 "behavior is to list the results for the top %d packages.\n"
92 "%s: Stops custom I/O performance data collection and generates a dump of "
93 "the collection report.\n\n"
94 "When no options are specified, the carwatchdog report contains the I/O performance "
95 "data collected during boot-time and over the last %ld minutes before the report "
96 "generation.";
97
percentage(uint64_t numer,uint64_t denom)98 double percentage(uint64_t numer, uint64_t denom) {
99 return denom == 0 ? 0.0 : (static_cast<double>(numer) / static_cast<double>(denom)) * 100.0;
100 }
101
102 struct UidProcessStats {
103 struct ProcessInfo {
104 std::string comm = "";
105 uint64_t count = 0;
106 };
107 uint64_t uid = 0;
108 uint32_t ioBlockedTasksCnt = 0;
109 uint32_t totalTasksCnt = 0;
110 uint64_t majorFaults = 0;
111 std::vector<ProcessInfo> topNIoBlockedProcesses = {};
112 std::vector<ProcessInfo> topNMajorFaultProcesses = {};
113 };
114
getUidProcessStats(const std::vector<ProcessStats> & processStats,int topNStatsPerSubCategory)115 std::unique_ptr<std::unordered_map<uint32_t, UidProcessStats>> getUidProcessStats(
116 const std::vector<ProcessStats>& processStats, int topNStatsPerSubCategory) {
117 std::unique_ptr<std::unordered_map<uint32_t, UidProcessStats>> uidProcessStats(
118 new std::unordered_map<uint32_t, UidProcessStats>());
119 for (const auto& stats : processStats) {
120 if (stats.uid < 0) {
121 continue;
122 }
123 uint32_t uid = static_cast<uint32_t>(stats.uid);
124 if (uidProcessStats->find(uid) == uidProcessStats->end()) {
125 (*uidProcessStats)[uid] = UidProcessStats{
126 .uid = uid,
127 .topNIoBlockedProcesses = std::vector<
128 UidProcessStats::ProcessInfo>(topNStatsPerSubCategory,
129 UidProcessStats::ProcessInfo{}),
130 .topNMajorFaultProcesses = std::vector<
131 UidProcessStats::ProcessInfo>(topNStatsPerSubCategory,
132 UidProcessStats::ProcessInfo{}),
133 };
134 }
135 auto& curUidProcessStats = (*uidProcessStats)[uid];
136 // Top-level process stats has the aggregated major page faults count and this should be
137 // persistent across thread creation/termination. Thus use the value from this field.
138 curUidProcessStats.majorFaults += stats.process.majorFaults;
139 curUidProcessStats.totalTasksCnt += stats.threads.size();
140 // The process state is the same as the main thread state. Thus to avoid double counting
141 // ignore the process state.
142 uint32_t ioBlockedTasksCnt = 0;
143 for (const auto& threadStat : stats.threads) {
144 ioBlockedTasksCnt += threadStat.second.state == "D" ? 1 : 0;
145 }
146 curUidProcessStats.ioBlockedTasksCnt += ioBlockedTasksCnt;
147 for (auto it = curUidProcessStats.topNIoBlockedProcesses.begin();
148 it != curUidProcessStats.topNIoBlockedProcesses.end(); ++it) {
149 if (it->count < ioBlockedTasksCnt) {
150 curUidProcessStats.topNIoBlockedProcesses
151 .emplace(it,
152 UidProcessStats::ProcessInfo{
153 .comm = stats.process.comm,
154 .count = ioBlockedTasksCnt,
155 });
156 curUidProcessStats.topNIoBlockedProcesses.pop_back();
157 break;
158 }
159 }
160 for (auto it = curUidProcessStats.topNMajorFaultProcesses.begin();
161 it != curUidProcessStats.topNMajorFaultProcesses.end(); ++it) {
162 if (it->count < stats.process.majorFaults) {
163 curUidProcessStats.topNMajorFaultProcesses
164 .emplace(it,
165 UidProcessStats::ProcessInfo{
166 .comm = stats.process.comm,
167 .count = stats.process.majorFaults,
168 });
169 curUidProcessStats.topNMajorFaultProcesses.pop_back();
170 break;
171 }
172 }
173 }
174 return uidProcessStats;
175 }
176
parseSecondsFlag(Vector<String16> args,size_t pos)177 Result<std::chrono::seconds> parseSecondsFlag(Vector<String16> args, size_t pos) {
178 if (args.size() < pos) {
179 return Error() << "Value not provided";
180 }
181
182 uint64_t value;
183 std::string strValue = std::string(String8(args[pos]).string());
184 if (!ParseUint(strValue, &value)) {
185 return Error() << "Invalid value " << args[pos].string() << ", must be an integer";
186 }
187 return std::chrono::seconds(value);
188 }
189
190 } // namespace
191
toString(const UidIoPerfData & data)192 std::string toString(const UidIoPerfData& data) {
193 std::string buffer;
194 if (data.topNReads.size() > 0) {
195 StringAppendF(&buffer, "\nTop N Reads:\n%s\n", std::string(12, '-').c_str());
196 StringAppendF(&buffer,
197 "Android User ID, Package Name, Foreground Bytes, Foreground Bytes %%, "
198 "Foreground Fsync, Foreground Fsync %%, Background Bytes, "
199 "Background Bytes %%, Background Fsync, Background Fsync %%\n");
200 }
201 for (const auto& stat : data.topNReads) {
202 StringAppendF(&buffer, "%" PRIu32 ", %s", stat.userId, stat.packageName.c_str());
203 for (int i = 0; i < UID_STATES; ++i) {
204 StringAppendF(&buffer, ", %" PRIu64 ", %.2f%%, %" PRIu64 ", %.2f%%", stat.bytes[i],
205 percentage(stat.bytes[i], data.total[READ_BYTES][i]), stat.fsync[i],
206 percentage(stat.fsync[i], data.total[FSYNC_COUNT][i]));
207 }
208 StringAppendF(&buffer, "\n");
209 }
210 if (data.topNWrites.size() > 0) {
211 StringAppendF(&buffer, "\nTop N Writes:\n%s\n", std::string(13, '-').c_str());
212 StringAppendF(&buffer,
213 "Android User ID, Package Name, Foreground Bytes, Foreground Bytes %%, "
214 "Foreground Fsync, Foreground Fsync %%, Background Bytes, "
215 "Background Bytes %%, Background Fsync, Background Fsync %%\n");
216 }
217 for (const auto& stat : data.topNWrites) {
218 StringAppendF(&buffer, "%" PRIu32 ", %s", stat.userId, stat.packageName.c_str());
219 for (int i = 0; i < UID_STATES; ++i) {
220 StringAppendF(&buffer, ", %" PRIu64 ", %.2f%%, %" PRIu64 ", %.2f%%", stat.bytes[i],
221 percentage(stat.bytes[i], data.total[WRITE_BYTES][i]), stat.fsync[i],
222 percentage(stat.fsync[i], data.total[FSYNC_COUNT][i]));
223 }
224 StringAppendF(&buffer, "\n");
225 }
226 return buffer;
227 }
228
toString(const SystemIoPerfData & data)229 std::string toString(const SystemIoPerfData& data) {
230 std::string buffer;
231 StringAppendF(&buffer, "CPU I/O wait time/percent: %" PRIu64 " / %.2f%%\n", data.cpuIoWaitTime,
232 percentage(data.cpuIoWaitTime, data.totalCpuTime));
233 StringAppendF(&buffer, "Number of I/O blocked processes/percent: %" PRIu32 " / %.2f%%\n",
234 data.ioBlockedProcessesCnt,
235 percentage(data.ioBlockedProcessesCnt, data.totalProcessesCnt));
236 return buffer;
237 }
238
toString(const ProcessIoPerfData & data)239 std::string toString(const ProcessIoPerfData& data) {
240 std::string buffer;
241 StringAppendF(&buffer, "Number of major page faults since last collection: %" PRIu64 "\n",
242 data.totalMajorFaults);
243 StringAppendF(&buffer,
244 "Percentage of change in major page faults since last collection: %.2f%%\n",
245 data.majorFaultsPercentChange);
246 if (data.topNMajorFaultUids.size() > 0) {
247 StringAppendF(&buffer, "\nTop N major page faults:\n%s\n", std::string(24, '-').c_str());
248 StringAppendF(&buffer,
249 "Android User ID, Package Name, Number of major page faults, "
250 "Percentage of total major page faults\n");
251 StringAppendF(&buffer,
252 "\tCommand, Number of major page faults, Percentage of UID's major page "
253 "faults\n");
254 }
255 for (const auto& uidStats : data.topNMajorFaultUids) {
256 StringAppendF(&buffer, "%" PRIu32 ", %s, %" PRIu64 ", %.2f%%\n", uidStats.userId,
257 uidStats.packageName.c_str(), uidStats.count,
258 percentage(uidStats.count, data.totalMajorFaults));
259 for (const auto& procStats : uidStats.topNProcesses) {
260 StringAppendF(&buffer, "\t%s, %" PRIu64 ", %.2f%%\n", procStats.comm.c_str(),
261 procStats.count, percentage(procStats.count, uidStats.count));
262 }
263 }
264 if (data.topNIoBlockedUids.size() > 0) {
265 StringAppendF(&buffer, "\nTop N I/O waiting UIDs:\n%s\n", std::string(23, '-').c_str());
266 StringAppendF(&buffer,
267 "Android User ID, Package Name, Number of owned tasks waiting for I/O, "
268 "Percentage of owned tasks waiting for I/O\n");
269 StringAppendF(&buffer,
270 "\tCommand, Number of I/O waiting tasks, Percentage of UID's tasks waiting "
271 "for I/O\n");
272 }
273 for (size_t i = 0; i < data.topNIoBlockedUids.size(); ++i) {
274 const auto& uidStats = data.topNIoBlockedUids[i];
275 StringAppendF(&buffer, "%" PRIu32 ", %s, %" PRIu64 ", %.2f%%\n", uidStats.userId,
276 uidStats.packageName.c_str(), uidStats.count,
277 percentage(uidStats.count, data.topNIoBlockedUidsTotalTaskCnt[i]));
278 for (const auto& procStats : uidStats.topNProcesses) {
279 StringAppendF(&buffer, "\t%s, %" PRIu64 ", %.2f%%\n", procStats.comm.c_str(),
280 procStats.count, percentage(procStats.count, uidStats.count));
281 }
282 }
283 return buffer;
284 }
285
toString(const IoPerfRecord & record)286 std::string toString(const IoPerfRecord& record) {
287 std::string buffer;
288 StringAppendF(&buffer, "%s%s%s", toString(record.systemIoPerfData).c_str(),
289 toString(record.processIoPerfData).c_str(),
290 toString(record.uidIoPerfData).c_str());
291 return buffer;
292 }
293
toString(const CollectionInfo & collectionInfo)294 std::string toString(const CollectionInfo& collectionInfo) {
295 std::string buffer;
296 StringAppendF(&buffer, "Number of collections: %zu\n", collectionInfo.records.size());
297 auto interval =
298 std::chrono::duration_cast<std::chrono::seconds>(collectionInfo.interval).count();
299 StringAppendF(&buffer, "Collection interval: %lld second%s\n", interval,
300 ((interval > 1) ? "s" : ""));
301 for (size_t i = 0; i < collectionInfo.records.size(); ++i) {
302 const auto& record = collectionInfo.records[i];
303 std::stringstream timestamp;
304 timestamp << std::put_time(std::localtime(&record.time), "%c %Z");
305 StringAppendF(&buffer, "Collection %zu: <%s>\n%s\n%s\n", i, timestamp.str().c_str(),
306 std::string(45, '=').c_str(), toString(record).c_str());
307 }
308 return buffer;
309 }
310
start()311 Result<void> IoPerfCollection::start() {
312 {
313 Mutex::Autolock lock(mMutex);
314 if (mCurrCollectionEvent != CollectionEvent::INIT || mCollectionThread.joinable()) {
315 return Error(INVALID_OPERATION)
316 << "Cannot start I/O performance collection more than once";
317 }
318 mTopNStatsPerCategory = static_cast<int>(
319 sysprop::topNStatsPerCategory().value_or(kDefaultTopNStatsPerCategory));
320 mTopNStatsPerSubcategory = static_cast<int>(
321 sysprop::topNStatsPerSubcategory().value_or(kDefaultTopNStatsPerSubcategory));
322 std::chrono::nanoseconds boottimeCollectionInterval =
323 std::chrono::duration_cast<std::chrono::nanoseconds>(
324 std::chrono::seconds(sysprop::boottimeCollectionInterval().value_or(
325 kDefaultBoottimeCollectionInterval.count())));
326 std::chrono::nanoseconds periodicCollectionInterval =
327 std::chrono::duration_cast<std::chrono::nanoseconds>(
328 std::chrono::seconds(sysprop::periodicCollectionInterval().value_or(
329 kDefaultPeriodicCollectionInterval.count())));
330 size_t periodicCollectionBufferSize =
331 static_cast<size_t>(sysprop::periodicCollectionBufferSize().value_or(
332 kDefaultPeriodicCollectionBufferSize));
333 mBoottimeCollection = {
334 .interval = boottimeCollectionInterval,
335 .maxCacheSize = std::numeric_limits<std::size_t>::max(),
336 .lastCollectionUptime = 0,
337 .records = {},
338 };
339 mPeriodicCollection = {
340 .interval = periodicCollectionInterval,
341 .maxCacheSize = periodicCollectionBufferSize,
342 .lastCollectionUptime = 0,
343 .records = {},
344 };
345 }
346
347 mCollectionThread = std::thread([&]() {
348 {
349 Mutex::Autolock lock(mMutex);
350 if (mCurrCollectionEvent != CollectionEvent::INIT) {
351 ALOGE("Skipping I/O performance data collection as the current collection event "
352 "%s != %s",
353 toString(mCurrCollectionEvent).c_str(),
354 toString(CollectionEvent::INIT).c_str());
355 return;
356 }
357 mCurrCollectionEvent = CollectionEvent::BOOT_TIME;
358 mBoottimeCollection.lastCollectionUptime = mHandlerLooper->now();
359 mHandlerLooper->setLooper(Looper::prepare(/*opts=*/0));
360 mHandlerLooper->sendMessage(this, CollectionEvent::BOOT_TIME);
361 }
362 if (set_sched_policy(0, SP_BACKGROUND) != 0) {
363 ALOGW("Failed to set background scheduling priority to I/O performance data collection "
364 "thread");
365 }
366 int ret = pthread_setname_np(pthread_self(), "IoPerfCollect");
367 if (ret != 0) {
368 ALOGE("Failed to set I/O perf collection thread name: %d", ret);
369 }
370 bool isCollectionActive = true;
371 // Loop until the collection is not active -- I/O perf collection runs on this thread in a
372 // handler.
373 while (isCollectionActive) {
374 mHandlerLooper->pollAll(/*timeoutMillis=*/-1);
375 Mutex::Autolock lock(mMutex);
376 isCollectionActive = mCurrCollectionEvent != CollectionEvent::TERMINATED;
377 }
378 });
379 return {};
380 }
381
terminate()382 void IoPerfCollection::terminate() {
383 {
384 Mutex::Autolock lock(mMutex);
385 if (mCurrCollectionEvent == CollectionEvent::TERMINATED) {
386 ALOGE("I/O performance data collection was terminated already");
387 return;
388 }
389 ALOGE("Terminating I/O performance data collection");
390 mCurrCollectionEvent = CollectionEvent::TERMINATED;
391 }
392 if (mCollectionThread.joinable()) {
393 mHandlerLooper->removeMessages(this);
394 mHandlerLooper->wake();
395 mCollectionThread.join();
396 }
397 }
398
onBootFinished()399 Result<void> IoPerfCollection::onBootFinished() {
400 Mutex::Autolock lock(mMutex);
401 if (mCurrCollectionEvent != CollectionEvent::BOOT_TIME) {
402 // This case happens when either the I/O perf collection has prematurely terminated before
403 // boot complete notification is received or multiple boot complete notifications are
404 // received. In either case don't return error as this will lead to runtime exception and
405 // cause system to boot loop.
406 ALOGE("Current I/O performance data collection event %s != %s",
407 toString(mCurrCollectionEvent).c_str(),
408 toString(CollectionEvent::BOOT_TIME).c_str());
409 return {};
410 }
411 mBoottimeCollection.lastCollectionUptime = mHandlerLooper->now();
412 mHandlerLooper->removeMessages(this);
413 mHandlerLooper->sendMessage(this, SwitchEvent::END_BOOTTIME_COLLECTION);
414 return {};
415 }
416
dump(int fd,const Vector<String16> & args)417 Result<void> IoPerfCollection::dump(int fd, const Vector<String16>& args) {
418 if (args.empty()) {
419 const auto& ret = dumpCollection(fd);
420 if (!ret) {
421 return ret;
422 }
423 return {};
424 }
425
426 if (args[0] == String16(kStartCustomCollectionFlag)) {
427 if (args.size() > 7) {
428 return Error(BAD_VALUE) << "Number of arguments to start custom I/O performance data "
429 << "collection cannot exceed 7";
430 }
431 std::chrono::nanoseconds interval = kCustomCollectionInterval;
432 std::chrono::nanoseconds maxDuration = kCustomCollectionDuration;
433 std::unordered_set<std::string> filterPackages;
434 for (size_t i = 1; i < args.size(); ++i) {
435 if (args[i] == String16(kIntervalFlag)) {
436 const auto& ret = parseSecondsFlag(args, i + 1);
437 if (!ret) {
438 return Error(BAD_VALUE)
439 << "Failed to parse " << kIntervalFlag << ": " << ret.error();
440 }
441 interval = std::chrono::duration_cast<std::chrono::nanoseconds>(*ret);
442 ++i;
443 continue;
444 }
445 if (args[i] == String16(kMaxDurationFlag)) {
446 const auto& ret = parseSecondsFlag(args, i + 1);
447 if (!ret) {
448 return Error(BAD_VALUE)
449 << "Failed to parse " << kMaxDurationFlag << ": " << ret.error();
450 }
451 maxDuration = std::chrono::duration_cast<std::chrono::nanoseconds>(*ret);
452 ++i;
453 continue;
454 }
455 if (args[i] == String16(kFilterPackagesFlag)) {
456 if (args.size() < i + 1) {
457 return Error(BAD_VALUE)
458 << "Must provide value for '" << kFilterPackagesFlag << "' flag";
459 }
460 std::vector<std::string> packages =
461 Split(std::string(String8(args[i + 1]).string()), ",");
462 std::copy(packages.begin(), packages.end(),
463 std::inserter(filterPackages, filterPackages.end()));
464 ++i;
465 continue;
466 }
467 ALOGW("Unknown flag %s provided to start custom I/O performance data collection",
468 String8(args[i]).string());
469 return Error(BAD_VALUE) << "Unknown flag " << String8(args[i]).string()
470 << " provided to start custom I/O performance data "
471 << "collection";
472 }
473 const auto& ret = startCustomCollection(interval, maxDuration, filterPackages);
474 if (!ret) {
475 WriteStringToFd(ret.error().message(), fd);
476 return ret;
477 }
478 return {};
479 }
480
481 if (args[0] == String16(kEndCustomCollectionFlag)) {
482 if (args.size() != 1) {
483 ALOGW("Number of arguments to stop custom I/O performance data collection cannot "
484 "exceed 1. Stopping the data collection.");
485 WriteStringToFd("Number of arguments to stop custom I/O performance data collection "
486 "cannot exceed 1. Stopping the data collection.",
487 fd);
488 }
489 return endCustomCollection(fd);
490 }
491
492 return Error(BAD_VALUE) << "I/O perf collection dump arguments start neither with "
493 << kStartCustomCollectionFlag << " nor with "
494 << kEndCustomCollectionFlag << " flags";
495 }
496
dumpHelpText(int fd)497 bool IoPerfCollection::dumpHelpText(int fd) {
498 long periodicCacheMinutes =
499 (std::chrono::duration_cast<std::chrono::seconds>(mPeriodicCollection.interval)
500 .count() *
501 mPeriodicCollection.maxCacheSize) /
502 60;
503 return WriteStringToFd(StringPrintf(kHelpText, kStartCustomCollectionFlag, kIntervalFlag,
504 std::chrono::duration_cast<std::chrono::seconds>(
505 kCustomCollectionInterval)
506 .count(),
507 kMaxDurationFlag,
508 std::chrono::duration_cast<std::chrono::minutes>(
509 kCustomCollectionDuration)
510 .count(),
511 kFilterPackagesFlag, mTopNStatsPerCategory,
512 kEndCustomCollectionFlag, periodicCacheMinutes),
513 fd);
514 }
515
dumpCollection(int fd)516 Result<void> IoPerfCollection::dumpCollection(int fd) {
517 Mutex::Autolock lock(mMutex);
518 if (mCurrCollectionEvent == CollectionEvent::TERMINATED) {
519 ALOGW("I/O performance data collection not active. Dumping cached data");
520 if (!WriteStringToFd("I/O performance data collection not active. Dumping cached data.",
521 fd)) {
522 return Error(FAILED_TRANSACTION) << "Failed to write I/O performance collection status";
523 }
524 }
525
526 const auto& ret = dumpCollectorsStatusLocked(fd);
527 if (!ret) {
528 return Error(FAILED_TRANSACTION) << ret.error();
529 }
530
531 if (!WriteStringToFd(StringPrintf("%sI/O performance data reports:\n%sBoot-time collection "
532 "report:\n%s\n",
533 kDumpMajorDelimiter.c_str(), kDumpMajorDelimiter.c_str(),
534 std::string(28, '=').c_str()),
535 fd) ||
536 !WriteStringToFd(toString(mBoottimeCollection), fd) ||
537 !WriteStringToFd(StringPrintf("%s\nPeriodic collection report:\n%s\n",
538 std::string(75, '-').c_str(), std::string(27, '=').c_str()),
539 fd) ||
540 !WriteStringToFd(toString(mPeriodicCollection), fd) ||
541 !WriteStringToFd(kDumpMajorDelimiter, fd)) {
542 return Error(FAILED_TRANSACTION)
543 << "Failed to dump the boot-time and periodic collection reports.";
544 }
545 return {};
546 }
547
dumpCollectorsStatusLocked(int fd)548 Result<void> IoPerfCollection::dumpCollectorsStatusLocked(int fd) {
549 if (!mUidIoStats->enabled() &&
550 !WriteStringToFd(StringPrintf("UidIoStats collector failed to access the file %s",
551 mUidIoStats->filePath().c_str()),
552 fd)) {
553 return Error() << "Failed to write UidIoStats collector status";
554 }
555 if (!mProcStat->enabled() &&
556 !WriteStringToFd(StringPrintf("ProcStat collector failed to access the file %s",
557 mProcStat->filePath().c_str()),
558 fd)) {
559 return Error() << "Failed to write ProcStat collector status";
560 }
561 if (!mProcPidStat->enabled() &&
562 !WriteStringToFd(StringPrintf("ProcPidStat collector failed to access the directory %s",
563 mProcPidStat->dirPath().c_str()),
564 fd)) {
565 return Error() << "Failed to write ProcPidStat collector status";
566 }
567 return {};
568 }
569
startCustomCollection(std::chrono::nanoseconds interval,std::chrono::nanoseconds maxDuration,const std::unordered_set<std::string> & filterPackages)570 Result<void> IoPerfCollection::startCustomCollection(
571 std::chrono::nanoseconds interval, std::chrono::nanoseconds maxDuration,
572 const std::unordered_set<std::string>& filterPackages) {
573 if (interval < kMinCollectionInterval || maxDuration < kMinCollectionInterval) {
574 return Error(INVALID_OPERATION)
575 << "Collection interval and maximum duration must be >= "
576 << std::chrono::duration_cast<std::chrono::milliseconds>(kMinCollectionInterval)
577 .count()
578 << " milliseconds.";
579 }
580 Mutex::Autolock lock(mMutex);
581 if (mCurrCollectionEvent != CollectionEvent::PERIODIC) {
582 return Error(INVALID_OPERATION)
583 << "Cannot start a custom collection when "
584 << "the current collection event " << toString(mCurrCollectionEvent)
585 << " != " << toString(CollectionEvent::PERIODIC) << " collection event";
586 }
587
588 mCustomCollection = {
589 .interval = interval,
590 .maxCacheSize = std::numeric_limits<std::size_t>::max(),
591 .filterPackages = filterPackages,
592 .lastCollectionUptime = mHandlerLooper->now(),
593 .records = {},
594 };
595
596 mHandlerLooper->removeMessages(this);
597 nsecs_t uptime = mHandlerLooper->now() + maxDuration.count();
598 mHandlerLooper->sendMessageAtTime(uptime, this, SwitchEvent::END_CUSTOM_COLLECTION);
599 mCurrCollectionEvent = CollectionEvent::CUSTOM;
600 mHandlerLooper->sendMessage(this, CollectionEvent::CUSTOM);
601 return {};
602 }
603
endCustomCollection(int fd)604 Result<void> IoPerfCollection::endCustomCollection(int fd) {
605 Mutex::Autolock lock(mMutex);
606 if (mCurrCollectionEvent != CollectionEvent::CUSTOM) {
607 return Error(INVALID_OPERATION) << "No custom collection is running";
608 }
609
610 mHandlerLooper->removeMessages(this);
611 mHandlerLooper->sendMessage(this, SwitchEvent::END_CUSTOM_COLLECTION);
612
613 const auto& ret = dumpCollectorsStatusLocked(fd);
614 if (!ret) {
615 return Error(FAILED_TRANSACTION) << ret.error();
616 }
617
618 if (!WriteStringToFd(StringPrintf("%sI/O performance data report for custom collection:\n%s",
619 kDumpMajorDelimiter.c_str(), kDumpMajorDelimiter.c_str()),
620 fd) ||
621 !WriteStringToFd(toString(mCustomCollection), fd) ||
622 !WriteStringToFd(kDumpMajorDelimiter, fd)) {
623 return Error(FAILED_TRANSACTION) << "Failed to write custom collection report.";
624 }
625
626 return {};
627 }
628
handleMessage(const Message & message)629 void IoPerfCollection::handleMessage(const Message& message) {
630 Result<void> result;
631
632 switch (message.what) {
633 case static_cast<int>(CollectionEvent::BOOT_TIME):
634 result = processCollectionEvent(CollectionEvent::BOOT_TIME, &mBoottimeCollection);
635 break;
636 case static_cast<int>(SwitchEvent::END_BOOTTIME_COLLECTION):
637 result = processCollectionEvent(CollectionEvent::BOOT_TIME, &mBoottimeCollection);
638 if (result.ok()) {
639 mHandlerLooper->removeMessages(this);
640 mCurrCollectionEvent = CollectionEvent::PERIODIC;
641 mPeriodicCollection.lastCollectionUptime =
642 mHandlerLooper->now() + mPeriodicCollection.interval.count();
643 mHandlerLooper->sendMessageAtTime(mPeriodicCollection.lastCollectionUptime, this,
644 CollectionEvent::PERIODIC);
645 }
646 break;
647 case static_cast<int>(CollectionEvent::PERIODIC):
648 result = processCollectionEvent(CollectionEvent::PERIODIC, &mPeriodicCollection);
649 break;
650 case static_cast<int>(CollectionEvent::CUSTOM):
651 result = processCollectionEvent(CollectionEvent::CUSTOM, &mCustomCollection);
652 break;
653 case static_cast<int>(SwitchEvent::END_CUSTOM_COLLECTION): {
654 Mutex::Autolock lock(mMutex);
655 if (mCurrCollectionEvent != CollectionEvent::CUSTOM) {
656 ALOGW("Skipping END_CUSTOM_COLLECTION message as the current collection %s != %s",
657 toString(mCurrCollectionEvent).c_str(),
658 toString(CollectionEvent::CUSTOM).c_str());
659 return;
660 }
661 mCustomCollection = {};
662 mHandlerLooper->removeMessages(this);
663 mCurrCollectionEvent = CollectionEvent::PERIODIC;
664 mPeriodicCollection.lastCollectionUptime = mHandlerLooper->now();
665 mHandlerLooper->sendMessage(this, CollectionEvent::PERIODIC);
666 return;
667 }
668 default:
669 result = Error() << "Unknown message: " << message.what;
670 }
671
672 if (!result.ok()) {
673 Mutex::Autolock lock(mMutex);
674 ALOGE("Terminating I/O performance data collection: %s", result.error().message().c_str());
675 // DO NOT CALL terminate() as it tries to join the collection thread but this code is
676 // executed on the collection thread. Thus it will result in a deadlock.
677 mCurrCollectionEvent = CollectionEvent::TERMINATED;
678 mHandlerLooper->removeMessages(this);
679 mHandlerLooper->wake();
680 }
681 }
682
processCollectionEvent(CollectionEvent event,CollectionInfo * info)683 Result<void> IoPerfCollection::processCollectionEvent(CollectionEvent event, CollectionInfo* info) {
684 Mutex::Autolock lock(mMutex);
685 // Messages sent to the looper are intrinsically racy such that a message from the previous
686 // collection event may land in the looper after the current collection has already begun. Thus
687 // verify the current collection event before starting the collection.
688 if (mCurrCollectionEvent != event) {
689 ALOGW("Skipping %s collection message on collection event %s", toString(event).c_str(),
690 toString(mCurrCollectionEvent).c_str());
691 return {};
692 }
693 if (info->maxCacheSize == 0) {
694 return Error() << "Maximum cache size for " << toString(event) << " collection cannot be 0";
695 }
696 if (info->interval < kMinCollectionInterval) {
697 return Error()
698 << "Collection interval of "
699 << std::chrono::duration_cast<std::chrono::seconds>(info->interval).count()
700 << " seconds for " << toString(event) << " collection cannot be less than "
701 << std::chrono::duration_cast<std::chrono::seconds>(kMinCollectionInterval).count()
702 << " seconds";
703 }
704 auto ret = collectLocked(info);
705 if (!ret) {
706 return Error() << toString(event) << " collection failed: " << ret.error();
707 }
708 info->lastCollectionUptime += info->interval.count();
709 mHandlerLooper->sendMessageAtTime(info->lastCollectionUptime, this, event);
710 return {};
711 }
712
collectLocked(CollectionInfo * collectionInfo)713 Result<void> IoPerfCollection::collectLocked(CollectionInfo* collectionInfo) {
714 if (!mUidIoStats->enabled() && !mProcStat->enabled() && !mProcPidStat->enabled()) {
715 return Error() << "No collectors enabled";
716 }
717 IoPerfRecord record{
718 .time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()),
719 };
720 auto ret = collectSystemIoPerfDataLocked(&record.systemIoPerfData);
721 if (!ret) {
722 return ret;
723 }
724 ret = collectProcessIoPerfDataLocked(*collectionInfo, &record.processIoPerfData);
725 if (!ret) {
726 return ret;
727 }
728 ret = collectUidIoPerfDataLocked(*collectionInfo, &record.uidIoPerfData);
729 if (!ret) {
730 return ret;
731 }
732 if (collectionInfo->records.size() > collectionInfo->maxCacheSize) {
733 collectionInfo->records.erase(collectionInfo->records.begin()); // Erase the oldest record.
734 }
735 collectionInfo->records.emplace_back(record);
736 return {};
737 }
738
collectUidIoPerfDataLocked(const CollectionInfo & collectionInfo,UidIoPerfData * uidIoPerfData)739 Result<void> IoPerfCollection::collectUidIoPerfDataLocked(const CollectionInfo& collectionInfo,
740 UidIoPerfData* uidIoPerfData) {
741 if (!mUidIoStats->enabled()) {
742 // Don't return an error to avoid pre-mature termination. Instead, fetch data from other
743 // collectors.
744 return {};
745 }
746
747 const Result<std::unordered_map<uint32_t, UidIoUsage>>& usage = mUidIoStats->collect();
748 if (!usage) {
749 return Error() << "Failed to collect uid I/O usage: " << usage.error();
750 }
751
752 // Fetch only the top N reads and writes from the usage records.
753 UidIoUsage tempUsage = {};
754 std::vector<const UidIoUsage*> topNReads(mTopNStatsPerCategory, &tempUsage);
755 std::vector<const UidIoUsage*> topNWrites(mTopNStatsPerCategory, &tempUsage);
756 std::unordered_set<uint32_t> unmappedUids;
757
758 for (const auto& uIt : *usage) {
759 const UidIoUsage& curUsage = uIt.second;
760 if (curUsage.ios.isZero()) {
761 continue;
762 }
763 if (mUidToPackageNameMapping.find(curUsage.uid) == mUidToPackageNameMapping.end()) {
764 unmappedUids.insert(curUsage.uid);
765 }
766 uidIoPerfData->total[READ_BYTES][FOREGROUND] +=
767 curUsage.ios.metrics[READ_BYTES][FOREGROUND];
768 uidIoPerfData->total[READ_BYTES][BACKGROUND] +=
769 curUsage.ios.metrics[READ_BYTES][BACKGROUND];
770 uidIoPerfData->total[WRITE_BYTES][FOREGROUND] +=
771 curUsage.ios.metrics[WRITE_BYTES][FOREGROUND];
772 uidIoPerfData->total[WRITE_BYTES][BACKGROUND] +=
773 curUsage.ios.metrics[WRITE_BYTES][BACKGROUND];
774 uidIoPerfData->total[FSYNC_COUNT][FOREGROUND] +=
775 curUsage.ios.metrics[FSYNC_COUNT][FOREGROUND];
776 uidIoPerfData->total[FSYNC_COUNT][BACKGROUND] +=
777 curUsage.ios.metrics[FSYNC_COUNT][BACKGROUND];
778
779 for (auto it = topNReads.begin(); it != topNReads.end(); ++it) {
780 const UidIoUsage* curRead = *it;
781 if (curRead->ios.sumReadBytes() < curUsage.ios.sumReadBytes()) {
782 topNReads.emplace(it, &curUsage);
783 if (collectionInfo.filterPackages.empty()) {
784 topNReads.pop_back();
785 }
786 break;
787 }
788 }
789 for (auto it = topNWrites.begin(); it != topNWrites.end(); ++it) {
790 const UidIoUsage* curWrite = *it;
791 if (curWrite->ios.sumWriteBytes() < curUsage.ios.sumWriteBytes()) {
792 topNWrites.emplace(it, &curUsage);
793 if (collectionInfo.filterPackages.empty()) {
794 topNWrites.pop_back();
795 }
796 break;
797 }
798 }
799 }
800
801 const auto& ret = updateUidToPackageNameMapping(unmappedUids);
802 if (!ret) {
803 ALOGW("%s", ret.error().message().c_str());
804 }
805
806 // Convert the top N I/O usage to UidIoPerfData.
807 for (const auto& usage : topNReads) {
808 if (usage->ios.isZero()) {
809 // End of non-zero usage records. This case occurs when the number of UIDs with active
810 // I/O operations is < |ro.carwatchdog.top_n_stats_per_category|.
811 break;
812 }
813 UidIoPerfData::Stats stats = {
814 .userId = multiuser_get_user_id(usage->uid),
815 .packageName = std::to_string(usage->uid),
816 .bytes = {usage->ios.metrics[READ_BYTES][FOREGROUND],
817 usage->ios.metrics[READ_BYTES][BACKGROUND]},
818 .fsync = {usage->ios.metrics[FSYNC_COUNT][FOREGROUND],
819 usage->ios.metrics[FSYNC_COUNT][BACKGROUND]},
820 };
821 if (mUidToPackageNameMapping.find(usage->uid) != mUidToPackageNameMapping.end()) {
822 stats.packageName = mUidToPackageNameMapping[usage->uid];
823 }
824 if (!collectionInfo.filterPackages.empty() &&
825 collectionInfo.filterPackages.find(stats.packageName) ==
826 collectionInfo.filterPackages.end()) {
827 continue;
828 }
829 uidIoPerfData->topNReads.emplace_back(stats);
830 }
831
832 for (const auto& usage : topNWrites) {
833 if (usage->ios.isZero()) {
834 // End of non-zero usage records. This case occurs when the number of UIDs with active
835 // I/O operations is < |ro.carwatchdog.top_n_stats_per_category|.
836 break;
837 }
838 UidIoPerfData::Stats stats = {
839 .userId = multiuser_get_user_id(usage->uid),
840 .packageName = std::to_string(usage->uid),
841 .bytes = {usage->ios.metrics[WRITE_BYTES][FOREGROUND],
842 usage->ios.metrics[WRITE_BYTES][BACKGROUND]},
843 .fsync = {usage->ios.metrics[FSYNC_COUNT][FOREGROUND],
844 usage->ios.metrics[FSYNC_COUNT][BACKGROUND]},
845 };
846 if (mUidToPackageNameMapping.find(usage->uid) != mUidToPackageNameMapping.end()) {
847 stats.packageName = mUidToPackageNameMapping[usage->uid];
848 }
849 if (!collectionInfo.filterPackages.empty() &&
850 collectionInfo.filterPackages.find(stats.packageName) ==
851 collectionInfo.filterPackages.end()) {
852 continue;
853 }
854 uidIoPerfData->topNWrites.emplace_back(stats);
855 }
856 return {};
857 }
858
collectSystemIoPerfDataLocked(SystemIoPerfData * systemIoPerfData)859 Result<void> IoPerfCollection::collectSystemIoPerfDataLocked(SystemIoPerfData* systemIoPerfData) {
860 if (!mProcStat->enabled()) {
861 // Don't return an error to avoid pre-mature termination. Instead, fetch data from other
862 // collectors.
863 return {};
864 }
865
866 const Result<ProcStatInfo>& procStatInfo = mProcStat->collect();
867 if (!procStatInfo) {
868 return Error() << "Failed to collect proc stats: " << procStatInfo.error();
869 }
870
871 systemIoPerfData->cpuIoWaitTime = procStatInfo->cpuStats.ioWaitTime;
872 systemIoPerfData->totalCpuTime = procStatInfo->totalCpuTime();
873 systemIoPerfData->ioBlockedProcessesCnt = procStatInfo->ioBlockedProcessesCnt;
874 systemIoPerfData->totalProcessesCnt = procStatInfo->totalProcessesCnt();
875 return {};
876 }
877
collectProcessIoPerfDataLocked(const CollectionInfo & collectionInfo,ProcessIoPerfData * processIoPerfData)878 Result<void> IoPerfCollection::collectProcessIoPerfDataLocked(
879 const CollectionInfo& collectionInfo, ProcessIoPerfData* processIoPerfData) {
880 if (!mProcPidStat->enabled()) {
881 // Don't return an error to avoid pre-mature termination. Instead, fetch data from other
882 // collectors.
883 return {};
884 }
885
886 const Result<std::vector<ProcessStats>>& processStats = mProcPidStat->collect();
887 if (!processStats) {
888 return Error() << "Failed to collect process stats: " << processStats.error();
889 }
890
891 const auto& uidProcessStats = getUidProcessStats(*processStats, mTopNStatsPerSubcategory);
892 std::unordered_set<uint32_t> unmappedUids;
893 // Fetch only the top N I/O blocked UIDs and UIDs with most major page faults.
894 UidProcessStats temp = {};
895 std::vector<const UidProcessStats*> topNIoBlockedUids(mTopNStatsPerCategory, &temp);
896 std::vector<const UidProcessStats*> topNMajorFaultUids(mTopNStatsPerCategory, &temp);
897 processIoPerfData->totalMajorFaults = 0;
898 for (const auto& it : *uidProcessStats) {
899 const UidProcessStats& curStats = it.second;
900 if (mUidToPackageNameMapping.find(curStats.uid) == mUidToPackageNameMapping.end()) {
901 unmappedUids.insert(curStats.uid);
902 }
903 processIoPerfData->totalMajorFaults += curStats.majorFaults;
904 for (auto it = topNIoBlockedUids.begin(); it != topNIoBlockedUids.end(); ++it) {
905 const UidProcessStats* topStats = *it;
906 if (topStats->ioBlockedTasksCnt < curStats.ioBlockedTasksCnt) {
907 topNIoBlockedUids.emplace(it, &curStats);
908 if (collectionInfo.filterPackages.empty()) {
909 topNIoBlockedUids.pop_back();
910 }
911 break;
912 }
913 }
914 for (auto it = topNMajorFaultUids.begin(); it != topNMajorFaultUids.end(); ++it) {
915 const UidProcessStats* topStats = *it;
916 if (topStats->majorFaults < curStats.majorFaults) {
917 topNMajorFaultUids.emplace(it, &curStats);
918 if (collectionInfo.filterPackages.empty()) {
919 topNMajorFaultUids.pop_back();
920 }
921 break;
922 }
923 }
924 }
925
926 const auto& ret = updateUidToPackageNameMapping(unmappedUids);
927 if (!ret) {
928 ALOGW("%s", ret.error().message().c_str());
929 }
930
931 // Convert the top N uid process stats to ProcessIoPerfData.
932 for (const auto& it : topNIoBlockedUids) {
933 if (it->ioBlockedTasksCnt == 0) {
934 // End of non-zero elements. This case occurs when the number of UIDs with I/O blocked
935 // processes is < |ro.carwatchdog.top_n_stats_per_category|.
936 break;
937 }
938 ProcessIoPerfData::UidStats stats = {
939 .userId = multiuser_get_user_id(it->uid),
940 .packageName = std::to_string(it->uid),
941 .count = it->ioBlockedTasksCnt,
942 };
943 if (mUidToPackageNameMapping.find(it->uid) != mUidToPackageNameMapping.end()) {
944 stats.packageName = mUidToPackageNameMapping[it->uid];
945 }
946 if (!collectionInfo.filterPackages.empty() &&
947 collectionInfo.filterPackages.find(stats.packageName) ==
948 collectionInfo.filterPackages.end()) {
949 continue;
950 }
951 for (const auto& pIt : it->topNIoBlockedProcesses) {
952 if (pIt.count == 0) {
953 break;
954 }
955 stats.topNProcesses.emplace_back(
956 ProcessIoPerfData::UidStats::ProcessStats{pIt.comm, pIt.count});
957 }
958 processIoPerfData->topNIoBlockedUids.emplace_back(stats);
959 processIoPerfData->topNIoBlockedUidsTotalTaskCnt.emplace_back(it->totalTasksCnt);
960 }
961 for (const auto& it : topNMajorFaultUids) {
962 if (it->majorFaults == 0) {
963 // End of non-zero elements. This case occurs when the number of UIDs with major faults
964 // is < |ro.carwatchdog.top_n_stats_per_category|.
965 break;
966 }
967 ProcessIoPerfData::UidStats stats = {
968 .userId = multiuser_get_user_id(it->uid),
969 .packageName = std::to_string(it->uid),
970 .count = it->majorFaults,
971 };
972 if (mUidToPackageNameMapping.find(it->uid) != mUidToPackageNameMapping.end()) {
973 stats.packageName = mUidToPackageNameMapping[it->uid];
974 }
975 if (!collectionInfo.filterPackages.empty() &&
976 collectionInfo.filterPackages.find(stats.packageName) ==
977 collectionInfo.filterPackages.end()) {
978 continue;
979 }
980 for (const auto& pIt : it->topNMajorFaultProcesses) {
981 if (pIt.count == 0) {
982 break;
983 }
984 stats.topNProcesses.emplace_back(
985 ProcessIoPerfData::UidStats::ProcessStats{pIt.comm, pIt.count});
986 }
987 processIoPerfData->topNMajorFaultUids.emplace_back(stats);
988 }
989 if (mLastMajorFaults == 0) {
990 processIoPerfData->majorFaultsPercentChange = 0;
991 } else {
992 int64_t increase = processIoPerfData->totalMajorFaults - mLastMajorFaults;
993 processIoPerfData->majorFaultsPercentChange =
994 (static_cast<double>(increase) / static_cast<double>(mLastMajorFaults)) * 100.0;
995 }
996 mLastMajorFaults = processIoPerfData->totalMajorFaults;
997 return {};
998 }
999
updateUidToPackageNameMapping(const std::unordered_set<uint32_t> & uids)1000 Result<void> IoPerfCollection::updateUidToPackageNameMapping(
1001 const std::unordered_set<uint32_t>& uids) {
1002 std::vector<int32_t> appUids;
1003
1004 for (const auto& uid : uids) {
1005 if (uid >= AID_APP_START) {
1006 appUids.emplace_back(static_cast<int32_t>(uid));
1007 continue;
1008 }
1009 // System/native UIDs.
1010 passwd* usrpwd = getpwuid(uid);
1011 if (!usrpwd) {
1012 continue;
1013 }
1014 mUidToPackageNameMapping[uid] = std::string(usrpwd->pw_name);
1015 }
1016
1017 if (appUids.empty()) {
1018 return {};
1019 }
1020
1021 if (mPackageManager == nullptr) {
1022 auto ret = retrievePackageManager();
1023 if (!ret) {
1024 return Error() << "Failed to retrieve package manager: " << ret.error();
1025 }
1026 }
1027
1028 std::vector<std::string> packageNames;
1029 const binder::Status& status = mPackageManager->getNamesForUids(appUids, &packageNames);
1030 if (!status.isOk()) {
1031 return Error() << "package_native::getNamesForUids failed: " << status.exceptionMessage();
1032 }
1033
1034 for (uint32_t i = 0; i < appUids.size(); i++) {
1035 if (!packageNames[i].empty()) {
1036 mUidToPackageNameMapping[appUids[i]] = packageNames[i];
1037 }
1038 }
1039
1040 return {};
1041 }
1042
retrievePackageManager()1043 Result<void> IoPerfCollection::retrievePackageManager() {
1044 const sp<IServiceManager> sm = defaultServiceManager();
1045 if (sm == nullptr) {
1046 return Error() << "Failed to retrieve defaultServiceManager";
1047 }
1048
1049 sp<IBinder> binder = sm->getService(String16("package_native"));
1050 if (binder == nullptr) {
1051 return Error() << "Failed to get service package_native";
1052 }
1053 mPackageManager = interface_cast<IPackageManagerNative>(binder);
1054 return {};
1055 }
1056
1057 } // namespace watchdog
1058 } // namespace automotive
1059 } // namespace android
1060