1 /*
2  * Copyright 2017, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include "config/ConfigKey.h"
19 #include "statslog.h"
20 
21 #include <gtest/gtest_prod.h>
22 #include <log/log_time.h>
23 #include <list>
24 #include <mutex>
25 #include <string>
26 #include <vector>
27 #include <unordered_map>
28 
29 namespace android {
30 namespace os {
31 namespace statsd {
32 
33 struct ConfigStats {
34     int32_t uid;
35     int64_t id;
36     int32_t creation_time_sec;
37     int32_t deletion_time_sec = 0;
38     int32_t reset_time_sec = 0;
39     int32_t metric_count;
40     int32_t condition_count;
41     int32_t matcher_count;
42     int32_t alert_count;
43     bool is_valid;
44 
45     std::list<int32_t> broadcast_sent_time_sec;
46 
47     // Times at which this config is activated.
48     std::list<int32_t> activation_time_sec;
49 
50     // Times at which this config is deactivated.
51     std::list<int32_t> deactivation_time_sec;
52 
53     std::list<int32_t> data_drop_time_sec;
54     // Number of bytes dropped at corresponding time.
55     std::list<int64_t> data_drop_bytes;
56     std::list<std::pair<int32_t, int64_t>> dump_report_stats;
57 
58     // Stores how many times a matcher have been matched. The map size is capped by kMaxConfigCount.
59     std::map<const int64_t, int> matcher_stats;
60 
61     // Stores the number of output tuple of condition trackers when it's bigger than
62     // kDimensionKeySizeSoftLimit. When you see the number is kDimensionKeySizeHardLimit +1,
63     // it means some data has been dropped. The map size is capped by kMaxConfigCount.
64     std::map<const int64_t, int> condition_stats;
65 
66     // Stores the number of output tuple of metric producers when it's bigger than
67     // kDimensionKeySizeSoftLimit. When you see the number is kDimensionKeySizeHardLimit +1,
68     // it means some data has been dropped. The map size is capped by kMaxConfigCount.
69     std::map<const int64_t, int> metric_stats;
70 
71     // Stores the max number of output tuple of dimensions in condition across dimensions in what
72     // when it's bigger than kDimensionKeySizeSoftLimit. When you see the number is
73     // kDimensionKeySizeHardLimit +1, it means some data has been dropped. The map size is capped by
74     // kMaxConfigCount.
75     std::map<const int64_t, int> metric_dimension_in_condition_stats;
76 
77     // Stores the number of times an anomaly detection alert has been declared.
78     // The map size is capped by kMaxConfigCount.
79     std::map<const int64_t, int> alert_stats;
80 
81     // Stores the config ID for each sub-config used.
82     std::list<std::pair<const int64_t, const int32_t>> annotations;
83 };
84 
85 struct UidMapStats {
86     int32_t changes;
87     int32_t bytes_used;
88     int32_t dropped_changes;
89     int32_t deleted_apps = 0;
90 };
91 
92 // Keeps track of stats of statsd.
93 // Single instance shared across the process. All public methods are thread safe.
94 class StatsdStats {
95 public:
96     static StatsdStats& getInstance();
~StatsdStats()97     ~StatsdStats(){};
98 
99     const static int kDimensionKeySizeSoftLimit = 500;
100     const static int kDimensionKeySizeHardLimit = 800;
101 
102     // Per atom dimension key size limit
103     static const std::map<int, std::pair<size_t, size_t>> kAtomDimensionKeySizeLimitMap;
104 
105     const static int kMaxConfigCountPerUid = 10;
106     const static int kMaxAlertCountPerConfig = 100;
107     const static int kMaxConditionCountPerConfig = 300;
108     const static int kMaxMetricCountPerConfig = 1000;
109     const static int kMaxMatcherCountPerConfig = 800;
110 
111     // The max number of old config stats we keep.
112     const static int kMaxIceBoxSize = 20;
113 
114     const static int kMaxLoggerErrors = 20;
115 
116     const static int kMaxSystemServerRestarts = 20;
117 
118     const static int kMaxTimestampCount = 20;
119 
120     const static int kMaxLogSourceCount = 50;
121 
122     // Max memory allowed for storing metrics per configuration. If this limit is exceeded, statsd
123     // drops the metrics data in memory.
124     static const size_t kMaxMetricsBytesPerConfig = 2 * 1024 * 1024;
125 
126     // Soft memory limit per configuration. Once this limit is exceeded, we begin notifying the
127     // data subscriber that it's time to call getData.
128     static const size_t kBytesPerConfigTriggerGetData = 192 * 1024;
129 
130     // Cap the UID map's memory usage to this. This should be fairly high since the UID information
131     // is critical for understanding the metrics.
132     const static size_t kMaxBytesUsedUidMap = 50 * 1024;
133 
134     // The number of deleted apps that are stored in the uid map.
135     const static int kMaxDeletedAppsInUidMap = 100;
136 
137     /* Minimum period between two broadcasts in nanoseconds. */
138     static const int64_t kMinBroadcastPeriodNs = 60 * NS_PER_SEC;
139 
140     /* Min period between two checks of byte size per config key in nanoseconds. */
141     static const int64_t kMinByteSizeCheckPeriodNs = 60 * NS_PER_SEC;
142 
143     /* Minimum period between two activation broadcasts in nanoseconds. */
144     static const int64_t kMinActivationBroadcastPeriodNs = 10 * NS_PER_SEC;
145 
146     // Maximum age (30 days) that files on disk can exist in seconds.
147     static const int kMaxAgeSecond = 60 * 60 * 24 * 30;
148 
149     // Maximum age (2 days) that local history files on disk can exist in seconds.
150     static const int kMaxLocalHistoryAgeSecond = 60 * 60 * 24 * 2;
151 
152     // Maximum number of files (1000) that can be in stats directory on disk.
153     static const int kMaxFileNumber = 1000;
154 
155     // Maximum size of all files that can be written to stats directory on disk.
156     static const int kMaxFileSize = 50 * 1024 * 1024;
157 
158     // How long to try to clear puller cache from last time
159     static const long kPullerCacheClearIntervalSec = 1;
160 
161     // Max time to do a pull.
162     static const int64_t kPullMaxDelayNs = 10 * NS_PER_SEC;
163 
164     // Maximum number of pushed atoms statsd stats will track above kMaxPushedAtomId.
165     static const int kMaxNonPlatformPushedAtoms = 100;
166 
167     // Max platform atom tag number.
168     static const int32_t kMaxPlatformAtomTag = 100000;
169 
170     // Vendor pulled atom start id.
171     static const int32_t kVendorPulledAtomStartTag = 150000;
172 
173     // Beginning of range for timestamp truncation.
174     static const int32_t kTimestampTruncationStartTag = 300000;
175 
176     // End of range for timestamp truncation.
177     static const int32_t kTimestampTruncationEndTag = 304999;
178 
179     // Max accepted atom id.
180     static const int32_t kMaxAtomTag = 200000;
181 
182     static const int64_t kInt64Max = 0x7fffffffffffffffLL;
183 
184     /**
185      * Report a new config has been received and report the static stats about the config.
186      *
187      * The static stats include: the count of metrics, conditions, matchers, and alerts.
188      * If the config is not valid, this config stats will be put into icebox immediately.
189      */
190     void noteConfigReceived(const ConfigKey& key, int metricsCount, int conditionsCount,
191                             int matchersCount, int alertCount,
192                             const std::list<std::pair<const int64_t, const int32_t>>& annotations,
193                             bool isValid);
194     /**
195      * Report a config has been removed.
196      */
197     void noteConfigRemoved(const ConfigKey& key);
198     /**
199      * Report a config has been reset when ttl expires.
200      */
201     void noteConfigReset(const ConfigKey& key);
202 
203     /**
204      * Report a broadcast has been sent to a config owner to collect the data.
205      */
206     void noteBroadcastSent(const ConfigKey& key);
207 
208     /**
209      * Report that a config has become activated or deactivated.
210      * This can be different from whether or not a broadcast is sent if the
211      * guardrail prevented the broadcast from being sent.
212      */
213     void noteActiveStatusChanged(const ConfigKey& key, bool activate);
214 
215     /**
216      * Report a config's metrics data has been dropped.
217      */
218     void noteDataDropped(const ConfigKey& key, const size_t totalBytes);
219 
220     /**
221      * Report metrics data report has been sent.
222      *
223      * The report may be requested via StatsManager API, or through adb cmd.
224      */
225     void noteMetricsReportSent(const ConfigKey& key, const size_t num_bytes);
226 
227     /**
228      * Report the size of output tuple of a condition.
229      *
230      * Note: only report when the condition has an output dimension, and the tuple
231      * count > kDimensionKeySizeSoftLimit.
232      *
233      * [key]: The config key that this condition belongs to.
234      * [id]: The id of the condition.
235      * [size]: The output tuple size.
236      */
237     void noteConditionDimensionSize(const ConfigKey& key, const int64_t& id, int size);
238 
239     /**
240      * Report the size of output tuple of a metric.
241      *
242      * Note: only report when the metric has an output dimension, and the tuple
243      * count > kDimensionKeySizeSoftLimit.
244      *
245      * [key]: The config key that this metric belongs to.
246      * [id]: The id of the metric.
247      * [size]: The output tuple size.
248      */
249     void noteMetricDimensionSize(const ConfigKey& key, const int64_t& id, int size);
250 
251     /**
252      * Report the max size of output tuple of dimension in condition across dimensions in what.
253      *
254      * Note: only report when the metric has an output dimension in condition, and the max tuple
255      * count > kDimensionKeySizeSoftLimit.
256      *
257      * [key]: The config key that this metric belongs to.
258      * [id]: The id of the metric.
259      * [size]: The output tuple size.
260      */
261     void noteMetricDimensionInConditionSize(const ConfigKey& key, const int64_t& id, int size);
262 
263     /**
264      * Report a matcher has been matched.
265      *
266      * [key]: The config key that this matcher belongs to.
267      * [id]: The id of the matcher.
268      */
269     void noteMatcherMatched(const ConfigKey& key, const int64_t& id);
270 
271     /**
272      * Report that an anomaly detection alert has been declared.
273      *
274      * [key]: The config key that this alert belongs to.
275      * [id]: The id of the alert.
276      */
277     void noteAnomalyDeclared(const ConfigKey& key, const int64_t& id);
278 
279     /**
280      * Report an atom event has been logged.
281      */
282     void noteAtomLogged(int atomId, int32_t timeSec);
283 
284     /**
285      * Report that statsd modified the anomaly alarm registered with StatsCompanionService.
286      */
287     void noteRegisteredAnomalyAlarmChanged();
288 
289     /**
290      * Report that statsd modified the periodic alarm registered with StatsCompanionService.
291      */
292     void noteRegisteredPeriodicAlarmChanged();
293 
294     /**
295      * Records the number of delta entries that are being dropped from the uid map.
296      */
297     void noteUidMapDropped(int deltas);
298 
299     /**
300      * Records that an app was deleted (from statsd's map).
301      */
302     void noteUidMapAppDeletionDropped();
303 
304     /**
305      * Updates the number of changes currently stored in the uid map.
306      */
307     void setUidMapChanges(int changes);
308     void setCurrentUidMapMemory(int bytes);
309 
310     /*
311      * Updates minimum interval between pulls for an pulled atom.
312      */
313     void updateMinPullIntervalSec(int pullAtomId, long intervalSec);
314 
315     /*
316      * Notes an atom is pulled.
317      */
318     void notePull(int pullAtomId);
319 
320     /*
321      * Notes an atom is served from puller cache.
322      */
323     void notePullFromCache(int pullAtomId);
324 
325     /*
326      * Notify data error for pulled atom.
327      */
328     void notePullDataError(int pullAtomId);
329 
330     /*
331      * Records time for actual pulling, not including those served from cache and not including
332      * statsd processing delays.
333      */
334     void notePullTime(int pullAtomId, int64_t pullTimeNs);
335 
336     /*
337      * Records pull delay for a pulled atom, including those served from cache and including statsd
338      * processing delays.
339      */
340     void notePullDelay(int pullAtomId, int64_t pullDelayNs);
341 
342     /*
343      * Records pull exceeds timeout for the puller.
344      */
345     void notePullTimeout(int pullAtomId);
346 
347     /*
348      * Records pull exceeds max delay for a metric.
349      */
350     void notePullExceedMaxDelay(int pullAtomId);
351 
352     /*
353      * Records when system server restarts.
354      */
355     void noteSystemServerRestart(int32_t timeSec);
356 
357     /**
358      * Records statsd skipped an event.
359      */
360     void noteLogLost(int32_t wallClockTimeSec, int32_t count, int32_t lastError,
361                      int32_t lastAtomTag, int32_t uid, int32_t pid);
362 
363     /**
364      * Records that the pull of an atom has failed
365      */
366     void notePullFailed(int atomId);
367 
368     /**
369      * Records that the pull of StatsCompanionService atom has failed
370      */
371     void noteStatsCompanionPullFailed(int atomId);
372 
373     /**
374      * Records that the pull of a StatsCompanionService atom has failed due to a failed binder
375      * transaction. This can happen when StatsCompanionService returns too
376      * much data (the max Binder parcel size is 1MB)
377      */
378     void noteStatsCompanionPullBinderTransactionFailed(int atomId);
379 
380     /**
381      * A pull with no data occurred
382      */
383     void noteEmptyData(int atomId);
384 
385     /**
386      * Records that a puller callback for the given atomId was registered or unregistered.
387      *
388      * @param registered True if the callback was registered, false if was unregistered.
389      */
390     void notePullerCallbackRegistrationChanged(int atomId, bool registered);
391 
392     /**
393      * Hard limit was reached in the cardinality of an atom
394      */
395     void noteHardDimensionLimitReached(int64_t metricId);
396 
397     /**
398      * A log event was too late, arrived in the wrong bucket and was skipped
399      */
400     void noteLateLogEventSkipped(int64_t metricId);
401 
402     /**
403      * Buckets were skipped as time elapsed without any data for them
404      */
405     void noteSkippedForwardBuckets(int64_t metricId);
406 
407     /**
408      * An unsupported value type was received
409      */
410     void noteBadValueType(int64_t metricId);
411 
412     /**
413      * Buckets were dropped due to reclaim memory.
414      */
415     void noteBucketDropped(int64_t metricId);
416 
417     /**
418      * A condition change was too late, arrived in the wrong bucket and was skipped
419      */
420     void noteConditionChangeInNextBucket(int64_t metricId);
421 
422     /**
423      * A bucket has been tagged as invalid.
424      */
425     void noteInvalidatedBucket(int64_t metricId);
426 
427     /**
428      * Tracks the total number of buckets (include skipped/invalid buckets).
429      */
430     void noteBucketCount(int64_t metricId);
431 
432     /**
433      * For pulls at bucket boundaries, it represents the misalignment between the real timestamp and
434      * the end of the bucket.
435      */
436     void noteBucketBoundaryDelayNs(int64_t metricId, int64_t timeDelayNs);
437 
438     /**
439      * Number of buckets with unknown condition.
440      */
441     void noteBucketUnknownCondition(int64_t metricId);
442 
443     /* Reports one event has been dropped due to queue overflow, and the oldest event timestamp in
444      * the queue */
445     void noteEventQueueOverflow(int64_t oldestEventTimestampNs);
446 
447     /**
448      * Reports that the activation broadcast guardrail was hit for this uid. Namely, the broadcast
449      * should have been sent, but instead was skipped due to hitting the guardrail.
450      */
451      void noteActivationBroadcastGuardrailHit(const int uid);
452 
453     /**
454      * Reset the historical stats. Including all stats in icebox, and the tracked stats about
455      * metrics, matchers, and atoms. The active configs will be kept and StatsdStats will continue
456      * to collect stats after reset() has been called.
457      */
458     void reset();
459 
460     /**
461      * Output the stats in protobuf binary format to [buffer].
462      *
463      * [reset]: whether to clear the historical stats after the call.
464      */
465     void dumpStats(std::vector<uint8_t>* buffer, bool reset);
466 
467     /**
468      * Output statsd stats in human readable format to [out] file descriptor.
469      */
470     void dumpStats(int outFd) const;
471 
472     typedef struct {
473         long totalPull = 0;
474         long totalPullFromCache = 0;
475         long minPullIntervalSec = LONG_MAX;
476         int64_t avgPullTimeNs = 0;
477         int64_t maxPullTimeNs = 0;
478         long numPullTime = 0;
479         int64_t avgPullDelayNs = 0;
480         int64_t maxPullDelayNs = 0;
481         long numPullDelay = 0;
482         long dataError = 0;
483         long pullTimeout = 0;
484         long pullExceedMaxDelay = 0;
485         long pullFailed = 0;
486         long statsCompanionPullFailed = 0;
487         long statsCompanionPullBinderTransactionFailed = 0;
488         long emptyData = 0;
489         long registeredCount = 0;
490         long unregisteredCount = 0;
491     } PulledAtomStats;
492 
493     typedef struct {
494         long hardDimensionLimitReached = 0;
495         long lateLogEventSkipped = 0;
496         long skippedForwardBuckets = 0;
497         long badValueType = 0;
498         long conditionChangeInNextBucket = 0;
499         long invalidatedBucket = 0;
500         long bucketDropped = 0;
501         int64_t minBucketBoundaryDelayNs = 0;
502         int64_t maxBucketBoundaryDelayNs = 0;
503         long bucketUnknownCondition = 0;
504         long bucketCount = 0;
505     } AtomMetricStats;
506 
507 private:
508     StatsdStats();
509 
510     mutable std::mutex mLock;
511 
512     int32_t mStartTimeSec;
513 
514     // Track the number of dropped entries used by the uid map.
515     UidMapStats mUidMapStats;
516 
517     // The stats about the configs that are still in use.
518     // The map size is capped by kMaxConfigCount.
519     std::map<const ConfigKey, std::shared_ptr<ConfigStats>> mConfigStats;
520 
521     // Stores the stats for the configs that are no longer in use.
522     // The size of the vector is capped by kMaxIceBoxSize.
523     std::list<const std::shared_ptr<ConfigStats>> mIceBox;
524 
525     // Stores the number of times a pushed atom is logged.
526     // The size of the vector is the largest pushed atom id in atoms.proto + 1. Atoms
527     // out of that range will be put in mNonPlatformPushedAtomStats.
528     // This is a vector, not a map because it will be accessed A LOT -- for each stats log.
529     std::vector<int> mPushedAtomStats;
530 
531     // Stores the number of times a pushed atom is logged for atom ids above kMaxPushedAtomId.
532     // The max size of the map is kMaxNonPlatformPushedAtoms.
533     std::unordered_map<int, int> mNonPlatformPushedAtomStats;
534 
535     // Maps PullAtomId to its stats. The size is capped by the puller atom counts.
536     std::map<int, PulledAtomStats> mPulledAtomStats;
537 
538     // Maps metric ID to its stats. The size is capped by the number of metrics.
539     std::map<int64_t, AtomMetricStats> mAtomMetricStats;
540 
541     // Maps uids to times when the activation changed broadcast not sent due to hitting the
542     // guardrail. The size is capped by the number of configs, and up to 20 times per uid.
543     std::map<int, std::list<int32_t>> mActivationBroadcastGuardrailStats;
544 
545     struct LogLossStats {
LogLossStatsLogLossStats546         LogLossStats(int32_t sec, int32_t count, int32_t error, int32_t tag, int32_t uid,
547                      int32_t pid)
548             : mWallClockSec(sec),
549               mCount(count),
550               mLastError(error),
551               mLastTag(tag),
552               mUid(uid),
553               mPid(pid) {
554         }
555         int32_t mWallClockSec;
556         int32_t mCount;
557         // error code defined in linux/errno.h
558         int32_t mLastError;
559         int32_t mLastTag;
560         int32_t mUid;
561         int32_t mPid;
562     };
563 
564     // Max of {(now - oldestEventTimestamp) when overflow happens}.
565     // This number is helpful to understand how SLOW statsd can be.
566     int64_t mMaxQueueHistoryNs = 0;
567 
568     // Min of {(now - oldestEventTimestamp) when overflow happens}.
569     // This number is helpful to understand how FAST the events floods to statsd.
570     int64_t mMinQueueHistoryNs = kInt64Max;
571 
572     // Total number of events that are lost due to queue overflow.
573     int32_t mOverflowCount = 0;
574 
575     // Timestamps when we detect log loss, and the number of logs lost.
576     std::list<LogLossStats> mLogLossStats;
577 
578     std::list<int32_t> mSystemServerRestartSec;
579 
580     // Stores the number of times statsd modified the anomaly alarm registered with
581     // StatsCompanionService.
582     int mAnomalyAlarmRegisteredStats = 0;
583 
584     // Stores the number of times statsd registers the periodic alarm changes
585     int mPeriodicAlarmRegisteredStats = 0;
586 
587     void noteConfigResetInternalLocked(const ConfigKey& key);
588 
589     void noteConfigRemovedInternalLocked(const ConfigKey& key);
590 
591     void resetInternalLocked();
592 
593     void noteDataDropped(const ConfigKey& key, const size_t totalBytes, int32_t timeSec);
594 
595     void noteMetricsReportSent(const ConfigKey& key, const size_t num_bytes, int32_t timeSec);
596 
597     void noteBroadcastSent(const ConfigKey& key, int32_t timeSec);
598 
599     void noteActiveStatusChanged(const ConfigKey& key, bool activate, int32_t timeSec);
600 
601     void noteActivationBroadcastGuardrailHit(const int uid, int32_t timeSec);
602 
603     void addToIceBoxLocked(std::shared_ptr<ConfigStats>& stats);
604 
605     /**
606      * Get a reference to AtomMetricStats for a metric. If none exists, create it. The reference
607      * will live as long as `this`.
608      */
609     StatsdStats::AtomMetricStats& getAtomMetricStats(int64_t metricId);
610 
611     FRIEND_TEST(StatsdStatsTest, TestValidConfigAdd);
612     FRIEND_TEST(StatsdStatsTest, TestInvalidConfigAdd);
613     FRIEND_TEST(StatsdStatsTest, TestConfigRemove);
614     FRIEND_TEST(StatsdStatsTest, TestSubStats);
615     FRIEND_TEST(StatsdStatsTest, TestAtomLog);
616     FRIEND_TEST(StatsdStatsTest, TestNonPlatformAtomLog);
617     FRIEND_TEST(StatsdStatsTest, TestTimestampThreshold);
618     FRIEND_TEST(StatsdStatsTest, TestAnomalyMonitor);
619     FRIEND_TEST(StatsdStatsTest, TestSystemServerCrash);
620     FRIEND_TEST(StatsdStatsTest, TestPullAtomStats);
621     FRIEND_TEST(StatsdStatsTest, TestAtomMetricsStats);
622     FRIEND_TEST(StatsdStatsTest, TestActivationBroadcastGuardrailHit);
623 };
624 
625 }  // namespace statsd
626 }  // namespace os
627 }  // namespace android
628