1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <csignal>
18 #include "mediautils/TimerThread.h"
19 #define LOG_TAG "TimeCheck"
20 
21 #include <optional>
22 
23 #include <android-base/logging.h>
24 #include <android-base/strings.h>
25 #include <audio_utils/clock.h>
26 #include <mediautils/EventLog.h>
27 #include <mediautils/FixedString.h>
28 #include <mediautils/MethodStatistics.h>
29 #include <mediautils/TimeCheck.h>
30 #include <mediautils/TidWrapper.h>
31 #include <utils/Log.h>
32 
33 #if defined(__ANDROID__)
34 #include "debuggerd/handler.h"
35 #endif
36 
37 
38 namespace android::mediautils {
39 // This function appropriately signals a pid to dump a backtrace if we are
40 // running on device (and the HAL exists). If we are not running on an Android
41 // device, there is no HAL to signal (so we do nothing).
signalAudioHAL(pid_t pid)42 static inline void signalAudioHAL([[maybe_unused]] pid_t pid) {
43 #if defined(__ANDROID__)
44     sigqueue(pid, DEBUGGER_SIGNAL, {.sival_int = 0});
45 #endif
46 }
47 
48 /**
49  * Returns the std::string "HH:MM:SS.MSc" from a system_clock time_point.
50  */
formatTime(std::chrono::system_clock::time_point t)51 std::string formatTime(std::chrono::system_clock::time_point t) {
52     auto time_string = audio_utils_time_string_from_ns(
53             std::chrono::nanoseconds(t.time_since_epoch()).count());
54 
55     // The time string is 19 characters (including null termination).
56     // Example: "03-27 16:47:06.187"
57     //           MM DD HH MM SS MS
58     // We offset by 6 to get HH:MM:SS.MSc
59     //
60     return time_string.time + 6; // offset to remove month/day.
61 }
62 
63 /**
64  * Finds the end of the common time prefix.
65  *
66  * This is as an option to remove the common time prefix to avoid
67  * unnecessary duplicated strings.
68  *
69  * \param time1 a time string
70  * \param time2 a time string
71  * \return      the position where the common time prefix ends. For abbreviated
72  *              printing of time2, offset the character pointer by this position.
73  */
commonTimePrefixPosition(std::string_view time1,std::string_view time2)74 static size_t commonTimePrefixPosition(std::string_view time1, std::string_view time2) {
75     const size_t endPos = std::min(time1.size(), time2.size());
76     size_t i;
77 
78     // Find location of the first mismatch between strings
79     for (i = 0; ; ++i) {
80         if (i == endPos) {
81             return i; // strings match completely to the length of one of the strings.
82         }
83         if (time1[i] != time2[i]) {
84             break;
85         }
86         if (time1[i] == '\0') {
87             return i; // "printed" strings match completely.  No need to check further.
88         }
89     }
90 
91     // Go backwards until we find a delimeter or space.
92     for (; i > 0
93            && isdigit(time1[i]) // still a number
94            && time1[i - 1] != ' '
95          ; --i) {
96     }
97     return i;
98 }
99 
100 /**
101  * Returns the unique suffix of time2 that isn't present in time1.
102  *
103  * If time2 is identical to time1, then an empty string_view is returned.
104  * This method is used to elide the common prefix when printing times.
105  */
timeSuffix(std::string_view time1,std::string_view time2)106 std::string_view timeSuffix(std::string_view time1, std::string_view time2) {
107     const size_t pos = commonTimePrefixPosition(time1, time2);
108     return time2.substr(pos);
109 }
110 
111 // Audio HAL server pids vector used to generate audio HAL processes tombstone
112 // when audioserver watchdog triggers.
113 // We use a lockless storage to avoid potential deadlocks in the context of watchdog
114 // trigger.
115 // Protection again simultaneous writes is not needed given one update takes place
116 // during AudioFlinger construction and other comes necessarily later once the IAudioFlinger
117 // interface is available.
118 // The use of an atomic index just guaranties that current vector is fully initialized
119 // when read.
120 /* static */
accessAudioHalPids(std::vector<pid_t> * pids,bool update)121 void TimeCheck::accessAudioHalPids(std::vector<pid_t>* pids, bool update) {
122     static constexpr int kNumAudioHalPidsVectors = 3;
123     static std::vector<pid_t> audioHalPids[kNumAudioHalPidsVectors];
124     static std::atomic<unsigned> curAudioHalPids = 0;
125 
126     if (update) {
127         audioHalPids[(curAudioHalPids++ + 1) % kNumAudioHalPidsVectors] = *pids;
128     } else {
129         *pids = audioHalPids[curAudioHalPids % kNumAudioHalPidsVectors];
130     }
131 }
132 
133 /* static */
setAudioHalPids(const std::vector<pid_t> & pids)134 void TimeCheck::setAudioHalPids(const std::vector<pid_t>& pids) {
135     accessAudioHalPids(&(const_cast<std::vector<pid_t>&>(pids)), true);
136 }
137 
138 /* static */
getAudioHalPids()139 std::vector<pid_t> TimeCheck::getAudioHalPids() {
140     std::vector<pid_t> pids;
141     accessAudioHalPids(&pids, false);
142     return pids;
143 }
144 
145 /* static */
getTimeCheckThread()146 TimerThread& TimeCheck::getTimeCheckThread() {
147     static TimerThread sTimeCheckThread{};
148     return sTimeCheckThread;
149 }
150 
151 /* static */
toString()152 std::string TimeCheck::toString() {
153     // note pending and retired are individually locked for maximum concurrency,
154     // snapshot is not instantaneous at a single time.
155     return getTimeCheckThread().getSnapshotAnalysis().toString();
156 }
157 
TimeCheck(std::string_view tag,OnTimerFunc && onTimer,Duration requestedTimeoutDuration,Duration secondChanceDuration,bool crashOnTimeout)158 TimeCheck::TimeCheck(std::string_view tag, OnTimerFunc&& onTimer, Duration requestedTimeoutDuration,
159         Duration secondChanceDuration, bool crashOnTimeout)
160     : mTimeCheckHandler{ std::make_shared<TimeCheckHandler>(
161             tag, std::move(onTimer), crashOnTimeout, requestedTimeoutDuration,
162             secondChanceDuration, std::chrono::system_clock::now(), getThreadIdWrapper()) }
163     , mTimerHandle(requestedTimeoutDuration.count() == 0
164               /* for TimeCheck we don't consider a non-zero secondChanceDuration here */
165               ? getTimeCheckThread().trackTask(mTimeCheckHandler->tag)
166               : getTimeCheckThread().scheduleTask(
167                       mTimeCheckHandler->tag,
168                       // Pass in all the arguments by value to this task for safety.
169                       // The thread could call the callback before the constructor is finished.
170                       // The destructor is not blocked on callback.
__anon6842e7020102(TimerThread::Handle timerHandle) 171                       [ timeCheckHandler = mTimeCheckHandler ](TimerThread::Handle timerHandle) {
172                           timeCheckHandler->onTimeout(timerHandle);
173                       },
174                       requestedTimeoutDuration,
175                       secondChanceDuration)) {}
176 
~TimeCheck()177 TimeCheck::~TimeCheck() {
178     if (mTimeCheckHandler) {
179         mTimeCheckHandler->onCancel(mTimerHandle);
180     }
181 }
182 
183 /* static */
analyzeTimeouts(float requestedTimeoutMs,float elapsedSteadyMs,float elapsedSystemMs)184 std::string TimeCheck::analyzeTimeouts(
185         float requestedTimeoutMs, float elapsedSteadyMs, float elapsedSystemMs) {
186     // Track any OS clock issues with suspend.
187     // It is possible that the elapsedSystemMs is much greater than elapsedSteadyMs if
188     // a suspend occurs; however, we always expect the timeout ms should always be slightly
189     // less than the elapsed steady ms regardless of whether a suspend occurs or not.
190 
191     std::string s("Timeout ms ");
192     s.append(std::to_string(requestedTimeoutMs))
193         .append(" elapsed steady ms ").append(std::to_string(elapsedSteadyMs))
194         .append(" elapsed system ms ").append(std::to_string(elapsedSystemMs));
195 
196     // Is there something unusual?
197     static constexpr float TOLERANCE_CONTEXT_SWITCH_MS = 200.f;
198 
199     if (requestedTimeoutMs > elapsedSteadyMs || requestedTimeoutMs > elapsedSystemMs) {
200         s.append("\nError: early expiration - "
201                 "requestedTimeoutMs should be less than elapsed time");
202     }
203 
204     if (elapsedSteadyMs > elapsedSystemMs + TOLERANCE_CONTEXT_SWITCH_MS) {
205         s.append("\nWarning: steady time should not advance faster than system time");
206     }
207 
208     // This has been found in suspend stress testing.
209     if (elapsedSteadyMs > requestedTimeoutMs + TOLERANCE_CONTEXT_SWITCH_MS) {
210         s.append("\nWarning: steady time significantly exceeds timeout "
211                 "- possible thread stall or aborted suspend");
212     }
213 
214     // This has been found in suspend stress testing.
215     if (elapsedSystemMs > requestedTimeoutMs + TOLERANCE_CONTEXT_SWITCH_MS) {
216         s.append("\nInformation: system time significantly exceeds timeout "
217                 "- possible suspend");
218     }
219     return s;
220 }
221 
222 // To avoid any potential race conditions, the timer handle
223 // (expiration = clock steady start + timeout) is passed into the callback.
onCancel(TimerThread::Handle timerHandle) const224 void TimeCheck::TimeCheckHandler::onCancel(TimerThread::Handle timerHandle) const
225 {
226     if (TimeCheck::getTimeCheckThread().cancelTask(timerHandle) && onTimer) {
227         const std::chrono::steady_clock::time_point endSteadyTime =
228                 std::chrono::steady_clock::now();
229         const float elapsedSteadyMs = std::chrono::duration_cast<FloatMs>(
230                 endSteadyTime - timerHandle + timeoutDuration).count();
231         // send the elapsed steady time for statistics.
232         onTimer(false /* timeout */, elapsedSteadyMs);
233     }
234 }
235 
236 // To avoid any potential race conditions, the timer handle
237 // (expiration = clock steady start + timeout) is passed into the callback.
onTimeout(TimerThread::Handle timerHandle) const238 void TimeCheck::TimeCheckHandler::onTimeout(TimerThread::Handle timerHandle) const
239 {
240     const std::chrono::steady_clock::time_point endSteadyTime = std::chrono::steady_clock::now();
241     const std::chrono::system_clock::time_point endSystemTime = std::chrono::system_clock::now();
242     // timerHandle incorporates the timeout
243     const float elapsedSteadyMs = std::chrono::duration_cast<FloatMs>(
244             endSteadyTime - (timerHandle - timeoutDuration)).count();
245     const float elapsedSystemMs = std::chrono::duration_cast<FloatMs>(
246             endSystemTime - startSystemTime).count();
247     const float requestedTimeoutMs = std::chrono::duration_cast<FloatMs>(
248             timeoutDuration).count();
249     const float secondChanceMs = std::chrono::duration_cast<FloatMs>(
250             secondChanceDuration).count();
251 
252     if (onTimer) {
253         onTimer(true /* timeout */, elapsedSteadyMs);
254     }
255 
256     if (!crashOnTimeout) return;
257 
258     // Generate the TimerThread summary string early before sending signals to the
259     // HAL processes which can affect thread behavior.
260     const auto snapshotAnalysis = getTimeCheckThread().getSnapshotAnalysis(4 /* retiredCount */);
261 
262     // Generate audio HAL processes tombstones and allow time to complete
263     // before forcing restart
264     std::vector<pid_t> pids = TimeCheck::getAudioHalPids();
265     std::string halPids = "HAL pids [ ";
266     if (pids.size() != 0) {
267         for (const auto& pid : pids) {
268             ALOGI("requesting tombstone for pid: %d", pid);
269             halPids.append(std::to_string(pid)).append(" ");
270             signalAudioHAL(pid);
271         }
272         sleep(1);
273     } else {
274         ALOGI("No HAL process pid available, skipping tombstones");
275     }
276     halPids.append("]");
277 
278     LOG_EVENT_STRING(LOGTAG_AUDIO_BINDER_TIMEOUT, tag.c_str());
279 
280     // Create abort message string - caution: this can be very large.
281     const std::string abortMessage = std::string("TimeCheck timeout for ")
282             .append(tag)
283             .append(" scheduled ").append(formatTime(startSystemTime))
284             .append(" on thread ").append(std::to_string(tid)).append("\n")
285             .append(analyzeTimeouts(requestedTimeoutMs + secondChanceMs,
286                     elapsedSteadyMs, elapsedSystemMs)).append("\n")
287             .append(halPids).append("\n")
288             .append(snapshotAnalysis.toString());
289 
290     // In many cases, the initial timeout stack differs from the abort backtrace because
291     // (1) the time difference between initial timeout and the final abort signal
292     // and (2) signalling the HAL audio service may cause
293     // the thread to unblock and continue.
294 
295     // Note: LOG_ALWAYS_FATAL limits the size of the string - per log/log.h:
296     // Log message text may be truncated to less than an
297     // implementation-specific limit (1023 bytes).
298     //
299     // Here, we send the string through android-base/logging.h LOG()
300     // to avoid the size limitation. LOG(FATAL) does an abort whereas
301     // LOG(FATAL_WITHOUT_ABORT) does not abort.
302 
303     static constexpr pid_t invalidPid = TimerThread::SnapshotAnalysis::INVALID_PID;
304     pid_t tidToAbort = invalidPid;
305     if (snapshotAnalysis.suspectTid != invalidPid) {
306         tidToAbort = snapshotAnalysis.suspectTid;
307     } else if (snapshotAnalysis.timeoutTid != invalidPid) {
308         tidToAbort = snapshotAnalysis.timeoutTid;
309     }
310 
311     LOG(FATAL_WITHOUT_ABORT) << abortMessage;
312     const auto ret = abortTid(tidToAbort);
313     if (ret < 0) {
314         LOG(FATAL) << "TimeCheck thread signal failed, aborting process. "
315                        "errno: " << errno << base::ErrnoNumberAsString(errno);
316     }
317 }
318 
319 // Automatically create a TimeCheck class for a class and method.
320 // This is used for Audio HAL support.
makeTimeCheckStatsForClassMethod(std::string_view className,std::string_view methodName)321 mediautils::TimeCheck makeTimeCheckStatsForClassMethod(
322         std::string_view className, std::string_view methodName) {
323     std::shared_ptr<MethodStatistics<std::string>> statistics =
324             mediautils::getStatisticsForClass(className);
325     if (!statistics) return {}; // empty TimeCheck.
326     return mediautils::TimeCheck(
327             FixedString62(className).append("::").append(methodName),
328             [ safeMethodName = FixedString30(methodName),
329               stats = std::move(statistics) ]
330             (bool timeout, float elapsedMs) {
331                     if (timeout) {
332                         ; // ignored, there is no timeout value.
333                     } else {
334                         stats->event(safeMethodName.asStringView(), elapsedMs);
335                     }
336             }, {} /* timeoutDuration */, {} /* secondChanceDuration */, false /* crashOnTimeout */);
337 }
338 
339 }  // namespace android::mediautils
340