1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "TimerThread"
18 
19 #include <optional>
20 #include <sstream>
21 #include <unistd.h>
22 #include <vector>
23 
24 #include <audio_utils/mutex.h>
25 #include <mediautils/MediaUtilsDelayed.h>
26 #include <mediautils/TidWrapper.h>
27 #include <mediautils/TimerThread.h>
28 #include <utils/Log.h>
29 #include <utils/ThreadDefs.h>
30 
31 using namespace std::chrono_literals;
32 
33 namespace android::mediautils {
34 
35 extern std::string formatTime(std::chrono::system_clock::time_point t);
36 extern std::string_view timeSuffix(std::string_view time1, std::string_view time2);
37 
scheduleTask(std::string_view tag,TimerCallback && func,Duration timeoutDuration,Duration secondChanceDuration)38 TimerThread::Handle TimerThread::scheduleTask(
39         std::string_view tag, TimerCallback&& func,
40         Duration timeoutDuration, Duration secondChanceDuration) {
41     const auto now = std::chrono::system_clock::now();
42     auto request = std::make_shared<const Request>(now, now +
43             std::chrono::duration_cast<std::chrono::system_clock::duration>(timeoutDuration),
44             secondChanceDuration, getThreadIdWrapper(), tag);
45     return mMonitorThread.add(std::move(request), std::move(func), timeoutDuration);
46 }
47 
trackTask(std::string_view tag)48 TimerThread::Handle TimerThread::trackTask(std::string_view tag) {
49     const auto now = std::chrono::system_clock::now();
50     auto request = std::make_shared<const Request>(now, now,
51             Duration{} /* secondChanceDuration */, getThreadIdWrapper(), tag);
52     return mNoTimeoutMap.add(std::move(request));
53 }
54 
cancelTask(Handle handle)55 bool TimerThread::cancelTask(Handle handle) {
56     std::shared_ptr<const Request> request = isNoTimeoutHandle(handle) ?
57              mNoTimeoutMap.remove(handle) : mMonitorThread.remove(handle);
58     if (!request) return false;
59     mRetiredQueue.add(std::move(request));
60     return true;
61 }
62 
toString(bool showTimeoutStack) const63 std::string TimerThread::SnapshotAnalysis::toString(bool showTimeoutStack) const {
64     // Note: These request queues are snapshot very close together but
65     // not at "identical" times as we don't use a class-wide lock.
66     std::string analysisSummary = std::string("\nanalysis [ ").append(description).append(" ]");
67     std::string timeoutStack;
68     std::string blockedStack;
69     std::string mutexWaitChainStack;
70     if (showTimeoutStack && timeoutTid != -1) {
71         timeoutStack = std::string(suspectTid == timeoutTid ? "\ntimeout/blocked(" : "\ntimeout(")
72                 .append(std::to_string(timeoutTid)).append(") callstack [\n")
73                 .append(getCallStackStringForTid(timeoutTid)).append("]");
74     }
75 
76     if (suspectTid != -1 && suspectTid != timeoutTid) {
77         blockedStack = std::string("\nblocked(")
78                 .append(std::to_string(suspectTid)).append(")  callstack [\n")
79                 .append(getCallStackStringForTid(suspectTid)).append("]");
80     }
81 
82     if (!mutexWaitChain.empty()) {
83         mutexWaitChainStack.append("\nmutex wait chain [\n");
84         // the wait chain omits the initial timeout tid (which is good as we don't
85         // need to suppress it).
86         for (size_t i = 0; i < mutexWaitChain.size(); ++i) {
87             const auto& [tid, name] = mutexWaitChain[i];
88             mutexWaitChainStack.append("{ tid: ").append(std::to_string(tid))
89                     .append(" (holding ").append(name).append(")");
90             if (tid == timeoutTid) {
91                 mutexWaitChainStack.append(" TIMEOUT_STACK }\n");
92             } else if (tid == suspectTid) {
93                 mutexWaitChainStack.append(" BLOCKED_STACK }\n");
94             } else if (hasMutexCycle && i == mutexWaitChain.size() - 1) {
95                 // for a cycle, the last pid in the chain is repeated.
96                 mutexWaitChainStack.append(" CYCLE_STACK }\n");
97             } else {
98                 mutexWaitChainStack.append(" callstack [\n")
99                         .append(getCallStackStringForTid(tid)).append("] }\n");
100             }
101         }
102         mutexWaitChainStack.append("]");
103     }
104 
105     return std::string("now ")
106             .append(formatTime(std::chrono::system_clock::now()))
107             .append("\nsecondChanceCount ")
108             .append(std::to_string(secondChanceCount))
109             .append(analysisSummary)
110             .append("\ntimeout [ ")
111             .append(requestsToString(timeoutRequests))
112             .append(" ]\npending [ ")
113             .append(requestsToString(pendingRequests))
114             .append(" ]\nretired [ ")
115             .append(requestsToString(retiredRequests))
116             .append(" ]")
117             .append(timeoutStack)
118             .append(blockedStack)
119             .append(mutexWaitChainStack);
120 }
121 
122 // A HAL method is where the substring "Hidl" is in the class name.
123 // The tag should look like: ... Hidl ... :: ...
124 // When the audio HAL is updated to AIDL perhaps we will use instead
125 // a global directory of HAL classes.
126 //
127 // See MethodStatistics.cpp:
128 // mediautils::getStatisticsClassesForModule(METHOD_STATISTICS_MODULE_NAME_AUDIO_HIDL)
129 //
130 /* static */
isRequestFromHal(const std::shared_ptr<const Request> & request)131 bool TimerThread::isRequestFromHal(const std::shared_ptr<const Request>& request) {
132     for (const auto& s : {"Hidl", "Aidl"}) {
133         const auto& tagSV = request->tag.asStringView();
134         const size_t halStrPos = tagSV.find(s);
135         // should be a separator afterwards Hidl/Aidl which indicates the string was in the class.
136         if (halStrPos != std::string::npos && tagSV.find("::", halStrPos) != std::string::npos) {
137             return true;
138         }
139     }
140 
141     return false;
142 }
143 
getSnapshotAnalysis(size_t retiredCount) const144 struct TimerThread::SnapshotAnalysis TimerThread::getSnapshotAnalysis(size_t retiredCount) const {
145     struct SnapshotAnalysis analysis{};
146     // The following snapshot of the TimerThread state will be utilized for
147     // analysis. Note, there is no lock around these calls, so there could be
148     // a state update between them.
149     mTimeoutQueue.copyRequests(analysis.timeoutRequests);
150     mRetiredQueue.copyRequests(analysis.retiredRequests, retiredCount);
151     analysis.pendingRequests = getPendingRequests();
152     analysis.secondChanceCount = mMonitorThread.getSecondChanceCount();
153     // No call has timed out, so there is no analysis to be done.
154     if (analysis.timeoutRequests.empty()) {
155         return analysis;
156     }
157 
158     // for now look at last timeout (in our case, the only timeout)
159     const std::shared_ptr<const Request> timeout = analysis.timeoutRequests.back();
160     analysis.timeoutTid = timeout->tid;
161 
162     std::string& description = analysis.description;
163 
164     // audio mutex specific wait chain analysis
165     auto deadlockInfo = audio_utils::mutex::deadlock_detection(analysis.timeoutTid);
166     ALOGD("%s: deadlockInfo: %s", __func__, deadlockInfo.to_string().c_str());
167 
168     if (!deadlockInfo.empty()) {
169         if (!description.empty()) description.append("\n");
170         description.append(deadlockInfo.to_string());
171     }
172 
173     analysis.hasMutexCycle = deadlockInfo.has_cycle;
174     analysis.mutexWaitChain = std::move(deadlockInfo.chain);
175 
176     // no pending requests, we don't attempt to use temporal correlation between a recent call.
177     if (analysis.pendingRequests.empty()) {
178         return analysis;
179     }
180 
181     // pending Requests that are problematic.
182     std::vector<std::shared_ptr<const Request>> pendingExact;
183     std::vector<std::shared_ptr<const Request>> pendingPossible;
184 
185     // We look at pending requests that were scheduled no later than kPendingDuration
186     // after the timeout request. This prevents false matches with calls
187     // that naturally block for a short period of time
188     // such as HAL write() and read().
189     //
190     constexpr Duration kPendingDuration = 1000ms;
191     for (const auto& pending : analysis.pendingRequests) {
192         // If the pending tid is the same as timeout tid, problem identified.
193         if (pending->tid == timeout->tid) {
194             pendingExact.emplace_back(pending);
195             continue;
196         }
197 
198         // if the pending tid is scheduled within time limit
199         if (pending->scheduled - timeout->scheduled < kPendingDuration) {
200             pendingPossible.emplace_back(pending);
201         }
202     }
203 
204     if (!pendingExact.empty()) {
205         const auto& request = pendingExact.front();
206         const bool hal = isRequestFromHal(request);
207 
208         if (hal) {
209             if (!description.empty()) description.append("\n");
210             description.append("Blocked directly due to HAL call: ")
211                 .append(request->toString());
212             analysis.suspectTid = request->tid;
213         }
214     }
215     if (description.empty() && !pendingPossible.empty()) {
216         for (const auto& request : pendingPossible) {
217             const bool hal = isRequestFromHal(request);
218             if (hal) {
219                 // The first blocked call is the most likely one.
220                 // Recent calls might be temporarily blocked
221                 // calls such as write() or read() depending on kDuration.
222                 description = std::string("Blocked possibly due to HAL call: ")
223                     .append(request->toString());
224                 analysis.suspectTid= request->tid;
225             }
226        }
227     }
228     return analysis;
229 }
230 
getPendingRequests() const231 std::vector<std::shared_ptr<const TimerThread::Request>> TimerThread::getPendingRequests() const {
232     constexpr size_t kEstimatedPendingRequests = 8;  // approx 128 byte alloc.
233     std::vector<std::shared_ptr<const Request>> pendingRequests;
234     pendingRequests.reserve(kEstimatedPendingRequests); // preallocate vector out of lock.
235 
236     // following are internally locked calls, which add to our local pendingRequests.
237     mMonitorThread.copyRequests(pendingRequests);
238     mNoTimeoutMap.copyRequests(pendingRequests);
239 
240     // Sort in order of scheduled time.
241     std::sort(pendingRequests.begin(), pendingRequests.end(),
242         [](const std::shared_ptr<const Request>& r1,
243            const std::shared_ptr<const Request>& r2) {
244                return r1->scheduled < r2->scheduled;
245            });
246     return pendingRequests;
247 }
248 
pendingToString() const249 std::string TimerThread::pendingToString() const {
250     return requestsToString(getPendingRequests());
251 }
252 
retiredToString(size_t n) const253 std::string TimerThread::retiredToString(size_t n) const {
254     std::vector<std::shared_ptr<const Request>> retiredRequests;
255     mRetiredQueue.copyRequests(retiredRequests, n);
256 
257     // Dump to string
258     return requestsToString(retiredRequests);
259 }
260 
timeoutToString(size_t n) const261 std::string TimerThread::timeoutToString(size_t n) const {
262     std::vector<std::shared_ptr<const Request>> timeoutRequests;
263     mTimeoutQueue.copyRequests(timeoutRequests, n);
264 
265     // Dump to string
266     return requestsToString(timeoutRequests);
267 }
268 
toString() const269 std::string TimerThread::Request::toString() const {
270     const auto scheduledString = formatTime(scheduled);
271     const auto deadlineString = formatTime(deadline);
272     return std::string(tag)
273         .append(" scheduled ").append(scheduledString)
274         .append(" deadline ").append(timeSuffix(scheduledString, deadlineString))
275         .append(" tid ").append(std::to_string(tid));
276 }
277 
add(std::shared_ptr<const Request> request)278 void TimerThread::RequestQueue::add(std::shared_ptr<const Request> request) {
279     std::lock_guard lg(mRQMutex);
280     mRequestQueue.emplace_back(std::chrono::system_clock::now(), std::move(request));
281     if (mRequestQueue.size() > mRequestQueueMax) {
282         mRequestQueue.pop_front();
283     }
284 }
285 
copyRequests(std::vector<std::shared_ptr<const Request>> & requests,size_t n) const286 void TimerThread::RequestQueue::copyRequests(
287         std::vector<std::shared_ptr<const Request>>& requests, size_t n) const {
288     std::lock_guard lg(mRQMutex);
289     const size_t size = mRequestQueue.size();
290     size_t i = n >=  size ? 0 : size - n;
291     for (; i < size; ++i) {
292         const auto &[time, request] = mRequestQueue[i];
293         requests.emplace_back(request);
294     }
295 }
296 
add(std::shared_ptr<const Request> request)297 TimerThread::Handle TimerThread::NoTimeoutMap::add(std::shared_ptr<const Request> request) {
298     std::lock_guard lg(mNTMutex);
299     // A unique handle is obtained by mNoTimeoutRequests.fetch_add(1),
300     // This need not be under a lock, but we do so anyhow.
301     const Handle handle = getUniqueHandle_l();
302     mMap[handle] = request;
303     return handle;
304 }
305 
remove(Handle handle)306 std::shared_ptr<const TimerThread::Request> TimerThread::NoTimeoutMap::remove(Handle handle) {
307     std::lock_guard lg(mNTMutex);
308     auto it = mMap.find(handle);
309     if (it == mMap.end()) return {};
310     auto request = it->second;
311     mMap.erase(it);
312     return request;
313 }
314 
copyRequests(std::vector<std::shared_ptr<const Request>> & requests) const315 void TimerThread::NoTimeoutMap::copyRequests(
316         std::vector<std::shared_ptr<const Request>>& requests) const {
317     std::lock_guard lg(mNTMutex);
318     for (const auto &[handle, request] : mMap) {
319         requests.emplace_back(request);
320     }
321 }
322 
MonitorThread(RequestQueue & timeoutQueue)323 TimerThread::MonitorThread::MonitorThread(RequestQueue& timeoutQueue)
324         : mTimeoutQueue(timeoutQueue)
325         , mThread([this] { threadFunc(); }) {
326      pthread_setname_np(mThread.native_handle(), "TimerThread");
327      pthread_setschedprio(mThread.native_handle(), PRIORITY_URGENT_AUDIO);
328 }
329 
~MonitorThread()330 TimerThread::MonitorThread::~MonitorThread() {
331     {
332         std::lock_guard _l(mMutex);
333         mShouldExit = true;
334         mCond.notify_all();
335     }
336     mThread.join();
337 }
338 
threadFunc()339 void TimerThread::MonitorThread::threadFunc() {
340     std::unique_lock _l(mMutex);
341     ::android::base::ScopedLockAssertion lock_assertion(mMutex);
342     while (!mShouldExit) {
343         Handle nextDeadline = INVALID_HANDLE;
344         Handle now = INVALID_HANDLE;
345         if (!mMonitorRequests.empty()) {
346             nextDeadline = mMonitorRequests.begin()->first;
347             now = std::chrono::steady_clock::now();
348             if (nextDeadline < now) {
349                 auto node = mMonitorRequests.extract(mMonitorRequests.begin());
350                 // Deadline has expired, handle the request.
351                 auto secondChanceDuration = node.mapped().first->secondChanceDuration;
352                 if (secondChanceDuration.count() != 0) {
353                     // We now apply the second chance duration to find the clock
354                     // monotonic second deadline.  The unique key is then the
355                     // pair<second_deadline, first_deadline>.
356                     //
357                     // The second chance prevents a false timeout should there be
358                     // any clock monotonic advancement during suspend.
359                     auto newHandle = now + secondChanceDuration;
360                     ALOGD("%s: TimeCheck second chance applied for %s",
361                             __func__, node.mapped().first->tag.c_str()); // should be rare event.
362                     mSecondChanceRequests.emplace_hint(mSecondChanceRequests.end(),
363                             std::make_pair(newHandle, nextDeadline),
364                             std::move(node.mapped()));
365                     // increment second chance counter.
366                     mSecondChanceCount.fetch_add(1 /* arg */, std::memory_order_relaxed);
367                 } else {
368                     {
369                         _l.unlock();
370                         // We add Request to retired queue early so that it can be dumped out.
371                         mTimeoutQueue.add(std::move(node.mapped().first));
372                         node.mapped().second(nextDeadline);
373                         // Caution: we don't hold lock when we call TimerCallback,
374                         // but this is the timeout case!  We will crash soon,
375                         // maybe before returning.
376                         // anything left over is released here outside lock.
377                     }
378                     // reacquire the lock - if something was added, we loop immediately to check.
379                     _l.lock();
380                 }
381                 // always process expiring monitor requests first.
382                 continue;
383             }
384         }
385         // now process any second chance requests.
386         if (!mSecondChanceRequests.empty()) {
387             Handle secondDeadline = mSecondChanceRequests.begin()->first.first;
388             if (now == INVALID_HANDLE) now = std::chrono::steady_clock::now();
389             if (secondDeadline < now) {
390                 auto node = mSecondChanceRequests.extract(mSecondChanceRequests.begin());
391                 {
392                     _l.unlock();
393                     // We add Request to retired queue early so that it can be dumped out.
394                     mTimeoutQueue.add(std::move(node.mapped().first));
395                     const Handle originalHandle = node.key().second;
396                     node.mapped().second(originalHandle);
397                     // Caution: we don't hold lock when we call TimerCallback.
398                     // This is benign issue - we permit concurrent operations
399                     // while in the callback to the MonitorQueue.
400                     //
401                     // Anything left over is released here outside lock.
402                 }
403                 // reacquire the lock - if something was added, we loop immediately to check.
404                 _l.lock();
405                 continue;
406             }
407             // update the deadline.
408             if (nextDeadline == INVALID_HANDLE) {
409                 nextDeadline = secondDeadline;
410             } else {
411                 nextDeadline = std::min(nextDeadline, secondDeadline);
412             }
413         }
414         if (nextDeadline != INVALID_HANDLE) {
415             mCond.wait_until(_l, nextDeadline);
416         } else {
417             mCond.wait(_l);
418         }
419     }
420 }
421 
add(std::shared_ptr<const Request> request,TimerCallback && func,Duration timeout)422 TimerThread::Handle TimerThread::MonitorThread::add(
423         std::shared_ptr<const Request> request, TimerCallback&& func, Duration timeout) {
424     std::lock_guard _l(mMutex);
425     const Handle handle = getUniqueHandle_l(timeout);
426     mMonitorRequests.emplace_hint(mMonitorRequests.end(),
427             handle, std::make_pair(std::move(request), std::move(func)));
428     mCond.notify_all();
429     return handle;
430 }
431 
remove(Handle handle)432 std::shared_ptr<const TimerThread::Request> TimerThread::MonitorThread::remove(Handle handle) {
433     std::pair<std::shared_ptr<const Request>, TimerCallback> data;
434     std::unique_lock ul(mMutex);
435     ::android::base::ScopedLockAssertion lock_assertion(mMutex);
436     if (const auto it = mMonitorRequests.find(handle);
437         it != mMonitorRequests.end()) {
438         data = std::move(it->second);
439         mMonitorRequests.erase(it);
440         ul.unlock();  // manually release lock here so func (data.second)
441                       // is released outside of lock.
442         return data.first;  // request
443     }
444 
445     // this check is O(N), but since the second chance requests are ordered
446     // in terms of earliest expiration time, we would expect better than average results.
447     for (auto it = mSecondChanceRequests.begin(); it != mSecondChanceRequests.end(); ++it) {
448         if (it->first.second == handle) {
449             data = std::move(it->second);
450             mSecondChanceRequests.erase(it);
451             ul.unlock();  // manually release lock here so func (data.second)
452                           // is released outside of lock.
453             return data.first; // request
454         }
455     }
456     return {};
457 }
458 
copyRequests(std::vector<std::shared_ptr<const Request>> & requests) const459 void TimerThread::MonitorThread::copyRequests(
460         std::vector<std::shared_ptr<const Request>>& requests) const {
461     std::lock_guard lg(mMutex);
462     for (const auto &[deadline, monitorpair] : mMonitorRequests) {
463         requests.emplace_back(monitorpair.first);
464     }
465     // we combine the second map with the first map - this is
466     // everything that is pending on the monitor thread.
467     // The second map will be older than the first map so this
468     // is in order.
469     for (const auto &[deadline, monitorpair] : mSecondChanceRequests) {
470         requests.emplace_back(monitorpair.first);
471     }
472 }
473 
474 }  // namespace android::mediautils
475