1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "thread_list.h"
18 
19 #include <dirent.h>
20 #include <sys/types.h>
21 #include <unistd.h>
22 
23 #include <sstream>
24 #include <vector>
25 
26 #include "android-base/stringprintf.h"
27 #include "backtrace/BacktraceMap.h"
28 #include "nativehelper/scoped_local_ref.h"
29 #include "nativehelper/scoped_utf_chars.h"
30 
31 #include "base/aborting.h"
32 #include "base/histogram-inl.h"
33 #include "base/mutex-inl.h"
34 #include "base/systrace.h"
35 #include "base/time_utils.h"
36 #include "base/timing_logger.h"
37 #include "debugger.h"
38 #include "gc/collector/concurrent_copying.h"
39 #include "gc/gc_pause_listener.h"
40 #include "gc/heap.h"
41 #include "gc/reference_processor.h"
42 #include "gc_root.h"
43 #include "jni_internal.h"
44 #include "lock_word.h"
45 #include "monitor.h"
46 #include "native_stack_dump.h"
47 #include "scoped_thread_state_change-inl.h"
48 #include "thread.h"
49 #include "trace.h"
50 #include "well_known_classes.h"
51 
52 #if ART_USE_FUTEXES
53 #include "linux/futex.h"
54 #include "sys/syscall.h"
55 #ifndef SYS_futex
56 #define SYS_futex __NR_futex
57 #endif
58 #endif  // ART_USE_FUTEXES
59 
60 namespace art {
61 
62 using android::base::StringPrintf;
63 
64 static constexpr uint64_t kLongThreadSuspendThreshold = MsToNs(5);
65 // Use 0 since we want to yield to prevent blocking for an unpredictable amount of time.
66 static constexpr useconds_t kThreadSuspendInitialSleepUs = 0;
67 static constexpr useconds_t kThreadSuspendMaxYieldUs = 3000;
68 static constexpr useconds_t kThreadSuspendMaxSleepUs = 5000;
69 
70 // Whether we should try to dump the native stack of unattached threads. See commit ed8b723 for
71 // some history.
72 static constexpr bool kDumpUnattachedThreadNativeStackForSigQuit = true;
73 
ThreadList(uint64_t thread_suspend_timeout_ns)74 ThreadList::ThreadList(uint64_t thread_suspend_timeout_ns)
75     : suspend_all_count_(0),
76       debug_suspend_all_count_(0),
77       unregistering_count_(0),
78       suspend_all_historam_("suspend all histogram", 16, 64),
79       long_suspend_(false),
80       shut_down_(false),
81       thread_suspend_timeout_ns_(thread_suspend_timeout_ns),
82       empty_checkpoint_barrier_(new Barrier(0)) {
83   CHECK(Monitor::IsValidLockWord(LockWord::FromThinLockId(kMaxThreadId, 1, 0U)));
84 }
85 
~ThreadList()86 ThreadList::~ThreadList() {
87   CHECK(shut_down_);
88 }
89 
ShutDown()90 void ThreadList::ShutDown() {
91   ScopedTrace trace(__PRETTY_FUNCTION__);
92   // Detach the current thread if necessary. If we failed to start, there might not be any threads.
93   // We need to detach the current thread here in case there's another thread waiting to join with
94   // us.
95   bool contains = false;
96   Thread* self = Thread::Current();
97   {
98     MutexLock mu(self, *Locks::thread_list_lock_);
99     contains = Contains(self);
100   }
101   if (contains) {
102     Runtime::Current()->DetachCurrentThread();
103   }
104   WaitForOtherNonDaemonThreadsToExit();
105   // Disable GC and wait for GC to complete in case there are still daemon threads doing
106   // allocations.
107   gc::Heap* const heap = Runtime::Current()->GetHeap();
108   heap->DisableGCForShutdown();
109   // In case a GC is in progress, wait for it to finish.
110   heap->WaitForGcToComplete(gc::kGcCauseBackground, Thread::Current());
111   // TODO: there's an unaddressed race here where a thread may attach during shutdown, see
112   //       Thread::Init.
113   SuspendAllDaemonThreadsForShutdown();
114 
115   shut_down_ = true;
116 }
117 
Contains(Thread * thread)118 bool ThreadList::Contains(Thread* thread) {
119   return find(list_.begin(), list_.end(), thread) != list_.end();
120 }
121 
Contains(pid_t tid)122 bool ThreadList::Contains(pid_t tid) {
123   for (const auto& thread : list_) {
124     if (thread->GetTid() == tid) {
125       return true;
126     }
127   }
128   return false;
129 }
130 
GetLockOwner()131 pid_t ThreadList::GetLockOwner() {
132   return Locks::thread_list_lock_->GetExclusiveOwnerTid();
133 }
134 
DumpNativeStacks(std::ostream & os)135 void ThreadList::DumpNativeStacks(std::ostream& os) {
136   MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
137   std::unique_ptr<BacktraceMap> map(BacktraceMap::Create(getpid()));
138   for (const auto& thread : list_) {
139     os << "DUMPING THREAD " << thread->GetTid() << "\n";
140     DumpNativeStack(os, thread->GetTid(), map.get(), "\t");
141     os << "\n";
142   }
143 }
144 
DumpForSigQuit(std::ostream & os)145 void ThreadList::DumpForSigQuit(std::ostream& os) {
146   {
147     ScopedObjectAccess soa(Thread::Current());
148     // Only print if we have samples.
149     if (suspend_all_historam_.SampleSize() > 0) {
150       Histogram<uint64_t>::CumulativeData data;
151       suspend_all_historam_.CreateHistogram(&data);
152       suspend_all_historam_.PrintConfidenceIntervals(os, 0.99, data);  // Dump time to suspend.
153     }
154   }
155   bool dump_native_stack = Runtime::Current()->GetDumpNativeStackOnSigQuit();
156   Dump(os, dump_native_stack);
157   DumpUnattachedThreads(os, dump_native_stack && kDumpUnattachedThreadNativeStackForSigQuit);
158 }
159 
DumpUnattachedThread(std::ostream & os,pid_t tid,bool dump_native_stack)160 static void DumpUnattachedThread(std::ostream& os, pid_t tid, bool dump_native_stack)
161     NO_THREAD_SAFETY_ANALYSIS {
162   // TODO: No thread safety analysis as DumpState with a null thread won't access fields, should
163   // refactor DumpState to avoid skipping analysis.
164   Thread::DumpState(os, nullptr, tid);
165   DumpKernelStack(os, tid, "  kernel: ", false);
166   if (dump_native_stack) {
167     DumpNativeStack(os, tid, nullptr, "  native: ");
168   }
169   os << std::endl;
170 }
171 
DumpUnattachedThreads(std::ostream & os,bool dump_native_stack)172 void ThreadList::DumpUnattachedThreads(std::ostream& os, bool dump_native_stack) {
173   DIR* d = opendir("/proc/self/task");
174   if (!d) {
175     return;
176   }
177 
178   Thread* self = Thread::Current();
179   dirent* e;
180   while ((e = readdir(d)) != nullptr) {
181     char* end;
182     pid_t tid = strtol(e->d_name, &end, 10);
183     if (!*end) {
184       bool contains;
185       {
186         MutexLock mu(self, *Locks::thread_list_lock_);
187         contains = Contains(tid);
188       }
189       if (!contains) {
190         DumpUnattachedThread(os, tid, dump_native_stack);
191       }
192     }
193   }
194   closedir(d);
195 }
196 
197 // Dump checkpoint timeout in milliseconds. Larger amount on the target, since the device could be
198 // overloaded with ANR dumps.
199 static constexpr uint32_t kDumpWaitTimeout = kIsTargetBuild ? 100000 : 20000;
200 
201 // A closure used by Thread::Dump.
202 class DumpCheckpoint FINAL : public Closure {
203  public:
DumpCheckpoint(std::ostream * os,bool dump_native_stack)204   DumpCheckpoint(std::ostream* os, bool dump_native_stack)
205       : os_(os),
206         barrier_(0),
207         backtrace_map_(dump_native_stack ? BacktraceMap::Create(getpid()) : nullptr),
208         dump_native_stack_(dump_native_stack) {
209     if (backtrace_map_ != nullptr) {
210       backtrace_map_->SetSuffixesToIgnore(std::vector<std::string> { "oat", "odex" });
211     }
212   }
213 
Run(Thread * thread)214   void Run(Thread* thread) OVERRIDE {
215     // Note thread and self may not be equal if thread was already suspended at the point of the
216     // request.
217     Thread* self = Thread::Current();
218     CHECK(self != nullptr);
219     std::ostringstream local_os;
220     {
221       ScopedObjectAccess soa(self);
222       thread->Dump(local_os, dump_native_stack_, backtrace_map_.get());
223     }
224     {
225       // Use the logging lock to ensure serialization when writing to the common ostream.
226       MutexLock mu(self, *Locks::logging_lock_);
227       *os_ << local_os.str() << std::endl;
228     }
229     barrier_.Pass(self);
230   }
231 
WaitForThreadsToRunThroughCheckpoint(size_t threads_running_checkpoint)232   void WaitForThreadsToRunThroughCheckpoint(size_t threads_running_checkpoint) {
233     Thread* self = Thread::Current();
234     ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
235     bool timed_out = barrier_.Increment(self, threads_running_checkpoint, kDumpWaitTimeout);
236     if (timed_out) {
237       // Avoid a recursive abort.
238       LOG((kIsDebugBuild && (gAborting == 0)) ? ::android::base::FATAL : ::android::base::ERROR)
239           << "Unexpected time out during dump checkpoint.";
240     }
241   }
242 
243  private:
244   // The common stream that will accumulate all the dumps.
245   std::ostream* const os_;
246   // The barrier to be passed through and for the requestor to wait upon.
247   Barrier barrier_;
248   // A backtrace map, so that all threads use a shared info and don't reacquire/parse separately.
249   std::unique_ptr<BacktraceMap> backtrace_map_;
250   // Whether we should dump the native stack.
251   const bool dump_native_stack_;
252 };
253 
Dump(std::ostream & os,bool dump_native_stack)254 void ThreadList::Dump(std::ostream& os, bool dump_native_stack) {
255   Thread* self = Thread::Current();
256   {
257     MutexLock mu(self, *Locks::thread_list_lock_);
258     os << "DALVIK THREADS (" << list_.size() << "):\n";
259   }
260   if (self != nullptr) {
261     DumpCheckpoint checkpoint(&os, dump_native_stack);
262     size_t threads_running_checkpoint;
263     {
264       // Use SOA to prevent deadlocks if multiple threads are calling Dump() at the same time.
265       ScopedObjectAccess soa(self);
266       threads_running_checkpoint = RunCheckpoint(&checkpoint);
267     }
268     if (threads_running_checkpoint != 0) {
269       checkpoint.WaitForThreadsToRunThroughCheckpoint(threads_running_checkpoint);
270     }
271   } else {
272     DumpUnattachedThreads(os, dump_native_stack);
273   }
274 }
275 
AssertThreadsAreSuspended(Thread * self,Thread * ignore1,Thread * ignore2)276 void ThreadList::AssertThreadsAreSuspended(Thread* self, Thread* ignore1, Thread* ignore2) {
277   MutexLock mu(self, *Locks::thread_list_lock_);
278   MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
279   for (const auto& thread : list_) {
280     if (thread != ignore1 && thread != ignore2) {
281       CHECK(thread->IsSuspended())
282             << "\nUnsuspended thread: <<" << *thread << "\n"
283             << "self: <<" << *Thread::Current();
284     }
285   }
286 }
287 
288 #if HAVE_TIMED_RWLOCK
289 // Attempt to rectify locks so that we dump thread list with required locks before exiting.
UnsafeLogFatalForThreadSuspendAllTimeout()290 NO_RETURN static void UnsafeLogFatalForThreadSuspendAllTimeout() {
291   // Increment gAborting before doing the thread list dump since we don't want any failures from
292   // AssertThreadSuspensionIsAllowable in cases where thread suspension is not allowed.
293   // See b/69044468.
294   ++gAborting;
295   Runtime* runtime = Runtime::Current();
296   std::ostringstream ss;
297   ss << "Thread suspend timeout\n";
298   Locks::mutator_lock_->Dump(ss);
299   ss << "\n";
300   runtime->GetThreadList()->Dump(ss);
301   --gAborting;
302   LOG(FATAL) << ss.str();
303   exit(0);
304 }
305 #endif
306 
307 // Unlike suspending all threads where we can wait to acquire the mutator_lock_, suspending an
308 // individual thread requires polling. delay_us is the requested sleep wait. If delay_us is 0 then
309 // we use sched_yield instead of calling usleep.
310 // Although there is the possibility, here and elsewhere, that usleep could return -1 and
311 // errno = EINTR, there should be no problem if interrupted, so we do not check.
ThreadSuspendSleep(useconds_t delay_us)312 static void ThreadSuspendSleep(useconds_t delay_us) {
313   if (delay_us == 0) {
314     sched_yield();
315   } else {
316     usleep(delay_us);
317   }
318 }
319 
RunCheckpoint(Closure * checkpoint_function,Closure * callback)320 size_t ThreadList::RunCheckpoint(Closure* checkpoint_function, Closure* callback) {
321   Thread* self = Thread::Current();
322   Locks::mutator_lock_->AssertNotExclusiveHeld(self);
323   Locks::thread_list_lock_->AssertNotHeld(self);
324   Locks::thread_suspend_count_lock_->AssertNotHeld(self);
325 
326   std::vector<Thread*> suspended_count_modified_threads;
327   size_t count = 0;
328   {
329     // Call a checkpoint function for each thread, threads which are suspend get their checkpoint
330     // manually called.
331     MutexLock mu(self, *Locks::thread_list_lock_);
332     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
333     count = list_.size();
334     for (const auto& thread : list_) {
335       if (thread != self) {
336         while (true) {
337           if (thread->RequestCheckpoint(checkpoint_function)) {
338             // This thread will run its checkpoint some time in the near future.
339             break;
340           } else {
341             // We are probably suspended, try to make sure that we stay suspended.
342             // The thread switched back to runnable.
343             if (thread->GetState() == kRunnable) {
344               // Spurious fail, try again.
345               continue;
346             }
347             bool updated = thread->ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal);
348             DCHECK(updated);
349             suspended_count_modified_threads.push_back(thread);
350             break;
351           }
352         }
353       }
354     }
355     // Run the callback to be called inside this critical section.
356     if (callback != nullptr) {
357       callback->Run(self);
358     }
359   }
360 
361   // Run the checkpoint on ourself while we wait for threads to suspend.
362   checkpoint_function->Run(self);
363 
364   // Run the checkpoint on the suspended threads.
365   for (const auto& thread : suspended_count_modified_threads) {
366     if (!thread->IsSuspended()) {
367       ScopedTrace trace([&]() {
368         std::ostringstream oss;
369         thread->ShortDump(oss);
370         return std::string("Waiting for suspension of thread ") + oss.str();
371       });
372       // Busy wait until the thread is suspended.
373       const uint64_t start_time = NanoTime();
374       do {
375         ThreadSuspendSleep(kThreadSuspendInitialSleepUs);
376       } while (!thread->IsSuspended());
377       const uint64_t total_delay = NanoTime() - start_time;
378       // Shouldn't need to wait for longer than 1000 microseconds.
379       constexpr uint64_t kLongWaitThreshold = MsToNs(1);
380       if (UNLIKELY(total_delay > kLongWaitThreshold)) {
381         LOG(WARNING) << "Long wait of " << PrettyDuration(total_delay) << " for "
382             << *thread << " suspension!";
383       }
384     }
385     // We know for sure that the thread is suspended at this point.
386     checkpoint_function->Run(thread);
387     {
388       MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
389       bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
390       DCHECK(updated);
391     }
392   }
393 
394   {
395     // Imitate ResumeAll, threads may be waiting on Thread::resume_cond_ since we raised their
396     // suspend count. Now the suspend_count_ is lowered so we must do the broadcast.
397     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
398     Thread::resume_cond_->Broadcast(self);
399   }
400 
401   return count;
402 }
403 
RunEmptyCheckpoint()404 void ThreadList::RunEmptyCheckpoint() {
405   Thread* self = Thread::Current();
406   Locks::mutator_lock_->AssertNotExclusiveHeld(self);
407   Locks::thread_list_lock_->AssertNotHeld(self);
408   Locks::thread_suspend_count_lock_->AssertNotHeld(self);
409   std::vector<uint32_t> runnable_thread_ids;
410   size_t count = 0;
411   Barrier* barrier = empty_checkpoint_barrier_.get();
412   barrier->Init(self, 0);
413   {
414     MutexLock mu(self, *Locks::thread_list_lock_);
415     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
416     for (Thread* thread : list_) {
417       if (thread != self) {
418         while (true) {
419           if (thread->RequestEmptyCheckpoint()) {
420             // This thread will run an empty checkpoint (decrement the empty checkpoint barrier)
421             // some time in the near future.
422             ++count;
423             if (kIsDebugBuild) {
424               runnable_thread_ids.push_back(thread->GetThreadId());
425             }
426             break;
427           }
428           if (thread->GetState() != kRunnable) {
429             // It's seen suspended, we are done because it must not be in the middle of a mutator
430             // heap access.
431             break;
432           }
433         }
434       }
435     }
436   }
437 
438   // Wake up the threads blocking for weak ref access so that they will respond to the empty
439   // checkpoint request. Otherwise we will hang as they are blocking in the kRunnable state.
440   Runtime::Current()->GetHeap()->GetReferenceProcessor()->BroadcastForSlowPath(self);
441   Runtime::Current()->BroadcastForNewSystemWeaks(/*broadcast_for_checkpoint*/true);
442   {
443     ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
444     uint64_t total_wait_time = 0;
445     bool first_iter = true;
446     while (true) {
447       // Wake up the runnable threads blocked on the mutexes that another thread, which is blocked
448       // on a weak ref access, holds (indirectly blocking for weak ref access through another thread
449       // and a mutex.) This needs to be done periodically because the thread may be preempted
450       // between the CheckEmptyCheckpointFromMutex call and the subsequent futex wait in
451       // Mutex::ExclusiveLock, etc. when the wakeup via WakeupToRespondToEmptyCheckpoint
452       // arrives. This could cause a *very rare* deadlock, if not repeated. Most of the cases are
453       // handled in the first iteration.
454       for (BaseMutex* mutex : Locks::expected_mutexes_on_weak_ref_access_) {
455         mutex->WakeupToRespondToEmptyCheckpoint();
456       }
457       static constexpr uint64_t kEmptyCheckpointPeriodicTimeoutMs = 100;  // 100ms
458       static constexpr uint64_t kEmptyCheckpointTotalTimeoutMs = 600 * 1000;  // 10 minutes.
459       size_t barrier_count = first_iter ? count : 0;
460       first_iter = false;  // Don't add to the barrier count from the second iteration on.
461       bool timed_out = barrier->Increment(self, barrier_count, kEmptyCheckpointPeriodicTimeoutMs);
462       if (!timed_out) {
463         break;  // Success
464       }
465       // This is a very rare case.
466       total_wait_time += kEmptyCheckpointPeriodicTimeoutMs;
467       if (kIsDebugBuild && total_wait_time > kEmptyCheckpointTotalTimeoutMs) {
468         std::ostringstream ss;
469         ss << "Empty checkpoint timeout\n";
470         ss << "Barrier count " << barrier->GetCount(self) << "\n";
471         ss << "Runnable thread IDs";
472         for (uint32_t tid : runnable_thread_ids) {
473           ss << " " << tid;
474         }
475         ss << "\n";
476         Locks::mutator_lock_->Dump(ss);
477         ss << "\n";
478         LOG(FATAL_WITHOUT_ABORT) << ss.str();
479         // Some threads in 'runnable_thread_ids' are probably stuck. Try to dump their stacks.
480         // Avoid using ThreadList::Dump() initially because it is likely to get stuck as well.
481         {
482           ScopedObjectAccess soa(self);
483           MutexLock mu1(self, *Locks::thread_list_lock_);
484           for (Thread* thread : GetList()) {
485             uint32_t tid = thread->GetThreadId();
486             bool is_in_runnable_thread_ids =
487                 std::find(runnable_thread_ids.begin(), runnable_thread_ids.end(), tid) !=
488                 runnable_thread_ids.end();
489             if (is_in_runnable_thread_ids &&
490                 thread->ReadFlag(kEmptyCheckpointRequest)) {
491               // Found a runnable thread that hasn't responded to the empty checkpoint request.
492               // Assume it's stuck and safe to dump its stack.
493               thread->Dump(LOG_STREAM(FATAL_WITHOUT_ABORT),
494                            /*dump_native_stack*/ true,
495                            /*backtrace_map*/ nullptr,
496                            /*force_dump_stack*/ true);
497             }
498           }
499         }
500         LOG(FATAL_WITHOUT_ABORT)
501             << "Dumped runnable threads that haven't responded to empty checkpoint.";
502         // Now use ThreadList::Dump() to dump more threads, noting it may get stuck.
503         Dump(LOG_STREAM(FATAL_WITHOUT_ABORT));
504         LOG(FATAL) << "Dumped all threads.";
505       }
506     }
507   }
508 }
509 
510 // Request that a checkpoint function be run on all active (non-suspended)
511 // threads.  Returns the number of successful requests.
RunCheckpointOnRunnableThreads(Closure * checkpoint_function)512 size_t ThreadList::RunCheckpointOnRunnableThreads(Closure* checkpoint_function) {
513   Thread* self = Thread::Current();
514   Locks::mutator_lock_->AssertNotExclusiveHeld(self);
515   Locks::thread_list_lock_->AssertNotHeld(self);
516   Locks::thread_suspend_count_lock_->AssertNotHeld(self);
517   CHECK_NE(self->GetState(), kRunnable);
518 
519   size_t count = 0;
520   {
521     // Call a checkpoint function for each non-suspended thread.
522     MutexLock mu(self, *Locks::thread_list_lock_);
523     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
524     for (const auto& thread : list_) {
525       if (thread != self) {
526         if (thread->RequestCheckpoint(checkpoint_function)) {
527           // This thread will run its checkpoint some time in the near future.
528           count++;
529         }
530       }
531     }
532   }
533 
534   // Return the number of threads that will run the checkpoint function.
535   return count;
536 }
537 
538 // A checkpoint/suspend-all hybrid to switch thread roots from
539 // from-space to to-space refs. Used to synchronize threads at a point
540 // to mark the initiation of marking while maintaining the to-space
541 // invariant.
FlipThreadRoots(Closure * thread_flip_visitor,Closure * flip_callback,gc::collector::GarbageCollector * collector,gc::GcPauseListener * pause_listener)542 size_t ThreadList::FlipThreadRoots(Closure* thread_flip_visitor,
543                                    Closure* flip_callback,
544                                    gc::collector::GarbageCollector* collector,
545                                    gc::GcPauseListener* pause_listener) {
546   TimingLogger::ScopedTiming split("ThreadListFlip", collector->GetTimings());
547   Thread* self = Thread::Current();
548   Locks::mutator_lock_->AssertNotHeld(self);
549   Locks::thread_list_lock_->AssertNotHeld(self);
550   Locks::thread_suspend_count_lock_->AssertNotHeld(self);
551   CHECK_NE(self->GetState(), kRunnable);
552 
553   collector->GetHeap()->ThreadFlipBegin(self);  // Sync with JNI critical calls.
554 
555   // ThreadFlipBegin happens before we suspend all the threads, so it does not count towards the
556   // pause.
557   const uint64_t suspend_start_time = NanoTime();
558   SuspendAllInternal(self, self, nullptr);
559   if (pause_listener != nullptr) {
560     pause_listener->StartPause();
561   }
562 
563   // Run the flip callback for the collector.
564   Locks::mutator_lock_->ExclusiveLock(self);
565   suspend_all_historam_.AdjustAndAddValue(NanoTime() - suspend_start_time);
566   flip_callback->Run(self);
567   Locks::mutator_lock_->ExclusiveUnlock(self);
568   collector->RegisterPause(NanoTime() - suspend_start_time);
569   if (pause_listener != nullptr) {
570     pause_listener->EndPause();
571   }
572 
573   // Resume runnable threads.
574   size_t runnable_thread_count = 0;
575   std::vector<Thread*> other_threads;
576   {
577     TimingLogger::ScopedTiming split2("ResumeRunnableThreads", collector->GetTimings());
578     MutexLock mu(self, *Locks::thread_list_lock_);
579     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
580     --suspend_all_count_;
581     for (const auto& thread : list_) {
582       // Set the flip function for all threads because Thread::DumpState/DumpJavaStack() (invoked by
583       // a checkpoint) may cause the flip function to be run for a runnable/suspended thread before
584       // a runnable thread runs it for itself or we run it for a suspended thread below.
585       thread->SetFlipFunction(thread_flip_visitor);
586       if (thread == self) {
587         continue;
588       }
589       // Resume early the threads that were runnable but are suspended just for this thread flip or
590       // about to transition from non-runnable (eg. kNative at the SOA entry in a JNI function) to
591       // runnable (both cases waiting inside Thread::TransitionFromSuspendedToRunnable), or waiting
592       // for the thread flip to end at the JNI critical section entry (kWaitingForGcThreadFlip),
593       ThreadState state = thread->GetState();
594       if ((state == kWaitingForGcThreadFlip || thread->IsTransitioningToRunnable()) &&
595           thread->GetSuspendCount() == 1) {
596         // The thread will resume right after the broadcast.
597         bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
598         DCHECK(updated);
599         ++runnable_thread_count;
600       } else {
601         other_threads.push_back(thread);
602       }
603     }
604     Thread::resume_cond_->Broadcast(self);
605   }
606 
607   collector->GetHeap()->ThreadFlipEnd(self);
608 
609   // Run the closure on the other threads and let them resume.
610   {
611     TimingLogger::ScopedTiming split3("FlipOtherThreads", collector->GetTimings());
612     ReaderMutexLock mu(self, *Locks::mutator_lock_);
613     for (const auto& thread : other_threads) {
614       Closure* flip_func = thread->GetFlipFunction();
615       if (flip_func != nullptr) {
616         flip_func->Run(thread);
617       }
618     }
619     // Run it for self.
620     Closure* flip_func = self->GetFlipFunction();
621     if (flip_func != nullptr) {
622       flip_func->Run(self);
623     }
624   }
625 
626   // Resume other threads.
627   {
628     TimingLogger::ScopedTiming split4("ResumeOtherThreads", collector->GetTimings());
629     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
630     for (const auto& thread : other_threads) {
631       bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
632       DCHECK(updated);
633     }
634     Thread::resume_cond_->Broadcast(self);
635   }
636 
637   return runnable_thread_count + other_threads.size() + 1;  // +1 for self.
638 }
639 
SuspendAll(const char * cause,bool long_suspend)640 void ThreadList::SuspendAll(const char* cause, bool long_suspend) {
641   Thread* self = Thread::Current();
642 
643   if (self != nullptr) {
644     VLOG(threads) << *self << " SuspendAll for " << cause << " starting...";
645   } else {
646     VLOG(threads) << "Thread[null] SuspendAll for " << cause << " starting...";
647   }
648   {
649     ScopedTrace trace("Suspending mutator threads");
650     const uint64_t start_time = NanoTime();
651 
652     SuspendAllInternal(self, self);
653     // All threads are known to have suspended (but a thread may still own the mutator lock)
654     // Make sure this thread grabs exclusive access to the mutator lock and its protected data.
655 #if HAVE_TIMED_RWLOCK
656     while (true) {
657       if (Locks::mutator_lock_->ExclusiveLockWithTimeout(self,
658                                                          NsToMs(thread_suspend_timeout_ns_),
659                                                          0)) {
660         break;
661       } else if (!long_suspend_) {
662         // Reading long_suspend without the mutator lock is slightly racy, in some rare cases, this
663         // could result in a thread suspend timeout.
664         // Timeout if we wait more than thread_suspend_timeout_ns_ nanoseconds.
665         UnsafeLogFatalForThreadSuspendAllTimeout();
666       }
667     }
668 #else
669     Locks::mutator_lock_->ExclusiveLock(self);
670 #endif
671 
672     long_suspend_ = long_suspend;
673 
674     const uint64_t end_time = NanoTime();
675     const uint64_t suspend_time = end_time - start_time;
676     suspend_all_historam_.AdjustAndAddValue(suspend_time);
677     if (suspend_time > kLongThreadSuspendThreshold) {
678       LOG(WARNING) << "Suspending all threads took: " << PrettyDuration(suspend_time);
679     }
680 
681     if (kDebugLocking) {
682       // Debug check that all threads are suspended.
683       AssertThreadsAreSuspended(self, self);
684     }
685   }
686   ATRACE_BEGIN((std::string("Mutator threads suspended for ") + cause).c_str());
687 
688   if (self != nullptr) {
689     VLOG(threads) << *self << " SuspendAll complete";
690   } else {
691     VLOG(threads) << "Thread[null] SuspendAll complete";
692   }
693 }
694 
695 // Ensures all threads running Java suspend and that those not running Java don't start.
696 // Debugger thread might be set to kRunnable for a short period of time after the
697 // SuspendAllInternal. This is safe because it will be set back to suspended state before
698 // the SuspendAll returns.
SuspendAllInternal(Thread * self,Thread * ignore1,Thread * ignore2,SuspendReason reason)699 void ThreadList::SuspendAllInternal(Thread* self,
700                                     Thread* ignore1,
701                                     Thread* ignore2,
702                                     SuspendReason reason) {
703   Locks::mutator_lock_->AssertNotExclusiveHeld(self);
704   Locks::thread_list_lock_->AssertNotHeld(self);
705   Locks::thread_suspend_count_lock_->AssertNotHeld(self);
706   if (kDebugLocking && self != nullptr) {
707     CHECK_NE(self->GetState(), kRunnable);
708   }
709 
710   // First request that all threads suspend, then wait for them to suspend before
711   // returning. This suspension scheme also relies on other behaviour:
712   // 1. Threads cannot be deleted while they are suspended or have a suspend-
713   //    request flag set - (see Unregister() below).
714   // 2. When threads are created, they are created in a suspended state (actually
715   //    kNative) and will never begin executing Java code without first checking
716   //    the suspend-request flag.
717 
718   // The atomic counter for number of threads that need to pass the barrier.
719   AtomicInteger pending_threads;
720   uint32_t num_ignored = 0;
721   if (ignore1 != nullptr) {
722     ++num_ignored;
723   }
724   if (ignore2 != nullptr && ignore1 != ignore2) {
725     ++num_ignored;
726   }
727   {
728     MutexLock mu(self, *Locks::thread_list_lock_);
729     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
730     // Update global suspend all state for attaching threads.
731     ++suspend_all_count_;
732     if (reason == SuspendReason::kForDebugger) {
733       ++debug_suspend_all_count_;
734     }
735     pending_threads.StoreRelaxed(list_.size() - num_ignored);
736     // Increment everybody's suspend count (except those that should be ignored).
737     for (const auto& thread : list_) {
738       if (thread == ignore1 || thread == ignore2) {
739         continue;
740       }
741       VLOG(threads) << "requesting thread suspend: " << *thread;
742       bool updated = thread->ModifySuspendCount(self, +1, &pending_threads, reason);
743       DCHECK(updated);
744 
745       // Must install the pending_threads counter first, then check thread->IsSuspend() and clear
746       // the counter. Otherwise there's a race with Thread::TransitionFromRunnableToSuspended()
747       // that can lead a thread to miss a call to PassActiveSuspendBarriers().
748       if (thread->IsSuspended()) {
749         // Only clear the counter for the current thread.
750         thread->ClearSuspendBarrier(&pending_threads);
751         pending_threads.FetchAndSubSequentiallyConsistent(1);
752       }
753     }
754   }
755 
756   // Wait for the barrier to be passed by all runnable threads. This wait
757   // is done with a timeout so that we can detect problems.
758 #if ART_USE_FUTEXES
759   timespec wait_timeout;
760   InitTimeSpec(false, CLOCK_MONOTONIC, NsToMs(thread_suspend_timeout_ns_), 0, &wait_timeout);
761 #endif
762   const uint64_t start_time = NanoTime();
763   while (true) {
764     int32_t cur_val = pending_threads.LoadRelaxed();
765     if (LIKELY(cur_val > 0)) {
766 #if ART_USE_FUTEXES
767       if (futex(pending_threads.Address(), FUTEX_WAIT, cur_val, &wait_timeout, nullptr, 0) != 0) {
768         // EAGAIN and EINTR both indicate a spurious failure, try again from the beginning.
769         if ((errno != EAGAIN) && (errno != EINTR)) {
770           if (errno == ETIMEDOUT) {
771             LOG(kIsDebugBuild ? ::android::base::FATAL : ::android::base::ERROR)
772                 << "Timed out waiting for threads to suspend, waited for "
773                 << PrettyDuration(NanoTime() - start_time);
774           } else {
775             PLOG(FATAL) << "futex wait failed for SuspendAllInternal()";
776           }
777         }
778       }  // else re-check pending_threads in the next iteration (this may be a spurious wake-up).
779 #else
780       // Spin wait. This is likely to be slow, but on most architecture ART_USE_FUTEXES is set.
781       UNUSED(start_time);
782 #endif
783     } else {
784       CHECK_EQ(cur_val, 0);
785       break;
786     }
787   }
788 }
789 
ResumeAll()790 void ThreadList::ResumeAll() {
791   Thread* self = Thread::Current();
792 
793   if (self != nullptr) {
794     VLOG(threads) << *self << " ResumeAll starting";
795   } else {
796     VLOG(threads) << "Thread[null] ResumeAll starting";
797   }
798 
799   ATRACE_END();
800 
801   ScopedTrace trace("Resuming mutator threads");
802 
803   if (kDebugLocking) {
804     // Debug check that all threads are suspended.
805     AssertThreadsAreSuspended(self, self);
806   }
807 
808   long_suspend_ = false;
809 
810   Locks::mutator_lock_->ExclusiveUnlock(self);
811   {
812     MutexLock mu(self, *Locks::thread_list_lock_);
813     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
814     // Update global suspend all state for attaching threads.
815     --suspend_all_count_;
816     // Decrement the suspend counts for all threads.
817     for (const auto& thread : list_) {
818       if (thread == self) {
819         continue;
820       }
821       bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
822       DCHECK(updated);
823     }
824 
825     // Broadcast a notification to all suspended threads, some or all of
826     // which may choose to wake up.  No need to wait for them.
827     if (self != nullptr) {
828       VLOG(threads) << *self << " ResumeAll waking others";
829     } else {
830       VLOG(threads) << "Thread[null] ResumeAll waking others";
831     }
832     Thread::resume_cond_->Broadcast(self);
833   }
834 
835   if (self != nullptr) {
836     VLOG(threads) << *self << " ResumeAll complete";
837   } else {
838     VLOG(threads) << "Thread[null] ResumeAll complete";
839   }
840 }
841 
Resume(Thread * thread,SuspendReason reason)842 bool ThreadList::Resume(Thread* thread, SuspendReason reason) {
843   // This assumes there was an ATRACE_BEGIN when we suspended the thread.
844   ATRACE_END();
845 
846   Thread* self = Thread::Current();
847   DCHECK_NE(thread, self);
848   VLOG(threads) << "Resume(" << reinterpret_cast<void*>(thread) << ") starting..." << reason;
849 
850   {
851     // To check Contains.
852     MutexLock mu(self, *Locks::thread_list_lock_);
853     // To check IsSuspended.
854     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
855     if (UNLIKELY(!thread->IsSuspended())) {
856       LOG(ERROR) << "Resume(" << reinterpret_cast<void*>(thread)
857           << ") thread not suspended";
858       return false;
859     }
860     if (!Contains(thread)) {
861       // We only expect threads within the thread-list to have been suspended otherwise we can't
862       // stop such threads from delete-ing themselves.
863       LOG(ERROR) << "Resume(" << reinterpret_cast<void*>(thread)
864           << ") thread not within thread list";
865       return false;
866     }
867     if (UNLIKELY(!thread->ModifySuspendCount(self, -1, nullptr, reason))) {
868       LOG(ERROR) << "Resume(" << reinterpret_cast<void*>(thread)
869                  << ") could not modify suspend count.";
870       return false;
871     }
872   }
873 
874   {
875     VLOG(threads) << "Resume(" << reinterpret_cast<void*>(thread) << ") waking others";
876     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
877     Thread::resume_cond_->Broadcast(self);
878   }
879 
880   VLOG(threads) << "Resume(" << reinterpret_cast<void*>(thread) << ") complete";
881   return true;
882 }
883 
ThreadSuspendByPeerWarning(Thread * self,LogSeverity severity,const char * message,jobject peer)884 static void ThreadSuspendByPeerWarning(Thread* self,
885                                        LogSeverity severity,
886                                        const char* message,
887                                        jobject peer) {
888   JNIEnvExt* env = self->GetJniEnv();
889   ScopedLocalRef<jstring>
890       scoped_name_string(env, static_cast<jstring>(env->GetObjectField(
891           peer, WellKnownClasses::java_lang_Thread_name)));
892   ScopedUtfChars scoped_name_chars(env, scoped_name_string.get());
893   if (scoped_name_chars.c_str() == nullptr) {
894       LOG(severity) << message << ": " << peer;
895       env->ExceptionClear();
896   } else {
897       LOG(severity) << message << ": " << peer << ":" << scoped_name_chars.c_str();
898   }
899 }
900 
SuspendThreadByPeer(jobject peer,bool request_suspension,SuspendReason reason,bool * timed_out)901 Thread* ThreadList::SuspendThreadByPeer(jobject peer,
902                                         bool request_suspension,
903                                         SuspendReason reason,
904                                         bool* timed_out) {
905   const uint64_t start_time = NanoTime();
906   useconds_t sleep_us = kThreadSuspendInitialSleepUs;
907   *timed_out = false;
908   Thread* const self = Thread::Current();
909   Thread* suspended_thread = nullptr;
910   VLOG(threads) << "SuspendThreadByPeer starting";
911   while (true) {
912     Thread* thread;
913     {
914       // Note: this will transition to runnable and potentially suspend. We ensure only one thread
915       // is requesting another suspend, to avoid deadlock, by requiring this function be called
916       // holding Locks::thread_list_suspend_thread_lock_. Its important this thread suspend rather
917       // than request thread suspension, to avoid potential cycles in threads requesting each other
918       // suspend.
919       ScopedObjectAccess soa(self);
920       MutexLock thread_list_mu(self, *Locks::thread_list_lock_);
921       thread = Thread::FromManagedThread(soa, peer);
922       if (thread == nullptr) {
923         if (suspended_thread != nullptr) {
924           MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
925           // If we incremented the suspend count but the thread reset its peer, we need to
926           // re-decrement it since it is shutting down and may deadlock the runtime in
927           // ThreadList::WaitForOtherNonDaemonThreadsToExit.
928           bool updated = suspended_thread->ModifySuspendCount(soa.Self(),
929                                                               -1,
930                                                               nullptr,
931                                                               reason);
932           DCHECK(updated);
933         }
934         ThreadSuspendByPeerWarning(self,
935                                    ::android::base::WARNING,
936                                     "No such thread for suspend",
937                                     peer);
938         return nullptr;
939       }
940       if (!Contains(thread)) {
941         CHECK(suspended_thread == nullptr);
942         VLOG(threads) << "SuspendThreadByPeer failed for unattached thread: "
943             << reinterpret_cast<void*>(thread);
944         return nullptr;
945       }
946       VLOG(threads) << "SuspendThreadByPeer found thread: " << *thread;
947       {
948         MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
949         if (request_suspension) {
950           if (self->GetSuspendCount() > 0) {
951             // We hold the suspend count lock but another thread is trying to suspend us. Its not
952             // safe to try to suspend another thread in case we get a cycle. Start the loop again
953             // which will allow this thread to be suspended.
954             continue;
955           }
956           CHECK(suspended_thread == nullptr);
957           suspended_thread = thread;
958           bool updated = suspended_thread->ModifySuspendCount(self, +1, nullptr, reason);
959           DCHECK(updated);
960           request_suspension = false;
961         } else {
962           // If the caller isn't requesting suspension, a suspension should have already occurred.
963           CHECK_GT(thread->GetSuspendCount(), 0);
964         }
965         // IsSuspended on the current thread will fail as the current thread is changed into
966         // Runnable above. As the suspend count is now raised if this is the current thread
967         // it will self suspend on transition to Runnable, making it hard to work with. It's simpler
968         // to just explicitly handle the current thread in the callers to this code.
969         CHECK_NE(thread, self) << "Attempt to suspend the current thread for the debugger";
970         // If thread is suspended (perhaps it was already not Runnable but didn't have a suspend
971         // count, or else we've waited and it has self suspended) or is the current thread, we're
972         // done.
973         if (thread->IsSuspended()) {
974           VLOG(threads) << "SuspendThreadByPeer thread suspended: " << *thread;
975           if (ATRACE_ENABLED()) {
976             std::string name;
977             thread->GetThreadName(name);
978             ATRACE_BEGIN(StringPrintf("SuspendThreadByPeer suspended %s for peer=%p", name.c_str(),
979                                       peer).c_str());
980           }
981           return thread;
982         }
983         const uint64_t total_delay = NanoTime() - start_time;
984         if (total_delay >= thread_suspend_timeout_ns_) {
985           ThreadSuspendByPeerWarning(self,
986                                      ::android::base::FATAL,
987                                      "Thread suspension timed out",
988                                      peer);
989           if (suspended_thread != nullptr) {
990             CHECK_EQ(suspended_thread, thread);
991             bool updated = suspended_thread->ModifySuspendCount(soa.Self(),
992                                                                 -1,
993                                                                 nullptr,
994                                                                 reason);
995             DCHECK(updated);
996           }
997           *timed_out = true;
998           return nullptr;
999         } else if (sleep_us == 0 &&
1000             total_delay > static_cast<uint64_t>(kThreadSuspendMaxYieldUs) * 1000) {
1001           // We have spun for kThreadSuspendMaxYieldUs time, switch to sleeps to prevent
1002           // excessive CPU usage.
1003           sleep_us = kThreadSuspendMaxYieldUs / 2;
1004         }
1005       }
1006       // Release locks and come out of runnable state.
1007     }
1008     VLOG(threads) << "SuspendThreadByPeer waiting to allow thread chance to suspend";
1009     ThreadSuspendSleep(sleep_us);
1010     // This may stay at 0 if sleep_us == 0, but this is WAI since we want to avoid using usleep at
1011     // all if possible. This shouldn't be an issue since time to suspend should always be small.
1012     sleep_us = std::min(sleep_us * 2, kThreadSuspendMaxSleepUs);
1013   }
1014 }
1015 
ThreadSuspendByThreadIdWarning(LogSeverity severity,const char * message,uint32_t thread_id)1016 static void ThreadSuspendByThreadIdWarning(LogSeverity severity,
1017                                            const char* message,
1018                                            uint32_t thread_id) {
1019   LOG(severity) << StringPrintf("%s: %d", message, thread_id);
1020 }
1021 
SuspendThreadByThreadId(uint32_t thread_id,SuspendReason reason,bool * timed_out)1022 Thread* ThreadList::SuspendThreadByThreadId(uint32_t thread_id,
1023                                             SuspendReason reason,
1024                                             bool* timed_out) {
1025   const uint64_t start_time = NanoTime();
1026   useconds_t sleep_us = kThreadSuspendInitialSleepUs;
1027   *timed_out = false;
1028   Thread* suspended_thread = nullptr;
1029   Thread* const self = Thread::Current();
1030   CHECK_NE(thread_id, kInvalidThreadId);
1031   VLOG(threads) << "SuspendThreadByThreadId starting";
1032   while (true) {
1033     {
1034       // Note: this will transition to runnable and potentially suspend. We ensure only one thread
1035       // is requesting another suspend, to avoid deadlock, by requiring this function be called
1036       // holding Locks::thread_list_suspend_thread_lock_. Its important this thread suspend rather
1037       // than request thread suspension, to avoid potential cycles in threads requesting each other
1038       // suspend.
1039       ScopedObjectAccess soa(self);
1040       MutexLock thread_list_mu(self, *Locks::thread_list_lock_);
1041       Thread* thread = nullptr;
1042       for (const auto& it : list_) {
1043         if (it->GetThreadId() == thread_id) {
1044           thread = it;
1045           break;
1046         }
1047       }
1048       if (thread == nullptr) {
1049         CHECK(suspended_thread == nullptr) << "Suspended thread " << suspended_thread
1050             << " no longer in thread list";
1051         // There's a race in inflating a lock and the owner giving up ownership and then dying.
1052         ThreadSuspendByThreadIdWarning(::android::base::WARNING,
1053                                        "No such thread id for suspend",
1054                                        thread_id);
1055         return nullptr;
1056       }
1057       VLOG(threads) << "SuspendThreadByThreadId found thread: " << *thread;
1058       DCHECK(Contains(thread));
1059       {
1060         MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
1061         if (suspended_thread == nullptr) {
1062           if (self->GetSuspendCount() > 0) {
1063             // We hold the suspend count lock but another thread is trying to suspend us. Its not
1064             // safe to try to suspend another thread in case we get a cycle. Start the loop again
1065             // which will allow this thread to be suspended.
1066             continue;
1067           }
1068           bool updated = thread->ModifySuspendCount(self, +1, nullptr, reason);
1069           DCHECK(updated);
1070           suspended_thread = thread;
1071         } else {
1072           CHECK_EQ(suspended_thread, thread);
1073           // If the caller isn't requesting suspension, a suspension should have already occurred.
1074           CHECK_GT(thread->GetSuspendCount(), 0);
1075         }
1076         // IsSuspended on the current thread will fail as the current thread is changed into
1077         // Runnable above. As the suspend count is now raised if this is the current thread
1078         // it will self suspend on transition to Runnable, making it hard to work with. It's simpler
1079         // to just explicitly handle the current thread in the callers to this code.
1080         CHECK_NE(thread, self) << "Attempt to suspend the current thread for the debugger";
1081         // If thread is suspended (perhaps it was already not Runnable but didn't have a suspend
1082         // count, or else we've waited and it has self suspended) or is the current thread, we're
1083         // done.
1084         if (thread->IsSuspended()) {
1085           if (ATRACE_ENABLED()) {
1086             std::string name;
1087             thread->GetThreadName(name);
1088             ATRACE_BEGIN(StringPrintf("SuspendThreadByThreadId suspended %s id=%d",
1089                                       name.c_str(), thread_id).c_str());
1090           }
1091           VLOG(threads) << "SuspendThreadByThreadId thread suspended: " << *thread;
1092           return thread;
1093         }
1094         const uint64_t total_delay = NanoTime() - start_time;
1095         if (total_delay >= thread_suspend_timeout_ns_) {
1096           ThreadSuspendByThreadIdWarning(::android::base::WARNING,
1097                                          "Thread suspension timed out",
1098                                          thread_id);
1099           if (suspended_thread != nullptr) {
1100             bool updated = thread->ModifySuspendCount(soa.Self(), -1, nullptr, reason);
1101             DCHECK(updated);
1102           }
1103           *timed_out = true;
1104           return nullptr;
1105         } else if (sleep_us == 0 &&
1106             total_delay > static_cast<uint64_t>(kThreadSuspendMaxYieldUs) * 1000) {
1107           // We have spun for kThreadSuspendMaxYieldUs time, switch to sleeps to prevent
1108           // excessive CPU usage.
1109           sleep_us = kThreadSuspendMaxYieldUs / 2;
1110         }
1111       }
1112       // Release locks and come out of runnable state.
1113     }
1114     VLOG(threads) << "SuspendThreadByThreadId waiting to allow thread chance to suspend";
1115     ThreadSuspendSleep(sleep_us);
1116     sleep_us = std::min(sleep_us * 2, kThreadSuspendMaxSleepUs);
1117   }
1118 }
1119 
FindThreadByThreadId(uint32_t thread_id)1120 Thread* ThreadList::FindThreadByThreadId(uint32_t thread_id) {
1121   for (const auto& thread : list_) {
1122     if (thread->GetThreadId() == thread_id) {
1123       return thread;
1124     }
1125   }
1126   return nullptr;
1127 }
1128 
SuspendAllForDebugger()1129 void ThreadList::SuspendAllForDebugger() {
1130   Thread* self = Thread::Current();
1131   Thread* debug_thread = Dbg::GetDebugThread();
1132 
1133   VLOG(threads) << *self << " SuspendAllForDebugger starting...";
1134 
1135   SuspendAllInternal(self, self, debug_thread, SuspendReason::kForDebugger);
1136   // Block on the mutator lock until all Runnable threads release their share of access then
1137   // immediately unlock again.
1138 #if HAVE_TIMED_RWLOCK
1139   // Timeout if we wait more than 30 seconds.
1140   if (!Locks::mutator_lock_->ExclusiveLockWithTimeout(self, 30 * 1000, 0)) {
1141     UnsafeLogFatalForThreadSuspendAllTimeout();
1142   } else {
1143     Locks::mutator_lock_->ExclusiveUnlock(self);
1144   }
1145 #else
1146   Locks::mutator_lock_->ExclusiveLock(self);
1147   Locks::mutator_lock_->ExclusiveUnlock(self);
1148 #endif
1149   // Disabled for the following race condition:
1150   // Thread 1 calls SuspendAllForDebugger, gets preempted after pulsing the mutator lock.
1151   // Thread 2 calls SuspendAll and SetStateUnsafe (perhaps from Dbg::Disconnected).
1152   // Thread 1 fails assertion that all threads are suspended due to thread 2 being in a runnable
1153   // state (from SetStateUnsafe).
1154   // AssertThreadsAreSuspended(self, self, debug_thread);
1155 
1156   VLOG(threads) << *self << " SuspendAllForDebugger complete";
1157 }
1158 
SuspendSelfForDebugger()1159 void ThreadList::SuspendSelfForDebugger() {
1160   Thread* const self = Thread::Current();
1161   self->SetReadyForDebugInvoke(true);
1162 
1163   // The debugger thread must not suspend itself due to debugger activity!
1164   Thread* debug_thread = Dbg::GetDebugThread();
1165   CHECK(self != debug_thread);
1166   CHECK_NE(self->GetState(), kRunnable);
1167   Locks::mutator_lock_->AssertNotHeld(self);
1168 
1169   // The debugger may have detached while we were executing an invoke request. In that case, we
1170   // must not suspend ourself.
1171   DebugInvokeReq* pReq = self->GetInvokeReq();
1172   const bool skip_thread_suspension = (pReq != nullptr && !Dbg::IsDebuggerActive());
1173   if (!skip_thread_suspension) {
1174     // Collisions with other suspends aren't really interesting. We want
1175     // to ensure that we're the only one fiddling with the suspend count
1176     // though.
1177     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
1178     bool updated = self->ModifySuspendCount(self, +1, nullptr, SuspendReason::kForDebugger);
1179     DCHECK(updated);
1180     CHECK_GT(self->GetSuspendCount(), 0);
1181 
1182     VLOG(threads) << *self << " self-suspending (debugger)";
1183   } else {
1184     // We must no longer be subject to debugger suspension.
1185     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
1186     CHECK_EQ(self->GetDebugSuspendCount(), 0) << "Debugger detached without resuming us";
1187 
1188     VLOG(threads) << *self << " not self-suspending because debugger detached during invoke";
1189   }
1190 
1191   // If the debugger requested an invoke, we need to send the reply and clear the request.
1192   if (pReq != nullptr) {
1193     Dbg::FinishInvokeMethod(pReq);
1194     self->ClearDebugInvokeReq();
1195     pReq = nullptr;  // object has been deleted, clear it for safety.
1196   }
1197 
1198   // Tell JDWP that we've completed suspension. The JDWP thread can't
1199   // tell us to resume before we're fully asleep because we hold the
1200   // suspend count lock.
1201   Dbg::ClearWaitForEventThread();
1202 
1203   {
1204     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
1205     while (self->GetSuspendCount() != 0) {
1206       Thread::resume_cond_->Wait(self);
1207       if (self->GetSuspendCount() != 0) {
1208         // The condition was signaled but we're still suspended. This
1209         // can happen when we suspend then resume all threads to
1210         // update instrumentation or compute monitor info. This can
1211         // also happen if the debugger lets go while a SIGQUIT thread
1212         // dump event is pending (assuming SignalCatcher was resumed for
1213         // just long enough to try to grab the thread-suspend lock).
1214         VLOG(jdwp) << *self << " still suspended after undo "
1215                    << "(suspend count=" << self->GetSuspendCount() << ", "
1216                    << "debug suspend count=" << self->GetDebugSuspendCount() << ")";
1217       }
1218     }
1219     CHECK_EQ(self->GetSuspendCount(), 0);
1220   }
1221 
1222   self->SetReadyForDebugInvoke(false);
1223   VLOG(threads) << *self << " self-reviving (debugger)";
1224 }
1225 
ResumeAllForDebugger()1226 void ThreadList::ResumeAllForDebugger() {
1227   Thread* self = Thread::Current();
1228   Thread* debug_thread = Dbg::GetDebugThread();
1229 
1230   VLOG(threads) << *self << " ResumeAllForDebugger starting...";
1231 
1232   // Threads can't resume if we exclusively hold the mutator lock.
1233   Locks::mutator_lock_->AssertNotExclusiveHeld(self);
1234 
1235   {
1236     MutexLock thread_list_mu(self, *Locks::thread_list_lock_);
1237     {
1238       MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
1239       // Update global suspend all state for attaching threads.
1240       DCHECK_GE(suspend_all_count_, debug_suspend_all_count_);
1241       if (debug_suspend_all_count_ > 0) {
1242         --suspend_all_count_;
1243         --debug_suspend_all_count_;
1244       } else {
1245         // We've been asked to resume all threads without being asked to
1246         // suspend them all before. That may happen if a debugger tries
1247         // to resume some suspended threads (with suspend count == 1)
1248         // at once with a VirtualMachine.Resume command. Let's print a
1249         // warning.
1250         LOG(WARNING) << "Debugger attempted to resume all threads without "
1251                      << "having suspended them all before.";
1252       }
1253       // Decrement everybody's suspend count (except our own).
1254       for (const auto& thread : list_) {
1255         if (thread == self || thread == debug_thread) {
1256           continue;
1257         }
1258         if (thread->GetDebugSuspendCount() == 0) {
1259           // This thread may have been individually resumed with ThreadReference.Resume.
1260           continue;
1261         }
1262         VLOG(threads) << "requesting thread resume: " << *thread;
1263         bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kForDebugger);
1264         DCHECK(updated);
1265       }
1266     }
1267   }
1268 
1269   {
1270     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
1271     Thread::resume_cond_->Broadcast(self);
1272   }
1273 
1274   VLOG(threads) << *self << " ResumeAllForDebugger complete";
1275 }
1276 
UndoDebuggerSuspensions()1277 void ThreadList::UndoDebuggerSuspensions() {
1278   Thread* self = Thread::Current();
1279 
1280   VLOG(threads) << *self << " UndoDebuggerSuspensions starting";
1281 
1282   {
1283     MutexLock mu(self, *Locks::thread_list_lock_);
1284     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
1285     // Update global suspend all state for attaching threads.
1286     suspend_all_count_ -= debug_suspend_all_count_;
1287     debug_suspend_all_count_ = 0;
1288     // Update running threads.
1289     for (const auto& thread : list_) {
1290       if (thread == self || thread->GetDebugSuspendCount() == 0) {
1291         continue;
1292       }
1293       bool suspended = thread->ModifySuspendCount(self,
1294                                                   -thread->GetDebugSuspendCount(),
1295                                                   nullptr,
1296                                                   SuspendReason::kForDebugger);
1297       DCHECK(suspended);
1298     }
1299   }
1300 
1301   {
1302     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
1303     Thread::resume_cond_->Broadcast(self);
1304   }
1305 
1306   VLOG(threads) << "UndoDebuggerSuspensions(" << *self << ") complete";
1307 }
1308 
WaitForOtherNonDaemonThreadsToExit()1309 void ThreadList::WaitForOtherNonDaemonThreadsToExit() {
1310   ScopedTrace trace(__PRETTY_FUNCTION__);
1311   Thread* self = Thread::Current();
1312   Locks::mutator_lock_->AssertNotHeld(self);
1313   while (true) {
1314     {
1315       // No more threads can be born after we start to shutdown.
1316       MutexLock mu(self, *Locks::runtime_shutdown_lock_);
1317       CHECK(Runtime::Current()->IsShuttingDownLocked());
1318       CHECK_EQ(Runtime::Current()->NumberOfThreadsBeingBorn(), 0U);
1319     }
1320     MutexLock mu(self, *Locks::thread_list_lock_);
1321     // Also wait for any threads that are unregistering to finish. This is required so that no
1322     // threads access the thread list after it is deleted. TODO: This may not work for user daemon
1323     // threads since they could unregister at the wrong time.
1324     bool done = unregistering_count_ == 0;
1325     if (done) {
1326       for (const auto& thread : list_) {
1327         if (thread != self && !thread->IsDaemon()) {
1328           done = false;
1329           break;
1330         }
1331       }
1332     }
1333     if (done) {
1334       break;
1335     }
1336     // Wait for another thread to exit before re-checking.
1337     Locks::thread_exit_cond_->Wait(self);
1338   }
1339 }
1340 
SuspendAllDaemonThreadsForShutdown()1341 void ThreadList::SuspendAllDaemonThreadsForShutdown() {
1342   ScopedTrace trace(__PRETTY_FUNCTION__);
1343   Thread* self = Thread::Current();
1344   size_t daemons_left = 0;
1345   {
1346     // Tell all the daemons it's time to suspend.
1347     MutexLock mu(self, *Locks::thread_list_lock_);
1348     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
1349     for (const auto& thread : list_) {
1350       // This is only run after all non-daemon threads have exited, so the remainder should all be
1351       // daemons.
1352       CHECK(thread->IsDaemon()) << *thread;
1353       if (thread != self) {
1354         bool updated = thread->ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal);
1355         DCHECK(updated);
1356         ++daemons_left;
1357       }
1358       // We are shutting down the runtime, set the JNI functions of all the JNIEnvs to be
1359       // the sleep forever one.
1360       thread->GetJniEnv()->SetFunctionsToRuntimeShutdownFunctions();
1361     }
1362   }
1363   // If we have any daemons left, wait 200ms to ensure they are not stuck in a place where they
1364   // are about to access runtime state and are not in a runnable state. Examples: Monitor code
1365   // or waking up from a condition variable. TODO: Try and see if there is a better way to wait
1366   // for daemon threads to be in a blocked state.
1367   if (daemons_left > 0) {
1368     static constexpr size_t kDaemonSleepTime = 200 * 1000;
1369     usleep(kDaemonSleepTime);
1370   }
1371   // Give the threads a chance to suspend, complaining if they're slow.
1372   bool have_complained = false;
1373   static constexpr size_t kTimeoutMicroseconds = 2000 * 1000;
1374   static constexpr size_t kSleepMicroseconds = 1000;
1375   for (size_t i = 0; i < kTimeoutMicroseconds / kSleepMicroseconds; ++i) {
1376     bool all_suspended = true;
1377     {
1378       MutexLock mu(self, *Locks::thread_list_lock_);
1379       for (const auto& thread : list_) {
1380         if (thread != self && thread->GetState() == kRunnable) {
1381           if (!have_complained) {
1382             LOG(WARNING) << "daemon thread not yet suspended: " << *thread;
1383             have_complained = true;
1384           }
1385           all_suspended = false;
1386         }
1387       }
1388     }
1389     if (all_suspended) {
1390       return;
1391     }
1392     usleep(kSleepMicroseconds);
1393   }
1394   LOG(WARNING) << "timed out suspending all daemon threads";
1395 }
1396 
Register(Thread * self)1397 void ThreadList::Register(Thread* self) {
1398   DCHECK_EQ(self, Thread::Current());
1399   CHECK(!shut_down_);
1400 
1401   if (VLOG_IS_ON(threads)) {
1402     std::ostringstream oss;
1403     self->ShortDump(oss);  // We don't hold the mutator_lock_ yet and so cannot call Dump.
1404     LOG(INFO) << "ThreadList::Register() " << *self  << "\n" << oss.str();
1405   }
1406 
1407   // Atomically add self to the thread list and make its thread_suspend_count_ reflect ongoing
1408   // SuspendAll requests.
1409   MutexLock mu(self, *Locks::thread_list_lock_);
1410   MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
1411   CHECK_GE(suspend_all_count_, debug_suspend_all_count_);
1412   // Modify suspend count in increments of 1 to maintain invariants in ModifySuspendCount. While
1413   // this isn't particularly efficient the suspend counts are most commonly 0 or 1.
1414   for (int delta = debug_suspend_all_count_; delta > 0; delta--) {
1415     bool updated = self->ModifySuspendCount(self, +1, nullptr, SuspendReason::kForDebugger);
1416     DCHECK(updated);
1417   }
1418   for (int delta = suspend_all_count_ - debug_suspend_all_count_; delta > 0; delta--) {
1419     bool updated = self->ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal);
1420     DCHECK(updated);
1421   }
1422   CHECK(!Contains(self));
1423   list_.push_back(self);
1424   if (kUseReadBarrier) {
1425     gc::collector::ConcurrentCopying* const cc =
1426         Runtime::Current()->GetHeap()->ConcurrentCopyingCollector();
1427     // Initialize according to the state of the CC collector.
1428     self->SetIsGcMarkingAndUpdateEntrypoints(cc->IsMarking());
1429     if (cc->IsUsingReadBarrierEntrypoints()) {
1430       self->SetReadBarrierEntrypoints();
1431     }
1432     self->SetWeakRefAccessEnabled(cc->IsWeakRefAccessEnabled());
1433   }
1434 }
1435 
Unregister(Thread * self)1436 void ThreadList::Unregister(Thread* self) {
1437   DCHECK_EQ(self, Thread::Current());
1438   CHECK_NE(self->GetState(), kRunnable);
1439   Locks::mutator_lock_->AssertNotHeld(self);
1440 
1441   VLOG(threads) << "ThreadList::Unregister() " << *self;
1442 
1443   {
1444     MutexLock mu(self, *Locks::thread_list_lock_);
1445     ++unregistering_count_;
1446   }
1447 
1448   // Any time-consuming destruction, plus anything that can call back into managed code or
1449   // suspend and so on, must happen at this point, and not in ~Thread. The self->Destroy is what
1450   // causes the threads to join. It is important to do this after incrementing unregistering_count_
1451   // since we want the runtime to wait for the daemon threads to exit before deleting the thread
1452   // list.
1453   self->Destroy();
1454 
1455   // If tracing, remember thread id and name before thread exits.
1456   Trace::StoreExitingThreadInfo(self);
1457 
1458   uint32_t thin_lock_id = self->GetThreadId();
1459   while (true) {
1460     // Remove and delete the Thread* while holding the thread_list_lock_ and
1461     // thread_suspend_count_lock_ so that the unregistering thread cannot be suspended.
1462     // Note: deliberately not using MutexLock that could hold a stale self pointer.
1463     MutexLock mu(self, *Locks::thread_list_lock_);
1464     if (!Contains(self)) {
1465       std::string thread_name;
1466       self->GetThreadName(thread_name);
1467       std::ostringstream os;
1468       DumpNativeStack(os, GetTid(), nullptr, "  native: ", nullptr);
1469       LOG(ERROR) << "Request to unregister unattached thread " << thread_name << "\n" << os.str();
1470       break;
1471     } else {
1472       MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
1473       if (!self->IsSuspended()) {
1474         list_.remove(self);
1475         break;
1476       }
1477     }
1478     // We failed to remove the thread due to a suspend request, loop and try again.
1479   }
1480   delete self;
1481 
1482   // Release the thread ID after the thread is finished and deleted to avoid cases where we can
1483   // temporarily have multiple threads with the same thread id. When this occurs, it causes
1484   // problems in FindThreadByThreadId / SuspendThreadByThreadId.
1485   ReleaseThreadId(nullptr, thin_lock_id);
1486 
1487   // Clear the TLS data, so that the underlying native thread is recognizably detached.
1488   // (It may wish to reattach later.)
1489 #ifdef ART_TARGET_ANDROID
1490   __get_tls()[TLS_SLOT_ART_THREAD_SELF] = nullptr;
1491 #else
1492   CHECK_PTHREAD_CALL(pthread_setspecific, (Thread::pthread_key_self_, nullptr), "detach self");
1493 #endif
1494 
1495   // Signal that a thread just detached.
1496   MutexLock mu(nullptr, *Locks::thread_list_lock_);
1497   --unregistering_count_;
1498   Locks::thread_exit_cond_->Broadcast(nullptr);
1499 }
1500 
ForEach(void (* callback)(Thread *,void *),void * context)1501 void ThreadList::ForEach(void (*callback)(Thread*, void*), void* context) {
1502   for (const auto& thread : list_) {
1503     callback(thread, context);
1504   }
1505 }
1506 
VisitRootsForSuspendedThreads(RootVisitor * visitor)1507 void ThreadList::VisitRootsForSuspendedThreads(RootVisitor* visitor) {
1508   Thread* const self = Thread::Current();
1509   std::vector<Thread*> threads_to_visit;
1510 
1511   // Tell threads to suspend and copy them into list.
1512   {
1513     MutexLock mu(self, *Locks::thread_list_lock_);
1514     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
1515     for (Thread* thread : list_) {
1516       bool suspended = thread->ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal);
1517       DCHECK(suspended);
1518       if (thread == self || thread->IsSuspended()) {
1519         threads_to_visit.push_back(thread);
1520       } else {
1521         bool resumed = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
1522         DCHECK(resumed);
1523       }
1524     }
1525   }
1526 
1527   // Visit roots without holding thread_list_lock_ and thread_suspend_count_lock_ to prevent lock
1528   // order violations.
1529   for (Thread* thread : threads_to_visit) {
1530     thread->VisitRoots(visitor, kVisitRootFlagAllRoots);
1531   }
1532 
1533   // Restore suspend counts.
1534   {
1535     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
1536     for (Thread* thread : threads_to_visit) {
1537       bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
1538       DCHECK(updated);
1539     }
1540   }
1541 }
1542 
VisitRoots(RootVisitor * visitor,VisitRootFlags flags) const1543 void ThreadList::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) const {
1544   MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
1545   for (const auto& thread : list_) {
1546     thread->VisitRoots(visitor, flags);
1547   }
1548 }
1549 
AllocThreadId(Thread * self)1550 uint32_t ThreadList::AllocThreadId(Thread* self) {
1551   MutexLock mu(self, *Locks::allocated_thread_ids_lock_);
1552   for (size_t i = 0; i < allocated_ids_.size(); ++i) {
1553     if (!allocated_ids_[i]) {
1554       allocated_ids_.set(i);
1555       return i + 1;  // Zero is reserved to mean "invalid".
1556     }
1557   }
1558   LOG(FATAL) << "Out of internal thread ids";
1559   return 0;
1560 }
1561 
ReleaseThreadId(Thread * self,uint32_t id)1562 void ThreadList::ReleaseThreadId(Thread* self, uint32_t id) {
1563   MutexLock mu(self, *Locks::allocated_thread_ids_lock_);
1564   --id;  // Zero is reserved to mean "invalid".
1565   DCHECK(allocated_ids_[id]) << id;
1566   allocated_ids_.reset(id);
1567 }
1568 
ScopedSuspendAll(const char * cause,bool long_suspend)1569 ScopedSuspendAll::ScopedSuspendAll(const char* cause, bool long_suspend) {
1570   Runtime::Current()->GetThreadList()->SuspendAll(cause, long_suspend);
1571 }
1572 
~ScopedSuspendAll()1573 ScopedSuspendAll::~ScopedSuspendAll() {
1574   Runtime::Current()->GetThreadList()->ResumeAll();
1575 }
1576 
1577 }  // namespace art
1578