1 /*
2  * Copyright (C) 2020 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef SRC_PROFILING_PERF_UNWINDING_H_
18 #define SRC_PROFILING_PERF_UNWINDING_H_
19 
20 #include <condition_variable>
21 #include <map>
22 #include <thread>
23 
24 #include <linux/perf_event.h>
25 #include <stdint.h>
26 
27 #include <unwindstack/Error.h>
28 
29 #include "perfetto/base/flat_set.h"
30 #include "perfetto/base/logging.h"
31 #include "perfetto/ext/base/optional.h"
32 #include "perfetto/ext/base/thread_checker.h"
33 #include "perfetto/ext/base/unix_task_runner.h"
34 #include "perfetto/ext/tracing/core/basic_types.h"
35 #include "src/kallsyms/kernel_symbol_map.h"
36 #include "src/kallsyms/lazy_kernel_symbolizer.h"
37 #include "src/profiling/common/unwind_support.h"
38 #include "src/profiling/perf/common_types.h"
39 #include "src/profiling/perf/unwind_queue.h"
40 
41 namespace perfetto {
42 namespace profiling {
43 
44 constexpr static uint32_t kUnwindQueueCapacity = 1024;
45 
46 // Unwinds callstacks based on the sampled stack and register state (see
47 // |ParsedSample|). Has a single unwinding ring queue, shared across
48 // all data sources.
49 //
50 // Samples cannot be unwound without having /proc/<pid>/{maps,mem} file
51 // descriptors for that process. This lookup can be asynchronous (e.g. on
52 // Android), so the unwinder might have to wait before it can process (or
53 // discard) some of the enqueued samples. To avoid blocking the entire queue,
54 // the unwinder is allowed to process the entries out of order.
55 //
56 // Besides the queue, all interactions between the unwinder and the rest of the
57 // producer logic are through posted tasks.
58 //
59 // As unwinding times are long-tailed (example measurements: median <1ms,
60 // worst-case ~1000ms), the unwinder runs on a dedicated thread to avoid
61 // starving the rest of the producer's work (including IPC and consumption of
62 // records from the kernel ring buffers).
63 //
64 // This class should not be instantiated directly, use the |UnwinderHandle|
65 // below instead.
66 //
67 // TODO(rsavitski): while the inputs to the unwinder are batched as a result of
68 // the reader posting a wakeup only after consuming a batch of kernel samples,
69 // the Unwinder might be staggering wakeups for the producer thread by posting a
70 // task every time a sample has been unwound. Evaluate how bad these wakeups are
71 // in practice, and consider also implementing a batching strategy for the
72 // unwinder->serialization handoff (which isn't very latency-sensitive).
73 class Unwinder {
74  public:
75   friend class UnwinderHandle;
76 
77   // Callbacks from the unwinder to the primary producer thread.
78   class Delegate {
79    public:
80     virtual void PostEmitSample(DataSourceInstanceID ds_id,
81                                 CompletedSample sample) = 0;
82     virtual void PostEmitUnwinderSkippedSample(DataSourceInstanceID ds_id,
83                                                ParsedSample sample) = 0;
84     virtual void PostFinishDataSourceStop(DataSourceInstanceID ds_id) = 0;
85 
86     virtual ~Delegate();
87   };
88 
~Unwinder()89   ~Unwinder() { PERFETTO_DCHECK_THREAD(thread_checker_); }
90 
91   void PostStartDataSource(DataSourceInstanceID ds_id, bool kernel_frames);
92   void PostAdoptProcDescriptors(DataSourceInstanceID ds_id,
93                                 pid_t pid,
94                                 base::ScopedFile maps_fd,
95                                 base::ScopedFile mem_fd);
96   void PostRecordTimedOutProcDescriptors(DataSourceInstanceID ds_id, pid_t pid);
97   void PostProcessQueue();
98   void PostInitiateDataSourceStop(DataSourceInstanceID ds_id);
99   void PostPurgeDataSource(DataSourceInstanceID ds_id);
100 
101   void PostClearCachedStatePeriodic(DataSourceInstanceID ds_id,
102                                     uint32_t period_ms);
103 
unwind_queue()104   UnwindQueue<UnwindEntry, kUnwindQueueCapacity>& unwind_queue() {
105     return unwind_queue_;
106   }
107 
GetEnqueuedFootprint()108   uint64_t GetEnqueuedFootprint() {
109     uint64_t freed =
110         footprint_tracker_.stack_bytes_freed.load(std::memory_order_acquire);
111     uint64_t allocated = footprint_tracker_.stack_bytes_allocated.load(
112         std::memory_order_relaxed);
113 
114     // overflow not a concern in practice
115     PERFETTO_DCHECK(allocated >= freed);
116     return allocated - freed;
117   }
118 
IncrementEnqueuedFootprint(uint64_t increment)119   void IncrementEnqueuedFootprint(uint64_t increment) {
120     footprint_tracker_.stack_bytes_allocated.fetch_add(
121         increment, std::memory_order_relaxed);
122   }
123 
124  private:
125   struct ProcessState {
126     enum class Status {
127       kResolving,  // unwinder waiting on proc-fds for the process
128       kResolved,   // proc-fds available, can unwind samples
129       kExpired     // proc-fd lookup timed out, will discard samples
130     };
131 
132     Status status = Status::kResolving;
133     // Present iff status == kResolved.
134     base::Optional<UnwindingMetadata> unwind_state;
135     // Used to distinguish first-time unwinding attempts for a process, for
136     // logging purposes.
137     bool attempted_unwinding = false;
138   };
139 
140   struct DataSourceState {
141     enum class Status { kActive, kShuttingDown };
142 
143     Status status = Status::kActive;
144     std::map<pid_t, ProcessState> process_states;
145   };
146 
147   // Accounting for how much heap memory is attached to the enqueued samples at
148   // a given time. Read by the main thread, mutated by both threads.
149   // We track just the heap allocated for the sampled stacks, as it dominates
150   // the per-sample heap use.
151   struct QueueFootprintTracker {
152     std::atomic<uint64_t> stack_bytes_allocated;
153     std::atomic<uint64_t> stack_bytes_freed;
154   };
155 
156   // Must be instantiated via the |UnwinderHandle|.
157   Unwinder(Delegate* delegate, base::UnixTaskRunner* task_runner);
158 
159   // Marks the data source as valid and active at the unwinding stage.
160   // Initializes kernel address symbolization if needed.
161   void StartDataSource(DataSourceInstanceID ds_id, bool kernel_frames);
162 
163   void AdoptProcDescriptors(DataSourceInstanceID ds_id,
164                             pid_t pid,
165                             base::ScopedFile maps_fd,
166                             base::ScopedFile mem_fd);
167   void RecordTimedOutProcDescriptors(DataSourceInstanceID ds_id, pid_t pid);
168 
169   // Primary task. Processes the enqueued samples using
170   // |ConsumeAndUnwindReadySamples|, and re-evaluates data source state.
171   void ProcessQueue();
172 
173   // Processes the enqueued samples for which all unwinding inputs are ready.
174   // Returns the set of data source instances which still have samples pending
175   // (i.e. waiting on the proc-fds).
176   base::FlatSet<DataSourceInstanceID> ConsumeAndUnwindReadySamples();
177 
178   CompletedSample UnwindSample(const ParsedSample& sample,
179                                UnwindingMetadata* unwind_state,
180                                bool pid_unwound_before);
181 
182   // Returns a list of symbolized kernel frames in the sample (if any).
183   std::vector<unwindstack::FrameData> SymbolizeKernelCallchain(
184       const ParsedSample& sample);
185 
186   // Marks the data source as shutting down at the unwinding stage. It is known
187   // that no new samples for this source will be pushed into the queue, but we
188   // need to delay the unwinder state teardown until all previously-enqueued
189   // samples for this source are processed.
190   void InitiateDataSourceStop(DataSourceInstanceID ds_id);
191 
192   // Tears down unwinding state for the data source without any outstanding
193   // samples, and informs the service that it can continue the shutdown
194   // sequence.
195   void FinishDataSourceStop(DataSourceInstanceID ds_id);
196 
197   // Immediately destroys the data source state, used for abrupt stops.
198   void PurgeDataSource(DataSourceInstanceID ds_id);
199 
DecrementEnqueuedFootprint(uint64_t decrement)200   void DecrementEnqueuedFootprint(uint64_t decrement) {
201     footprint_tracker_.stack_bytes_freed.fetch_add(decrement,
202                                                    std::memory_order_relaxed);
203   }
204 
205   // Clears the parsed maps for all previously-sampled processes, and resets the
206   // libunwindstack cache. This has the effect of deallocating the cached Elf
207   // objects within libunwindstack, which take up non-trivial amounts of memory.
208   //
209   // There are two reasons for having this operation:
210   // * over a longer trace, it's desireable to drop heavy state for processes
211   //   that haven't been sampled recently.
212   // * since libunwindstack's cache is not bounded, it'll tend towards having
213   //   state for all processes that are targeted by the profiling config.
214   //   Clearing the cache periodically helps keep its footprint closer to the
215   //   actual working set (NB: which might still be arbitrarily big, depending
216   //   on the profiling config).
217   //
218   // After this function completes, the next unwind for each process will
219   // therefore incur a guaranteed maps reparse.
220   //
221   // Unwinding for concurrent data sources will *not* be directly affected at
222   // the time of writing, as the non-cleared parsed maps will keep the cached
223   // Elf objects alive through shared_ptrs.
224   //
225   // Note that this operation is heavy in terms of cpu%, and should therefore
226   // be called only for profiling configs that require it.
227   //
228   // TODO(rsavitski): dropping the full parsed maps is somewhat excessive, could
229   // instead clear just the |MapInfo.elf| shared_ptr, but that's considered too
230   // brittle as it's an implementation detail of libunwindstack.
231   // TODO(rsavitski): improve libunwindstack cache's architecture (it is still
232   // worth having at the moment to speed up unwinds across map reparses).
233   void ClearCachedStatePeriodic(DataSourceInstanceID ds_id, uint32_t period_ms);
234 
235   void ResetAndEnableUnwindstackCache();
236 
237   base::UnixTaskRunner* const task_runner_;
238   Delegate* const delegate_;
239   UnwindQueue<UnwindEntry, kUnwindQueueCapacity> unwind_queue_;
240   QueueFootprintTracker footprint_tracker_;
241   std::map<DataSourceInstanceID, DataSourceState> data_sources_;
242   LazyKernelSymbolizer kernel_symbolizer_;
243 
244   PERFETTO_THREAD_CHECKER(thread_checker_)
245 };
246 
247 // Owning resource handle for an |Unwinder| with a dedicated task thread.
248 // Ensures that the |Unwinder| is constructed and destructed on the task thread.
249 // TODO(rsavitski): update base::ThreadTaskRunner to allow for this pattern of
250 // owned state, and consolidate.
251 class UnwinderHandle {
252  public:
UnwinderHandle(Unwinder::Delegate * delegate)253   explicit UnwinderHandle(Unwinder::Delegate* delegate) {
254     std::mutex init_lock;
255     std::condition_variable init_cv;
256 
257     std::function<void(base::UnixTaskRunner*, Unwinder*)> initializer =
258         [this, &init_lock, &init_cv](base::UnixTaskRunner* task_runner,
259                                      Unwinder* unwinder) {
260           std::lock_guard<std::mutex> lock(init_lock);
261           task_runner_ = task_runner;
262           unwinder_ = unwinder;
263           // Notify while still holding the lock, as init_cv ceases to exist as
264           // soon as the main thread observes a non-null task_runner_, and it
265           // can wake up spuriously (i.e. before the notify if we had unlocked
266           // before notifying).
267           init_cv.notify_one();
268         };
269 
270     thread_ = std::thread(&UnwinderHandle::RunTaskThread, this,
271                           std::move(initializer), delegate);
272 
273     std::unique_lock<std::mutex> lock(init_lock);
274     init_cv.wait(lock, [this] { return !!task_runner_ && !!unwinder_; });
275   }
276 
~UnwinderHandle()277   ~UnwinderHandle() {
278     if (task_runner_) {
279       PERFETTO_CHECK(!task_runner_->QuitCalled());
280       task_runner_->Quit();
281 
282       PERFETTO_DCHECK(thread_.joinable());
283     }
284     if (thread_.joinable())
285       thread_.join();
286   }
287 
288   Unwinder* operator->() { return unwinder_; }
289 
290  private:
RunTaskThread(std::function<void (base::UnixTaskRunner *,Unwinder *)> initializer,Unwinder::Delegate * delegate)291   void RunTaskThread(
292       std::function<void(base::UnixTaskRunner*, Unwinder*)> initializer,
293       Unwinder::Delegate* delegate) {
294     base::UnixTaskRunner task_runner;
295     Unwinder unwinder(delegate, &task_runner);
296     task_runner.PostTask(
297         std::bind(std::move(initializer), &task_runner, &unwinder));
298     task_runner.Run();
299   }
300 
301   std::thread thread_;
302   base::UnixTaskRunner* task_runner_ = nullptr;
303   Unwinder* unwinder_ = nullptr;
304 };
305 
306 }  // namespace profiling
307 }  // namespace perfetto
308 
309 #endif  // SRC_PROFILING_PERF_UNWINDING_H_
310