1 /*
2  * Copyright (C) 2019 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/profiling/perf/event_reader.h"
18 
19 #include <linux/perf_event.h>
20 #include <sys/ioctl.h>
21 #include <sys/mman.h>
22 #include <sys/syscall.h>
23 #include <sys/types.h>
24 #include <unistd.h>
25 
26 #include "perfetto/ext/base/utils.h"
27 #include "src/profiling/perf/regs_parsing.h"
28 
29 namespace perfetto {
30 namespace profiling {
31 
32 namespace {
33 
34 template <typename T>
ReadValue(T * value_out,const char * ptr)35 const char* ReadValue(T* value_out, const char* ptr) {
36   memcpy(value_out, reinterpret_cast<const void*>(ptr), sizeof(T));
37   return ptr + sizeof(T);
38 }
39 
40 template <typename T>
ReadValues(T * out,const char * ptr,size_t num_values)41 const char* ReadValues(T* out, const char* ptr, size_t num_values) {
42   size_t sz = sizeof(T) * num_values;
43   memcpy(out, reinterpret_cast<const void*>(ptr), sz);
44   return ptr + sz;
45 }
46 
IsPowerOfTwo(size_t v)47 bool IsPowerOfTwo(size_t v) {
48   return (v != 0 && ((v & (v - 1)) == 0));
49 }
50 
perf_event_open(perf_event_attr * attr,pid_t pid,int cpu,int group_fd,unsigned long flags)51 static int perf_event_open(perf_event_attr* attr,
52                            pid_t pid,
53                            int cpu,
54                            int group_fd,
55                            unsigned long flags) {
56   return static_cast<int>(
57       syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags));
58 }
59 
PerfEventOpen(uint32_t cpu,perf_event_attr * perf_attr,int group_fd=-1)60 base::ScopedFile PerfEventOpen(uint32_t cpu,
61                                perf_event_attr* perf_attr,
62                                int group_fd = -1) {
63   base::ScopedFile perf_fd{perf_event_open(perf_attr, /*pid=*/-1,
64                                            static_cast<int>(cpu), group_fd,
65                                            PERF_FLAG_FD_CLOEXEC)};
66   return perf_fd;
67 }
68 
69 // If counting tracepoints, set an event filter if requested.
MaybeApplyTracepointFilter(int fd,const PerfCounter & event)70 bool MaybeApplyTracepointFilter(int fd, const PerfCounter& event) {
71   if (event.type == PERF_TYPE_TRACEPOINT &&
72       !event.tracepoint.filter().empty()) {
73     if (ioctl(fd, PERF_EVENT_IOC_SET_FILTER,
74               event.tracepoint.filter().c_str()) != 0) {
75       PERFETTO_PLOG("Failed ioctl to set event filter");
76       return false;
77     }
78   }
79   return true;
80 }
81 
82 }  // namespace
83 
PerfRingBuffer(PerfRingBuffer && other)84 PerfRingBuffer::PerfRingBuffer(PerfRingBuffer&& other) noexcept
85     : metadata_page_(other.metadata_page_),
86       mmap_sz_(other.mmap_sz_),
87       data_buf_(other.data_buf_),
88       data_buf_sz_(other.data_buf_sz_) {
89   other.metadata_page_ = nullptr;
90   other.mmap_sz_ = 0;
91   other.data_buf_ = nullptr;
92   other.data_buf_sz_ = 0;
93 }
94 
operator =(PerfRingBuffer && other)95 PerfRingBuffer& PerfRingBuffer::operator=(PerfRingBuffer&& other) noexcept {
96   if (this == &other)
97     return *this;
98 
99   this->~PerfRingBuffer();
100   new (this) PerfRingBuffer(std::move(other));
101   return *this;
102 }
103 
~PerfRingBuffer()104 PerfRingBuffer::~PerfRingBuffer() {
105   if (!valid())
106     return;
107 
108   if (munmap(reinterpret_cast<void*>(metadata_page_), mmap_sz_) != 0)
109     PERFETTO_PLOG("failed munmap");
110 }
111 
Allocate(int perf_fd,size_t data_page_count)112 base::Optional<PerfRingBuffer> PerfRingBuffer::Allocate(
113     int perf_fd,
114     size_t data_page_count) {
115   // perf_event_open requires the ring buffer to be a power of two in size.
116   PERFETTO_DCHECK(IsPowerOfTwo(data_page_count));
117 
118   PerfRingBuffer ret;
119 
120   // mmap request is one page larger than the buffer size (for the metadata).
121   ret.data_buf_sz_ = data_page_count * base::kPageSize;
122   ret.mmap_sz_ = ret.data_buf_sz_ + base::kPageSize;
123 
124   // If PROT_WRITE, kernel won't overwrite unread samples.
125   void* mmap_addr = mmap(nullptr, ret.mmap_sz_, PROT_READ | PROT_WRITE,
126                          MAP_SHARED, perf_fd, 0);
127   if (mmap_addr == MAP_FAILED) {
128     PERFETTO_PLOG("failed mmap");
129     return base::nullopt;
130   }
131 
132   // Expected layout is [ metadata page ] [ data pages ... ]
133   ret.metadata_page_ = reinterpret_cast<perf_event_mmap_page*>(mmap_addr);
134   ret.data_buf_ = reinterpret_cast<char*>(mmap_addr) + base::kPageSize;
135   PERFETTO_CHECK(ret.metadata_page_->data_offset == base::kPageSize);
136   PERFETTO_CHECK(ret.metadata_page_->data_size = ret.data_buf_sz_);
137 
138   return base::make_optional(std::move(ret));
139 }
140 
141 // See |perf_output_put_handle| for the necessary synchronization between the
142 // kernel and this userspace thread (which are using the same shared memory, but
143 // might be on different cores).
144 // TODO(rsavitski): is there false sharing between |data_tail| and |data_head|?
145 // Is there an argument for maintaining our own copy of |data_tail| instead of
146 // reloading it?
ReadRecordNonconsuming()147 char* PerfRingBuffer::ReadRecordNonconsuming() {
148   static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t), "");
149 
150   PERFETTO_DCHECK(valid());
151 
152   // |data_tail| is written only by this userspace thread, so we can safely read
153   // it without any synchronization.
154   uint64_t read_offset = metadata_page_->data_tail;
155 
156   // |data_head| is written by the kernel, perform an acquiring load such that
157   // the payload reads below are ordered after this load.
158   uint64_t write_offset =
159       reinterpret_cast<std::atomic<uint64_t>*>(&metadata_page_->data_head)
160           ->load(std::memory_order_acquire);
161 
162   PERFETTO_DCHECK(read_offset <= write_offset);
163   if (write_offset == read_offset)
164     return nullptr;  // no new data
165 
166   size_t read_pos = static_cast<size_t>(read_offset & (data_buf_sz_ - 1));
167 
168   // event header (64 bits) guaranteed to be contiguous
169   PERFETTO_DCHECK(read_pos <= data_buf_sz_ - sizeof(perf_event_header));
170   PERFETTO_DCHECK(0 == reinterpret_cast<size_t>(data_buf_ + read_pos) %
171                            alignof(perf_event_header));
172 
173   perf_event_header* evt_header =
174       reinterpret_cast<perf_event_header*>(data_buf_ + read_pos);
175   uint16_t evt_size = evt_header->size;
176 
177   // event wrapped - reconstruct it, and return a pointer to the buffer
178   if (read_pos + evt_size > data_buf_sz_) {
179     PERFETTO_DCHECK(read_pos + evt_size !=
180                     ((read_pos + evt_size) & (data_buf_sz_ - 1)));
181     PERFETTO_DLOG("PerfRingBuffer: returning reconstructed event");
182 
183     size_t prefix_sz = data_buf_sz_ - read_pos;
184     memcpy(&reconstructed_record_[0], data_buf_ + read_pos, prefix_sz);
185     memcpy(&reconstructed_record_[0] + prefix_sz, data_buf_,
186            evt_size - prefix_sz);
187     return &reconstructed_record_[0];
188   } else {
189     // usual case - contiguous sample
190     PERFETTO_DCHECK(read_pos + evt_size ==
191                     ((read_pos + evt_size) & (data_buf_sz_ - 1)));
192 
193     return data_buf_ + read_pos;
194   }
195 }
196 
Consume(size_t bytes)197 void PerfRingBuffer::Consume(size_t bytes) {
198   PERFETTO_DCHECK(valid());
199 
200   // Advance |data_tail|, which is written only by this thread. The store of the
201   // updated value needs to have release semantics such that the preceding
202   // payload reads are ordered before it. The reader in this case is the kernel,
203   // which reads |data_tail| to calculate the available ring buffer capacity
204   // before trying to store a new record.
205   uint64_t updated_tail = metadata_page_->data_tail + bytes;
206   reinterpret_cast<std::atomic<uint64_t>*>(&metadata_page_->data_tail)
207       ->store(updated_tail, std::memory_order_release);
208 }
209 
EventReader(uint32_t cpu,perf_event_attr event_attr,base::ScopedFile perf_fd,PerfRingBuffer ring_buffer)210 EventReader::EventReader(uint32_t cpu,
211                          perf_event_attr event_attr,
212                          base::ScopedFile perf_fd,
213                          PerfRingBuffer ring_buffer)
214     : cpu_(cpu),
215       event_attr_(event_attr),
216       perf_fd_(std::move(perf_fd)),
217       ring_buffer_(std::move(ring_buffer)) {}
218 
operator =(EventReader && other)219 EventReader& EventReader::operator=(EventReader&& other) noexcept {
220   if (this == &other)
221     return *this;
222 
223   this->~EventReader();
224   new (this) EventReader(std::move(other));
225   return *this;
226 }
227 
ConfigureEvents(uint32_t cpu,const EventConfig & event_cfg)228 base::Optional<EventReader> EventReader::ConfigureEvents(
229     uint32_t cpu,
230     const EventConfig& event_cfg) {
231   auto leader_fd = PerfEventOpen(cpu, event_cfg.perf_attr());
232   if (!leader_fd) {
233     PERFETTO_PLOG("Failed perf_event_open");
234     return base::nullopt;
235   }
236   if (!MaybeApplyTracepointFilter(leader_fd.get(), event_cfg.timebase_event()))
237     return base::nullopt;
238 
239   auto ring_buffer =
240       PerfRingBuffer::Allocate(leader_fd.get(), event_cfg.ring_buffer_pages());
241   if (!ring_buffer.has_value()) {
242     return base::nullopt;
243   }
244 
245   return base::make_optional<EventReader>(cpu, *event_cfg.perf_attr(),
246                                           std::move(leader_fd),
247                                           std::move(ring_buffer.value()));
248 }
249 
ReadUntilSample(std::function<void (uint64_t)> records_lost_callback)250 base::Optional<ParsedSample> EventReader::ReadUntilSample(
251     std::function<void(uint64_t)> records_lost_callback) {
252   for (;;) {
253     char* event = ring_buffer_.ReadRecordNonconsuming();
254     if (!event)
255       return base::nullopt;  // caught up with the writer
256 
257     auto* event_hdr = reinterpret_cast<const perf_event_header*>(event);
258 
259     if (event_hdr->type == PERF_RECORD_SAMPLE) {
260       ParsedSample sample = ParseSampleRecord(cpu_, event);
261       ring_buffer_.Consume(event_hdr->size);
262       return base::make_optional(std::move(sample));
263     }
264 
265     if (event_hdr->type == PERF_RECORD_LOST) {
266       /*
267        * struct {
268        *   struct perf_event_header header;
269        *   u64 id;
270        *   u64 lost;
271        *   struct sample_id sample_id;
272        * };
273        */
274       uint64_t records_lost = *reinterpret_cast<const uint64_t*>(
275           event + sizeof(perf_event_header) + sizeof(uint64_t));
276 
277       records_lost_callback(records_lost);
278       ring_buffer_.Consume(event_hdr->size);
279       continue;  // keep looking for a sample
280     }
281 
282     // Kernel had to throttle irqs.
283     if (event_hdr->type == PERF_RECORD_THROTTLE ||
284         event_hdr->type == PERF_RECORD_UNTHROTTLE) {
285       ring_buffer_.Consume(event_hdr->size);
286       continue;  // keep looking for a sample
287     }
288 
289     PERFETTO_DFATAL_OR_ELOG("Unsupported event type [%zu]",
290                             static_cast<size_t>(event_hdr->type));
291     ring_buffer_.Consume(event_hdr->size);
292   }
293 }
294 
295 // Generally, samples can belong to any cpu (which can be recorded with
296 // PERF_SAMPLE_CPU). However, this producer uses only cpu-scoped events,
297 // therefore it is already known.
ParseSampleRecord(uint32_t cpu,const char * record_start)298 ParsedSample EventReader::ParseSampleRecord(uint32_t cpu,
299                                             const char* record_start) {
300   if (event_attr_.sample_type &
301       (~uint64_t(PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_STACK_USER |
302                  PERF_SAMPLE_REGS_USER | PERF_SAMPLE_CALLCHAIN |
303                  PERF_SAMPLE_READ))) {
304     PERFETTO_FATAL("Unsupported sampling option");
305   }
306 
307   auto* event_hdr = reinterpret_cast<const perf_event_header*>(record_start);
308   size_t sample_size = event_hdr->size;
309 
310   ParsedSample sample = {};
311   sample.common.cpu = cpu;
312   sample.common.cpu_mode = event_hdr->misc & PERF_RECORD_MISC_CPUMODE_MASK;
313 
314   // Parse the payload, which consists of concatenated data for each
315   // |attr.sample_type| flag.
316   const char* parse_pos = record_start + sizeof(perf_event_header);
317 
318   if (event_attr_.sample_type & PERF_SAMPLE_TID) {
319     uint32_t pid = 0;
320     uint32_t tid = 0;
321     parse_pos = ReadValue(&pid, parse_pos);
322     parse_pos = ReadValue(&tid, parse_pos);
323     sample.common.pid = static_cast<pid_t>(pid);
324     sample.common.tid = static_cast<pid_t>(tid);
325   }
326 
327   if (event_attr_.sample_type & PERF_SAMPLE_TIME) {
328     parse_pos = ReadValue(&sample.common.timestamp, parse_pos);
329   }
330 
331   if (event_attr_.sample_type & PERF_SAMPLE_READ) {
332     parse_pos = ReadValue(&sample.common.timebase_count, parse_pos);
333   }
334 
335   if (event_attr_.sample_type & PERF_SAMPLE_CALLCHAIN) {
336     uint64_t chain_len = 0;
337     parse_pos = ReadValue(&chain_len, parse_pos);
338     sample.kernel_ips.resize(static_cast<size_t>(chain_len));
339     parse_pos = ReadValues<uint64_t>(sample.kernel_ips.data(), parse_pos,
340                                      static_cast<size_t>(chain_len));
341   }
342 
343   if (event_attr_.sample_type & PERF_SAMPLE_REGS_USER) {
344     // Can be empty, e.g. if we sampled a kernel thread.
345     sample.regs = ReadPerfUserRegsData(&parse_pos);
346   }
347 
348   if (event_attr_.sample_type & PERF_SAMPLE_STACK_USER) {
349     // Maximum possible sampled stack size for this sample. Can be lower than
350     // the requested size if there wasn't enough room in the sample (which is
351     // limited to 64k).
352     uint64_t max_stack_size;
353     parse_pos = ReadValue(&max_stack_size, parse_pos);
354 
355     const char* stack_start = parse_pos;
356     parse_pos += max_stack_size;  // skip to dyn_size
357 
358     // Payload written conditionally, e.g. kernel threads don't have a
359     // user stack.
360     if (max_stack_size > 0) {
361       uint64_t filled_stack_size;
362       parse_pos = ReadValue(&filled_stack_size, parse_pos);
363       PERFETTO_DLOG("sampled stack size: %" PRIu64 " / %" PRIu64 "",
364                     filled_stack_size, max_stack_size);
365 
366       // copy stack bytes into a vector
367       size_t payload_sz = static_cast<size_t>(filled_stack_size);
368       sample.stack.resize(payload_sz);
369       memcpy(sample.stack.data(), stack_start, payload_sz);
370 
371       // remember whether the stack sample is (most likely) truncated
372       sample.stack_maxed = (filled_stack_size == max_stack_size);
373     }
374   }
375 
376   PERFETTO_CHECK(parse_pos == record_start + sample_size);
377   return sample;
378 }
379 
EnableEvents()380 void EventReader::EnableEvents() {
381   int ret = ioctl(perf_fd_.get(), PERF_EVENT_IOC_ENABLE);
382   PERFETTO_CHECK(ret == 0);
383 }
384 
DisableEvents()385 void EventReader::DisableEvents() {
386   int ret = ioctl(perf_fd_.get(), PERF_EVENT_IOC_DISABLE);
387   PERFETTO_CHECK(ret == 0);
388 }
389 
390 }  // namespace profiling
391 }  // namespace perfetto
392