1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/ftrace_reader/cpu_reader.h"
18 
19 #include <signal.h>
20 
21 #include <dirent.h>
22 #include <map>
23 #include <queue>
24 #include <string>
25 #include <utility>
26 
27 #include "perfetto/base/build_config.h"
28 #include "perfetto/base/logging.h"
29 #include "perfetto/base/utils.h"
30 #include "src/ftrace_reader/proto_translation_table.h"
31 
32 #include "perfetto/trace/ftrace/ftrace_event.pbzero.h"
33 #include "perfetto/trace/ftrace/ftrace_event_bundle.pbzero.h"
34 
35 namespace perfetto {
36 
37 namespace {
38 
ReadIntoString(const uint8_t * start,const uint8_t * end,uint32_t field_id,protozero::Message * out)39 bool ReadIntoString(const uint8_t* start,
40                     const uint8_t* end,
41                     uint32_t field_id,
42                     protozero::Message* out) {
43   for (const uint8_t* c = start; c < end; c++) {
44     if (*c != '\0')
45       continue;
46     out->AppendBytes(field_id, reinterpret_cast<const char*>(start),
47                      static_cast<uintptr_t>(c - start));
48     return true;
49   }
50   return false;
51 }
52 
53 using BundleHandle =
54     protozero::MessageHandle<protos::pbzero::FtraceEventBundle>;
55 
BuildEnabledVector(const ProtoTranslationTable & table,const std::set<std::string> & names)56 const std::vector<bool> BuildEnabledVector(const ProtoTranslationTable& table,
57                                            const std::set<std::string>& names) {
58   std::vector<bool> enabled(table.largest_id() + 1);
59   for (const std::string& name : names) {
60     const Event* event = table.GetEventByName(name);
61     if (!event)
62       continue;
63     enabled[event->ftrace_event_id] = true;
64   }
65   return enabled;
66 }
67 
SetBlocking(int fd,bool is_blocking)68 void SetBlocking(int fd, bool is_blocking) {
69   int flags = fcntl(fd, F_GETFL, 0);
70   flags = (is_blocking) ? (flags & ~O_NONBLOCK) : (flags | O_NONBLOCK);
71   PERFETTO_CHECK(fcntl(fd, F_SETFL, flags) == 0);
72 }
73 
74 // For further documentation of these constants see the kernel source:
75 // linux/include/linux/ring_buffer.h
76 // Some information about the values of these constants are exposed to user
77 // space at: /sys/kernel/debug/tracing/events/header_event
78 constexpr uint32_t kTypeDataTypeLengthMax = 28;
79 constexpr uint32_t kTypePadding = 29;
80 constexpr uint32_t kTypeTimeExtend = 30;
81 constexpr uint32_t kTypeTimeStamp = 31;
82 
83 struct PageHeader {
84   uint64_t timestamp;
85   uint64_t size;
86   uint64_t overwrite;
87 };
88 
89 struct EventHeader {
90   uint32_t type_or_length : 5;
91   uint32_t time_delta : 27;
92 };
93 
94 struct TimeStamp {
95   uint64_t tv_nsec;
96   uint64_t tv_sec;
97 };
98 
99 }  // namespace
100 
EventFilter(const ProtoTranslationTable & table,std::set<std::string> names)101 EventFilter::EventFilter(const ProtoTranslationTable& table,
102                          std::set<std::string> names)
103     : enabled_ids_(BuildEnabledVector(table, names)),
104       enabled_names_(std::move(names)) {}
105 EventFilter::~EventFilter() = default;
106 
CpuReader(const ProtoTranslationTable * table,size_t cpu,base::ScopedFile fd,std::function<void ()> on_data_available)107 CpuReader::CpuReader(const ProtoTranslationTable* table,
108                      size_t cpu,
109                      base::ScopedFile fd,
110                      std::function<void()> on_data_available)
111     : table_(table), cpu_(cpu), trace_fd_(std::move(fd)) {
112   int pipe_fds[2];
113   PERFETTO_CHECK(pipe(&pipe_fds[0]) == 0);
114   staging_read_fd_.reset(pipe_fds[0]);
115   staging_write_fd_.reset(pipe_fds[1]);
116 
117   // Make reads from the raw pipe blocking so that splice() can sleep.
118   PERFETTO_CHECK(trace_fd_);
119   SetBlocking(*trace_fd_, true);
120 
121   // Reads from the staging pipe are always non-blocking.
122   SetBlocking(*staging_read_fd_, false);
123 
124   // Note: O_NONBLOCK seems to be ignored by splice() on the target pipe. The
125   // blocking vs non-blocking behavior is controlled solely by the
126   // SPLICE_F_NONBLOCK flag passed to splice().
127   SetBlocking(*staging_write_fd_, false);
128 
129   // We need a non-default SIGPIPE handler to make it so that the blocking
130   // splice() is woken up when the ~CpuReader() dtor destroys the pipes.
131   // Just masking out the signal would cause an implicit syscall restart and
132   // hence make the join() in the dtor unreliable.
133   struct sigaction current_act = {};
134   PERFETTO_CHECK(sigaction(SIGPIPE, nullptr, &current_act) == 0);
135 #pragma GCC diagnostic push
136 #if defined(__clang__)
137 #pragma GCC diagnostic ignored "-Wdisabled-macro-expansion"
138 #endif
139   if (current_act.sa_handler == SIG_DFL || current_act.sa_handler == SIG_IGN) {
140     struct sigaction act = {};
141     act.sa_sigaction = [](int, siginfo_t*, void*) {};
142     PERFETTO_CHECK(sigaction(SIGPIPE, &act, nullptr) == 0);
143   }
144 #pragma GCC diagnostic pop
145 
146   worker_thread_ =
147       std::thread(std::bind(&RunWorkerThread, cpu_, *trace_fd_,
148                             *staging_write_fd_, on_data_available, &exiting_));
149 }
150 
~CpuReader()151 CpuReader::~CpuReader() {
152   // The kernel's splice implementation for the trace pipe doesn't generate a
153   // SIGPIPE if the output pipe is closed (b/73807072). Instead, the call to
154   // close() on the pipe hangs forever. To work around this, we first close the
155   // trace fd (which prevents another splice from starting), raise SIGPIPE and
156   // wait for the worker to exit (i.e., to guarantee no splice is in progress)
157   // and only then close the staging pipe.
158   exiting_ = true;
159   trace_fd_.reset();
160   pthread_kill(worker_thread_.native_handle(), SIGPIPE);
161   worker_thread_.join();
162 }
163 
164 // static
RunWorkerThread(size_t cpu,int trace_fd,int staging_write_fd,const std::function<void ()> & on_data_available,std::atomic<bool> * exiting)165 void CpuReader::RunWorkerThread(size_t cpu,
166                                 int trace_fd,
167                                 int staging_write_fd,
168                                 const std::function<void()>& on_data_available,
169                                 std::atomic<bool>* exiting) {
170 #if PERFETTO_BUILDFLAG(PERFETTO_OS_LINUX) || \
171     PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID)
172   // This thread is responsible for moving data from the trace pipe into the
173   // staging pipe at least one page at a time. This is done using the splice(2)
174   // system call, which unlike poll/select makes it possible to block until at
175   // least a full page of data is ready to be read. The downside is that as the
176   // call is blocking we need a dedicated thread for each trace pipe (i.e.,
177   // CPU).
178   char thread_name[16];
179   snprintf(thread_name, sizeof(thread_name), "traced_probes%zu", cpu);
180   pthread_setname_np(pthread_self(), thread_name);
181 
182   while (true) {
183     // First do a blocking splice which sleeps until there is at least one
184     // page of data available and enough space to write it into the staging
185     // pipe.
186     ssize_t splice_res = splice(trace_fd, nullptr, staging_write_fd, nullptr,
187                                 base::kPageSize, SPLICE_F_MOVE);
188     if (splice_res < 0) {
189       // The kernel ftrace code has its own splice() implementation that can
190       // occasionally fail with transient errors not reported in man 2 splice.
191       // Just try again if we see these.
192       if (errno == ENOMEM || errno == EBUSY || (errno == EINTR && !*exiting)) {
193         PERFETTO_DPLOG("Transient splice failure -- retrying");
194         usleep(100 * 1000);
195         continue;
196       }
197       PERFETTO_DPLOG("Stopping CPUReader loop for CPU %zd.", cpu);
198       PERFETTO_DCHECK(errno == EPIPE || errno == EINTR || errno == EBADF);
199       break;  // ~CpuReader is waiting to join this thread.
200     }
201 
202     // Then do as many non-blocking splices as we can. This moves any full
203     // pages from the trace pipe into the staging pipe as long as there is
204     // data in the former and space in the latter.
205     while (true) {
206       splice_res = splice(trace_fd, nullptr, staging_write_fd, nullptr,
207                           base::kPageSize, SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
208       if (splice_res < 0) {
209         if (errno != EAGAIN && errno != ENOMEM && errno != EBUSY)
210           PERFETTO_PLOG("splice");
211         break;
212       }
213     }
214 
215     // This callback will block until we are allowed to read more data.
216     on_data_available();
217   }
218 #else
219   base::ignore_result(cpu);
220   base::ignore_result(trace_fd);
221   base::ignore_result(staging_write_fd);
222   base::ignore_result(on_data_available);
223   PERFETTO_ELOG("Supported only on Linux/Android");
224 #endif
225 }
226 
Drain(const std::array<const EventFilter *,kMaxSinks> & filters,const std::array<BundleHandle,kMaxSinks> & bundles,const std::array<FtraceMetadata *,kMaxSinks> & metadatas)227 bool CpuReader::Drain(const std::array<const EventFilter*, kMaxSinks>& filters,
228                       const std::array<BundleHandle, kMaxSinks>& bundles,
229                       const std::array<FtraceMetadata*, kMaxSinks>& metadatas) {
230   PERFETTO_DCHECK_THREAD(thread_checker_);
231   while (true) {
232     uint8_t* buffer = GetBuffer();
233     long bytes =
234         PERFETTO_EINTR(read(*staging_read_fd_, buffer, base::kPageSize));
235     if (bytes == -1 && errno == EAGAIN)
236       break;
237     PERFETTO_CHECK(static_cast<size_t>(bytes) == base::kPageSize);
238 
239     size_t evt_size = 0;
240     for (size_t i = 0; i < kMaxSinks; i++) {
241       if (!filters[i])
242         break;
243       evt_size =
244           ParsePage(buffer, filters[i], &*bundles[i], table_, metadatas[i]);
245       PERFETTO_DCHECK(evt_size);
246     }
247   }
248 
249   for (size_t i = 0; i < kMaxSinks; i++) {
250     if (!filters[i])
251       break;
252     bundles[i]->set_cpu(static_cast<uint32_t>(cpu_));
253     bundles[i]->set_overwrite_count(metadatas[i]->overwrite_count);
254   }
255 
256   return true;
257 }
258 
GetBuffer()259 uint8_t* CpuReader::GetBuffer() {
260   PERFETTO_DCHECK_THREAD(thread_checker_);
261   // TODO(primiano): Guard against overflows, like BufferedFrameDeserializer.
262   if (!buffer_)
263     buffer_ = std::unique_ptr<uint8_t[]>(new uint8_t[base::kPageSize]);
264   return buffer_.get();
265 }
266 
267 // The structure of a raw trace buffer page is as follows:
268 // First a page header:
269 //   8 bytes of timestamp
270 //   8 bytes of page length TODO(hjd): other fields also defined here?
271 // // TODO(hjd): Document rest of format.
272 // Some information about the layout of the page header is available in user
273 // space at: /sys/kernel/debug/tracing/events/header_event
274 // This method is deliberately static so it can be tested independently.
ParsePage(const uint8_t * ptr,const EventFilter * filter,protos::pbzero::FtraceEventBundle * bundle,const ProtoTranslationTable * table,FtraceMetadata * metadata)275 size_t CpuReader::ParsePage(const uint8_t* ptr,
276                             const EventFilter* filter,
277                             protos::pbzero::FtraceEventBundle* bundle,
278                             const ProtoTranslationTable* table,
279                             FtraceMetadata* metadata) {
280   const uint8_t* const start_of_page = ptr;
281   const uint8_t* const end_of_page = ptr + base::kPageSize;
282 
283   // TODO(hjd): Read this format dynamically?
284   PageHeader page_header;
285   if (!ReadAndAdvance<uint64_t>(&ptr, end_of_page, &page_header.timestamp))
286     return 0;
287 
288   // Temporary workaroud to make this work on ARM32 and ARM64 devices.
289   if (sizeof(void*) == 8) {
290     uint64_t overwrite_and_size;
291     if (!ReadAndAdvance<uint64_t>(&ptr, end_of_page, &overwrite_and_size))
292       return 0;
293 
294     page_header.size = (overwrite_and_size & 0x000000000000ffffull) >> 0;
295     page_header.overwrite = (overwrite_and_size & 0x00000000ff000000ull) >> 24;
296   } else if (sizeof(void*) == 4) {
297     uint32_t overwrite_and_size;
298     if (!ReadAndAdvance<uint32_t>(&ptr, end_of_page, &overwrite_and_size))
299       return 0;
300 
301     page_header.size = (overwrite_and_size & 0x000000000000ffffull) >> 0;
302     page_header.overwrite = (overwrite_and_size & 0x00000000ff000000ull) >> 24;
303   } else {
304     PERFETTO_CHECK(false);
305   }
306 
307   metadata->overwrite_count = static_cast<uint32_t>(page_header.overwrite);
308 
309   const uint8_t* const end = ptr + page_header.size;
310   if (end > end_of_page)
311     return 0;
312 
313   uint64_t timestamp = page_header.timestamp;
314 
315   while (ptr < end) {
316     EventHeader event_header;
317     if (!ReadAndAdvance(&ptr, end, &event_header))
318       return 0;
319 
320     timestamp += event_header.time_delta;
321 
322     switch (event_header.type_or_length) {
323       case kTypePadding: {
324         // Left over page padding or discarded event.
325         if (event_header.time_delta == 0) {
326           // Not clear what the correct behaviour is in this case.
327           PERFETTO_DCHECK(false);
328           return 0;
329         }
330         uint32_t length;
331         if (!ReadAndAdvance<uint32_t>(&ptr, end, &length))
332           return 0;
333         ptr += length;
334         break;
335       }
336       case kTypeTimeExtend: {
337         // Extend the time delta.
338         uint32_t time_delta_ext;
339         if (!ReadAndAdvance<uint32_t>(&ptr, end, &time_delta_ext))
340           return 0;
341         // See https://goo.gl/CFBu5x
342         timestamp += (static_cast<uint64_t>(time_delta_ext)) << 27;
343         break;
344       }
345       case kTypeTimeStamp: {
346         // Sync time stamp with external clock.
347         TimeStamp time_stamp;
348         if (!ReadAndAdvance<TimeStamp>(&ptr, end, &time_stamp))
349           return 0;
350         // Not implemented in the kernel, nothing should generate this.
351         PERFETTO_DCHECK(false);
352         break;
353       }
354       // Data record:
355       default: {
356         PERFETTO_CHECK(event_header.type_or_length <= kTypeDataTypeLengthMax);
357         // type_or_length is <=28 so it represents the length of a data record.
358         // if == 0, this is an extended record and the size of the record is
359         // stored in the first uint32_t word in the payload.
360         // See Kernel's include/linux/ring_buffer.h
361         uint32_t event_size;
362         if (event_header.type_or_length == 0) {
363           if (!ReadAndAdvance<uint32_t>(&ptr, end, &event_size))
364             return 0;
365           // Size includes the size field itself.
366           if (event_size < 4)
367             return 0;
368           event_size -= 4;
369         } else {
370           event_size = 4 * event_header.type_or_length;
371         }
372         const uint8_t* start = ptr;
373         const uint8_t* next = ptr + event_size;
374 
375         if (next > end)
376           return 0;
377 
378         uint16_t ftrace_event_id;
379         if (!ReadAndAdvance<uint16_t>(&ptr, end, &ftrace_event_id))
380           return 0;
381         if (filter->IsEventEnabled(ftrace_event_id)) {
382           protos::pbzero::FtraceEvent* event = bundle->add_event();
383           event->set_timestamp(timestamp);
384           if (!ParseEvent(ftrace_event_id, start, next, table, event, metadata))
385             return 0;
386         }
387 
388         // Jump to next event.
389         ptr = next;
390       }
391     }
392   }
393   return static_cast<size_t>(ptr - start_of_page);
394 }
395 
396 // |start| is the start of the current event.
397 // |end| is the end of the buffer.
ParseEvent(uint16_t ftrace_event_id,const uint8_t * start,const uint8_t * end,const ProtoTranslationTable * table,protozero::Message * message,FtraceMetadata * metadata)398 bool CpuReader::ParseEvent(uint16_t ftrace_event_id,
399                            const uint8_t* start,
400                            const uint8_t* end,
401                            const ProtoTranslationTable* table,
402                            protozero::Message* message,
403                            FtraceMetadata* metadata) {
404   PERFETTO_DCHECK(start < end);
405   const size_t length = static_cast<size_t>(end - start);
406 
407   // TODO(hjd): Rework to work even if the event is unknown.
408   const Event& info = *table->GetEventById(ftrace_event_id);
409 
410   // TODO(hjd): Test truncated events.
411   // If the end of the buffer is before the end of the event give up.
412   if (info.size > length) {
413     PERFETTO_DCHECK(false);
414     return false;
415   }
416 
417   bool success = true;
418   for (const Field& field : table->common_fields())
419     success &= ParseField(field, start, end, message, metadata);
420 
421   protozero::Message* nested =
422       message->BeginNestedMessage<protozero::Message>(info.proto_field_id);
423 
424   for (const Field& field : info.fields)
425     success &= ParseField(field, start, end, nested, metadata);
426 
427   // This finalizes |nested| automatically.
428   message->Finalize();
429   metadata->FinishEvent();
430   return success;
431 }
432 
433 // Caller must guarantee that the field fits in the range,
434 // explicitly: start + field.ftrace_offset + field.ftrace_size <= end
435 // The only exception is fields with strategy = kCStringToString
436 // where the total size isn't known up front. In this case ParseField
437 // will check the string terminates in the bounds and won't read past |end|.
ParseField(const Field & field,const uint8_t * start,const uint8_t * end,protozero::Message * message,FtraceMetadata * metadata)438 bool CpuReader::ParseField(const Field& field,
439                            const uint8_t* start,
440                            const uint8_t* end,
441                            protozero::Message* message,
442                            FtraceMetadata* metadata) {
443   PERFETTO_DCHECK(start + field.ftrace_offset + field.ftrace_size <= end);
444   const uint8_t* field_start = start + field.ftrace_offset;
445   uint32_t field_id = field.proto_field_id;
446 
447   switch (field.strategy) {
448     case kUint8ToUint32:
449       ReadIntoVarInt<uint8_t>(field_start, field_id, message);
450       return true;
451     case kUint16ToUint32:
452       ReadIntoVarInt<uint16_t>(field_start, field_id, message);
453       return true;
454     case kUint32ToUint32:
455     case kUint32ToUint64:
456       ReadIntoVarInt<uint32_t>(field_start, field_id, message);
457       return true;
458     case kUint64ToUint64:
459       ReadIntoVarInt<uint64_t>(field_start, field_id, message);
460       return true;
461     case kInt8ToInt32:
462       ReadIntoVarInt<int8_t>(field_start, field_id, message);
463       return true;
464     case kInt16ToInt32:
465       ReadIntoVarInt<int16_t>(field_start, field_id, message);
466       return true;
467     case kInt32ToInt32:
468     case kInt32ToInt64:
469       ReadIntoVarInt<int32_t>(field_start, field_id, message);
470       return true;
471     case kInt64ToInt64:
472       ReadIntoVarInt<int64_t>(field_start, field_id, message);
473       return true;
474     case kFixedCStringToString:
475       // TODO(hjd): Add AppendMaxLength string to protozero.
476       return ReadIntoString(field_start, field_start + field.ftrace_size,
477                             field_id, message);
478     case kCStringToString:
479       // TODO(hjd): Kernel-dive to check this how size:0 char fields work.
480       return ReadIntoString(field_start, end, field.proto_field_id, message);
481     case kStringPtrToString:
482       // TODO(hjd): Figure out how to read these.
483       return true;
484     case kBoolToUint32:
485       ReadIntoVarInt<uint32_t>(field_start, field_id, message);
486       return true;
487     case kInode32ToUint64:
488       ReadInode<uint32_t>(field_start, field_id, message, metadata);
489       return true;
490     case kInode64ToUint64:
491       ReadInode<uint64_t>(field_start, field_id, message, metadata);
492       return true;
493     case kPid32ToInt32:
494       ReadPid(field_start, field_id, message, metadata);
495       return true;
496     case kCommonPid32ToInt32:
497       ReadCommonPid(field_start, field_id, message, metadata);
498       return true;
499     case kDevId32ToUint64:
500       ReadDevId<uint32_t>(field_start, field_id, message, metadata);
501       return true;
502     case kDevId64ToUint64:
503       ReadDevId<uint64_t>(field_start, field_id, message, metadata);
504       return true;
505   }
506   // Not reached, for gcc.
507   PERFETTO_CHECK(false);
508   return false;
509 }
510 
511 }  // namespace perfetto
512