1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/traced/probes/ftrace/cpu_reader.h"
18 
19 #include <dirent.h>
20 #include <fcntl.h>
21 #include <signal.h>
22 
23 #include <algorithm>
24 #include <utility>
25 
26 #include "perfetto/base/build_config.h"
27 #include "perfetto/base/logging.h"
28 #include "perfetto/ext/base/metatrace.h"
29 #include "perfetto/ext/base/optional.h"
30 #include "perfetto/ext/base/utils.h"
31 #include "perfetto/ext/tracing/core/trace_writer.h"
32 #include "src/kallsyms/kernel_symbol_map.h"
33 #include "src/kallsyms/lazy_kernel_symbolizer.h"
34 #include "src/traced/probes/ftrace/ftrace_config_muxer.h"
35 #include "src/traced/probes/ftrace/ftrace_controller.h"
36 #include "src/traced/probes/ftrace/ftrace_data_source.h"
37 #include "src/traced/probes/ftrace/proto_translation_table.h"
38 
39 #include "protos/perfetto/trace/ftrace/ftrace_event.pbzero.h"
40 #include "protos/perfetto/trace/ftrace/ftrace_event_bundle.pbzero.h"
41 #include "protos/perfetto/trace/ftrace/generic.pbzero.h"
42 #include "protos/perfetto/trace/interned_data/interned_data.pbzero.h"
43 #include "protos/perfetto/trace/profiling/profile_common.pbzero.h"
44 #include "protos/perfetto/trace/trace_packet.pbzero.h"
45 
46 namespace perfetto {
47 namespace {
48 
49 // If the compact_sched buffer accumulates more unique strings, the reader will
50 // flush it to reset the interning state (and make it cheap again).
51 // This is not an exact cap, since we check only at tracing page boundaries.
52 // TODO(rsavitski): consider making part of compact_sched config.
53 constexpr size_t kCompactSchedInternerThreshold = 64;
54 
55 // For further documentation of these constants see the kernel source:
56 // linux/include/linux/ring_buffer.h
57 // Some information about the values of these constants are exposed to user
58 // space at: /sys/kernel/debug/tracing/events/header_event
59 constexpr uint32_t kTypeDataTypeLengthMax = 28;
60 constexpr uint32_t kTypePadding = 29;
61 constexpr uint32_t kTypeTimeExtend = 30;
62 constexpr uint32_t kTypeTimeStamp = 31;
63 
64 struct EventHeader {
65   uint32_t type_or_length : 5;
66   uint32_t time_delta : 27;
67 };
68 
69 struct TimeStamp {
70   uint64_t tv_nsec;
71   uint64_t tv_sec;
72 };
73 
ReadIntoString(const uint8_t * start,const uint8_t * end,uint32_t field_id,protozero::Message * out)74 bool ReadIntoString(const uint8_t* start,
75                     const uint8_t* end,
76                     uint32_t field_id,
77                     protozero::Message* out) {
78   for (const uint8_t* c = start; c < end; c++) {
79     if (*c != '\0')
80       continue;
81     out->AppendBytes(field_id, reinterpret_cast<const char*>(start),
82                      static_cast<uintptr_t>(c - start));
83     return true;
84   }
85   return false;
86 }
87 
ReadDataLoc(const uint8_t * start,const uint8_t * field_start,const uint8_t * end,const Field & field,protozero::Message * message)88 bool ReadDataLoc(const uint8_t* start,
89                  const uint8_t* field_start,
90                  const uint8_t* end,
91                  const Field& field,
92                  protozero::Message* message) {
93   PERFETTO_DCHECK(field.ftrace_size == 4);
94   // See
95   // https://github.com/torvalds/linux/blob/master/include/trace/trace_events.h
96   uint32_t data = 0;
97   const uint8_t* ptr = field_start;
98   if (!CpuReader::ReadAndAdvance(&ptr, end, &data)) {
99     PERFETTO_DFATAL("Buffer overflowed.");
100     return false;
101   }
102 
103   const uint16_t offset = data & 0xffff;
104   const uint16_t len = (data >> 16) & 0xffff;
105   const uint8_t* const string_start = start + offset;
106   const uint8_t* const string_end = string_start + len;
107   if (string_start <= start || string_end > end) {
108     PERFETTO_DFATAL("Buffer overflowed.");
109     return false;
110   }
111   ReadIntoString(string_start, string_end, field.proto_field_id, message);
112   return true;
113 }
114 
115 template <typename T>
ReadValue(const uint8_t * ptr)116 T ReadValue(const uint8_t* ptr) {
117   T t;
118   memcpy(&t, reinterpret_cast<const void*>(ptr), sizeof(T));
119   return t;
120 }
121 
122 // Reads a signed ftrace value as an int64_t, sign extending if necessary.
ReadSignedFtraceValue(const uint8_t * ptr,FtraceFieldType ftrace_type)123 static int64_t ReadSignedFtraceValue(const uint8_t* ptr,
124                                      FtraceFieldType ftrace_type) {
125   if (ftrace_type == kFtraceInt32) {
126     int32_t value;
127     memcpy(&value, reinterpret_cast<const void*>(ptr), sizeof(value));
128     return int64_t(value);
129   }
130   if (ftrace_type == kFtraceInt64) {
131     int64_t value;
132     memcpy(&value, reinterpret_cast<const void*>(ptr), sizeof(value));
133     return value;
134   }
135   PERFETTO_FATAL("unexpected ftrace type");
136 }
137 
SetBlocking(int fd,bool is_blocking)138 bool SetBlocking(int fd, bool is_blocking) {
139   int flags = fcntl(fd, F_GETFL, 0);
140   flags = (is_blocking) ? (flags & ~O_NONBLOCK) : (flags | O_NONBLOCK);
141   return fcntl(fd, F_SETFL, flags) == 0;
142 }
143 
144 }  // namespace
145 
146 using protos::pbzero::GenericFtraceEvent;
147 
CpuReader(size_t cpu,const ProtoTranslationTable * table,LazyKernelSymbolizer * symbolizer,base::ScopedFile trace_fd)148 CpuReader::CpuReader(size_t cpu,
149                      const ProtoTranslationTable* table,
150                      LazyKernelSymbolizer* symbolizer,
151                      base::ScopedFile trace_fd)
152     : cpu_(cpu),
153       table_(table),
154       symbolizer_(symbolizer),
155       trace_fd_(std::move(trace_fd)) {
156   PERFETTO_CHECK(trace_fd_);
157   PERFETTO_CHECK(SetBlocking(*trace_fd_, false));
158 }
159 
160 CpuReader::~CpuReader() = default;
161 
ReadCycle(uint8_t * parsing_buf,size_t parsing_buf_size_pages,size_t max_pages,const std::set<FtraceDataSource * > & started_data_sources)162 size_t CpuReader::ReadCycle(
163     uint8_t* parsing_buf,
164     size_t parsing_buf_size_pages,
165     size_t max_pages,
166     const std::set<FtraceDataSource*>& started_data_sources) {
167   PERFETTO_DCHECK(max_pages > 0 && parsing_buf_size_pages > 0);
168   metatrace::ScopedEvent evt(metatrace::TAG_FTRACE,
169                              metatrace::FTRACE_CPU_READ_CYCLE);
170 
171   // Work in batches to keep cache locality, and limit memory usage.
172   size_t batch_pages = std::min(parsing_buf_size_pages, max_pages);
173   size_t total_pages_read = 0;
174   for (bool is_first_batch = true;; is_first_batch = false) {
175     size_t pages_read = ReadAndProcessBatch(
176         parsing_buf, batch_pages, is_first_batch, started_data_sources);
177 
178     PERFETTO_DCHECK(pages_read <= batch_pages);
179     total_pages_read += pages_read;
180 
181     // Check whether we've caught up to the writer, or possibly giving up on
182     // this attempt due to some error.
183     if (pages_read != batch_pages)
184       break;
185     // Check if we've hit the limit of work for this cycle.
186     if (total_pages_read >= max_pages)
187       break;
188   }
189   PERFETTO_METATRACE_COUNTER(TAG_FTRACE, FTRACE_PAGES_DRAINED,
190                              total_pages_read);
191   return total_pages_read;
192 }
193 
194 // metatrace note: mark the reading phase as FTRACE_CPU_READ_BATCH, but let the
195 // parsing time be implied (by the difference between the caller's span, and
196 // this reading span). Makes it easier to estimate the read/parse ratio when
197 // looking at the trace in the UI.
ReadAndProcessBatch(uint8_t * parsing_buf,size_t max_pages,bool first_batch_in_cycle,const std::set<FtraceDataSource * > & started_data_sources)198 size_t CpuReader::ReadAndProcessBatch(
199     uint8_t* parsing_buf,
200     size_t max_pages,
201     bool first_batch_in_cycle,
202     const std::set<FtraceDataSource*>& started_data_sources) {
203   size_t pages_read = 0;
204   {
205     metatrace::ScopedEvent evt(metatrace::TAG_FTRACE,
206                                metatrace::FTRACE_CPU_READ_BATCH);
207     for (; pages_read < max_pages;) {
208       uint8_t* curr_page = parsing_buf + (pages_read * base::kPageSize);
209       ssize_t res =
210           PERFETTO_EINTR(read(*trace_fd_, curr_page, base::kPageSize));
211       if (res < 0) {
212         // Expected errors:
213         // EAGAIN: no data (since we're in non-blocking mode).
214         // ENONMEM, EBUSY: temporary ftrace failures (they happen).
215         // ENODEV: the cpu is offline (b/145583318).
216         if (errno != EAGAIN && errno != ENOMEM && errno != EBUSY &&
217             errno != ENODEV) {
218           PERFETTO_PLOG("Unexpected error on raw ftrace read");
219         }
220         break;  // stop reading regardless of errno
221       }
222 
223       // As long as all of our reads are for a single page, the kernel should
224       // return exactly a well-formed raw ftrace page (if not in the steady
225       // state of reading out fully-written pages, the kernel will construct
226       // pages as necessary, copying over events and zero-filling at the end).
227       // A sub-page read() is therefore not expected in practice (unless
228       // there's a concurrent reader requesting less than a page?). Crash if
229       // encountering this situation. Kernel source pointer: see usage of
230       // |info->read| within |tracing_buffers_read|.
231       if (res == 0) {
232         // Very rare, but possible. Stop for now, should recover.
233         PERFETTO_DLOG("[cpu%zu]: 0-sized read from ftrace pipe.", cpu_);
234         break;
235       }
236       PERFETTO_CHECK(res == static_cast<ssize_t>(base::kPageSize));
237 
238       pages_read += 1;
239 
240       // Compare the amount of ftrace data read against an empirical threshold
241       // to make an educated guess on whether we should read more. To figure
242       // out the amount of ftrace data, we need to parse the page header (since
243       // the read always returns a page, zero-filled at the end). If we read
244       // fewer bytes than the threshold, it means that we caught up with the
245       // write pointer and we started consuming ftrace events in real-time.
246       // This cannot be just 4096 because it needs to account for
247       // fragmentation, i.e. for the fact that the last trace event didn't fit
248       // in the current page and hence the current page was terminated
249       // prematurely.
250       static constexpr size_t kRoughlyAPage = base::kPageSize - 512;
251       const uint8_t* scratch_ptr = curr_page;
252       base::Optional<PageHeader> hdr =
253           ParsePageHeader(&scratch_ptr, table_->page_header_size_len());
254       PERFETTO_DCHECK(hdr && hdr->size > 0 && hdr->size <= base::kPageSize);
255       if (!hdr.has_value()) {
256         PERFETTO_ELOG("[cpu%zu]: can't parse page header", cpu_);
257         break;
258       }
259       // Note that the first read after starting the read cycle being small is
260       // normal. It means that we're given the remainder of events from a
261       // page that we've partially consumed during the last read of the previous
262       // cycle (having caught up to the writer).
263       if (hdr->size < kRoughlyAPage &&
264           !(first_batch_in_cycle && pages_read == 1)) {
265         break;
266       }
267     }
268   }  // end of metatrace::FTRACE_CPU_READ_BATCH
269 
270   // Parse the pages and write to the trace for all relevant data
271   // sources.
272   if (pages_read == 0)
273     return pages_read;
274 
275   for (FtraceDataSource* data_source : started_data_sources) {
276     bool success = ProcessPagesForDataSource(
277         data_source->trace_writer(), data_source->mutable_metadata(), cpu_,
278         data_source->parsing_config(), parsing_buf, pages_read, table_,
279         symbolizer_);
280     PERFETTO_CHECK(success);
281   }
282 
283   return pages_read;
284 }
285 
286 // static
ProcessPagesForDataSource(TraceWriter * trace_writer,FtraceMetadata * metadata,size_t cpu,const FtraceDataSourceConfig * ds_config,const uint8_t * parsing_buf,const size_t pages_read,const ProtoTranslationTable * table,LazyKernelSymbolizer * symbolizer)287 bool CpuReader::ProcessPagesForDataSource(
288     TraceWriter* trace_writer,
289     FtraceMetadata* metadata,
290     size_t cpu,
291     const FtraceDataSourceConfig* ds_config,
292     const uint8_t* parsing_buf,
293     const size_t pages_read,
294     const ProtoTranslationTable* table,
295     LazyKernelSymbolizer* symbolizer) {
296   // Allocate the buffer for compact scheduler events (which will be unused if
297   // the compact option isn't enabled).
298   CompactSchedBuffer compact_sched;
299   bool compact_sched_enabled = ds_config->compact_sched.enabled;
300 
301   TraceWriter::TracePacketHandle packet;
302   protos::pbzero::FtraceEventBundle* bundle = nullptr;
303 
304   // This function is called after the contents of a FtraceBundle are written.
305   auto finalize_cur_packet = [&] {
306     PERFETTO_DCHECK(packet);
307     if (compact_sched_enabled)
308       compact_sched.WriteAndReset(bundle);
309 
310     bundle->Finalize();
311     bundle = nullptr;
312 
313     // Write the kernel symbol index (mangled address) -> name table.
314     // |metadata| is shared across all cpus, is distinct per |data_source| (i.e.
315     // tracing session) and is cleared after each FtraceController::ReadTick().
316     // const size_t kaddrs_size = metadata->kernel_addrs.size();
317     if (ds_config->symbolize_ksyms) {
318       // Symbol indexes are assigned mononically as |kernel_addrs.size()|,
319       // starting from index 1 (no symbol has index 0). Here we remember the
320       // size() (which is also == the highest value in |kernel_addrs|) at the
321       // beginning and only write newer indexes bigger than that.
322       uint32_t max_index_at_start = metadata->last_kernel_addr_index_written;
323       PERFETTO_DCHECK(max_index_at_start <= metadata->kernel_addrs.size());
324       protos::pbzero::InternedData* interned_data = nullptr;
325       auto* ksyms_map = symbolizer->GetOrCreateKernelSymbolMap();
326       bool wrote_at_least_one_symbol = false;
327       for (const FtraceMetadata::KernelAddr& kaddr : metadata->kernel_addrs) {
328         if (kaddr.index <= max_index_at_start)
329           continue;
330         std::string sym_name = ksyms_map->Lookup(kaddr.addr);
331         if (sym_name.empty()) {
332           // Lookup failed. This can genuinely happen in many occasions. E.g.,
333           // workqueue_execute_start has two pointers: one is a pointer to a
334           // function (which we expect to be symbolized), the other (|work|) is
335           // a pointer to a heap struct, which is unsymbolizable, even when
336           // using the textual ftrace endpoint.
337           continue;
338         }
339 
340         if (!interned_data) {
341           // If this is the very first write, clear the start of the sequence
342           // so the trace processor knows that all previous indexes can be
343           // discarded and that the mapping is restarting.
344           // In most cases this occurs with cpu==0. But if cpu0 is idle, this
345           // will happen with the first CPU that has any ftrace data.
346           if (max_index_at_start == 0) {
347             packet->set_sequence_flags(
348                 protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
349           }
350           interned_data = packet->set_interned_data();
351         }
352         auto* interned_sym = interned_data->add_kernel_symbols();
353         interned_sym->set_iid(kaddr.index);
354         interned_sym->set_str(sym_name);
355         wrote_at_least_one_symbol = true;
356       }
357 
358       auto max_it_at_end = static_cast<uint32_t>(metadata->kernel_addrs.size());
359 
360       // Rationale for the if (wrote_at_least_one_symbol) check: in rare cases,
361       // all symbols seen in a ProcessPagesForDataSource() call can fail the
362       // ksyms_map->Lookup(). If that happens we don't want to bump the
363       // last_kernel_addr_index_written watermark, as that would cause the next
364       // call to NOT emit the SEQ_INCREMENTAL_STATE_CLEARED.
365       if (wrote_at_least_one_symbol)
366         metadata->last_kernel_addr_index_written = max_it_at_end;
367     }
368 
369     packet->Finalize();
370   };  // finalize_cur_packet().
371 
372   auto start_new_packet = [&](bool lost_events) {
373     if (packet)
374       finalize_cur_packet();
375     packet = trace_writer->NewTracePacket();
376     bundle = packet->set_ftrace_events();
377     // Note: The fastpath in proto_trace_parser.cc speculates on the fact
378     // that the cpu field is the first field of the proto message. If this
379     // changes, change proto_trace_parser.cc accordingly.
380     bundle->set_cpu(static_cast<uint32_t>(cpu));
381     if (lost_events)
382       bundle->set_lost_events(true);
383   };
384 
385   start_new_packet(/*lost_events=*/false);
386   for (size_t i = 0; i < pages_read; i++) {
387     const uint8_t* curr_page = parsing_buf + (i * base::kPageSize);
388     const uint8_t* curr_page_end = curr_page + base::kPageSize;
389     const uint8_t* parse_pos = curr_page;
390     base::Optional<PageHeader> page_header =
391         ParsePageHeader(&parse_pos, table->page_header_size_len());
392 
393     if (!page_header.has_value() || page_header->size == 0 ||
394         parse_pos >= curr_page_end ||
395         parse_pos + page_header->size > curr_page_end) {
396       PERFETTO_DFATAL("invalid page header");
397       return false;
398     }
399 
400     // Start a new bundle if either:
401     // * The page we're about to read indicates that there was a kernel ring
402     //   buffer overrun since our last read from that per-cpu buffer. We have
403     //   a single |lost_events| field per bundle, so start a new packet.
404     // * The compact_sched buffer is holding more unique interned strings than
405     //   a threshold. We need to flush the compact buffer to make the
406     //   interning lookups cheap again.
407     bool interner_past_threshold =
408         compact_sched_enabled &&
409         compact_sched.interner().interned_comms_size() >
410             kCompactSchedInternerThreshold;
411 
412     if (page_header->lost_events || interner_past_threshold)
413       start_new_packet(page_header->lost_events);
414 
415     size_t evt_size =
416         ParsePagePayload(parse_pos, &page_header.value(), table, ds_config,
417                          &compact_sched, bundle, metadata);
418 
419     // TODO(rsavitski): propagate error to trace processor in release builds.
420     // (FtraceMetadata -> FtraceStats in trace).
421     PERFETTO_DCHECK(evt_size == page_header->size);
422   }
423   finalize_cur_packet();
424 
425   return true;
426 }
427 
428 // A page header consists of:
429 // * timestamp: 8 bytes
430 // * commit: 8 bytes on 64 bit, 4 bytes on 32 bit kernels
431 //
432 // The kernel reports this at /sys/kernel/debug/tracing/events/header_page.
433 //
434 // |commit|'s bottom bits represent the length of the payload following this
435 // header. The top bits have been repurposed as a bitset of flags pertaining to
436 // data loss. We look only at the "there has been some data lost" flag
437 // (RB_MISSED_EVENTS), and ignore the relatively tricky "appended the precise
438 // lost events count past the end of the valid data, as there was room to do so"
439 // flag (RB_MISSED_STORED).
440 //
441 // static
ParsePageHeader(const uint8_t ** ptr,uint16_t page_header_size_len)442 base::Optional<CpuReader::PageHeader> CpuReader::ParsePageHeader(
443     const uint8_t** ptr,
444     uint16_t page_header_size_len) {
445   // Mask for the data length portion of the |commit| field. Note that the
446   // kernel implementation never explicitly defines the boundary (beyond using
447   // bits 30 and 31 as flags), but 27 bits are mentioned as sufficient in the
448   // original commit message, and is the constant used by trace-cmd.
449   constexpr static uint64_t kDataSizeMask = (1ull << 27) - 1;
450   // If set, indicates that the relevant cpu has lost events since the last read
451   // (clearing the bit internally).
452   constexpr static uint64_t kMissedEventsFlag = (1ull << 31);
453 
454   const uint8_t* end_of_page = *ptr + base::kPageSize;
455   PageHeader page_header;
456   if (!CpuReader::ReadAndAdvance<uint64_t>(ptr, end_of_page,
457                                            &page_header.timestamp))
458     return base::nullopt;
459 
460   uint32_t size_and_flags;
461 
462   // On little endian, we can just read a uint32_t and reject the rest of the
463   // number later.
464   if (!CpuReader::ReadAndAdvance<uint32_t>(
465           ptr, end_of_page, base::AssumeLittleEndian(&size_and_flags)))
466     return base::nullopt;
467 
468   page_header.size = size_and_flags & kDataSizeMask;
469   page_header.lost_events = bool(size_and_flags & kMissedEventsFlag);
470   PERFETTO_DCHECK(page_header.size <= base::kPageSize);
471 
472   // Reject rest of the number, if applicable. On 32-bit, size_bytes - 4 will
473   // evaluate to 0 and this will be a no-op. On 64-bit, this will advance by 4
474   // bytes.
475   PERFETTO_DCHECK(page_header_size_len >= 4);
476   *ptr += page_header_size_len - 4;
477 
478   return base::make_optional(page_header);
479 }
480 
481 // A raw ftrace buffer page consists of a header followed by a sequence of
482 // binary ftrace events. See |ParsePageHeader| for the format of the earlier.
483 //
484 // This method is deliberately static so it can be tested independently.
ParsePagePayload(const uint8_t * start_of_payload,const PageHeader * page_header,const ProtoTranslationTable * table,const FtraceDataSourceConfig * ds_config,CompactSchedBuffer * compact_sched_buffer,FtraceEventBundle * bundle,FtraceMetadata * metadata)485 size_t CpuReader::ParsePagePayload(const uint8_t* start_of_payload,
486                                    const PageHeader* page_header,
487                                    const ProtoTranslationTable* table,
488                                    const FtraceDataSourceConfig* ds_config,
489                                    CompactSchedBuffer* compact_sched_buffer,
490                                    FtraceEventBundle* bundle,
491                                    FtraceMetadata* metadata) {
492   const uint8_t* ptr = start_of_payload;
493   const uint8_t* const end = ptr + page_header->size;
494 
495   uint64_t timestamp = page_header->timestamp;
496 
497   while (ptr < end) {
498     EventHeader event_header;
499     if (!ReadAndAdvance(&ptr, end, &event_header))
500       return 0;
501 
502     timestamp += event_header.time_delta;
503 
504     switch (event_header.type_or_length) {
505       case kTypePadding: {
506         // Left over page padding or discarded event.
507         if (event_header.time_delta == 0) {
508           // Not clear what the correct behaviour is in this case.
509           PERFETTO_DFATAL("Empty padding event.");
510           return 0;
511         }
512         uint32_t length = 0;
513         if (!ReadAndAdvance<uint32_t>(&ptr, end, &length))
514           return 0;
515         // length includes itself (4 bytes)
516         if (length < 4)
517           return 0;
518         ptr += length - 4;
519         break;
520       }
521       case kTypeTimeExtend: {
522         // Extend the time delta.
523         uint32_t time_delta_ext = 0;
524         if (!ReadAndAdvance<uint32_t>(&ptr, end, &time_delta_ext))
525           return 0;
526         timestamp += (static_cast<uint64_t>(time_delta_ext)) << 27;
527         break;
528       }
529       case kTypeTimeStamp: {
530         // Absolute timestamp. This was historically partially implemented, but
531         // not written. Kernels 4.17+ reimplemented this record, changing its
532         // size in the process. We assume the newer layout. Parsed the same as
533         // kTypeTimeExtend, except that the timestamp is interpreted as an
534         // absolute, instead of a delta on top of the previous state.
535         timestamp = event_header.time_delta;
536         uint32_t time_delta_ext = 0;
537         if (!ReadAndAdvance<uint32_t>(&ptr, end, &time_delta_ext))
538           return 0;
539         timestamp += (static_cast<uint64_t>(time_delta_ext)) << 27;
540         break;
541       }
542       // Data record:
543       default: {
544         PERFETTO_CHECK(event_header.type_or_length <= kTypeDataTypeLengthMax);
545         // type_or_length is <=28 so it represents the length of a data
546         // record. if == 0, this is an extended record and the size of the
547         // record is stored in the first uint32_t word in the payload. See
548         // Kernel's include/linux/ring_buffer.h
549         uint32_t event_size = 0;
550         if (event_header.type_or_length == 0) {
551           if (!ReadAndAdvance<uint32_t>(&ptr, end, &event_size))
552             return 0;
553           // Size includes the size field itself.
554           if (event_size < 4)
555             return 0;
556           event_size -= 4;
557         } else {
558           event_size = 4 * event_header.type_or_length;
559         }
560         const uint8_t* start = ptr;
561         const uint8_t* next = ptr + event_size;
562 
563         if (next > end)
564           return 0;
565 
566         uint16_t ftrace_event_id;
567         if (!ReadAndAdvance<uint16_t>(&ptr, end, &ftrace_event_id))
568           return 0;
569 
570         if (ds_config->event_filter.IsEventEnabled(ftrace_event_id)) {
571           // Special-cased handling of some scheduler events when compact format
572           // is enabled.
573           bool compact_sched_enabled = ds_config->compact_sched.enabled;
574           const CompactSchedSwitchFormat& sched_switch_format =
575               table->compact_sched_format().sched_switch;
576           const CompactSchedWakingFormat& sched_waking_format =
577               table->compact_sched_format().sched_waking;
578 
579           // compact sched_switch
580           if (compact_sched_enabled &&
581               ftrace_event_id == sched_switch_format.event_id) {
582             if (event_size < sched_switch_format.size)
583               return 0;
584 
585             ParseSchedSwitchCompact(start, timestamp, &sched_switch_format,
586                                     compact_sched_buffer, metadata);
587 
588             // compact sched_waking
589           } else if (compact_sched_enabled &&
590                      ftrace_event_id == sched_waking_format.event_id) {
591             if (event_size < sched_waking_format.size)
592               return 0;
593 
594             ParseSchedWakingCompact(start, timestamp, &sched_waking_format,
595                                     compact_sched_buffer, metadata);
596 
597           } else {
598             // Common case: parse all other types of enabled events.
599             protos::pbzero::FtraceEvent* event = bundle->add_event();
600             event->set_timestamp(timestamp);
601             if (!ParseEvent(ftrace_event_id, start, next, table, event,
602                             metadata))
603               return 0;
604           }
605         }
606 
607         // Jump to next event.
608         ptr = next;
609       }
610     }
611   }
612   return static_cast<size_t>(ptr - start_of_payload);
613 }
614 
615 // |start| is the start of the current event.
616 // |end| is the end of the buffer.
ParseEvent(uint16_t ftrace_event_id,const uint8_t * start,const uint8_t * end,const ProtoTranslationTable * table,protozero::Message * message,FtraceMetadata * metadata)617 bool CpuReader::ParseEvent(uint16_t ftrace_event_id,
618                            const uint8_t* start,
619                            const uint8_t* end,
620                            const ProtoTranslationTable* table,
621                            protozero::Message* message,
622                            FtraceMetadata* metadata) {
623   PERFETTO_DCHECK(start < end);
624   const size_t length = static_cast<size_t>(end - start);
625 
626   // TODO(hjd): Rework to work even if the event is unknown.
627   const Event& info = *table->GetEventById(ftrace_event_id);
628 
629   // TODO(hjd): Test truncated events.
630   // If the end of the buffer is before the end of the event give up.
631   if (info.size > length) {
632     PERFETTO_DFATAL("Buffer overflowed.");
633     return false;
634   }
635 
636   bool success = true;
637   for (const Field& field : table->common_fields())
638     success &= ParseField(field, start, end, table, message, metadata);
639 
640   protozero::Message* nested =
641       message->BeginNestedMessage<protozero::Message>(info.proto_field_id);
642 
643   // Parse generic event.
644   if (PERFETTO_UNLIKELY(info.proto_field_id ==
645                         protos::pbzero::FtraceEvent::kGenericFieldNumber)) {
646     nested->AppendString(GenericFtraceEvent::kEventNameFieldNumber, info.name);
647     for (const Field& field : info.fields) {
648       auto generic_field = nested->BeginNestedMessage<protozero::Message>(
649           GenericFtraceEvent::kFieldFieldNumber);
650       // TODO(hjd): Avoid outputting field names every time.
651       generic_field->AppendString(GenericFtraceEvent::Field::kNameFieldNumber,
652                                   field.ftrace_name);
653       success &= ParseField(field, start, end, table, generic_field, metadata);
654     }
655   } else {  // Parse all other events.
656     for (const Field& field : info.fields) {
657       success &= ParseField(field, start, end, table, nested, metadata);
658     }
659   }
660 
661   if (PERFETTO_UNLIKELY(info.proto_field_id ==
662                         protos::pbzero::FtraceEvent::kTaskRenameFieldNumber)) {
663     // For task renames, we want to store that the pid was renamed. We use the
664     // common pid to reduce code complexity as in all the cases we care about,
665     // the common pid is the same as the renamed pid (the pid inside the event).
666     PERFETTO_DCHECK(metadata->last_seen_common_pid);
667     metadata->AddRenamePid(metadata->last_seen_common_pid);
668   }
669 
670   // This finalizes |nested| and |proto_field| automatically.
671   message->Finalize();
672   metadata->FinishEvent();
673   return success;
674 }
675 
676 // Caller must guarantee that the field fits in the range,
677 // explicitly: start + field.ftrace_offset + field.ftrace_size <= end
678 // The only exception is fields with strategy = kCStringToString
679 // where the total size isn't known up front. In this case ParseField
680 // will check the string terminates in the bounds and won't read past |end|.
ParseField(const Field & field,const uint8_t * start,const uint8_t * end,const ProtoTranslationTable * table,protozero::Message * message,FtraceMetadata * metadata)681 bool CpuReader::ParseField(const Field& field,
682                            const uint8_t* start,
683                            const uint8_t* end,
684                            const ProtoTranslationTable* table,
685                            protozero::Message* message,
686                            FtraceMetadata* metadata) {
687   PERFETTO_DCHECK(start + field.ftrace_offset + field.ftrace_size <= end);
688   const uint8_t* field_start = start + field.ftrace_offset;
689   uint32_t field_id = field.proto_field_id;
690 
691   switch (field.strategy) {
692     case kUint8ToUint32:
693     case kUint8ToUint64:
694       ReadIntoVarInt<uint8_t>(field_start, field_id, message);
695       return true;
696     case kUint16ToUint32:
697     case kUint16ToUint64:
698       ReadIntoVarInt<uint16_t>(field_start, field_id, message);
699       return true;
700     case kUint32ToUint32:
701     case kUint32ToUint64:
702       ReadIntoVarInt<uint32_t>(field_start, field_id, message);
703       return true;
704     case kUint64ToUint64:
705       ReadIntoVarInt<uint64_t>(field_start, field_id, message);
706       return true;
707     case kInt8ToInt32:
708     case kInt8ToInt64:
709       ReadIntoVarInt<int8_t>(field_start, field_id, message);
710       return true;
711     case kInt16ToInt32:
712     case kInt16ToInt64:
713       ReadIntoVarInt<int16_t>(field_start, field_id, message);
714       return true;
715     case kInt32ToInt32:
716     case kInt32ToInt64:
717       ReadIntoVarInt<int32_t>(field_start, field_id, message);
718       return true;
719     case kInt64ToInt64:
720       ReadIntoVarInt<int64_t>(field_start, field_id, message);
721       return true;
722     case kFixedCStringToString:
723       // TODO(hjd): Add AppendMaxLength string to protozero.
724       return ReadIntoString(field_start, field_start + field.ftrace_size,
725                             field_id, message);
726     case kCStringToString:
727       // TODO(hjd): Kernel-dive to check this how size:0 char fields work.
728       return ReadIntoString(field_start, end, field_id, message);
729     case kStringPtrToString: {
730       uint64_t n = 0;
731       // The ftrace field may be 8 or 4 bytes and we need to copy it into the
732       // bottom of n. In the unlikely case where the field is >8 bytes we
733       // should avoid making things worse by corrupting the stack but we
734       // don't need to handle it correctly.
735       size_t size = std::min<size_t>(field.ftrace_size, sizeof(n));
736       memcpy(base::AssumeLittleEndian(&n),
737              reinterpret_cast<const void*>(field_start), size);
738       // Look up the adddress in the printk format map and write it into the
739       // proto.
740       base::StringView name = table->LookupTraceString(n);
741       message->AppendBytes(field_id, name.begin(), name.size());
742       return true;
743     }
744     case kDataLocToString:
745       return ReadDataLoc(start, field_start, end, field, message);
746     case kBoolToUint32:
747     case kBoolToUint64:
748       ReadIntoVarInt<uint8_t>(field_start, field_id, message);
749       return true;
750     case kInode32ToUint64:
751       ReadInode<uint32_t>(field_start, field_id, message, metadata);
752       return true;
753     case kInode64ToUint64:
754       ReadInode<uint64_t>(field_start, field_id, message, metadata);
755       return true;
756     case kPid32ToInt32:
757     case kPid32ToInt64:
758       ReadPid(field_start, field_id, message, metadata);
759       return true;
760     case kCommonPid32ToInt32:
761     case kCommonPid32ToInt64:
762       ReadCommonPid(field_start, field_id, message, metadata);
763       return true;
764     case kDevId32ToUint64:
765       ReadDevId<uint32_t>(field_start, field_id, message, metadata);
766       return true;
767     case kDevId64ToUint64:
768       ReadDevId<uint64_t>(field_start, field_id, message, metadata);
769       return true;
770     case kFtraceSymAddr64ToUint64:
771       ReadSymbolAddr<uint64_t>(field_start, field_id, message, metadata);
772       return true;
773     case kInvalidTranslationStrategy:
774       break;
775   }
776   PERFETTO_FATAL("Unexpected translation strategy");
777 }
778 
779 // Parse a sched_switch event according to pre-validated format, and buffer the
780 // individual fields in the current compact batch. See the code populating
781 // |CompactSchedSwitchFormat| for the assumptions made around the format, which
782 // this code is closely tied to.
783 // static
ParseSchedSwitchCompact(const uint8_t * start,uint64_t timestamp,const CompactSchedSwitchFormat * format,CompactSchedBuffer * compact_buf,FtraceMetadata * metadata)784 void CpuReader::ParseSchedSwitchCompact(const uint8_t* start,
785                                         uint64_t timestamp,
786                                         const CompactSchedSwitchFormat* format,
787                                         CompactSchedBuffer* compact_buf,
788                                         FtraceMetadata* metadata) {
789   compact_buf->sched_switch().AppendTimestamp(timestamp);
790 
791   int32_t next_pid = ReadValue<int32_t>(start + format->next_pid_offset);
792   compact_buf->sched_switch().next_pid().Append(next_pid);
793   metadata->AddPid(next_pid);
794 
795   int32_t next_prio = ReadValue<int32_t>(start + format->next_prio_offset);
796   compact_buf->sched_switch().next_prio().Append(next_prio);
797 
798   // Varint encoding of int32 and int64 is the same, so treat the value as
799   // int64 after reading.
800   int64_t prev_state = ReadSignedFtraceValue(start + format->prev_state_offset,
801                                              format->prev_state_type);
802   compact_buf->sched_switch().prev_state().Append(prev_state);
803 
804   // next_comm
805   const char* comm_ptr =
806       reinterpret_cast<const char*>(start + format->next_comm_offset);
807   size_t iid = compact_buf->interner().InternComm(comm_ptr);
808   compact_buf->sched_switch().next_comm_index().Append(iid);
809 }
810 
811 // static
ParseSchedWakingCompact(const uint8_t * start,uint64_t timestamp,const CompactSchedWakingFormat * format,CompactSchedBuffer * compact_buf,FtraceMetadata * metadata)812 void CpuReader::ParseSchedWakingCompact(const uint8_t* start,
813                                         uint64_t timestamp,
814                                         const CompactSchedWakingFormat* format,
815                                         CompactSchedBuffer* compact_buf,
816                                         FtraceMetadata* metadata) {
817   compact_buf->sched_waking().AppendTimestamp(timestamp);
818 
819   int32_t pid = ReadValue<int32_t>(start + format->pid_offset);
820   compact_buf->sched_waking().pid().Append(pid);
821   metadata->AddPid(pid);
822 
823   int32_t target_cpu = ReadValue<int32_t>(start + format->target_cpu_offset);
824   compact_buf->sched_waking().target_cpu().Append(target_cpu);
825 
826   int32_t prio = ReadValue<int32_t>(start + format->prio_offset);
827   compact_buf->sched_waking().prio().Append(prio);
828 
829   // comm
830   const char* comm_ptr =
831       reinterpret_cast<const char*>(start + format->comm_offset);
832   size_t iid = compact_buf->interner().InternComm(comm_ptr);
833   compact_buf->sched_waking().comm_index().Append(iid);
834 }
835 
836 }  // namespace perfetto
837