1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/protozero/filtering/filter_util.h"
18 
19 #include <algorithm>
20 #include <map>
21 #include <memory>
22 #include <set>
23 
24 #include <google/protobuf/compiler/importer.h>
25 
26 #include "perfetto/base/build_config.h"
27 #include "perfetto/ext/base/file_utils.h"
28 #include "perfetto/ext/base/getopt.h"
29 #include "perfetto/ext/base/string_utils.h"
30 #include "perfetto/ext/base/version.h"
31 #include "perfetto/protozero/proto_utils.h"
32 #include "src/protozero/filtering/filter_bytecode_generator.h"
33 
34 namespace protozero {
35 
36 namespace {
37 
38 class MultiFileErrorCollectorImpl
39     : public google::protobuf::compiler::MultiFileErrorCollector {
40  public:
41   ~MultiFileErrorCollectorImpl() override;
42   void AddError(const std::string&, int, int, const std::string&) override;
43   void AddWarning(const std::string&, int, int, const std::string&) override;
44 };
45 
46 MultiFileErrorCollectorImpl::~MultiFileErrorCollectorImpl() = default;
47 
AddError(const std::string & filename,int line,int column,const std::string & message)48 void MultiFileErrorCollectorImpl::AddError(const std::string& filename,
49                                            int line,
50                                            int column,
51                                            const std::string& message) {
52   PERFETTO_ELOG("Error %s %d:%d: %s", filename.c_str(), line, column,
53                 message.c_str());
54 }
55 
AddWarning(const std::string & filename,int line,int column,const std::string & message)56 void MultiFileErrorCollectorImpl::AddWarning(const std::string& filename,
57                                              int line,
58                                              int column,
59                                              const std::string& message) {
60   PERFETTO_ELOG("Warning %s %d:%d: %s", filename.c_str(), line, column,
61                 message.c_str());
62 }
63 
64 }  // namespace
65 
66 FilterUtil::FilterUtil() = default;
67 FilterUtil::~FilterUtil() = default;
68 
LoadMessageDefinition(const std::string & proto_file,const std::string & root_message,const std::string & proto_dir_path)69 bool FilterUtil::LoadMessageDefinition(const std::string& proto_file,
70                                        const std::string& root_message,
71                                        const std::string& proto_dir_path) {
72   // The protobuf compiler doesn't like backslashes and prints an error like:
73   // Error C:\it7mjanpw3\perfetto-a16500 -1:0: Backslashes, consecutive slashes,
74   // ".", or ".." are not allowed in the virtual path.
75   // Given that C:\foo\bar is a legit path on windows, fix it at this level
76   // because the problem is really the protobuf compiler being too picky.
77   static auto normalize_for_win = [](const std::string& path) {
78 #if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN)
79     return perfetto::base::ReplaceAll(path, "\\", "/");
80 #else
81     return path;
82 #endif
83   };
84 
85   google::protobuf::compiler::DiskSourceTree dst;
86 #if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN)
87   // If the path is absolute, maps "C:/" -> "C:/" (without hardcoding 'C').
88   if (proto_file.size() > 3 && proto_file[1] == ':') {
89     char win_drive[4];
90     sprintf(win_drive, "%c:/", proto_file[0]);
91     dst.MapPath(win_drive, win_drive);
92   }
93 #endif
94   dst.MapPath("/", "/");  // We might still need this on Win under cygwin.
95   dst.MapPath("", normalize_for_win(proto_dir_path));
96   MultiFileErrorCollectorImpl mfe;
97   google::protobuf::compiler::Importer importer(&dst, &mfe);
98   const google::protobuf::FileDescriptor* root_file =
99       importer.Import(normalize_for_win(proto_file));
100   const google::protobuf::Descriptor* root_msg = nullptr;
101   if (!root_message.empty()) {
102     root_msg = importer.pool()->FindMessageTypeByName(root_message);
103   } else if (root_file->message_type_count() > 0) {
104     // The user didn't specfy the root type. Pick the first type in the file,
105     // most times it's the right guess.
106     root_msg = root_file->message_type(0);
107     if (root_msg)
108       PERFETTO_LOG(
109           "The guessed root message name is \"%s\". Pass -r com.MyName to "
110           "override",
111           root_msg->full_name().c_str());
112   }
113 
114   if (!root_msg) {
115     PERFETTO_ELOG("Could not find the root message \"%s\" in %s",
116                   root_message.c_str(), proto_file.c_str());
117     return false;
118   }
119 
120   // |descriptors_by_full_name| is passed by argument rather than being a member
121   // field so that we don't risk leaving it out of sync (and depending on it in
122   // future without realizing) when performing the Dedupe() pass.
123   DescriptorsByNameMap descriptors_by_full_name;
124   ParseProtoDescriptor(root_msg, &descriptors_by_full_name);
125   return true;
126 }
127 
128 // Generates a Message object for the given libprotobuf message descriptor.
129 // Recurses as needed into nested fields.
ParseProtoDescriptor(const google::protobuf::Descriptor * proto,DescriptorsByNameMap * descriptors_by_full_name)130 FilterUtil::Message* FilterUtil::ParseProtoDescriptor(
131     const google::protobuf::Descriptor* proto,
132     DescriptorsByNameMap* descriptors_by_full_name) {
133   auto descr_it = descriptors_by_full_name->find(proto->full_name());
134   if (descr_it != descriptors_by_full_name->end())
135     return descr_it->second;
136 
137   descriptors_.emplace_back();
138   Message* msg = &descriptors_.back();
139   msg->full_name = proto->full_name();
140   (*descriptors_by_full_name)[msg->full_name] = msg;
141   for (int i = 0; i < proto->field_count(); ++i) {
142     const auto* proto_field = proto->field(i);
143     const uint32_t field_id = static_cast<uint32_t>(proto_field->number());
144     PERFETTO_CHECK(msg->fields.count(field_id) == 0);
145     auto& field = msg->fields[field_id];
146     field.name = proto_field->name();
147     field.type = proto_field->type_name();
148     if (proto_field->message_type()) {
149       msg->has_nested_fields = true;
150       // Recurse.
151       field.nested_type = ParseProtoDescriptor(proto_field->message_type(),
152                                                descriptors_by_full_name);
153     }
154   }
155   return msg;
156 }
157 
Dedupe()158 void FilterUtil::Dedupe() {
159   std::map<std::string /*identity*/, Message*> index;
160 
161   std::map<Message*, Message*> dupe_graph;  // K,V: K shall be duped against V.
162 
163   // As a first pass, generate an |identity| string for each leaf message. The
164   // identity is simply the comma-separated stringification of its field ids.
165   // If another message with the same identity exists, add an edge to the graph.
166   const size_t initial_count = descriptors_.size();
167   size_t field_count = 0;
168   for (auto& descr : descriptors_) {
169     if (descr.has_nested_fields)
170       continue;  // Dedupe only leaf messages without nested fields.
171     std::string identity;
172     for (const auto& id_and_field : descr.fields)
173       identity.append(std::to_string(id_and_field.first) + ",");
174     auto it_and_inserted = index.emplace(identity, &descr);
175     if (!it_and_inserted.second) {
176       // insertion failed, a dupe exists already.
177       Message* dupe_against = it_and_inserted.first->second;
178       dupe_graph.emplace(&descr, dupe_against);
179     }
180   }
181 
182   // Now apply de-duplications by re-directing the nested_type pointer to the
183   // equivalent descriptors that have the same set of allowed field ids.
184   std::set<Message*> referenced_descriptors;
185   referenced_descriptors.emplace(&descriptors_.front());  // The root.
186   for (auto& descr : descriptors_) {
187     for (auto& id_and_field : descr.fields) {
188       Message* target = id_and_field.second.nested_type;
189       if (!target)
190         continue;  // Only try to dedupe nested types.
191       auto it = dupe_graph.find(target);
192       if (it == dupe_graph.end()) {
193         referenced_descriptors.emplace(target);
194         continue;
195       }
196       ++field_count;
197       // Replace with the dupe.
198       id_and_field.second.nested_type = it->second;
199     }  // for (nested_fields).
200   }    // for (descriptors_).
201 
202   // Remove unreferenced descriptors. We should much rather crash in the case of
203   // a logic bug rathern than trying to use them but don't emit them.
204   size_t removed_count = 0;
205   for (auto it = descriptors_.begin(); it != descriptors_.end();) {
206     if (referenced_descriptors.count(&*it)) {
207       ++it;
208     } else {
209       ++removed_count;
210       it = descriptors_.erase(it);
211     }
212   }
213   PERFETTO_LOG(
214       "Deduplication removed %zu duped descriptors out of %zu descriptors from "
215       "%zu fields",
216       removed_count, initial_count, field_count);
217 }
218 
219 // Prints the list of messages and fields in a diff-friendly text format.
PrintAsText()220 void FilterUtil::PrintAsText() {
221   using perfetto::base::StripPrefix;
222   const std::string& root_name = descriptors_.front().full_name;
223   std::string root_prefix = root_name.substr(0, root_name.rfind('.'));
224   if (!root_prefix.empty())
225     root_prefix.append(".");
226 
227   for (const auto& descr : descriptors_) {
228     for (const auto& id_and_field : descr.fields) {
229       const uint32_t field_id = id_and_field.first;
230       const auto& field = id_and_field.second;
231       const Message* nested_type = id_and_field.second.nested_type;
232       auto stripped_name = StripPrefix(descr.full_name, root_prefix);
233       std::string stripped_nested =
234           nested_type ? StripPrefix(nested_type->full_name, root_prefix) : "";
235       printf("%-60s %3u %-8s %-32s %s\n", stripped_name.c_str(), field_id,
236              field.type.c_str(), field.name.c_str(), stripped_nested.c_str());
237     }
238   }
239 }
240 
GenerateFilterBytecode()241 std::string FilterUtil::GenerateFilterBytecode() {
242   protozero::FilterBytecodeGenerator bytecode_gen;
243 
244   // Assign indexes to descriptors, simply by counting them in order;
245   std::map<Message*, uint32_t> descr_to_idx;
246   for (auto& descr : descriptors_)
247     descr_to_idx[&descr] = static_cast<uint32_t>(descr_to_idx.size());
248 
249   for (auto& descr : descriptors_) {
250     for (auto it = descr.fields.begin(); it != descr.fields.end();) {
251       uint32_t field_id = it->first;
252       const Message::Field& field = it->second;
253       if (field.nested_type) {
254         // Append the index of the target submessage.
255         PERFETTO_CHECK(descr_to_idx.count(field.nested_type));
256         uint32_t nested_msg_index = descr_to_idx[field.nested_type];
257         bytecode_gen.AddNestedField(field_id, nested_msg_index);
258         ++it;
259         continue;
260       }
261       // Simple field. Lookahead to see if we have a range of contiguous simple
262       // fields.
263       for (uint32_t range_len = 1;; ++range_len) {
264         ++it;
265         if (it != descr.fields.end() && it->first == field_id + range_len &&
266             it->second.is_simple()) {
267           continue;
268         }
269         // At this point it points to either the end() of the vector or a
270         // non-contiguous or non-simple field (which will be picked up by the
271         // next iteration).
272         if (range_len == 1) {
273           bytecode_gen.AddSimpleField(field_id);
274         } else {
275           bytecode_gen.AddSimpleFieldRange(field_id, range_len);
276         }
277         break;
278       }  // for (range_len)
279     }    // for (descr.fields)
280     bytecode_gen.EndMessage();
281   }  // for (descriptors)
282   return bytecode_gen.Serialize();
283 }
284 
LookupField(const std::string & varint_encoded_path)285 std::string FilterUtil::LookupField(const std::string& varint_encoded_path) {
286   const uint8_t* ptr =
287       reinterpret_cast<const uint8_t*>(varint_encoded_path.data());
288   const uint8_t* const end = ptr + varint_encoded_path.size();
289 
290   std::vector<uint32_t> fields;
291   while (ptr < end) {
292     uint64_t varint;
293     const uint8_t* next = proto_utils::ParseVarInt(ptr, end, &varint);
294     PERFETTO_CHECK(next != ptr);
295     fields.emplace_back(static_cast<uint32_t>(varint));
296     ptr = next;
297   }
298   return LookupField(fields.data(), fields.size());
299 }
300 
LookupField(const uint32_t * field_ids,size_t num_fields)301 std::string FilterUtil::LookupField(const uint32_t* field_ids,
302                                     size_t num_fields) {
303   const Message* msg = descriptors_.empty() ? nullptr : &descriptors_.front();
304   std::string res;
305   for (size_t i = 0; i < num_fields; ++i) {
306     const uint32_t field_id = field_ids[i];
307     const Message::Field* field = nullptr;
308     if (msg) {
309       auto it = msg->fields.find(field_id);
310       field = it == msg->fields.end() ? nullptr : &it->second;
311     }
312     res.append(".");
313     if (field) {
314       res.append(field->name);
315       msg = field->nested_type;
316     } else {
317       res.append(std::to_string(field_id));
318     }
319   }
320   return res;
321 }
322 
323 }  // namespace protozero
324