1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef SRC_PROTOZERO_FILTERING_FILTER_BYTECODE_PARSER_H_
18 #define SRC_PROTOZERO_FILTERING_FILTER_BYTECODE_PARSER_H_
19 
20 #include <stddef.h>
21 #include <stdint.h>
22 
23 #include <vector>
24 
25 namespace protozero {
26 
27 // Loads the proto-encoded bytecode in memory and allows fast lookups for tuples
28 // (msg_index, field_id) to tell if a given field should be allowed or not and,
29 // in the case of nested fields, what is the next message index to recurse into.
30 // This class does two things:
31 // 1. Expands the array of varint from the proto into a vector<uint32_t>. This
32 //    is to avoid performing varint decoding on every lookup, at the cost of
33 //    some extra memory (2KB-4KB). Note that the expanded vector is not just a
34 //    1:1 copy of the proto one (more below). This is to avoid O(Fields) linear
35 //    lookup complexity.
36 // 2. Creates an index of offsets to remember the start word for each message.
37 //    This is so we can jump to O(1) to the N-th message when recursing into a
38 //    nested fields, without having to scan and find the (N-1)-th END_OF_MESSAGE
39 //    marker.
40 // Overall lookups are O(1) for field ids < 128 (kDirectlyIndexLimit) and O(N),
41 // with N being the number of allowed field ranges for other fields.
42 // See comments around |word_| below for the structure of the word vector.
43 class FilterBytecodeParser {
44  public:
45   // Result of a Query() operation
46   struct QueryResult {
47     bool allowed;  // Whether the field is allowed at all or no.
48 
49     // If |allowed|==true && simple_field()==false, this tells the message index
50     // of the nested field that should be used when recursing in the parser.
51     uint32_t nested_msg_index;
52 
53     // If |allowed|==true, specifies if the field is of a simple type (varint,
54     // fixed32/64, string or byte) or a nested field that needs recursion.
55     // In the latter case the caller is expected to use |nested_msg_index| for
56     // the next Query() calls.
simple_fieldQueryResult57     bool simple_field() const { return nested_msg_index == kSimpleField; }
58   };
59 
60   // Loads a filter. The filter data consists of a sequence of varints which
61   // contains the filter opcodes and a final checksum.
62   bool Load(const void* filter_data, size_t len);
63 
64   // Checks wheter a given field is allowed or not.
65   // msg_index = 0 is the index of the root message, where all queries should
66   // start from (typically perfetto.protos.Trace).
67   QueryResult Query(uint32_t msg_index, uint32_t field_id);
68 
69   void Reset();
set_suppress_logs_for_fuzzer(bool x)70   void set_suppress_logs_for_fuzzer(bool x) { suppress_logs_for_fuzzer_ = x; }
71 
72  private:
73   static constexpr uint32_t kDirectlyIndexLimit = 128;
74   static constexpr uint32_t kAllowed = 1u << 31u;
75   static constexpr uint32_t kSimpleField = 0x7fffffff;
76 
77   bool LoadInternal(const uint8_t* filter_data, size_t len);
78 
79   // The state of all fields for all messages is stored in one contiguous array.
80   // This is to avoid memory fragmentation and allocator overhead.
81   // We expect a high number of messages (hundreds), but each message is small.
82   // For each message we store two sets of uint32:
83   // 1. A set of "directly indexed" fields, for field ids < 128.
84   // 2. The remainder is a set of ranges.
85   // So each message descriptor consists of a sequence of words as follows:
86   //
87   // [0] -> how many directly indexed fields are stored next (up to 128)
88   //
89   // [1..N] -> One word per field id (See "field state" below).
90   //
91   // [N + 1] -> Start of field id range 1
92   // [N + 2] -> End of field id range 1 (exclusive, STL-style).
93   // [N + 3] -> Field state for fields in range 1 (below)
94   //
95   // [N + 4] -> Start of field id range 2
96   // [N + 5] -> End of field id range 2 (exclusive, STL-style).
97   // [N + 6] -> Field state for fields in range 2 (below)
98 
99   // The "field state" word is as follows:
100   // Bit 31: 0 if the field is disallowed, 1 if allowed.
101   //         Only directly indexed fields can be 0 (it doesn't make sense to add
102   //         a range and then say "btw it's NOT allowed".. don't add it then.
103   //         0 is only used for filling gaps in the directly indexed bucket.
104   // Bits [30..0] (only when MSB == allowed):
105   //  0x7fffffff: The field is "simple" (varint, fixed32/64, string, bytes) and
106   //      can be directly passed through in output. No recursion is needed.
107   //  [0, 7ffffffe]: The field is a nested submessage. The value is the index
108   //     that must be passed as first argument to the next Query() calls.
109   //     Note that the message index is purely a monotonic counter in the
110   //     filter bytecode, has no proto-equivalent match (unlike field ids).
111   std::vector<uint32_t> words_;
112 
113   // One entry for each message index stored in the filter plus a sentinel at
114   // the end. Maps each message index to the offset in |words_| where the the
115   // Nth message start.
116   // message_offset_.size() - 2 == the max message id that can be parsed.
117   std::vector<uint32_t> message_offset_;
118 
119   bool suppress_logs_for_fuzzer_ = false;
120 };
121 
122 }  // namespace protozero
123 
124 #endif  // SRC_PROTOZERO_FILTERING_FILTER_BYTECODE_PARSER_H_
125