1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef SRC_PROTOZERO_FILTERING_MESSAGE_TOKENIZER_H_
18 #define SRC_PROTOZERO_FILTERING_MESSAGE_TOKENIZER_H_
19 
20 #include <stdint.h>
21 
22 #include "perfetto/base/compiler.h"
23 #include "perfetto/base/logging.h"
24 #include "perfetto/protozero/proto_utils.h"
25 
26 namespace protozero {
27 
28 // A helper class for schema-less tokenizing of protobuf messages.
29 // This class takes a stream of proto-encoded bytes, pushed one by one in input
30 // via Push(octet), and returns a stream of tokens (each Push() call can return
31 // 0 or 1 token).
32 // A "token" contains metadata about a field, specifically: its ID, its wire
33 // type and:
34 //  - For varint and fixed32/64 fields: its payload.
35 //  - For string and bytes fields: the length of its payload.
36 //    In this case the caller is supposed to "eat" those N bytes before calling
37 //    Push() again.
38 // Note that this class cannot differentiate between a string/bytes field or
39 // a submessage, because they are encoded in the same way. The caller is
40 // supposed to know whether a field can be recursed into by just keep calling
41 // Push() or is a string that should be skipped.
42 // This is inline to allow the compiler to see through the Push method and
43 // avoid a function call for each byte.
44 class MessageTokenizer {
45  public:
46   struct Token {
47     uint32_t field_id;  // 0 == not valid.
48     proto_utils::ProtoWireType type;
49 
50     // For kLengthDelimited, |value| represent the length of the payload.
51     uint64_t value;
52 
validToken53     inline bool valid() const { return field_id != 0; }
54     bool operator==(const Token& o) const {
55       return field_id == o.field_id && type == o.type && value == o.value;
56     }
57   };
58 
59   // Pushes a byte in input and returns a token, only when getting to the last
60   // byte of each field. Specifically:
61   // - For varint and fixed32 fields, the Token is returned after the last byte
62   //   of the numeric payload is pushed.
63   // - For length-delimited fields, this returns after the last byte of the
64   //   length is pushed (i.e. right before the payload starts). The caller is
65   //   expected to either skip the next |value| bytes (in the case of a string
66   //   or bytes fields) or keep calling Push, in the case of a submessage.
Push(uint8_t octet)67   inline Token Push(uint8_t octet) {
68     using protozero::proto_utils::ProtoWireType;
69 
70     // Parsing a fixed32/64 field is the only case where we don't have to do
71     // any varint decoding. This is why this block is before the remaining
72     // switch statement below (all the rest is a varint).
73     if (PERFETTO_UNLIKELY(state_ == kFixedIntValue)) {
74       PERFETTO_DCHECK(fixed_int_bits_ == 32 || fixed_int_bits_ == 64);
75       fixed_int_value_ |= static_cast<uint64_t>(octet) << fixed_int_shift_;
76       fixed_int_shift_ += 8;
77       if (fixed_int_shift_ < fixed_int_bits_)
78         return Token{};  // Intermediate byte of a fixed32/64.
79       auto wire_type = fixed_int_bits_ == 32 ? ProtoWireType::kFixed32
80                                              : ProtoWireType::kFixed64;
81       uint64_t fixed_int_value = fixed_int_value_;
82       fixed_int_value_ = fixed_int_shift_ = fixed_int_bits_ = 0;
83       state_ = kFieldPreamble;
84       return Token{field_id_, wire_type, fixed_int_value};
85     }
86 
87     // At this point either we are: (i) parsing a field preamble; (ii) parsing a
88     // varint field paylod; (iii) parsing the length of a length-delimited
89     // field. In all cases, we need to decode a varint before proceeding.
90     varint_ |= static_cast<uint64_t>(octet & 0x7F) << varint_shift_;
91     if (octet & 0x80) {
92       varint_shift_ += 7;
93       if (PERFETTO_UNLIKELY(varint_shift_ >= 64)) {
94         varint_shift_ = 0;
95         state_ = kInvalidVarInt;
96       }
97       return Token{};  // Still parsing a varint.
98     }
99 
100     uint64_t varint = varint_;
101     varint_ = 0;
102     varint_shift_ = 0;
103 
104     switch (state_) {
105       case kFieldPreamble: {
106         auto field_type = static_cast<uint32_t>(varint & 7u);  // 7 = 0..0111
107         field_id_ = static_cast<uint32_t>(varint >> 3);
108 
109         // The field type is legit, now check it's well formed and within
110         // boundaries.
111         if (field_type == static_cast<uint32_t>(ProtoWireType::kVarInt)) {
112           state_ = kVarIntValue;
113         } else if (field_type ==
114                        static_cast<uint32_t>(ProtoWireType::kFixed32) ||
115                    field_type ==
116                        static_cast<uint32_t>(ProtoWireType::kFixed64)) {
117           state_ = kFixedIntValue;
118           fixed_int_shift_ = 0;
119           fixed_int_value_ = 0;
120           fixed_int_bits_ =
121               field_type == static_cast<uint32_t>(ProtoWireType::kFixed32) ? 32
122                                                                            : 64;
123         } else if (field_type ==
124                    static_cast<uint32_t>(ProtoWireType::kLengthDelimited)) {
125           state_ = kLenDelimited;
126         } else {
127           state_ = kInvalidFieldType;
128         }
129         return Token{};
130       }
131 
132       case kVarIntValue: {
133         // Return the varint field payload and go back to the next field.
134         state_ = kFieldPreamble;
135         return Token{field_id_, ProtoWireType::kVarInt, varint};
136       }
137 
138       case kLenDelimited: {
139         const auto payload_len = varint;
140         if (payload_len > protozero::proto_utils::kMaxMessageLength) {
141           state_ = kMessageTooBig;
142           return Token{};
143         }
144         state_ = kFieldPreamble;
145         // At this point the caller is expected to consume the next
146         // |payload_len| bytes.
147         return Token{field_id_, ProtoWireType::kLengthDelimited, payload_len};
148       }
149 
150       case kFixedIntValue:
151         // Unreacheable because of the if before the switch.
152         PERFETTO_DCHECK(false);
153         break;
154 
155       // Unrecoverable error states.
156       case kInvalidFieldType:
157       case kMessageTooBig:
158       case kInvalidVarInt:
159         break;
160     }  // switch(state_)
161 
162     return Token{};  // Keep GCC happy.
163   }
164 
165   // Returns true if the tokenizer FSM has reached quiescence (i.e. if we are
166   // NOT in the middle of parsing a field).
idle()167   bool idle() const {
168     return state_ == kFieldPreamble && varint_shift_ == 0 &&
169            fixed_int_shift_ == 0;
170   }
171 
172   // Only for reporting parser errors in the trace.
state()173   uint32_t state() const { return static_cast<uint32_t>(state_); }
174 
175  private:
176   enum State {
177     kFieldPreamble = 0,  // Parsing the varint for the field preamble.
178     kVarIntValue = 1,    // Parsing the payload of a varint field.
179     kFixedIntValue = 2,  // Parsing the payload of a fixed32/64 field.
180     kLenDelimited = 3,   // Parsing the length of a length-delimited field.
181 
182     // Unrecoverable error states:
183     kInvalidFieldType = 4,  // Encountered an invalid field type.
184     kMessageTooBig = 5,     // Size of the length delimited message was too big.
185     kInvalidVarInt = 6,     // Varint larger than 64 bits.
186   };
187 
188   State state_ = kFieldPreamble;
189   uint32_t field_id_ = 0;
190   uint64_t varint_ = 0;
191   uint32_t varint_shift_ = 0;
192   uint32_t fixed_int_shift_ = 0;
193   uint32_t fixed_int_bits_ = 0;
194   uint64_t fixed_int_value_ = 0;
195 };
196 
197 }  // namespace protozero
198 
199 #endif  // SRC_PROTOZERO_FILTERING_MESSAGE_TOKENIZER_H_
200