1 /*
2 ** Internal-only definitions for the decoder.
3 */
4
5 #ifndef UPB_DECODER_INT_H_
6 #define UPB_DECODER_INT_H_
7
8 #include "upb/def.h"
9 #include "upb/handlers.h"
10 #include "upb/pb/decoder.h"
11 #include "upb/sink.h"
12 #include "upb/table.int.h"
13
14 #include "upb/port_def.inc"
15
16 /* Opcode definitions. The canonical meaning of each opcode is its
17 * implementation in the interpreter (the JIT is written to match this).
18 *
19 * All instructions have the opcode in the low byte.
20 * Instruction format for most instructions is:
21 *
22 * +-------------------+--------+
23 * | arg (24) | op (8) |
24 * +-------------------+--------+
25 *
26 * Exceptions are indicated below. A few opcodes are multi-word. */
27 typedef enum {
28 /* Opcodes 1-8, 13, 15-18 parse their respective descriptor types.
29 * Arg for all of these is the upb selector for this field. */
30 #define T(type) OP_PARSE_ ## type = UPB_DESCRIPTOR_TYPE_ ## type
31 T(DOUBLE), T(FLOAT), T(INT64), T(UINT64), T(INT32), T(FIXED64), T(FIXED32),
32 T(BOOL), T(UINT32), T(SFIXED32), T(SFIXED64), T(SINT32), T(SINT64),
33 #undef T
34 OP_STARTMSG = 9, /* No arg. */
35 OP_ENDMSG = 10, /* No arg. */
36 OP_STARTSEQ = 11,
37 OP_ENDSEQ = 12,
38 OP_STARTSUBMSG = 14,
39 OP_ENDSUBMSG = 19,
40 OP_STARTSTR = 20,
41 OP_STRING = 21,
42 OP_ENDSTR = 22,
43
44 OP_PUSHTAGDELIM = 23, /* No arg. */
45 OP_PUSHLENDELIM = 24, /* No arg. */
46 OP_POP = 25, /* No arg. */
47 OP_SETDELIM = 26, /* No arg. */
48 OP_SETBIGGROUPNUM = 27, /* two words:
49 * | unused (24) | opc (8) |
50 * | groupnum (32) | */
51 OP_CHECKDELIM = 28,
52 OP_CALL = 29,
53 OP_RET = 30,
54 OP_BRANCH = 31,
55
56 /* Different opcodes depending on how many bytes expected. */
57 OP_TAG1 = 32, /* | match tag (16) | jump target (8) | opc (8) | */
58 OP_TAG2 = 33, /* | match tag (16) | jump target (8) | opc (8) | */
59 OP_TAGN = 34, /* three words: */
60 /* | unused (16) | jump target(8) | opc (8) | */
61 /* | match tag 1 (32) | */
62 /* | match tag 2 (32) | */
63
64 OP_SETDISPATCH = 35, /* N words: */
65 /* | unused (24) | opc | */
66 /* | upb_inttable* (32 or 64) | */
67
68 OP_DISPATCH = 36, /* No arg. */
69
70 OP_HALT = 37 /* No arg. */
71 } opcode;
72
73 #define OP_MAX OP_HALT
74
getop(uint32_t instr)75 UPB_INLINE opcode getop(uint32_t instr) { return (opcode)(instr & 0xff); }
76
77 struct upb_pbcodecache {
78 upb_arena *arena;
79 upb_handlercache *dest;
80 bool allow_jit;
81 bool lazy;
82
83 /* Map of upb_msgdef -> mgroup. */
84 upb_inttable groups;
85 };
86
87 /* Method group; represents a set of decoder methods that had their code
88 * emitted together. Immutable once created. */
89 typedef struct {
90 /* Maps upb_msgdef/upb_handlers -> upb_pbdecodermethod. Owned by us.
91 *
92 * Ideally this would be on pbcodecache (if we were actually caching code).
93 * Right now we don't actually cache anything, which is wasteful. */
94 upb_inttable methods;
95
96 /* The bytecode for our methods, if any exists. Owned by us. */
97 uint32_t *bytecode;
98 uint32_t *bytecode_end;
99 } mgroup;
100
101 /* The maximum that any submessages can be nested. Matches proto2's limit.
102 * This specifies the size of the decoder's statically-sized array and therefore
103 * setting it high will cause the upb::pb::Decoder object to be larger.
104 *
105 * If necessary we can add a runtime-settable property to Decoder that allow
106 * this to be larger than the compile-time setting, but this would add
107 * complexity, particularly since we would have to decide how/if to give users
108 * the ability to set a custom memory allocation function. */
109 #define UPB_DECODER_MAX_NESTING 64
110
111 /* Internal-only struct used by the decoder. */
112 typedef struct {
113 /* Space optimization note: we store two pointers here that the JIT
114 * doesn't need at all; the upb_handlers* inside the sink and
115 * the dispatch table pointer. We can optimze so that the JIT uses
116 * smaller stack frames than the interpreter. The only thing we need
117 * to guarantee is that the fallback routines can find end_ofs. */
118 upb_sink sink;
119
120 /* The absolute stream offset of the end-of-frame delimiter.
121 * Non-delimited frames (groups and non-packed repeated fields) reuse the
122 * delimiter of their parent, even though the frame may not end there.
123 *
124 * NOTE: the JIT stores a slightly different value here for non-top frames.
125 * It stores the value relative to the end of the enclosed message. But the
126 * top frame is still stored the same way, which is important for ensuring
127 * that calls from the JIT into C work correctly. */
128 uint64_t end_ofs;
129 const uint32_t *base;
130
131 /* 0 indicates a length-delimited field.
132 * A positive number indicates a known group.
133 * A negative number indicates an unknown group. */
134 int32_t groupnum;
135 upb_inttable *dispatch; /* Not used by the JIT. */
136 } upb_pbdecoder_frame;
137
138 struct upb_pbdecodermethod {
139 /* While compiling, the base is relative in "ofs", after compiling it is
140 * absolute in "ptr". */
141 union {
142 uint32_t ofs; /* PC offset of method. */
143 void *ptr; /* Pointer to bytecode or machine code for this method. */
144 } code_base;
145
146 /* The decoder method group to which this method belongs. */
147 const mgroup *group;
148
149 /* Whether this method is native code or bytecode. */
150 bool is_native_;
151
152 /* The handler one calls to invoke this method. */
153 upb_byteshandler input_handler_;
154
155 /* The destination handlers this method is bound to. We own a ref. */
156 const upb_handlers *dest_handlers_;
157
158 /* Dispatch table -- used by both bytecode decoder and JIT when encountering a
159 * field number that wasn't the one we were expecting to see. See
160 * decoder.int.h for the layout of this table. */
161 upb_inttable dispatch;
162 };
163
164 struct upb_pbdecoder {
165 upb_arena *arena;
166
167 /* Our input sink. */
168 upb_bytessink input_;
169
170 /* The decoder method we are parsing with (owned). */
171 const upb_pbdecodermethod *method_;
172
173 size_t call_len;
174 const uint32_t *pc, *last;
175
176 /* Current input buffer and its stream offset. */
177 const char *buf, *ptr, *end, *checkpoint;
178
179 /* End of the delimited region, relative to ptr, NULL if not in this buf. */
180 const char *delim_end;
181
182 /* End of the delimited region, relative to ptr, end if not in this buf. */
183 const char *data_end;
184
185 /* Overall stream offset of "buf." */
186 uint64_t bufstart_ofs;
187
188 /* Buffer for residual bytes not parsed from the previous buffer. */
189 char residual[UPB_DECODER_MAX_RESIDUAL_BYTES];
190 char *residual_end;
191
192 /* Bytes of data that should be discarded from the input beore we start
193 * parsing again. We set this when we internally determine that we can
194 * safely skip the next N bytes, but this region extends past the current
195 * user buffer. */
196 size_t skip;
197
198 /* Stores the user buffer passed to our decode function. */
199 const char *buf_param;
200 size_t size_param;
201 const upb_bufhandle *handle;
202
203 /* Our internal stack. */
204 upb_pbdecoder_frame *stack, *top, *limit;
205 const uint32_t **callstack;
206 size_t stack_size;
207
208 upb_status *status;
209 };
210
211 /* Decoder entry points; used as handlers. */
212 void *upb_pbdecoder_startbc(void *closure, const void *pc, size_t size_hint);
213 size_t upb_pbdecoder_decode(void *closure, const void *hd, const char *buf,
214 size_t size, const upb_bufhandle *handle);
215 bool upb_pbdecoder_end(void *closure, const void *handler_data);
216
217 /* Decoder-internal functions that the JIT calls to handle fallback paths. */
218 int32_t upb_pbdecoder_resume(upb_pbdecoder *d, void *p, const char *buf,
219 size_t size, const upb_bufhandle *handle);
220 size_t upb_pbdecoder_suspend(upb_pbdecoder *d);
221 int32_t upb_pbdecoder_skipunknown(upb_pbdecoder *d, int32_t fieldnum,
222 uint8_t wire_type);
223 int32_t upb_pbdecoder_checktag_slow(upb_pbdecoder *d, uint64_t expected);
224 int32_t upb_pbdecoder_decode_varint_slow(upb_pbdecoder *d, uint64_t *u64);
225 int32_t upb_pbdecoder_decode_f32(upb_pbdecoder *d, uint32_t *u32);
226 int32_t upb_pbdecoder_decode_f64(upb_pbdecoder *d, uint64_t *u64);
227 void upb_pbdecoder_seterr(upb_pbdecoder *d, const char *msg);
228
229 /* Error messages that are shared between the bytecode and JIT decoders. */
230 extern const char *kPbDecoderStackOverflow;
231 extern const char *kPbDecoderSubmessageTooLong;
232
233 /* Access to decoderplan members needed by the decoder. */
234 const char *upb_pbdecoder_getopname(unsigned int op);
235
236 /* A special label that means "do field dispatch for this message and branch to
237 * wherever that takes you." */
238 #define LABEL_DISPATCH 0
239
240 /* A special slot in the dispatch table that stores the epilogue (ENDMSG and/or
241 * RET) for branching to when we find an appropriate ENDGROUP tag. */
242 #define DISPATCH_ENDMSG 0
243
244 /* It's important to use this invalid wire type instead of 0 (which is a valid
245 * wire type). */
246 #define NO_WIRE_TYPE 0xff
247
248 /* The dispatch table layout is:
249 * [field number] -> [ 48-bit offset ][ 8-bit wt2 ][ 8-bit wt1 ]
250 *
251 * If wt1 matches, jump to the 48-bit offset. If wt2 matches, lookup
252 * (UPB_MAX_FIELDNUMBER + fieldnum) and jump there.
253 *
254 * We need two wire types because of packed/non-packed compatibility. A
255 * primitive repeated field can use either wire type and be valid. While we
256 * could key the table on fieldnum+wiretype, the table would be 8x sparser.
257 *
258 * Storing two wire types in the primary value allows us to quickly rule out
259 * the second wire type without needing to do a separate lookup (this case is
260 * less common than an unknown field). */
upb_pbdecoder_packdispatch(uint64_t ofs,uint8_t wt1,uint8_t wt2)261 UPB_INLINE uint64_t upb_pbdecoder_packdispatch(uint64_t ofs, uint8_t wt1,
262 uint8_t wt2) {
263 return (ofs << 16) | (wt2 << 8) | wt1;
264 }
265
upb_pbdecoder_unpackdispatch(uint64_t dispatch,uint64_t * ofs,uint8_t * wt1,uint8_t * wt2)266 UPB_INLINE void upb_pbdecoder_unpackdispatch(uint64_t dispatch, uint64_t *ofs,
267 uint8_t *wt1, uint8_t *wt2) {
268 *wt1 = (uint8_t)dispatch;
269 *wt2 = (uint8_t)(dispatch >> 8);
270 *ofs = dispatch >> 16;
271 }
272
273 /* All of the functions in decoder.c that return int32_t return values according
274 * to the following scheme:
275 * 1. negative values indicate a return code from the following list.
276 * 2. positive values indicate that error or end of buffer was hit, and
277 * that the decode function should immediately return the given value
278 * (the decoder state has already been suspended and is ready to be
279 * resumed). */
280 #define DECODE_OK -1
281 #define DECODE_MISMATCH -2 /* Used only from checktag_slow(). */
282 #define DECODE_ENDGROUP -3 /* Used only from checkunknown(). */
283
284 #define CHECK_RETURN(x) { int32_t ret = x; if (ret >= 0) return ret; }
285
286 #include "upb/port_undef.inc"
287
288 #endif /* UPB_DECODER_INT_H_ */
289