1 /*
2 ** Internal-only definitions for the decoder.
3 */
4 
5 #ifndef UPB_DECODER_INT_H_
6 #define UPB_DECODER_INT_H_
7 
8 #include "upb/def.h"
9 #include "upb/handlers.h"
10 #include "upb/pb/decoder.h"
11 #include "upb/sink.h"
12 #include "upb/table.int.h"
13 
14 #include "upb/port_def.inc"
15 
16 /* Opcode definitions.  The canonical meaning of each opcode is its
17  * implementation in the interpreter (the JIT is written to match this).
18  *
19  * All instructions have the opcode in the low byte.
20  * Instruction format for most instructions is:
21  *
22  * +-------------------+--------+
23  * |     arg (24)      | op (8) |
24  * +-------------------+--------+
25  *
26  * Exceptions are indicated below.  A few opcodes are multi-word. */
27 typedef enum {
28   /* Opcodes 1-8, 13, 15-18 parse their respective descriptor types.
29    * Arg for all of these is the upb selector for this field. */
30 #define T(type) OP_PARSE_ ## type = UPB_DESCRIPTOR_TYPE_ ## type
31   T(DOUBLE), T(FLOAT), T(INT64), T(UINT64), T(INT32), T(FIXED64), T(FIXED32),
32   T(BOOL), T(UINT32), T(SFIXED32), T(SFIXED64), T(SINT32), T(SINT64),
33 #undef T
34   OP_STARTMSG       = 9,   /* No arg. */
35   OP_ENDMSG         = 10,  /* No arg. */
36   OP_STARTSEQ       = 11,
37   OP_ENDSEQ         = 12,
38   OP_STARTSUBMSG    = 14,
39   OP_ENDSUBMSG      = 19,
40   OP_STARTSTR       = 20,
41   OP_STRING         = 21,
42   OP_ENDSTR         = 22,
43 
44   OP_PUSHTAGDELIM   = 23,  /* No arg. */
45   OP_PUSHLENDELIM   = 24,  /* No arg. */
46   OP_POP            = 25,  /* No arg. */
47   OP_SETDELIM       = 26,  /* No arg. */
48   OP_SETBIGGROUPNUM = 27,  /* two words:
49                             *   | unused (24)     | opc (8) |
50                             *   |        groupnum (32)      | */
51   OP_CHECKDELIM     = 28,
52   OP_CALL           = 29,
53   OP_RET            = 30,
54   OP_BRANCH         = 31,
55 
56   /* Different opcodes depending on how many bytes expected. */
57   OP_TAG1           = 32,  /* | match tag (16) | jump target (8) | opc (8) | */
58   OP_TAG2           = 33,  /* | match tag (16) | jump target (8) | opc (8) | */
59   OP_TAGN           = 34,  /* three words: */
60                            /*   | unused (16) | jump target(8) | opc (8) | */
61                            /*   |           match tag 1 (32)             | */
62                            /*   |           match tag 2 (32)             | */
63 
64   OP_SETDISPATCH    = 35,  /* N words: */
65                            /*   | unused (24)         | opc | */
66                            /*   | upb_inttable* (32 or 64)  | */
67 
68   OP_DISPATCH       = 36,  /* No arg. */
69 
70   OP_HALT           = 37   /* No arg. */
71 } opcode;
72 
73 #define OP_MAX OP_HALT
74 
getop(uint32_t instr)75 UPB_INLINE opcode getop(uint32_t instr) { return (opcode)(instr & 0xff); }
76 
77 struct upb_pbcodecache {
78   upb_arena *arena;
79   upb_handlercache *dest;
80   bool allow_jit;
81   bool lazy;
82 
83   /* Map of upb_msgdef -> mgroup. */
84   upb_inttable groups;
85 };
86 
87 /* Method group; represents a set of decoder methods that had their code
88  * emitted together.  Immutable once created.  */
89 typedef struct {
90   /* Maps upb_msgdef/upb_handlers -> upb_pbdecodermethod.  Owned by us.
91    *
92    * Ideally this would be on pbcodecache (if we were actually caching code).
93    * Right now we don't actually cache anything, which is wasteful. */
94   upb_inttable methods;
95 
96   /* The bytecode for our methods, if any exists.  Owned by us. */
97   uint32_t *bytecode;
98   uint32_t *bytecode_end;
99 } mgroup;
100 
101 /* The maximum that any submessages can be nested.  Matches proto2's limit.
102  * This specifies the size of the decoder's statically-sized array and therefore
103  * setting it high will cause the upb::pb::Decoder object to be larger.
104  *
105  * If necessary we can add a runtime-settable property to Decoder that allow
106  * this to be larger than the compile-time setting, but this would add
107  * complexity, particularly since we would have to decide how/if to give users
108  * the ability to set a custom memory allocation function. */
109 #define UPB_DECODER_MAX_NESTING 64
110 
111 /* Internal-only struct used by the decoder. */
112 typedef struct {
113   /* Space optimization note: we store two pointers here that the JIT
114    * doesn't need at all; the upb_handlers* inside the sink and
115    * the dispatch table pointer.  We can optimze so that the JIT uses
116    * smaller stack frames than the interpreter.  The only thing we need
117    * to guarantee is that the fallback routines can find end_ofs. */
118   upb_sink sink;
119 
120   /* The absolute stream offset of the end-of-frame delimiter.
121    * Non-delimited frames (groups and non-packed repeated fields) reuse the
122    * delimiter of their parent, even though the frame may not end there.
123    *
124    * NOTE: the JIT stores a slightly different value here for non-top frames.
125    * It stores the value relative to the end of the enclosed message.  But the
126    * top frame is still stored the same way, which is important for ensuring
127    * that calls from the JIT into C work correctly. */
128   uint64_t end_ofs;
129   const uint32_t *base;
130 
131   /* 0 indicates a length-delimited field.
132    * A positive number indicates a known group.
133    * A negative number indicates an unknown group. */
134   int32_t groupnum;
135   upb_inttable *dispatch;  /* Not used by the JIT. */
136 } upb_pbdecoder_frame;
137 
138 struct upb_pbdecodermethod {
139   /* While compiling, the base is relative in "ofs", after compiling it is
140    * absolute in "ptr". */
141   union {
142     uint32_t ofs;     /* PC offset of method. */
143     void *ptr;        /* Pointer to bytecode or machine code for this method. */
144   } code_base;
145 
146   /* The decoder method group to which this method belongs. */
147   const mgroup *group;
148 
149   /* Whether this method is native code or bytecode. */
150   bool is_native_;
151 
152   /* The handler one calls to invoke this method. */
153   upb_byteshandler input_handler_;
154 
155   /* The destination handlers this method is bound to.  We own a ref. */
156   const upb_handlers *dest_handlers_;
157 
158   /* Dispatch table -- used by both bytecode decoder and JIT when encountering a
159    * field number that wasn't the one we were expecting to see.  See
160    * decoder.int.h for the layout of this table. */
161   upb_inttable dispatch;
162 };
163 
164 struct upb_pbdecoder {
165   upb_arena *arena;
166 
167   /* Our input sink. */
168   upb_bytessink input_;
169 
170   /* The decoder method we are parsing with (owned). */
171   const upb_pbdecodermethod *method_;
172 
173   size_t call_len;
174   const uint32_t *pc, *last;
175 
176   /* Current input buffer and its stream offset. */
177   const char *buf, *ptr, *end, *checkpoint;
178 
179   /* End of the delimited region, relative to ptr, NULL if not in this buf. */
180   const char *delim_end;
181 
182   /* End of the delimited region, relative to ptr, end if not in this buf. */
183   const char *data_end;
184 
185   /* Overall stream offset of "buf." */
186   uint64_t bufstart_ofs;
187 
188   /* Buffer for residual bytes not parsed from the previous buffer. */
189   char residual[UPB_DECODER_MAX_RESIDUAL_BYTES];
190   char *residual_end;
191 
192   /* Bytes of data that should be discarded from the input beore we start
193    * parsing again.  We set this when we internally determine that we can
194    * safely skip the next N bytes, but this region extends past the current
195    * user buffer. */
196   size_t skip;
197 
198   /* Stores the user buffer passed to our decode function. */
199   const char *buf_param;
200   size_t size_param;
201   const upb_bufhandle *handle;
202 
203   /* Our internal stack. */
204   upb_pbdecoder_frame *stack, *top, *limit;
205   const uint32_t **callstack;
206   size_t stack_size;
207 
208   upb_status *status;
209 };
210 
211 /* Decoder entry points; used as handlers. */
212 void *upb_pbdecoder_startbc(void *closure, const void *pc, size_t size_hint);
213 size_t upb_pbdecoder_decode(void *closure, const void *hd, const char *buf,
214                             size_t size, const upb_bufhandle *handle);
215 bool upb_pbdecoder_end(void *closure, const void *handler_data);
216 
217 /* Decoder-internal functions that the JIT calls to handle fallback paths. */
218 int32_t upb_pbdecoder_resume(upb_pbdecoder *d, void *p, const char *buf,
219                              size_t size, const upb_bufhandle *handle);
220 size_t upb_pbdecoder_suspend(upb_pbdecoder *d);
221 int32_t upb_pbdecoder_skipunknown(upb_pbdecoder *d, int32_t fieldnum,
222                                   uint8_t wire_type);
223 int32_t upb_pbdecoder_checktag_slow(upb_pbdecoder *d, uint64_t expected);
224 int32_t upb_pbdecoder_decode_varint_slow(upb_pbdecoder *d, uint64_t *u64);
225 int32_t upb_pbdecoder_decode_f32(upb_pbdecoder *d, uint32_t *u32);
226 int32_t upb_pbdecoder_decode_f64(upb_pbdecoder *d, uint64_t *u64);
227 void upb_pbdecoder_seterr(upb_pbdecoder *d, const char *msg);
228 
229 /* Error messages that are shared between the bytecode and JIT decoders. */
230 extern const char *kPbDecoderStackOverflow;
231 extern const char *kPbDecoderSubmessageTooLong;
232 
233 /* Access to decoderplan members needed by the decoder. */
234 const char *upb_pbdecoder_getopname(unsigned int op);
235 
236 /* A special label that means "do field dispatch for this message and branch to
237  * wherever that takes you." */
238 #define LABEL_DISPATCH 0
239 
240 /* A special slot in the dispatch table that stores the epilogue (ENDMSG and/or
241  * RET) for branching to when we find an appropriate ENDGROUP tag. */
242 #define DISPATCH_ENDMSG 0
243 
244 /* It's important to use this invalid wire type instead of 0 (which is a valid
245  * wire type). */
246 #define NO_WIRE_TYPE 0xff
247 
248 /* The dispatch table layout is:
249  *   [field number] -> [ 48-bit offset ][ 8-bit wt2 ][ 8-bit wt1 ]
250  *
251  * If wt1 matches, jump to the 48-bit offset.  If wt2 matches, lookup
252  * (UPB_MAX_FIELDNUMBER + fieldnum) and jump there.
253  *
254  * We need two wire types because of packed/non-packed compatibility.  A
255  * primitive repeated field can use either wire type and be valid.  While we
256  * could key the table on fieldnum+wiretype, the table would be 8x sparser.
257  *
258  * Storing two wire types in the primary value allows us to quickly rule out
259  * the second wire type without needing to do a separate lookup (this case is
260  * less common than an unknown field). */
upb_pbdecoder_packdispatch(uint64_t ofs,uint8_t wt1,uint8_t wt2)261 UPB_INLINE uint64_t upb_pbdecoder_packdispatch(uint64_t ofs, uint8_t wt1,
262                                                uint8_t wt2) {
263   return (ofs << 16) | (wt2 << 8) | wt1;
264 }
265 
upb_pbdecoder_unpackdispatch(uint64_t dispatch,uint64_t * ofs,uint8_t * wt1,uint8_t * wt2)266 UPB_INLINE void upb_pbdecoder_unpackdispatch(uint64_t dispatch, uint64_t *ofs,
267                                              uint8_t *wt1, uint8_t *wt2) {
268   *wt1 = (uint8_t)dispatch;
269   *wt2 = (uint8_t)(dispatch >> 8);
270   *ofs = dispatch >> 16;
271 }
272 
273 /* All of the functions in decoder.c that return int32_t return values according
274  * to the following scheme:
275  *   1. negative values indicate a return code from the following list.
276  *   2. positive values indicate that error or end of buffer was hit, and
277  *      that the decode function should immediately return the given value
278  *      (the decoder state has already been suspended and is ready to be
279  *      resumed). */
280 #define DECODE_OK -1
281 #define DECODE_MISMATCH -2  /* Used only from checktag_slow(). */
282 #define DECODE_ENDGROUP -3  /* Used only from checkunknown(). */
283 
284 #define CHECK_RETURN(x) { int32_t ret = x; if (ret >= 0) return ret; }
285 
286 #include "upb/port_undef.inc"
287 
288 #endif  /* UPB_DECODER_INT_H_ */
289