1 // Copyright 2005-2008 Google Inc. All Rights Reserved.
2 // Author: jrm@google.com (Jim Meehan)
3 
4 #include <google/protobuf/stubs/common.h>
5 
6 namespace google {
7 namespace protobuf {
8 namespace internal {
9 
10 // These four-byte entries compactly encode how many bytes 0..255 to delete
11 // in making a string replacement, how many bytes to add 0..255, and the offset
12 // 0..64k-1 of the replacement string in remap_string.
13 struct RemapEntry {
14   uint8 delete_bytes;
15   uint8 add_bytes;
16   uint16 bytes_offset;
17 };
18 
19 // Exit type codes for state tables. All but the first get stuffed into
20 // signed one-byte entries. The first is only generated by executable code.
21 // To distinguish from next-state entries, these must be contiguous and
22 // all <= kExitNone
23 typedef enum {
24   kExitDstSpaceFull = 239,
25   kExitIllegalStructure,  // 240
26   kExitOK,                // 241
27   kExitReject,            // ...
28   kExitReplace1,
29   kExitReplace2,
30   kExitReplace3,
31   kExitReplace21,
32   kExitReplace31,
33   kExitReplace32,
34   kExitReplaceOffset1,
35   kExitReplaceOffset2,
36   kExitReplace1S0,
37   kExitSpecial,
38   kExitDoAgain,
39   kExitRejectAlt,
40   kExitNone               // 255
41 } ExitReason;
42 
43 
44 // This struct represents one entire state table. The three initialized byte
45 // areas are state_table, remap_base, and remap_string. state0 and state0_size
46 // give the byte offset and length within state_table of the initial state --
47 // table lookups are expected to start and end in this state, but for
48 // truncated UTF-8 strings, may end in a different state. These allow a quick
49 // test for that condition. entry_shift is 8 for tables subscripted by a full
50 // byte value and 6 for space-optimized tables subscripted by only six
51 // significant bits in UTF-8 continuation bytes.
52 typedef struct {
53   const uint32 state0;
54   const uint32 state0_size;
55   const uint32 total_size;
56   const int max_expand;
57   const int entry_shift;
58   const int bytes_per_entry;
59   const uint32 losub;
60   const uint32 hiadd;
61   const uint8* state_table;
62   const RemapEntry* remap_base;
63   const uint8* remap_string;
64   const uint8* fast_state;
65 } UTF8StateMachineObj;
66 
67 typedef UTF8StateMachineObj UTF8ScanObj;
68 
69 #define X__ (kExitIllegalStructure)
70 #define RJ_ (kExitReject)
71 #define S1_ (kExitReplace1)
72 #define S2_ (kExitReplace2)
73 #define S3_ (kExitReplace3)
74 #define S21 (kExitReplace21)
75 #define S31 (kExitReplace31)
76 #define S32 (kExitReplace32)
77 #define T1_ (kExitReplaceOffset1)
78 #define T2_ (kExitReplaceOffset2)
79 #define S11 (kExitReplace1S0)
80 #define SP_ (kExitSpecial)
81 #define D__ (kExitDoAgain)
82 #define RJA (kExitRejectAlt)
83 
84 //  Entire table has 9 state blocks of 256 entries each
85 static const unsigned int utf8acceptnonsurrogates_STATE0 = 0;     // state[0]
86 static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256;  // =[1]
87 static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304;
88 static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0;
89 static const unsigned int utf8acceptnonsurrogates_SHIFT = 8;
90 static const unsigned int utf8acceptnonsurrogates_BYTES = 1;
91 static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020;
92 static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000;
93 
94 static const uint8 utf8acceptnonsurrogates[] = {
95 // state[0] 0x000000 Byte 1
96   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
97   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
98   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
99   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
100 
101   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
102   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
103   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
104   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
105 
106 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
107 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
108 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
109 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
110 
111 X__, X__,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
112   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
113   2,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   7,   3,   3,
114   4,   5,   5,   5,   6, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
115 
116 // state[1] 0x000080 Byte 2 of 2
117 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
118 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
119 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
120 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
121 
122 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
123 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
124 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
125 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
126 
127   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
128   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
129   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
130   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
131 
132 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
133 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
134 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
135 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
136 
137 // state[2] 0x000000 Byte 2 of 3
138 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
139 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
140 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
141 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
142 
143 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
144 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
145 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
146 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
147 
148 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
149 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
150   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
151   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
152 
153 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
154 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
155 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
156 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
157 
158 // state[3] 0x001000 Byte 2 of 3
159 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
160 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
161 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
162 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
163 
164 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
165 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
166 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
167 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
168 
169   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
170   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
171   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
172   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
173 
174 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
175 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
176 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
177 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
178 
179 // state[4] 0x000000 Byte 2 of 4
180 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
181 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
182 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
183 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
184 
185 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
186 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
187 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
188 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
189 
190 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
191   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
192   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
193   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
194 
195 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
196 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
197 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
198 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
199 
200 // state[5] 0x040000 Byte 2 of 4
201 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
202 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
203 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
204 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
205 
206 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
207 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
208 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
209 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
210 
211   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
212   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
213   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
214   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
215 
216 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
217 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
218 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
219 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
220 
221 // state[6] 0x100000 Byte 2 of 4
222 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
223 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
224 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
225 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
226 
227 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
228 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
229 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
230 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
231 
232   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
233 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
234 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
235 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
236 
237 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
238 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
239 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
240 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
241 
242 // state[7] 0x00d000 Byte 2 of 3
243 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
244 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
245 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
246 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
247 
248 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
249 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
250 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
251 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
252 
253   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
254   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
255   8,   8,   8,   8,   8,   8,   8,   8,    8,   8,   8,   8,   8,   8,   8,   8,
256   8,   8,   8,   8,   8,   8,   8,   8,    8,   8,   8,   8,   8,   8,   8,   8,
257 
258 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
259 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
260 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
261 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
262 
263 // state[8] 0x00d800 Byte 3 of 3
264 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
265 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
266 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
267 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
268 
269 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
270 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
271 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
272 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
273 
274 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
275 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
276 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
277 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
278 
279 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
280 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
281 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
282 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
283 };
284 
285 // Remap base[0] = (del, add, string_offset)
286 static const RemapEntry utf8acceptnonsurrogates_remap_base[] = {
287 {0, 0, 0} };
288 
289 // Remap string[0]
290 static const unsigned char utf8acceptnonsurrogates_remap_string[] = {
291 0 };
292 
293 static const unsigned char utf8acceptnonsurrogates_fast[256] = {
294 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
298 
299 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
303 
304 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
305 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
306 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
307 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
308 
309 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
310 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
311 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
312 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
313 };
314 
315 static const UTF8ScanObj utf8acceptnonsurrogates_obj = {
316   utf8acceptnonsurrogates_STATE0,
317   utf8acceptnonsurrogates_STATE0_SIZE,
318   utf8acceptnonsurrogates_TOTAL_SIZE,
319   utf8acceptnonsurrogates_MAX_EXPAND_X4,
320   utf8acceptnonsurrogates_SHIFT,
321   utf8acceptnonsurrogates_BYTES,
322   utf8acceptnonsurrogates_LOSUB,
323   utf8acceptnonsurrogates_HIADD,
324   utf8acceptnonsurrogates,
325   utf8acceptnonsurrogates_remap_base,
326   utf8acceptnonsurrogates_remap_string,
327   utf8acceptnonsurrogates_fast
328 };
329 
330 
331 #undef X__
332 #undef RJ_
333 #undef S1_
334 #undef S2_
335 #undef S3_
336 #undef S21
337 #undef S31
338 #undef S32
339 #undef T1_
340 #undef T2_
341 #undef S11
342 #undef SP_
343 #undef D__
344 #undef RJA
345 
346 // Return true if current Tbl pointer is within state0 range
347 // Note that unsigned compare checks both ends of range simultaneously
InStateZero(const UTF8ScanObj * st,const uint8 * Tbl)348 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
349   const uint8* Tbl0 = &st->state_table[st->state0];
350   return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
351 }
352 
353 // Scan a UTF-8 string based on state table.
354 // Always scan complete UTF-8 characters
355 // Set number of bytes scanned. Return reason for exiting
UTF8GenericScan(const UTF8ScanObj * st,const char * str,int str_length,int * bytes_consumed)356 int UTF8GenericScan(const UTF8ScanObj* st,
357                     const char * str,
358                     int str_length,
359                     int* bytes_consumed) {
360   *bytes_consumed = 0;
361   if (str_length == 0) return kExitOK;
362 
363   int eshift = st->entry_shift;
364   const uint8* isrc = reinterpret_cast<const uint8*>(str);
365   const uint8* src = isrc;
366   const uint8* srclimit = isrc + str_length;
367   const uint8* srclimit8 = srclimit - 7;
368   const uint8* Tbl_0 = &st->state_table[st->state0];
369 
370  DoAgain:
371   // Do state-table scan
372   int e = 0;
373   uint8 c;
374   const uint8* Tbl2 = &st->fast_state[0];
375   const uint32 losub = st->losub;
376   const uint32 hiadd = st->hiadd;
377   // Check initial few bytes one at a time until 8-byte aligned
378   //----------------------------
379   while ((((uintptr_t)src & 0x07) != 0) &&
380          (src < srclimit) &&
381          Tbl2[src[0]] == 0) {
382     src++;
383   }
384   if (((uintptr_t)src & 0x07) == 0) {
385     // Do fast for groups of 8 identity bytes.
386     // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
387     // including slowing slightly on cr/lf/ht
388     //----------------------------
389     while (src < srclimit8) {
390       uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
391       uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
392       src += 8;
393       // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
394       uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
395                     (s4567 - losub) | (s4567 + hiadd);
396       if ((temp & 0x80808080) != 0) {
397         // We typically end up here on cr/lf/ht; src was incremented
398         int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
399                     (Tbl2[src[-6]] | Tbl2[src[-5]]);
400         if (e0123 != 0) {
401           src -= 8;
402           break;
403         }    // Exit on Non-interchange
404         e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
405                 (Tbl2[src[-2]] | Tbl2[src[-1]]);
406         if (e0123 != 0) {
407           src -= 4;
408           break;
409         }    // Exit on Non-interchange
410         // Else OK, go around again
411       }
412     }
413   }
414   //----------------------------
415 
416   // Byte-at-a-time scan
417   //----------------------------
418   const uint8* Tbl = Tbl_0;
419   while (src < srclimit) {
420     c = *src;
421     e = Tbl[c];
422     src++;
423     if (e >= kExitIllegalStructure) {break;}
424     Tbl = &Tbl_0[e << eshift];
425   }
426   //----------------------------
427 
428 
429   // Exit posibilities:
430   //  Some exit code, !state0, back up over last char
431   //  Some exit code, state0, back up one byte exactly
432   //  source consumed, !state0, back up over partial char
433   //  source consumed, state0, exit OK
434   // For illegal byte in state0, avoid backup up over PREVIOUS char
435   // For truncated last char, back up to beginning of it
436 
437   if (e >= kExitIllegalStructure) {
438     // Back up over exactly one byte of rejected/illegal UTF-8 character
439     src--;
440     // Back up more if needed
441     if (!InStateZero(st, Tbl)) {
442       do {
443         src--;
444       } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
445     }
446   } else if (!InStateZero(st, Tbl)) {
447     // Back up over truncated UTF-8 character
448     e = kExitIllegalStructure;
449     do {
450       src--;
451     } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
452   } else {
453     // Normal termination, source fully consumed
454     e = kExitOK;
455   }
456 
457   if (e == kExitDoAgain) {
458     // Loop back up to the fast scan
459     goto DoAgain;
460   }
461 
462   *bytes_consumed = src - isrc;
463   return e;
464 }
465 
UTF8GenericScanFastAscii(const UTF8ScanObj * st,const char * str,int str_length,int * bytes_consumed)466 int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
467                     const char * str,
468                     int str_length,
469                     int* bytes_consumed) {
470   *bytes_consumed = 0;
471   if (str_length == 0) return kExitOK;
472 
473   const uint8* isrc =  reinterpret_cast<const uint8*>(str);
474   const uint8* src = isrc;
475   const uint8* srclimit = isrc + str_length;
476   const uint8* srclimit8 = srclimit - 7;
477   int n;
478   int rest_consumed;
479   int exit_reason;
480   do {
481     // Check initial few bytes one at a time until 8-byte aligned
482     while ((((uintptr_t)src & 0x07) != 0) &&
483            (src < srclimit) && (src[0] < 0x80)) {
484       src++;
485     }
486     if (((uintptr_t)src & 0x07) == 0) {
487       while ((src < srclimit8) &&
488              (((reinterpret_cast<const uint32*>(src)[0] |
489                 reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
490         src += 8;
491       }
492     }
493     while ((src < srclimit) && (src[0] < 0x80)) {
494       src++;
495     }
496     // Run state table on the rest
497     n = src - isrc;
498     exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed);
499     src += rest_consumed;
500   } while ( exit_reason == kExitDoAgain );
501 
502   *bytes_consumed = src - isrc;
503   return exit_reason;
504 }
505 
506 // Hack:  On some compilers the static tables are initialized at startup.
507 //   We can't use them until they are initialized.  However, some Protocol
508 //   Buffer parsing happens at static init time and may try to validate
509 //   UTF-8 strings.  Since UTF-8 validation is only used for debugging
510 //   anyway, we simply always return success if initialization hasn't
511 //   occurred yet.
512 namespace {
513 
514 bool module_initialized_ = false;
515 
516 struct InitDetector {
InitDetectorgoogle::protobuf::internal::__anon326f57750311::InitDetector517   InitDetector() {
518     module_initialized_ = true;
519   }
520 };
521 InitDetector init_detector;
522 
523 }  // namespace
524 
IsStructurallyValidUTF8(const char * buf,int len)525 bool IsStructurallyValidUTF8(const char* buf, int len) {
526   if (!module_initialized_) return true;
527 
528   int bytes_consumed = 0;
529   UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
530                            buf, len, &bytes_consumed);
531   return (bytes_consumed == len);
532 }
533 
534 }  // namespace internal
535 }  // namespace protobuf
536 }  // namespace google
537