1 // Copyright 2005-2008 Google Inc. All Rights Reserved.
2 // Author: jrm@google.com (Jim Meehan)
3 
4 #include <google/protobuf/stubs/common.h>
5 
6 #include <google/protobuf/stubs/stringpiece.h>
7 
8 namespace google {
9 namespace protobuf {
10 namespace internal {
11 
12 // These four-byte entries compactly encode how many bytes 0..255 to delete
13 // in making a string replacement, how many bytes to add 0..255, and the offset
14 // 0..64k-1 of the replacement string in remap_string.
15 struct RemapEntry {
16   uint8 delete_bytes;
17   uint8 add_bytes;
18   uint16 bytes_offset;
19 };
20 
21 // Exit type codes for state tables. All but the first get stuffed into
22 // signed one-byte entries. The first is only generated by executable code.
23 // To distinguish from next-state entries, these must be contiguous and
24 // all <= kExitNone
25 typedef enum {
26   kExitDstSpaceFull = 239,
27   kExitIllegalStructure,  // 240
28   kExitOK,                // 241
29   kExitReject,            // ...
30   kExitReplace1,
31   kExitReplace2,
32   kExitReplace3,
33   kExitReplace21,
34   kExitReplace31,
35   kExitReplace32,
36   kExitReplaceOffset1,
37   kExitReplaceOffset2,
38   kExitReplace1S0,
39   kExitSpecial,
40   kExitDoAgain,
41   kExitRejectAlt,
42   kExitNone               // 255
43 } ExitReason;
44 
45 
46 // This struct represents one entire state table. The three initialized byte
47 // areas are state_table, remap_base, and remap_string. state0 and state0_size
48 // give the byte offset and length within state_table of the initial state --
49 // table lookups are expected to start and end in this state, but for
50 // truncated UTF-8 strings, may end in a different state. These allow a quick
51 // test for that condition. entry_shift is 8 for tables subscripted by a full
52 // byte value and 6 for space-optimized tables subscripted by only six
53 // significant bits in UTF-8 continuation bytes.
54 typedef struct {
55   const uint32 state0;
56   const uint32 state0_size;
57   const uint32 total_size;
58   const int max_expand;
59   const int entry_shift;
60   const int bytes_per_entry;
61   const uint32 losub;
62   const uint32 hiadd;
63   const uint8* state_table;
64   const RemapEntry* remap_base;
65   const uint8* remap_string;
66   const uint8* fast_state;
67 } UTF8StateMachineObj;
68 
69 typedef UTF8StateMachineObj UTF8ScanObj;
70 
71 #define X__ (kExitIllegalStructure)
72 #define RJ_ (kExitReject)
73 #define S1_ (kExitReplace1)
74 #define S2_ (kExitReplace2)
75 #define S3_ (kExitReplace3)
76 #define S21 (kExitReplace21)
77 #define S31 (kExitReplace31)
78 #define S32 (kExitReplace32)
79 #define T1_ (kExitReplaceOffset1)
80 #define T2_ (kExitReplaceOffset2)
81 #define S11 (kExitReplace1S0)
82 #define SP_ (kExitSpecial)
83 #define D__ (kExitDoAgain)
84 #define RJA (kExitRejectAlt)
85 
86 //  Entire table has 9 state blocks of 256 entries each
87 static const unsigned int utf8acceptnonsurrogates_STATE0 = 0;     // state[0]
88 static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256;  // =[1]
89 static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304;
90 static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0;
91 static const unsigned int utf8acceptnonsurrogates_SHIFT = 8;
92 static const unsigned int utf8acceptnonsurrogates_BYTES = 1;
93 static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020;
94 static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000;
95 
96 static const uint8 utf8acceptnonsurrogates[] = {
97 // state[0] 0x000000 Byte 1
98   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
99   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
100   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
101   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
102 
103   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
104   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
105   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
106   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
107 
108 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
109 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
110 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
111 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
112 
113 X__, X__,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
114   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
115   2,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   7,   3,   3,
116   4,   5,   5,   5,   6, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
117 
118 // state[1] 0x000080 Byte 2 of 2
119 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
120 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
121 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
122 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
123 
124 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
125 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
126 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
127 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
128 
129   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
130   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
131   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
132   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
133 
134 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
135 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
136 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
137 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
138 
139 // state[2] 0x000000 Byte 2 of 3
140 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
141 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
142 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
143 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
144 
145 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
146 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
147 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
148 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
149 
150 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
151 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
152   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
153   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
154 
155 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
156 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
157 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
158 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
159 
160 // state[3] 0x001000 Byte 2 of 3
161 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
162 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
163 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
164 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
165 
166 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
167 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
168 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
169 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
170 
171   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
172   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
173   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
174   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
175 
176 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
177 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
178 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
179 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
180 
181 // state[4] 0x000000 Byte 2 of 4
182 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
183 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
184 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
185 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
186 
187 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
188 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
189 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
190 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
191 
192 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
193   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
194   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
195   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
196 
197 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
198 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
199 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
200 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
201 
202 // state[5] 0x040000 Byte 2 of 4
203 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
204 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
205 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
206 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
207 
208 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
209 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
210 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
211 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
212 
213   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
214   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
215   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
216   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
217 
218 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
219 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
220 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
221 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
222 
223 // state[6] 0x100000 Byte 2 of 4
224 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
225 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
226 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
227 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
228 
229 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
230 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
231 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
232 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
233 
234   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
235 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
236 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
237 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
238 
239 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
240 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
241 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
242 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
243 
244 // state[7] 0x00d000 Byte 2 of 3
245 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
246 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
247 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
248 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
249 
250 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
251 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
252 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
253 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
254 
255   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
256   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
257   8,   8,   8,   8,   8,   8,   8,   8,    8,   8,   8,   8,   8,   8,   8,   8,
258   8,   8,   8,   8,   8,   8,   8,   8,    8,   8,   8,   8,   8,   8,   8,   8,
259 
260 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
261 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
262 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
263 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
264 
265 // state[8] 0x00d800 Byte 3 of 3
266 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
267 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
268 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
269 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
270 
271 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
272 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
273 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
274 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
275 
276 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
277 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
278 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
279 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
280 
281 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
282 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
283 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
284 X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
285 };
286 
287 // Remap base[0] = (del, add, string_offset)
288 static const RemapEntry utf8acceptnonsurrogates_remap_base[] = {
289 {0, 0, 0} };
290 
291 // Remap string[0]
292 static const unsigned char utf8acceptnonsurrogates_remap_string[] = {
293 0 };
294 
295 static const unsigned char utf8acceptnonsurrogates_fast[256] = {
296 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
300 
301 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
305 
306 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
307 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
308 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
309 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
310 
311 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
312 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
313 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
314 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
315 };
316 
317 static const UTF8ScanObj utf8acceptnonsurrogates_obj = {
318   utf8acceptnonsurrogates_STATE0,
319   utf8acceptnonsurrogates_STATE0_SIZE,
320   utf8acceptnonsurrogates_TOTAL_SIZE,
321   utf8acceptnonsurrogates_MAX_EXPAND_X4,
322   utf8acceptnonsurrogates_SHIFT,
323   utf8acceptnonsurrogates_BYTES,
324   utf8acceptnonsurrogates_LOSUB,
325   utf8acceptnonsurrogates_HIADD,
326   utf8acceptnonsurrogates,
327   utf8acceptnonsurrogates_remap_base,
328   utf8acceptnonsurrogates_remap_string,
329   utf8acceptnonsurrogates_fast
330 };
331 
332 
333 #undef X__
334 #undef RJ_
335 #undef S1_
336 #undef S2_
337 #undef S3_
338 #undef S21
339 #undef S31
340 #undef S32
341 #undef T1_
342 #undef T2_
343 #undef S11
344 #undef SP_
345 #undef D__
346 #undef RJA
347 
348 // Return true if current Tbl pointer is within state0 range
349 // Note that unsigned compare checks both ends of range simultaneously
InStateZero(const UTF8ScanObj * st,const uint8 * Tbl)350 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
351   const uint8* Tbl0 = &st->state_table[st->state0];
352   return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
353 }
354 
355 // Scan a UTF-8 string based on state table.
356 // Always scan complete UTF-8 characters
357 // Set number of bytes scanned. Return reason for exiting
UTF8GenericScan(const UTF8ScanObj * st,const char * str,int str_length,int * bytes_consumed)358 int UTF8GenericScan(const UTF8ScanObj* st,
359                     const char * str,
360                     int str_length,
361                     int* bytes_consumed) {
362   *bytes_consumed = 0;
363   if (str_length == 0) return kExitOK;
364 
365   int eshift = st->entry_shift;
366   const uint8* isrc = reinterpret_cast<const uint8*>(str);
367   const uint8* src = isrc;
368   const uint8* srclimit = isrc + str_length;
369   const uint8* srclimit8 = srclimit - 7;
370   const uint8* Tbl_0 = &st->state_table[st->state0];
371 
372  DoAgain:
373   // Do state-table scan
374   int e = 0;
375   uint8 c;
376   const uint8* Tbl2 = &st->fast_state[0];
377   const uint32 losub = st->losub;
378   const uint32 hiadd = st->hiadd;
379   // Check initial few bytes one at a time until 8-byte aligned
380   //----------------------------
381   while ((((uintptr_t)src & 0x07) != 0) &&
382          (src < srclimit) &&
383          Tbl2[src[0]] == 0) {
384     src++;
385   }
386   if (((uintptr_t)src & 0x07) == 0) {
387     // Do fast for groups of 8 identity bytes.
388     // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
389     // including slowing slightly on cr/lf/ht
390     //----------------------------
391     while (src < srclimit8) {
392       uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
393       uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
394       src += 8;
395       // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
396       uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
397                     (s4567 - losub) | (s4567 + hiadd);
398       if ((temp & 0x80808080) != 0) {
399         // We typically end up here on cr/lf/ht; src was incremented
400         int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
401                     (Tbl2[src[-6]] | Tbl2[src[-5]]);
402         if (e0123 != 0) {
403           src -= 8;
404           break;
405         }    // Exit on Non-interchange
406         e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
407                 (Tbl2[src[-2]] | Tbl2[src[-1]]);
408         if (e0123 != 0) {
409           src -= 4;
410           break;
411         }    // Exit on Non-interchange
412         // Else OK, go around again
413       }
414     }
415   }
416   //----------------------------
417 
418   // Byte-at-a-time scan
419   //----------------------------
420   const uint8* Tbl = Tbl_0;
421   while (src < srclimit) {
422     c = *src;
423     e = Tbl[c];
424     src++;
425     if (e >= kExitIllegalStructure) {break;}
426     Tbl = &Tbl_0[e << eshift];
427   }
428   //----------------------------
429 
430 
431   // Exit posibilities:
432   //  Some exit code, !state0, back up over last char
433   //  Some exit code, state0, back up one byte exactly
434   //  source consumed, !state0, back up over partial char
435   //  source consumed, state0, exit OK
436   // For illegal byte in state0, avoid backup up over PREVIOUS char
437   // For truncated last char, back up to beginning of it
438 
439   if (e >= kExitIllegalStructure) {
440     // Back up over exactly one byte of rejected/illegal UTF-8 character
441     src--;
442     // Back up more if needed
443     if (!InStateZero(st, Tbl)) {
444       do {
445         src--;
446       } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
447     }
448   } else if (!InStateZero(st, Tbl)) {
449     // Back up over truncated UTF-8 character
450     e = kExitIllegalStructure;
451     do {
452       src--;
453     } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
454   } else {
455     // Normal termination, source fully consumed
456     e = kExitOK;
457   }
458 
459   if (e == kExitDoAgain) {
460     // Loop back up to the fast scan
461     goto DoAgain;
462   }
463 
464   *bytes_consumed = src - isrc;
465   return e;
466 }
467 
UTF8GenericScanFastAscii(const UTF8ScanObj * st,const char * str,int str_length,int * bytes_consumed)468 int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
469                     const char * str,
470                     int str_length,
471                     int* bytes_consumed) {
472   *bytes_consumed = 0;
473   if (str_length == 0) return kExitOK;
474 
475   const uint8* isrc =  reinterpret_cast<const uint8*>(str);
476   const uint8* src = isrc;
477   const uint8* srclimit = isrc + str_length;
478   const uint8* srclimit8 = srclimit - 7;
479   int n;
480   int rest_consumed;
481   int exit_reason;
482   do {
483     // Check initial few bytes one at a time until 8-byte aligned
484     while ((((uintptr_t)src & 0x07) != 0) &&
485            (src < srclimit) && (src[0] < 0x80)) {
486       src++;
487     }
488     if (((uintptr_t)src & 0x07) == 0) {
489       while ((src < srclimit8) &&
490              (((reinterpret_cast<const uint32*>(src)[0] |
491                 reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
492         src += 8;
493       }
494     }
495     while ((src < srclimit) && (src[0] < 0x80)) {
496       src++;
497     }
498     // Run state table on the rest
499     n = src - isrc;
500     exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed);
501     src += rest_consumed;
502   } while ( exit_reason == kExitDoAgain );
503 
504   *bytes_consumed = src - isrc;
505   return exit_reason;
506 }
507 
508 // Hack:  On some compilers the static tables are initialized at startup.
509 //   We can't use them until they are initialized.  However, some Protocol
510 //   Buffer parsing happens at static init time and may try to validate
511 //   UTF-8 strings.  Since UTF-8 validation is only used for debugging
512 //   anyway, we simply always return success if initialization hasn't
513 //   occurred yet.
514 namespace {
515 
516 bool module_initialized_ = false;
517 
518 struct InitDetector {
InitDetectorgoogle::protobuf::internal::__anon326f57750311::InitDetector519   InitDetector() {
520     module_initialized_ = true;
521   }
522 };
523 InitDetector init_detector;
524 
525 }  // namespace
526 
IsStructurallyValidUTF8(const char * buf,int len)527 bool IsStructurallyValidUTF8(const char* buf, int len) {
528   if (!module_initialized_) return true;
529 
530   int bytes_consumed = 0;
531   UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
532                            buf, len, &bytes_consumed);
533   return (bytes_consumed == len);
534 }
535 
UTF8SpnStructurallyValid(const StringPiece & str)536 int UTF8SpnStructurallyValid(const StringPiece& str) {
537   if (!module_initialized_) return str.size();
538 
539   int bytes_consumed = 0;
540   UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
541                            str.data(), str.size(), &bytes_consumed);
542   return bytes_consumed;
543 }
544 
545 // Coerce UTF-8 byte string in src_str to be
546 // a structurally-valid equal-length string by selectively
547 // overwriting illegal bytes with replace_char (typically blank).
548 // replace_char must be legal printable 7-bit Ascii 0x20..0x7e.
549 // src_str is read-only. If any overwriting is needed, a modified byte string
550 // is created in idst, length isrclen.
551 //
552 // Returns pointer to output buffer, isrc if no changes were made,
553 //  or idst if some bytes were changed.
554 //
555 // Fast case: all is structurally valid and no byte copying is done.
556 //
UTF8CoerceToStructurallyValid(const StringPiece & src_str,char * idst,const char replace_char)557 char* UTF8CoerceToStructurallyValid(const StringPiece& src_str,
558                                     char* idst,
559                                     const char replace_char) {
560   const char* isrc = src_str.data();
561   const int len = src_str.length();
562   int n = UTF8SpnStructurallyValid(src_str);
563   if (n == len) {               // Normal case -- all is cool, return
564     return const_cast<char*>(isrc);
565   } else {                      // Unusual case -- copy w/o bad bytes
566     const char* src = isrc;
567     const char* srclimit = isrc + len;
568     char* dst = idst;
569     memmove(dst, src, n);       // Copy initial good chunk
570     src += n;
571     dst += n;
572     while (src < srclimit) {    // src points to bogus byte or is off the end
573       dst[0] = replace_char;                    // replace one bad byte
574       src++;
575       dst++;
576       StringPiece str2(src, srclimit - src);
577       n = UTF8SpnStructurallyValid(str2);       // scan the remainder
578       memmove(dst, src, n);                     // copy next good chunk
579       src += n;
580       dst += n;
581     }
582   }
583   return idst;
584 }
585 
586 }  // namespace internal
587 }  // namespace protobuf
588 }  // namespace google
589