1 // Tencent is pleased to support the open source community by making RapidJSON available.
2 //
3 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
4 //
5 // Licensed under the MIT License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // http://opensource.org/licenses/MIT
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "unittest.h"
16 #include "rapidjson/filereadstream.h"
17 #include "rapidjson/filewritestream.h"
18 #include "rapidjson/encodedstream.h"
19 #include "rapidjson/stringbuffer.h"
20
21 using namespace rapidjson;
22
23 // Verification of encoders/decoders with Hoehrmann's UTF8 decoder
24
25 // http://www.unicode.org/Public/UNIDATA/Blocks.txt
26 static const unsigned kCodepointRanges[] = {
27 0x0000, 0x007F, // Basic Latin
28 0x0080, 0x00FF, // Latin-1 Supplement
29 0x0100, 0x017F, // Latin Extended-A
30 0x0180, 0x024F, // Latin Extended-B
31 0x0250, 0x02AF, // IPA Extensions
32 0x02B0, 0x02FF, // Spacing Modifier Letters
33 0x0300, 0x036F, // Combining Diacritical Marks
34 0x0370, 0x03FF, // Greek and Coptic
35 0x0400, 0x04FF, // Cyrillic
36 0x0500, 0x052F, // Cyrillic Supplement
37 0x0530, 0x058F, // Armenian
38 0x0590, 0x05FF, // Hebrew
39 0x0600, 0x06FF, // Arabic
40 0x0700, 0x074F, // Syriac
41 0x0750, 0x077F, // Arabic Supplement
42 0x0780, 0x07BF, // Thaana
43 0x07C0, 0x07FF, // NKo
44 0x0800, 0x083F, // Samaritan
45 0x0840, 0x085F, // Mandaic
46 0x0900, 0x097F, // Devanagari
47 0x0980, 0x09FF, // Bengali
48 0x0A00, 0x0A7F, // Gurmukhi
49 0x0A80, 0x0AFF, // Gujarati
50 0x0B00, 0x0B7F, // Oriya
51 0x0B80, 0x0BFF, // Tamil
52 0x0C00, 0x0C7F, // Telugu
53 0x0C80, 0x0CFF, // Kannada
54 0x0D00, 0x0D7F, // Malayalam
55 0x0D80, 0x0DFF, // Sinhala
56 0x0E00, 0x0E7F, // Thai
57 0x0E80, 0x0EFF, // Lao
58 0x0F00, 0x0FFF, // Tibetan
59 0x1000, 0x109F, // Myanmar
60 0x10A0, 0x10FF, // Georgian
61 0x1100, 0x11FF, // Hangul Jamo
62 0x1200, 0x137F, // Ethiopic
63 0x1380, 0x139F, // Ethiopic Supplement
64 0x13A0, 0x13FF, // Cherokee
65 0x1400, 0x167F, // Unified Canadian Aboriginal Syllabics
66 0x1680, 0x169F, // Ogham
67 0x16A0, 0x16FF, // Runic
68 0x1700, 0x171F, // Tagalog
69 0x1720, 0x173F, // Hanunoo
70 0x1740, 0x175F, // Buhid
71 0x1760, 0x177F, // Tagbanwa
72 0x1780, 0x17FF, // Khmer
73 0x1800, 0x18AF, // Mongolian
74 0x18B0, 0x18FF, // Unified Canadian Aboriginal Syllabics Extended
75 0x1900, 0x194F, // Limbu
76 0x1950, 0x197F, // Tai Le
77 0x1980, 0x19DF, // New Tai Lue
78 0x19E0, 0x19FF, // Khmer Symbols
79 0x1A00, 0x1A1F, // Buginese
80 0x1A20, 0x1AAF, // Tai Tham
81 0x1B00, 0x1B7F, // Balinese
82 0x1B80, 0x1BBF, // Sundanese
83 0x1BC0, 0x1BFF, // Batak
84 0x1C00, 0x1C4F, // Lepcha
85 0x1C50, 0x1C7F, // Ol Chiki
86 0x1CD0, 0x1CFF, // Vedic Extensions
87 0x1D00, 0x1D7F, // Phonetic Extensions
88 0x1D80, 0x1DBF, // Phonetic Extensions Supplement
89 0x1DC0, 0x1DFF, // Combining Diacritical Marks Supplement
90 0x1E00, 0x1EFF, // Latin Extended Additional
91 0x1F00, 0x1FFF, // Greek Extended
92 0x2000, 0x206F, // General Punctuation
93 0x2070, 0x209F, // Superscripts and Subscripts
94 0x20A0, 0x20CF, // Currency Symbols
95 0x20D0, 0x20FF, // Combining Diacritical Marks for Symbols
96 0x2100, 0x214F, // Letterlike Symbols
97 0x2150, 0x218F, // Number Forms
98 0x2190, 0x21FF, // Arrows
99 0x2200, 0x22FF, // Mathematical Operators
100 0x2300, 0x23FF, // Miscellaneous Technical
101 0x2400, 0x243F, // Control Pictures
102 0x2440, 0x245F, // Optical Character Recognition
103 0x2460, 0x24FF, // Enclosed Alphanumerics
104 0x2500, 0x257F, // Box Drawing
105 0x2580, 0x259F, // Block Elements
106 0x25A0, 0x25FF, // Geometric Shapes
107 0x2600, 0x26FF, // Miscellaneous Symbols
108 0x2700, 0x27BF, // Dingbats
109 0x27C0, 0x27EF, // Miscellaneous Mathematical Symbols-A
110 0x27F0, 0x27FF, // Supplemental Arrows-A
111 0x2800, 0x28FF, // Braille Patterns
112 0x2900, 0x297F, // Supplemental Arrows-B
113 0x2980, 0x29FF, // Miscellaneous Mathematical Symbols-B
114 0x2A00, 0x2AFF, // Supplemental Mathematical Operators
115 0x2B00, 0x2BFF, // Miscellaneous Symbols and Arrows
116 0x2C00, 0x2C5F, // Glagolitic
117 0x2C60, 0x2C7F, // Latin Extended-C
118 0x2C80, 0x2CFF, // Coptic
119 0x2D00, 0x2D2F, // Georgian Supplement
120 0x2D30, 0x2D7F, // Tifinagh
121 0x2D80, 0x2DDF, // Ethiopic Extended
122 0x2DE0, 0x2DFF, // Cyrillic Extended-A
123 0x2E00, 0x2E7F, // Supplemental Punctuation
124 0x2E80, 0x2EFF, // CJK Radicals Supplement
125 0x2F00, 0x2FDF, // Kangxi Radicals
126 0x2FF0, 0x2FFF, // Ideographic Description Characters
127 0x3000, 0x303F, // CJK Symbols and Punctuation
128 0x3040, 0x309F, // Hiragana
129 0x30A0, 0x30FF, // Katakana
130 0x3100, 0x312F, // Bopomofo
131 0x3130, 0x318F, // Hangul Compatibility Jamo
132 0x3190, 0x319F, // Kanbun
133 0x31A0, 0x31BF, // Bopomofo Extended
134 0x31C0, 0x31EF, // CJK Strokes
135 0x31F0, 0x31FF, // Katakana Phonetic Extensions
136 0x3200, 0x32FF, // Enclosed CJK Letters and Months
137 0x3300, 0x33FF, // CJK Compatibility
138 0x3400, 0x4DBF, // CJK Unified Ideographs Extension A
139 0x4DC0, 0x4DFF, // Yijing Hexagram Symbols
140 0x4E00, 0x9FFF, // CJK Unified Ideographs
141 0xA000, 0xA48F, // Yi Syllables
142 0xA490, 0xA4CF, // Yi Radicals
143 0xA4D0, 0xA4FF, // Lisu
144 0xA500, 0xA63F, // Vai
145 0xA640, 0xA69F, // Cyrillic Extended-B
146 0xA6A0, 0xA6FF, // Bamum
147 0xA700, 0xA71F, // Modifier Tone Letters
148 0xA720, 0xA7FF, // Latin Extended-D
149 0xA800, 0xA82F, // Syloti Nagri
150 0xA830, 0xA83F, // Common Indic Number Forms
151 0xA840, 0xA87F, // Phags-pa
152 0xA880, 0xA8DF, // Saurashtra
153 0xA8E0, 0xA8FF, // Devanagari Extended
154 0xA900, 0xA92F, // Kayah Li
155 0xA930, 0xA95F, // Rejang
156 0xA960, 0xA97F, // Hangul Jamo Extended-A
157 0xA980, 0xA9DF, // Javanese
158 0xAA00, 0xAA5F, // Cham
159 0xAA60, 0xAA7F, // Myanmar Extended-A
160 0xAA80, 0xAADF, // Tai Viet
161 0xAB00, 0xAB2F, // Ethiopic Extended-A
162 0xABC0, 0xABFF, // Meetei Mayek
163 0xAC00, 0xD7AF, // Hangul Syllables
164 0xD7B0, 0xD7FF, // Hangul Jamo Extended-B
165 //0xD800, 0xDB7F, // High Surrogates
166 //0xDB80, 0xDBFF, // High Private Use Surrogates
167 //0xDC00, 0xDFFF, // Low Surrogates
168 0xE000, 0xF8FF, // Private Use Area
169 0xF900, 0xFAFF, // CJK Compatibility Ideographs
170 0xFB00, 0xFB4F, // Alphabetic Presentation Forms
171 0xFB50, 0xFDFF, // Arabic Presentation Forms-A
172 0xFE00, 0xFE0F, // Variation Selectors
173 0xFE10, 0xFE1F, // Vertical Forms
174 0xFE20, 0xFE2F, // Combining Half Marks
175 0xFE30, 0xFE4F, // CJK Compatibility Forms
176 0xFE50, 0xFE6F, // Small Form Variants
177 0xFE70, 0xFEFF, // Arabic Presentation Forms-B
178 0xFF00, 0xFFEF, // Halfwidth and Fullwidth Forms
179 0xFFF0, 0xFFFF, // Specials
180 0x10000, 0x1007F, // Linear B Syllabary
181 0x10080, 0x100FF, // Linear B Ideograms
182 0x10100, 0x1013F, // Aegean Numbers
183 0x10140, 0x1018F, // Ancient Greek Numbers
184 0x10190, 0x101CF, // Ancient Symbols
185 0x101D0, 0x101FF, // Phaistos Disc
186 0x10280, 0x1029F, // Lycian
187 0x102A0, 0x102DF, // Carian
188 0x10300, 0x1032F, // Old Italic
189 0x10330, 0x1034F, // Gothic
190 0x10380, 0x1039F, // Ugaritic
191 0x103A0, 0x103DF, // Old Persian
192 0x10400, 0x1044F, // Deseret
193 0x10450, 0x1047F, // Shavian
194 0x10480, 0x104AF, // Osmanya
195 0x10800, 0x1083F, // Cypriot Syllabary
196 0x10840, 0x1085F, // Imperial Aramaic
197 0x10900, 0x1091F, // Phoenician
198 0x10920, 0x1093F, // Lydian
199 0x10A00, 0x10A5F, // Kharoshthi
200 0x10A60, 0x10A7F, // Old South Arabian
201 0x10B00, 0x10B3F, // Avestan
202 0x10B40, 0x10B5F, // Inscriptional Parthian
203 0x10B60, 0x10B7F, // Inscriptional Pahlavi
204 0x10C00, 0x10C4F, // Old Turkic
205 0x10E60, 0x10E7F, // Rumi Numeral Symbols
206 0x11000, 0x1107F, // Brahmi
207 0x11080, 0x110CF, // Kaithi
208 0x12000, 0x123FF, // Cuneiform
209 0x12400, 0x1247F, // Cuneiform Numbers and Punctuation
210 0x13000, 0x1342F, // Egyptian Hieroglyphs
211 0x16800, 0x16A3F, // Bamum Supplement
212 0x1B000, 0x1B0FF, // Kana Supplement
213 0x1D000, 0x1D0FF, // Byzantine Musical Symbols
214 0x1D100, 0x1D1FF, // Musical Symbols
215 0x1D200, 0x1D24F, // Ancient Greek Musical Notation
216 0x1D300, 0x1D35F, // Tai Xuan Jing Symbols
217 0x1D360, 0x1D37F, // Counting Rod Numerals
218 0x1D400, 0x1D7FF, // Mathematical Alphanumeric Symbols
219 0x1F000, 0x1F02F, // Mahjong Tiles
220 0x1F030, 0x1F09F, // Domino Tiles
221 0x1F0A0, 0x1F0FF, // Playing Cards
222 0x1F100, 0x1F1FF, // Enclosed Alphanumeric Supplement
223 0x1F200, 0x1F2FF, // Enclosed Ideographic Supplement
224 0x1F300, 0x1F5FF, // Miscellaneous Symbols And Pictographs
225 0x1F600, 0x1F64F, // Emoticons
226 0x1F680, 0x1F6FF, // Transport And Map Symbols
227 0x1F700, 0x1F77F, // Alchemical Symbols
228 0x20000, 0x2A6DF, // CJK Unified Ideographs Extension B
229 0x2A700, 0x2B73F, // CJK Unified Ideographs Extension C
230 0x2B740, 0x2B81F, // CJK Unified Ideographs Extension D
231 0x2F800, 0x2FA1F, // CJK Compatibility Ideographs Supplement
232 0xE0000, 0xE007F, // Tags
233 0xE0100, 0xE01EF, // Variation Selectors Supplement
234 0xF0000, 0xFFFFF, // Supplementary Private Use Area-A
235 0x100000, 0x10FFFF, // Supplementary Private Use Area-B
236 0xFFFFFFFF
237 };
238
239 // Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
240 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
241
242 #define UTF8_ACCEPT 0u
243 #define UTF8_REJECT 12u
244
245 static const unsigned char utf8d[] = {
246 // The first part of the table maps bytes to character classes that
247 // to reduce the size of the transition table and create bitmasks.
248 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
249 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
250 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
251 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
252 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
253 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
254 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
255 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
256
257 // The second part is a transition table that maps a combination
258 // of a state of the automaton and a character class to a state.
259 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
260 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
261 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
262 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
263 12,36,12,12,12,12,12,12,12,12,12,12,
264 };
265
decode(unsigned * state,unsigned * codep,unsigned byte)266 static unsigned inline decode(unsigned* state, unsigned* codep, unsigned byte) {
267 unsigned type = utf8d[byte];
268
269 *codep = (*state != UTF8_ACCEPT) ?
270 (byte & 0x3fu) | (*codep << 6) :
271 (0xff >> type) & (byte);
272
273 *state = utf8d[256 + *state + type];
274 return *state;
275 }
276
277 //static bool IsUTF8(unsigned char* s) {
278 // unsigned codepoint, state = 0;
279 //
280 // while (*s)
281 // decode(&state, &codepoint, *s++);
282 //
283 // return state == UTF8_ACCEPT;
284 //}
285
TEST(EncodingsTest,UTF8)286 TEST(EncodingsTest, UTF8) {
287 StringBuffer os, os2;
288 for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
289 for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
290 os.Clear();
291 UTF8<>::Encode(os, codepoint);
292 const char* encodedStr = os.GetString();
293
294 // Decode with Hoehrmann
295 {
296 unsigned decodedCodepoint = 0;
297 unsigned state = 0;
298
299 unsigned decodedCount = 0;
300 for (const char* s = encodedStr; *s; ++s)
301 if (!decode(&state, &decodedCodepoint, (unsigned char)*s)) {
302 EXPECT_EQ(codepoint, decodedCodepoint);
303 decodedCount++;
304 }
305
306 if (*encodedStr) // This decoder cannot handle U+0000
307 EXPECT_EQ(1u, decodedCount); // Should only contain one code point
308
309 EXPECT_EQ(UTF8_ACCEPT, state);
310 if (UTF8_ACCEPT != state)
311 std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
312 }
313
314 // Decode
315 {
316 StringStream is(encodedStr);
317 unsigned decodedCodepoint;
318 bool result = UTF8<>::Decode(is, &decodedCodepoint);
319 EXPECT_TRUE(result);
320 EXPECT_EQ(codepoint, decodedCodepoint);
321 if (!result || codepoint != decodedCodepoint)
322 std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
323 }
324
325 // Validate
326 {
327 StringStream is(encodedStr);
328 os2.Clear();
329 bool result = UTF8<>::Validate(is, os2);
330 EXPECT_TRUE(result);
331 EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
332 }
333 }
334 }
335 }
336
TEST(EncodingsTest,UTF16)337 TEST(EncodingsTest, UTF16) {
338 GenericStringBuffer<UTF16<> > os, os2;
339 GenericStringBuffer<UTF8<> > utf8os;
340 for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
341 for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
342 os.Clear();
343 UTF16<>::Encode(os, codepoint);
344 const UTF16<>::Ch* encodedStr = os.GetString();
345
346 // Encode with Hoehrmann's code
347 if (codepoint != 0) // cannot handle U+0000
348 {
349 // encode with UTF8<> first
350 utf8os.Clear();
351 UTF8<>::Encode(utf8os, codepoint);
352
353 // transcode from UTF8 to UTF16 with Hoehrmann's code
354 unsigned decodedCodepoint = 0;
355 unsigned state = 0;
356 UTF16<>::Ch buffer[3], *p = &buffer[0];
357 for (const char* s = utf8os.GetString(); *s; ++s) {
358 if (!decode(&state, &decodedCodepoint, (unsigned char)*s))
359 break;
360 }
361
362 if (codepoint <= 0xFFFF)
363 *p++ = static_cast<UTF16<>::Ch>(decodedCodepoint);
364 else {
365 // Encode code points above U+FFFF as surrogate pair.
366 *p++ = static_cast<UTF16<>::Ch>(0xD7C0 + (decodedCodepoint >> 10));
367 *p++ = static_cast<UTF16<>::Ch>(0xDC00 + (decodedCodepoint & 0x3FF));
368 }
369 *p++ = '\0';
370
371 EXPECT_EQ(0, StrCmp(buffer, encodedStr));
372 }
373
374 // Decode
375 {
376 GenericStringStream<UTF16<> > is(encodedStr);
377 unsigned decodedCodepoint;
378 bool result = UTF16<>::Decode(is, &decodedCodepoint);
379 EXPECT_TRUE(result);
380 EXPECT_EQ(codepoint, decodedCodepoint);
381 if (!result || codepoint != decodedCodepoint)
382 std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
383 }
384
385 // Validate
386 {
387 GenericStringStream<UTF16<> > is(encodedStr);
388 os2.Clear();
389 bool result = UTF16<>::Validate(is, os2);
390 EXPECT_TRUE(result);
391 EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
392 }
393 }
394 }
395 }
396
TEST(EncodingsTest,UTF32)397 TEST(EncodingsTest, UTF32) {
398 GenericStringBuffer<UTF32<> > os, os2;
399 for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
400 for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
401 os.Clear();
402 UTF32<>::Encode(os, codepoint);
403 const UTF32<>::Ch* encodedStr = os.GetString();
404
405 // Decode
406 {
407 GenericStringStream<UTF32<> > is(encodedStr);
408 unsigned decodedCodepoint;
409 bool result = UTF32<>::Decode(is, &decodedCodepoint);
410 EXPECT_TRUE(result);
411 EXPECT_EQ(codepoint, decodedCodepoint);
412 if (!result || codepoint != decodedCodepoint)
413 std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
414 }
415
416 // Validate
417 {
418 GenericStringStream<UTF32<> > is(encodedStr);
419 os2.Clear();
420 bool result = UTF32<>::Validate(is, os2);
421 EXPECT_TRUE(result);
422 EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
423 }
424 }
425 }
426 }
427