1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1998-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /*
7 * File utf8tst.c
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 07/24/2000 Madhu Creation
13 *******************************************************************************
14 */
15
16 #include "unicode/utypes.h"
17 #include "unicode/utf8.h"
18 #include "cmemory.h"
19 #include "cintltst.h"
20
21 /* lenient UTF-8 ------------------------------------------------------------ */
22
23 /*
24 * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
25 * code points with their "natural" encoding.
26 * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
27 * single surrogates.
28 *
29 * This is not conformant with UTF-8.
30 *
31 * Supplementary code points may be encoded as pairs of 3-byte sequences, but
32 * the macros below do not attempt to assemble such pairs.
33 */
34
35 #define L8_NEXT(s, i, length, c) { \
36 (c)=(uint8_t)(s)[(i)++]; \
37 if((c)>=0x80) { \
38 if(U8_IS_LEAD(c)) { \
39 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
40 } else { \
41 (c)=U_SENTINEL; \
42 } \
43 } \
44 }
45
46 #define L8_PREV(s, start, i, c) { \
47 (c)=(uint8_t)(s)[--(i)]; \
48 if((c)>=0x80) { \
49 if((c)<=0xbf) { \
50 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
51 } else { \
52 (c)=U_SENTINEL; \
53 } \
54 } \
55 }
56
57 /* -------------------------------------------------------------------------- */
58
59 static void printUChars(const uint8_t *uchars, int16_t len);
60
61 static void TestCodeUnitValues(void);
62 static void TestCharLength(void);
63 static void TestGetChar(void);
64 static void TestNextPrevChar(void);
65 static void TestNulTerminated(void);
66 static void TestNextPrevNonCharacters(void);
67 static void TestNextPrevCharUnsafe(void);
68 static void TestFwdBack(void);
69 static void TestFwdBackUnsafe(void);
70 static void TestSetChar(void);
71 static void TestSetCharUnsafe(void);
72 static void TestAppendChar(void);
73 static void TestAppend(void);
74 static void TestSurrogates(void);
75
76 void addUTF8Test(TestNode** root);
77
78 void
addUTF8Test(TestNode ** root)79 addUTF8Test(TestNode** root)
80 {
81 addTest(root, &TestCodeUnitValues, "utf8tst/TestCodeUnitValues");
82 addTest(root, &TestCharLength, "utf8tst/TestCharLength");
83 addTest(root, &TestGetChar, "utf8tst/TestGetChar");
84 addTest(root, &TestNextPrevChar, "utf8tst/TestNextPrevChar");
85 addTest(root, &TestNulTerminated, "utf8tst/TestNulTerminated");
86 addTest(root, &TestNextPrevNonCharacters, "utf8tst/TestNextPrevNonCharacters");
87 addTest(root, &TestNextPrevCharUnsafe, "utf8tst/TestNextPrevCharUnsafe");
88 addTest(root, &TestFwdBack, "utf8tst/TestFwdBack");
89 addTest(root, &TestFwdBackUnsafe, "utf8tst/TestFwdBackUnsafe");
90 addTest(root, &TestSetChar, "utf8tst/TestSetChar");
91 addTest(root, &TestSetCharUnsafe, "utf8tst/TestSetCharUnsafe");
92 addTest(root, &TestAppendChar, "utf8tst/TestAppendChar");
93 addTest(root, &TestAppend, "utf8tst/TestAppend");
94 addTest(root, &TestSurrogates, "utf8tst/TestSurrogates");
95 }
96
TestCodeUnitValues()97 static void TestCodeUnitValues()
98 {
99 static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,};
100
101 int16_t i;
102 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
103 uint8_t c=codeunit[i];
104 log_verbose("Testing code unit value of %x\n", c);
105 if(i<4){
106 if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) || !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)){
107 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
108 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
109 }
110 } else if(i< 8){
111 if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) || !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)){
112 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
113 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
114 }
115 } else if(i< 12){
116 if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
117 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
118 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
119 }
120 }
121 }
122 }
123
TestCharLength()124 static void TestCharLength()
125 {
126 static const uint32_t codepoint[]={
127 1, 0x0061,
128 1, 0x007f,
129 2, 0x016f,
130 2, 0x07ff,
131 3, 0x0865,
132 3, 0x20ac,
133 4, 0x20402,
134 4, 0x23456,
135 4, 0x24506,
136 4, 0x20402,
137 4, 0x10402,
138 3, 0xd7ff,
139 3, 0xe000,
140
141 };
142
143 int16_t i;
144 UBool multiple;
145 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
146 UChar32 c=codepoint[i+1];
147 if(UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U8_LENGTH(c) != (uint16_t)codepoint[i]){
148 log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], UTF8_CHAR_LENGTH(c));
149 }else{
150 log_verbose("The no: of code units for %lx is %d\n",c, UTF8_CHAR_LENGTH(c));
151 }
152 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
153 if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){
154 log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c);
155 }
156 }
157 }
158
TestGetChar()159 static void TestGetChar()
160 {
161 static const uint8_t input[]={
162 /* code unit,*/
163 0x61,
164 0x7f,
165 0xe4,
166 0xba,
167 0x8c,
168 0xF0,
169 0x90,
170 0x90,
171 0x81,
172 0xc0,
173 0x65,
174 0x31,
175 0x9a,
176 0xc9
177 };
178 static const UChar32 result[]={
179 /* codepoint-unsafe, codepoint-safe(not strict) codepoint-safe(strict) */
180 0x61, 0x61, 0x61,
181 0x7f, 0x7f, 0x7f,
182 0x4e8c, 0x4e8c, 0x4e8c,
183 0x4e8c, 0x4e8c, 0x4e8c ,
184 0x4e8c, 0x4e8c, 0x4e8c,
185 0x10401, 0x10401, 0x10401 ,
186 0x10401, 0x10401, 0x10401 ,
187 0x10401, 0x10401, 0x10401 ,
188 0x10401, 0x10401, 0x10401,
189 0x25, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
190 0x65, 0x65, 0x65,
191 0x31, 0x31, 0x31,
192 0x31, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
193 0x240, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1
194 };
195 uint16_t i=0;
196 UChar32 c, expected;
197 uint32_t offset=0;
198
199 for(offset=0; offset<sizeof(input); offset++) {
200 if (offset < sizeof(input) - 1) {
201 UTF8_GET_CHAR_UNSAFE(input, offset, c);
202 if(c != result[i]){
203 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
204
205 }
206
207 U8_GET_UNSAFE(input, offset, c);
208 if(c != result[i]){
209 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
210
211 }
212 }
213
214 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
215 expected=result[i+1];
216 if(c != expected){
217 log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
218 }
219
220 U8_GET(input, 0, offset, sizeof(input), c);
221 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
222 if(c != expected){
223 log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
224 }
225
226 U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c);
227 if(expected<0) { expected=0xfffd; }
228 if(c != expected){
229 log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
230 }
231
232 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
233 if(c != result[i+2]){
234 log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
235 }
236
237 i=(uint16_t)(i+3);
238 }
239 }
240
TestNextPrevChar()241 static void TestNextPrevChar() {
242 static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
243 static const UChar32 result[]={
244 /* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s */
245 0x0061, 0x0061, 0x0061, 0x0000, 0x0000, 0x0000,
246 0x10401, 0x10401, 0x10401, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
247 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841410, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
248 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xa1050, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
249 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
250 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x61, 0x61, 0x61,
251 0x80, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xc2, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
252 0xfd, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x77e, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
253 0xbe, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xfd, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
254 0xa1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
255 0x61, 0x61, 0x61, 0xc0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
256 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401, 0x10401,
257 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF_ERROR_VALUE, UTF_ERROR_VALUE,
258 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
259 0x0840, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
260 0x0000, 0x0000, 0x0000, 0x0061, 0x0061, 0x0061
261 };
262 static const int32_t movedOffset[]={
263 /* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s */
264 1, 1, 1, 15, 15, 15,
265 5, 5, 5, 14, 14 , 14,
266 3, 3, 3, 9, 13, 13,
267 4, 4, 4, 9, 12, 12,
268 5, 5, 5, 9, 11, 11,
269 7, 7, 7, 10, 10, 10,
270 7, 7, 7, 9, 9, 9,
271 8, 9, 9, 7, 7, 7,
272 9, 9, 9, 7, 7, 7,
273 11, 10, 10, 5, 5, 5,
274 11, 11, 11, 5, 5, 5,
275 12, 12, 12, 1, 1, 1,
276 13, 13, 13, 1, 1, 1,
277 14, 14, 14, 1, 1, 1,
278 14, 15, 15, 1, 1, 1,
279 14, 16, 16, 0, 0, 0,
280 };
281 /* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the test code */
282
283 UChar32 c, expected;
284 uint32_t i=0;
285 uint32_t offset=0;
286 int32_t setOffset=0;
287 for(offset=0; offset<sizeof(input); offset++){
288 setOffset=offset;
289 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
290 if(setOffset != movedOffset[i+1]){
291 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
292 offset, movedOffset[i+1], setOffset);
293 }
294 expected=result[i+1];
295 if(c != expected){
296 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
297 }
298
299 setOffset=offset;
300 U8_NEXT(input, setOffset, sizeof(input), c);
301 if(setOffset != movedOffset[i+1]){
302 log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
303 offset, movedOffset[i+1], setOffset);
304 }
305 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
306 if(c != expected){
307 log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
308 }
309
310 setOffset=offset;
311 U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
312 if(setOffset != movedOffset[i+1]){
313 log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
314 offset, movedOffset[i+1], setOffset);
315 }
316 if(expected<0) { expected=0xfffd; }
317 if(c != expected){
318 log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
319 }
320
321 setOffset=offset;
322 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
323 if(setOffset != movedOffset[i+1]){
324 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
325 offset, movedOffset[i+2], setOffset);
326 }
327 if(c != result[i+2]){
328 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
329 }
330
331 i=i+6;
332 }
333
334 i=0;
335 for(offset=sizeof(input); offset > 0; --offset){
336 setOffset=offset;
337 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
338 if(setOffset != movedOffset[i+4]){
339 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
340 offset, movedOffset[i+4], setOffset);
341 }
342 expected=result[i+4];
343 if(c != expected){
344 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
345 }
346
347 setOffset=offset;
348 U8_PREV(input, 0, setOffset, c);
349 if(setOffset != movedOffset[i+4]){
350 log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
351 offset, movedOffset[i+4], setOffset);
352 }
353 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
354 if(c != expected){
355 log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
356 }
357
358 setOffset=offset;
359 U8_PREV_OR_FFFD(input, 0, setOffset, c);
360 if(setOffset != movedOffset[i+4]){
361 log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
362 offset, movedOffset[i+4], setOffset);
363 }
364 if(expected<0) { expected=0xfffd; }
365 if(c != expected){
366 log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
367 }
368
369 setOffset=offset;
370 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
371 if(setOffset != movedOffset[i+5]){
372 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
373 offset, movedOffset[i+5], setOffset);
374 }
375 if(c != result[i+5]){
376 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
377 }
378
379 i=i+6;
380 }
381 }
382
383 /* keep this in sync with utf16tst.c's TestNulTerminated() */
TestNulTerminated()384 static void TestNulTerminated() {
385 static const uint8_t input[]={
386 /* 0 */ 0x61,
387 /* 1 */ 0xf0, 0x90, 0x90, 0x81,
388 /* 5 */ 0xc0, 0x80,
389 /* 7 */ 0xdf, 0x80,
390 /* 9 */ 0xc2,
391 /* 10 */ 0x62,
392 /* 11 */ 0xfd, 0xbe,
393 /* 13 */ 0xe0, 0xa0, 0x80,
394 /* 16 */ 0xe2, 0x82, 0xac,
395 /* 19 */ 0xf0, 0x90, 0x90,
396 /* 22 */ 0x00
397 /* 23 */
398 };
399 static const UChar32 result[]={
400 0x61,
401 0x10401,
402 U_SENTINEL,
403 0x7c0,
404 U_SENTINEL,
405 0x62,
406 U_SENTINEL,
407 0x800,
408 0x20ac,
409 U_SENTINEL,
410 0
411 };
412
413 UChar32 c, c2, expected;
414 int32_t i0, i=0, j, k, expectedIndex;
415 int32_t cpIndex=0;
416 do {
417 i0=i;
418 U8_NEXT(input, i, -1, c);
419 expected=result[cpIndex];
420 if(c!=expected) {
421 log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
422 }
423 j=i0;
424 U8_NEXT_OR_FFFD(input, j, -1, c);
425 if(expected<0) { expected=0xfffd; }
426 if(c!=expected) {
427 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
428 }
429 if(j!=i) {
430 log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);
431 }
432 j=i0;
433 U8_FWD_1(input, j, -1);
434 if(j!=i) {
435 log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i);
436 }
437 ++cpIndex;
438 /*
439 * Move by this many code points from the start.
440 * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
441 */
442 expectedIndex= (c==0) ? i-1 : i;
443 k=0;
444 U8_FWD_N(input, k, -1, cpIndex);
445 if(k!=expectedIndex) {
446 log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);
447 }
448 } while(c!=0);
449
450 i=0;
451 do {
452 j=i0=i;
453 U8_NEXT(input, i, -1, c);
454 do {
455 U8_GET(input, 0, j, -1, c2);
456 if(c2!=c) {
457 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);
458 }
459 U8_GET_OR_FFFD(input, 0, j, -1, c2);
460 expected= (c>=0) ? c : 0xfffd;
461 if(c2!=expected) {
462 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
463 }
464 /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
465 k=j+1;
466 U8_SET_CP_LIMIT(input, 0, k, -1);
467 if(k!=i) {
468 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);
469 }
470 } while(++j<i);
471 } while(c!=0);
472 }
473
TestNextPrevNonCharacters()474 static void TestNextPrevNonCharacters() {
475 /* test non-characters */
476 static const uint8_t nonChars[]={
477 0xef, 0xb7, 0x90, /* U+fdd0 */
478 0xef, 0xbf, 0xbf, /* U+feff */
479 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
480 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
481 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */
482 };
483
484 UChar32 ch;
485 int32_t idx;
486
487 for(idx=0; idx<(int32_t)sizeof(nonChars);) {
488 U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
489 if(!U_IS_UNICODE_NONCHAR(ch)) {
490 log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
491 }
492 }
493 for(idx=(int32_t)sizeof(nonChars); idx>0;) {
494 U8_PREV(nonChars, 0, idx, ch);
495 if(!U_IS_UNICODE_NONCHAR(ch)) {
496 log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
497 }
498 }
499 }
500
TestNextPrevCharUnsafe()501 static void TestNextPrevCharUnsafe() {
502 /*
503 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
504 * The behavior of _UNSAFE macros for ill-formed strings is undefined.
505 */
506 static const uint8_t input[]={
507 0x61,
508 0xf0, 0x90, 0x90, 0x81,
509 0xc0, 0x80, /* non-shortest form */
510 0xe2, 0x82, 0xac,
511 0xc2, 0xa1,
512 0xf4, 0x8f, 0xbf, 0xbf,
513 0x00
514 };
515 static const UChar32 codePoints[]={
516 0x61,
517 0x10401,
518 0,
519 0x20ac,
520 0xa1,
521 0x10ffff,
522 0
523 };
524
525 UChar32 c;
526 int32_t i;
527 uint32_t offset;
528 for(i=0, offset=0; offset<sizeof(input); ++i) {
529 UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
530 if(c != codePoints[i]){
531 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
532 offset, codePoints[i], c);
533 }
534 }
535 for(i=0, offset=0; offset<sizeof(input); ++i) {
536 U8_NEXT_UNSAFE(input, offset, c);
537 if(c != codePoints[i]){
538 log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
539 offset, codePoints[i], c);
540 }
541 }
542
543 for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
544 UTF8_PREV_CHAR_UNSAFE(input, offset, c);
545 if(c != codePoints[i]){
546 log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
547 offset, codePoints[i], c);
548 }
549 }
550 for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
551 U8_PREV_UNSAFE(input, offset, c);
552 if(c != codePoints[i]){
553 log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
554 offset, codePoints[i], c);
555 }
556 }
557 }
558
TestFwdBack()559 static void TestFwdBack() {
560 static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
561 static const uint16_t fwd_safe[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
562 static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};
563
564 static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
565 static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
566 static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0};
567
568 uint32_t offsafe=0;
569
570 uint32_t i=0;
571 while(offsafe < sizeof(input)){
572 UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
573 if(offsafe != fwd_safe[i]){
574 log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
575 }
576 i++;
577 }
578
579 i=0;
580 while(offsafe < sizeof(input)){
581 U8_FWD_1(input, offsafe, sizeof(input));
582 if(offsafe != fwd_safe[i]){
583 log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
584 }
585 i++;
586 }
587
588 i=0;
589 offsafe=sizeof(input);
590 while(offsafe > 0){
591 UTF8_BACK_1_SAFE(input, 0, offsafe);
592 if(offsafe != back_safe[i]){
593 log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe[i], offsafe);
594 }
595 i++;
596 }
597
598 i=0;
599 offsafe=sizeof(input);
600 while(offsafe > 0){
601 U8_BACK_1(input, 0, offsafe);
602 if(offsafe != back_safe[i]){
603 log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i], offsafe);
604 }
605 i++;
606 }
607
608 offsafe=0;
609 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
610 UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
611 if(offsafe != fwd_N_safe[i]){
612 log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
613 }
614
615 }
616
617 offsafe=0;
618 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
619 U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
620 if(offsafe != fwd_N_safe[i]){
621 log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
622 }
623
624 }
625
626 offsafe=sizeof(input);
627 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
628 UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
629 if(offsafe != back_N_safe[i]){
630 log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
631 }
632 }
633
634 offsafe=sizeof(input);
635 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
636 U8_BACK_N(input, 0, offsafe, Nvalue[i]);
637 if(offsafe != back_N_safe[i]){
638 log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
639 }
640 }
641 }
642
TestFwdBackUnsafe()643 static void TestFwdBackUnsafe() {
644 /*
645 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
646 * The behavior of _UNSAFE macros for ill-formed strings is undefined.
647 */
648 static const uint8_t input[]={
649 0x61,
650 0xf0, 0x90, 0x90, 0x81,
651 0xc0, 0x80, /* non-shortest form */
652 0xe2, 0x82, 0xac,
653 0xc2, 0xa1,
654 0xf4, 0x8f, 0xbf, 0xbf,
655 0x00
656 };
657 static const int8_t boundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
658
659 int32_t offset;
660 int32_t i;
661 for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
662 UTF8_FWD_1_UNSAFE(input, offset);
663 if(offset != boundaries[i]){
664 log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
665 }
666 }
667 for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
668 U8_FWD_1_UNSAFE(input, offset);
669 if(offset != boundaries[i]){
670 log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
671 }
672 }
673
674 for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
675 UTF8_BACK_1_UNSAFE(input, offset);
676 if(offset != boundaries[i]){
677 log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
678 }
679 }
680 for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
681 U8_BACK_1_UNSAFE(input, offset);
682 if(offset != boundaries[i]){
683 log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
684 }
685 }
686
687 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
688 offset=0;
689 UTF8_FWD_N_UNSAFE(input, offset, i);
690 if(offset != boundaries[i]) {
691 log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
692 }
693 }
694 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
695 offset=0;
696 U8_FWD_N_UNSAFE(input, offset, i);
697 if(offset != boundaries[i]) {
698 log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
699 }
700 }
701
702 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
703 int32_t j=UPRV_LENGTHOF(boundaries)-1-i;
704 offset=UPRV_LENGTHOF(input);
705 UTF8_BACK_N_UNSAFE(input, offset, i);
706 if(offset != boundaries[j]) {
707 log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
708 }
709 }
710 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
711 int32_t j=UPRV_LENGTHOF(boundaries)-1-i;
712 offset=UPRV_LENGTHOF(input);
713 U8_BACK_N_UNSAFE(input, offset, i);
714 if(offset != boundaries[j]) {
715 log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
716 }
717 }
718 }
719
TestSetChar()720 static void TestSetChar() {
721 static const uint8_t input[]
722 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
723 static const int16_t start_safe[]
724 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
725 static const int16_t limit_safe[]
726 = {0, 1, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
727
728 uint32_t i=0;
729 int32_t offset=0, setOffset=0;
730 for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
731 if (offset<UPRV_LENGTHOF(input)){
732 setOffset=offset;
733 UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
734 if(setOffset != start_safe[i]){
735 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
736 }
737
738 setOffset=offset;
739 U8_SET_CP_START(input, 0, setOffset);
740 if(setOffset != start_safe[i]){
741 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
742 }
743 }
744
745 setOffset=offset;
746 UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input));
747 if(setOffset != limit_safe[i]){
748 log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
749 }
750
751 setOffset=offset;
752 U8_SET_CP_LIMIT(input,0, setOffset, sizeof(input));
753 if(setOffset != limit_safe[i]){
754 log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
755 }
756
757 i++;
758 }
759 }
760
TestSetCharUnsafe()761 static void TestSetCharUnsafe() {
762 static const uint8_t input[]
763 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x80, 0x80, 0x00 };
764 static const int16_t start_unsafe[]
765 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 9, 9, 12, 12, 12, 15 };
766 static const int16_t limit_unsafe[]
767 = {0, 1, 4, 4, 4, 5, 6, 7, 9, 9, 10, 10, 10, 15, 15, 15, 16 };
768
769 uint32_t i=0;
770 int32_t offset=0, setOffset=0;
771 for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
772 if (offset<UPRV_LENGTHOF(input)){
773 setOffset=offset;
774 UTF8_SET_CHAR_START_UNSAFE(input, setOffset);
775 if(setOffset != start_unsafe[i]){
776 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
777 }
778
779 setOffset=offset;
780 U8_SET_CP_START_UNSAFE(input, setOffset);
781 if(setOffset != start_unsafe[i]){
782 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
783 }
784 }
785
786 if (offset != 0) { /* Can't have it go off the end of the array */
787 setOffset=offset;
788 UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
789 if(setOffset != limit_unsafe[i]){
790 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
791 }
792
793 setOffset=offset;
794 U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
795 if(setOffset != limit_unsafe[i]){
796 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
797 }
798 }
799
800 i++;
801 }
802 }
803
TestAppendChar()804 static void TestAppendChar(){
805 static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
806 static const uint32_t test[]={
807 /* append-position(unsafe), CHAR to be appended */
808 0, 0x10401,
809 2, 0x0028,
810 2, 0x007f,
811 3, 0xd801,
812 1, 0x20402,
813 8, 0x10401,
814 5, 0xc0,
815 5, 0xc1,
816 5, 0xfd,
817 6, 0x80,
818 6, 0x81,
819 6, 0xbf,
820 7, 0xfe,
821
822 /* append-position(safe), CHAR to be appended */
823 0, 0x10401,
824 2, 0x0028,
825 3, 0x7f,
826 3, 0xd801, /* illegal for UTF-8 starting with Unicode 3.2 */
827 1, 0x20402,
828 9, 0x10401,
829 5, 0xc0,
830 5, 0xc1,
831 5, 0xfd,
832 6, 0x80,
833 6, 0x81,
834 6, 0xbf,
835 7, 0xfe,
836
837 };
838 static const uint16_t movedOffset[]={
839 /* offset-moved-to(unsafe) */
840 4, /*for append-pos: 0 , CHAR 0x10401*/
841 3,
842 3,
843 6,
844 5,
845 12,
846 7,
847 7,
848 7,
849 8,
850 8,
851 8,
852 9,
853
854 /* offset-moved-to(safe) */
855 4, /*for append-pos: 0, CHAR 0x10401*/
856 3,
857 4,
858 6,
859 5,
860 11,
861 7,
862 7,
863 7,
864 8,
865 8,
866 8,
867 9,
868
869 };
870
871 static const uint8_t result[][11]={
872 /*unsafe*/
873 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
874 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
875 {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
876 {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
877 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
878 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
879
880 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
881 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
882 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
883
884 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
885 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
886 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
887
888 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
889 /*safe*/
890 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
891 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
892 {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
893 {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
894 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
895 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
896
897 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
898 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
899 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
900
901 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
902 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
903 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
904
905 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
906
907 };
908 uint16_t i, count=0;
909 uint8_t str[12];
910 uint32_t offset;
911 /* UChar32 c=0;*/
912 uint16_t size=UPRV_LENGTHOF(s);
913 for(i=0; i<UPRV_LENGTHOF(test); i=(uint16_t)(i+2)){
914 uprv_memcpy(str, s, size);
915 offset=test[i];
916 if(count<13){
917 UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]);
918 if(offset != movedOffset[count]){
919 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
920 count, movedOffset[count], offset);
921
922 }
923 if(uprv_memcmp(str, result[count], size) !=0){
924 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count);
925 printUChars(result[count], size);
926 log_err("\nGot: ");
927 printUChars(str, size);
928 log_err("\n");
929 }
930 }else{
931 UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]);
932 if(offset != movedOffset[count]){
933 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
934 count, movedOffset[count], offset);
935
936 }
937 if(uprv_memcmp(str, result[count], size) !=0){
938 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count);
939 printUChars(result[count], size);
940 log_err("\nGot: ");
941 printUChars(str, size);
942 log_err("\n");
943 }
944 /*call the API instead of MACRO
945 uprv_memcpy(str, s, size);
946 offset=test[i];
947 c=test[i+1];
948 if((uint32_t)(c)<=0x7f) {
949 (str)[(offset)++]=(uint8_t)(c);
950 } else {
951 (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
952 }
953 if(offset != movedOffset[count]){
954 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
955 count, movedOffset[count], offset);
956
957 }
958 if(uprv_memcmp(str, result[count], size) !=0){
959 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
960 printUChars(result[count], size);
961 printf("\nGot: ");
962 printUChars(str, size);
963 printf("\n");
964 }
965 */
966 }
967 count++;
968 }
969
970
971 }
972
TestAppend()973 static void TestAppend() {
974 static const UChar32 codePoints[]={
975 0x61, 0xdf, 0x901, 0x3040,
976 0xac00, 0xd800, 0xdbff, 0xdcde,
977 0xdffd, 0xe000, 0xffff, 0x10000,
978 0x12345, 0xe0021, 0x10ffff, 0x110000,
979 0x234567, 0x7fffffff, -1, -1000,
980 0, 0x400
981 };
982 static const uint8_t expectUnsafe[]={
983 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
984 0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9e,
985 0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
986 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
987 /* none from this line */
988 0, 0xd0, 0x80
989 }, expectSafe[]={
990 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
991 0xea, 0xb0, 0x80, /* no surrogates */
992 /* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
993 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
994 /* none from this line */
995 0, 0xd0, 0x80
996 };
997
998 uint8_t buffer[100];
999 UChar32 c;
1000 int32_t i, length;
1001 UBool isError, expectIsError, wrongIsError;
1002
1003 length=0;
1004 for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
1005 c=codePoints[i];
1006 if(c<0 || 0x10ffff<c) {
1007 continue; /* skip non-code points for U8_APPEND_UNSAFE */
1008 }
1009
1010 U8_APPEND_UNSAFE(buffer, length, c);
1011 }
1012 if(length!=UPRV_LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
1013 log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
1014 }
1015
1016 length=0;
1017 wrongIsError=FALSE;
1018 for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
1019 c=codePoints[i];
1020 expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
1021 isError=FALSE;
1022
1023 U8_APPEND(buffer, length, UPRV_LENGTHOF(buffer), c, isError);
1024 wrongIsError|= isError!=expectIsError;
1025 }
1026 if(wrongIsError) {
1027 log_err("U8_APPEND did not set isError correctly\n");
1028 }
1029 if(length!=UPRV_LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
1030 log_err("U8_APPEND did not generate the expected output\n");
1031 }
1032 }
1033
1034 static void
TestSurrogates()1035 TestSurrogates() {
1036 static const uint8_t b[]={
1037 0xc3, 0x9f, /* 00DF */
1038 0xed, 0x9f, 0xbf, /* D7FF */
1039 0xed, 0xa0, 0x81, /* D801 */
1040 0xed, 0xbf, 0xbe, /* DFFE */
1041 0xee, 0x80, 0x80, /* E000 */
1042 0xf0, 0x97, 0xbf, 0xbe /* 17FFE */
1043 };
1044 static const UChar32 cp[]={
1045 0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
1046 };
1047
1048 UChar32 cu, cs, cl;
1049 int32_t i, j, k, iu, is, il, length;
1050
1051 k=0; /* index into cp[] */
1052 length=UPRV_LENGTHOF(b);
1053 for(i=0; i<length;) {
1054 j=i;
1055 U8_NEXT_UNSAFE(b, j, cu);
1056 iu=j;
1057
1058 j=i;
1059 U8_NEXT(b, j, length, cs);
1060 is=j;
1061
1062 j=i;
1063 L8_NEXT(b, j, length, cl);
1064 il=j;
1065
1066 if(cu!=cp[k]) {
1067 log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1068 }
1069
1070 /* U8_NEXT() returns <0 for surrogate code points */
1071 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1072 log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1073 }
1074
1075 /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
1076 if(cl!=cu) {
1077 log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1078 }
1079
1080 if(is!=iu || il!=iu) {
1081 log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1082 }
1083
1084 ++k; /* next code point */
1085 i=iu; /* advance by one UTF-8 sequence */
1086 }
1087
1088 while(i>0) {
1089 --k; /* previous code point */
1090
1091 j=i;
1092 U8_PREV_UNSAFE(b, j, cu);
1093 iu=j;
1094
1095 j=i;
1096 U8_PREV(b, 0, j, cs);
1097 is=j;
1098
1099 j=i;
1100 L8_PREV(b, 0, j, cl);
1101 il=j;
1102
1103 if(cu!=cp[k]) {
1104 log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1105 }
1106
1107 /* U8_PREV() returns <0 for surrogate code points */
1108 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1109 log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1110 }
1111
1112 /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
1113 if(cl!=cu) {
1114 log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1115 }
1116
1117 if(is!=iu || il !=iu) {
1118 log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1119 }
1120
1121 i=iu; /* go back by one UTF-8 sequence */
1122 }
1123 }
1124
printUChars(const uint8_t * uchars,int16_t len)1125 static void printUChars(const uint8_t *uchars, int16_t len){
1126 int16_t i=0;
1127 for(i=0; i<len; i++){
1128 log_err("0x%02x ", *(uchars+i));
1129 }
1130 }
1131