1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (c) 1998-2014, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  ********************************************************************/
6 /*
7 * File utf8tst.c
8 *
9 * Modification History:
10 *
11 *   Date          Name        Description
12 *   07/24/2000    Madhu       Creation
13 *******************************************************************************
14 */
15 
16 #include "unicode/utypes.h"
17 #include "unicode/utf8.h"
18 #include "cmemory.h"
19 #include "cintltst.h"
20 
21 /* lenient UTF-8 ------------------------------------------------------------ */
22 
23 /*
24  * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
25  * code points with their "natural" encoding.
26  * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
27  * single surrogates.
28  *
29  * This is not conformant with UTF-8.
30  *
31  * Supplementary code points may be encoded as pairs of 3-byte sequences, but
32  * the macros below do not attempt to assemble such pairs.
33  */
34 
35 #define L8_NEXT(s, i, length, c) { \
36     (c)=(uint8_t)(s)[(i)++]; \
37     if((c)>=0x80) { \
38         if(U8_IS_LEAD(c)) { \
39             (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
40         } else { \
41             (c)=U_SENTINEL; \
42         } \
43     } \
44 }
45 
46 #define L8_PREV(s, start, i, c) { \
47     (c)=(uint8_t)(s)[--(i)]; \
48     if((c)>=0x80) { \
49         if((c)<=0xbf) { \
50             (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
51         } else { \
52             (c)=U_SENTINEL; \
53         } \
54     } \
55 }
56 
57 /* -------------------------------------------------------------------------- */
58 
59 static void printUChars(const uint8_t *uchars, int16_t len);
60 
61 static void TestCodeUnitValues(void);
62 static void TestCharLength(void);
63 static void TestGetChar(void);
64 static void TestNextPrevChar(void);
65 static void TestNulTerminated(void);
66 static void TestNextPrevNonCharacters(void);
67 static void TestNextPrevCharUnsafe(void);
68 static void TestFwdBack(void);
69 static void TestFwdBackUnsafe(void);
70 static void TestSetChar(void);
71 static void TestSetCharUnsafe(void);
72 static void TestAppendChar(void);
73 static void TestAppend(void);
74 static void TestSurrogates(void);
75 
76 void addUTF8Test(TestNode** root);
77 
78 void
addUTF8Test(TestNode ** root)79 addUTF8Test(TestNode** root)
80 {
81     addTest(root, &TestCodeUnitValues,          "utf8tst/TestCodeUnitValues");
82     addTest(root, &TestCharLength,              "utf8tst/TestCharLength");
83     addTest(root, &TestGetChar,                 "utf8tst/TestGetChar");
84     addTest(root, &TestNextPrevChar,            "utf8tst/TestNextPrevChar");
85     addTest(root, &TestNulTerminated,           "utf8tst/TestNulTerminated");
86     addTest(root, &TestNextPrevNonCharacters,   "utf8tst/TestNextPrevNonCharacters");
87     addTest(root, &TestNextPrevCharUnsafe,      "utf8tst/TestNextPrevCharUnsafe");
88     addTest(root, &TestFwdBack,                 "utf8tst/TestFwdBack");
89     addTest(root, &TestFwdBackUnsafe,           "utf8tst/TestFwdBackUnsafe");
90     addTest(root, &TestSetChar,                 "utf8tst/TestSetChar");
91     addTest(root, &TestSetCharUnsafe,           "utf8tst/TestSetCharUnsafe");
92     addTest(root, &TestAppendChar,              "utf8tst/TestAppendChar");
93     addTest(root, &TestAppend,                  "utf8tst/TestAppend");
94     addTest(root, &TestSurrogates,              "utf8tst/TestSurrogates");
95 }
96 
TestCodeUnitValues()97 static void TestCodeUnitValues()
98 {
99     static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,};
100 
101     int16_t i;
102     for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
103         uint8_t c=codeunit[i];
104         log_verbose("Testing code unit value of %x\n", c);
105         if(i<4){
106             if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) || !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)){
107                 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
108                     c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
109             }
110         } else if(i< 8){
111             if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) || !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)){
112                 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
113                     c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
114             }
115         } else if(i< 12){
116             if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
117                 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
118                     c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
119             }
120         }
121     }
122 }
123 
TestCharLength()124 static void TestCharLength()
125 {
126     static const uint32_t codepoint[]={
127         1, 0x0061,
128         1, 0x007f,
129         2, 0x016f,
130         2, 0x07ff,
131         3, 0x0865,
132         3, 0x20ac,
133         4, 0x20402,
134         4, 0x23456,
135         4, 0x24506,
136         4, 0x20402,
137         4, 0x10402,
138         3, 0xd7ff,
139         3, 0xe000,
140 
141     };
142 
143     int16_t i;
144     UBool multiple;
145     for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
146         UChar32 c=codepoint[i+1];
147         if(UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U8_LENGTH(c) != (uint16_t)codepoint[i]){
148               log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], UTF8_CHAR_LENGTH(c));
149         }else{
150               log_verbose("The no: of code units for %lx is %d\n",c, UTF8_CHAR_LENGTH(c));
151         }
152         multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
153         if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){
154               log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c);
155         }
156     }
157 }
158 
TestGetChar()159 static void TestGetChar()
160 {
161     static const uint8_t input[]={
162     /*  code unit,*/
163         0x61,
164         0x7f,
165         0xe4,
166         0xba,
167         0x8c,
168         0xF0,
169         0x90,
170         0x90,
171         0x81,
172         0xc0,
173         0x65,
174         0x31,
175         0x9a,
176         0xc9
177     };
178     static const UChar32 result[]={
179     /*  codepoint-unsafe, codepoint-safe(not strict)  codepoint-safe(strict) */
180         0x61,             0x61,                       0x61,
181         0x7f,             0x7f,                       0x7f,
182         0x4e8c,           0x4e8c,                     0x4e8c,
183         0x4e8c,           0x4e8c,                     0x4e8c ,
184         0x4e8c,           0x4e8c,                     0x4e8c,
185         0x10401,          0x10401,                    0x10401 ,
186         0x10401,          0x10401,                    0x10401 ,
187         0x10401,          0x10401,                    0x10401 ,
188         0x10401,          0x10401,                    0x10401,
189         0x25,             UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
190         0x65,             0x65,                       0x65,
191         0x31,             0x31,                       0x31,
192         0x31,             UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
193         0x240,            UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1
194     };
195     uint16_t i=0;
196     UChar32 c, expected;
197     uint32_t offset=0;
198 
199     for(offset=0; offset<sizeof(input); offset++) {
200         if (offset < sizeof(input) - 1) {
201             UTF8_GET_CHAR_UNSAFE(input, offset, c);
202             if(c != result[i]){
203                 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
204 
205             }
206 
207             U8_GET_UNSAFE(input, offset, c);
208             if(c != result[i]){
209                 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
210 
211             }
212         }
213 
214         UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
215         expected=result[i+1];
216         if(c != expected){
217             log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
218         }
219 
220         U8_GET(input, 0, offset, sizeof(input), c);
221         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
222         if(c != expected){
223             log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
224         }
225 
226         U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c);
227         if(expected<0) { expected=0xfffd; }
228         if(c != expected){
229             log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
230         }
231 
232         UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
233         if(c != result[i+2]){
234             log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
235         }
236 
237         i=(uint16_t)(i+3);
238     }
239 }
240 
TestNextPrevChar()241 static void TestNextPrevChar() {
242     static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
243     static const UChar32 result[]={
244     /*  next_unsafe    next_safe_ns        next_safe_s          prev_unsafe   prev_safe_ns        prev_safe_s */
245         0x0061,        0x0061,             0x0061,              0x0000,       0x0000,             0x0000,
246         0x10401,       0x10401,            0x10401,             0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
247         0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841410,    UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
248         0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xa1050,      UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
249         0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841,       UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
250         0x00,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x61,         0x61,               0x61,
251         0x80,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xc2,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
252         0xfd,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x77e,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
253         0xbe,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xfd,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
254         0xa1,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x00,         UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
255         0x61,          0x61,               0x61,                0xc0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
256         0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x10401,      0x10401,            0x10401,
257         0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF_ERROR_VALUE,    UTF_ERROR_VALUE,
258         0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
259         0x0840,        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
260         0x0000,        0x0000,             0x0000,              0x0061,       0x0061,             0x0061
261     };
262     static const int32_t movedOffset[]={
263     /*  next_unsafe   next_safe_ns next_safe_s       prev_unsafe   prev_safe_ns      prev_safe_s */
264         1,            1,           1,                15,           15,               15,
265         5,            5,           5,                14,           14 ,              14,
266         3,            3,           3,                9,            13,               13,
267         4,            4,           4,                9,            12,               12,
268         5,            5,           5,                9,            11,               11,
269         7,            7,           7,                10,           10,               10,
270         7,            7,           7,                9,            9,                9,
271         8,            9,           9,                7,            7,                7,
272         9,            9,           9,                7,            7,                7,
273         11,           10,          10,               5,            5,                5,
274         11,           11,          11,               5,            5,                5,
275         12,           12,          12,               1,            1,                1,
276         13,           13,          13,               1,            1,                1,
277         14,           14,          14,               1,            1,                1,
278         14,           15,          15,               1,            1,                1,
279         14,           16,          16,               0,            0,                0,
280     };
281     /* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the test code */
282 
283     UChar32 c, expected;
284     uint32_t i=0;
285     uint32_t offset=0;
286     int32_t setOffset=0;
287     for(offset=0; offset<sizeof(input); offset++){
288          setOffset=offset;
289          UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
290          if(setOffset != movedOffset[i+1]){
291              log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
292                  offset, movedOffset[i+1], setOffset);
293          }
294         expected=result[i+1];
295         if(c != expected){
296             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
297         }
298 
299          setOffset=offset;
300          U8_NEXT(input, setOffset, sizeof(input), c);
301          if(setOffset != movedOffset[i+1]){
302              log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
303                  offset, movedOffset[i+1], setOffset);
304          }
305         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
306         if(c != expected){
307             log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
308         }
309 
310         setOffset=offset;
311         U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
312         if(setOffset != movedOffset[i+1]){
313             log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
314                 offset, movedOffset[i+1], setOffset);
315         }
316         if(expected<0) { expected=0xfffd; }
317         if(c != expected){
318             log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
319         }
320 
321          setOffset=offset;
322          UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
323          if(setOffset != movedOffset[i+1]){
324              log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
325                  offset, movedOffset[i+2], setOffset);
326          }
327          if(c != result[i+2]){
328              log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
329          }
330 
331          i=i+6;
332     }
333 
334     i=0;
335     for(offset=sizeof(input); offset > 0; --offset){
336          setOffset=offset;
337          UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
338          if(setOffset != movedOffset[i+4]){
339              log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
340                  offset, movedOffset[i+4], setOffset);
341          }
342         expected=result[i+4];
343         if(c != expected){
344             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
345         }
346 
347          setOffset=offset;
348          U8_PREV(input, 0, setOffset, c);
349          if(setOffset != movedOffset[i+4]){
350              log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
351                  offset, movedOffset[i+4], setOffset);
352          }
353         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
354         if(c != expected){
355             log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
356         }
357 
358         setOffset=offset;
359         U8_PREV_OR_FFFD(input, 0, setOffset, c);
360         if(setOffset != movedOffset[i+4]){
361             log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
362                 offset, movedOffset[i+4], setOffset);
363         }
364         if(expected<0) { expected=0xfffd; }
365         if(c != expected){
366             log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
367         }
368 
369          setOffset=offset;
370          UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
371          if(setOffset != movedOffset[i+5]){
372              log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
373                  offset, movedOffset[i+5], setOffset);
374          }
375          if(c != result[i+5]){
376              log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
377          }
378 
379          i=i+6;
380     }
381 }
382 
383 /* keep this in sync with utf16tst.c's TestNulTerminated() */
TestNulTerminated()384 static void TestNulTerminated() {
385     static const uint8_t input[]={
386         /*  0 */  0x61,
387         /*  1 */  0xf0, 0x90, 0x90, 0x81,
388         /*  5 */  0xc0, 0x80,
389         /*  7 */  0xdf, 0x80,
390         /*  9 */  0xc2,
391         /* 10 */  0x62,
392         /* 11 */  0xfd, 0xbe,
393         /* 13 */  0xe0, 0xa0, 0x80,
394         /* 16 */  0xe2, 0x82, 0xac,
395         /* 19 */  0xf0, 0x90, 0x90,
396         /* 22 */  0x00
397         /* 23 */
398     };
399     static const UChar32 result[]={
400         0x61,
401         0x10401,
402         U_SENTINEL,
403         0x7c0,
404         U_SENTINEL,
405         0x62,
406         U_SENTINEL,
407         0x800,
408         0x20ac,
409         U_SENTINEL,
410         0
411     };
412 
413     UChar32 c, c2, expected;
414     int32_t i0, i=0, j, k, expectedIndex;
415     int32_t cpIndex=0;
416     do {
417         i0=i;
418         U8_NEXT(input, i, -1, c);
419         expected=result[cpIndex];
420         if(c!=expected) {
421             log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
422         }
423         j=i0;
424         U8_NEXT_OR_FFFD(input, j, -1, c);
425         if(expected<0) { expected=0xfffd; }
426         if(c!=expected) {
427             log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
428         }
429         if(j!=i) {
430             log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);
431         }
432         j=i0;
433         U8_FWD_1(input, j, -1);
434         if(j!=i) {
435             log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i);
436         }
437         ++cpIndex;
438         /*
439          * Move by this many code points from the start.
440          * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
441          */
442         expectedIndex= (c==0) ? i-1 : i;
443         k=0;
444         U8_FWD_N(input, k, -1, cpIndex);
445         if(k!=expectedIndex) {
446             log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);
447         }
448     } while(c!=0);
449 
450     i=0;
451     do {
452         j=i0=i;
453         U8_NEXT(input, i, -1, c);
454         do {
455             U8_GET(input, 0, j, -1, c2);
456             if(c2!=c) {
457                 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);
458             }
459             U8_GET_OR_FFFD(input, 0, j, -1, c2);
460             expected= (c>=0) ? c : 0xfffd;
461             if(c2!=expected) {
462                 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
463             }
464             /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
465             k=j+1;
466             U8_SET_CP_LIMIT(input, 0, k, -1);
467             if(k!=i) {
468                 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);
469             }
470         } while(++j<i);
471     } while(c!=0);
472 }
473 
TestNextPrevNonCharacters()474 static void TestNextPrevNonCharacters() {
475     /* test non-characters */
476     static const uint8_t nonChars[]={
477         0xef, 0xb7, 0x90,       /* U+fdd0 */
478         0xef, 0xbf, 0xbf,       /* U+feff */
479         0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
480         0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
481         0xf4, 0x8f, 0xbf, 0xbe  /* U+10fffe */
482     };
483 
484     UChar32 ch;
485     int32_t idx;
486 
487     for(idx=0; idx<(int32_t)sizeof(nonChars);) {
488         U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
489         if(!U_IS_UNICODE_NONCHAR(ch)) {
490             log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
491         }
492     }
493     for(idx=(int32_t)sizeof(nonChars); idx>0;) {
494         U8_PREV(nonChars, 0, idx, ch);
495         if(!U_IS_UNICODE_NONCHAR(ch)) {
496             log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
497         }
498     }
499 }
500 
TestNextPrevCharUnsafe()501 static void TestNextPrevCharUnsafe() {
502     /*
503      * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
504      * The behavior of _UNSAFE macros for ill-formed strings is undefined.
505      */
506     static const uint8_t input[]={
507         0x61,
508         0xf0, 0x90, 0x90, 0x81,
509         0xc0, 0x80,  /* non-shortest form */
510         0xe2, 0x82, 0xac,
511         0xc2, 0xa1,
512         0xf4, 0x8f, 0xbf, 0xbf,
513         0x00
514     };
515     static const UChar32 codePoints[]={
516         0x61,
517         0x10401,
518         0,
519         0x20ac,
520         0xa1,
521         0x10ffff,
522         0
523     };
524 
525     UChar32 c;
526     int32_t i;
527     uint32_t offset;
528     for(i=0, offset=0; offset<sizeof(input); ++i) {
529         UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
530         if(c != codePoints[i]){
531             log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
532                     offset, codePoints[i], c);
533         }
534     }
535     for(i=0, offset=0; offset<sizeof(input); ++i) {
536         U8_NEXT_UNSAFE(input, offset, c);
537         if(c != codePoints[i]){
538             log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
539                     offset, codePoints[i], c);
540         }
541     }
542 
543     for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
544          UTF8_PREV_CHAR_UNSAFE(input, offset, c);
545          if(c != codePoints[i]){
546              log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
547                      offset, codePoints[i], c);
548          }
549     }
550     for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
551          U8_PREV_UNSAFE(input, offset, c);
552          if(c != codePoints[i]){
553              log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
554                      offset, codePoints[i], c);
555          }
556     }
557 }
558 
TestFwdBack()559 static void TestFwdBack() {
560     static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
561     static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 9, 10, 11,  12, 13, 14, 15, 16, 17, 18};
562     static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};
563 
564     static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
565     static const uint16_t fwd_N_safe[]   ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
566     static const uint16_t back_N_safe[]  ={18, 17, 15, 12, 11, 9, 7, 0};
567 
568     uint32_t offsafe=0;
569 
570     uint32_t i=0;
571     while(offsafe < sizeof(input)){
572         UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
573         if(offsafe != fwd_safe[i]){
574             log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
575         }
576         i++;
577     }
578 
579     i=0;
580     while(offsafe < sizeof(input)){
581         U8_FWD_1(input, offsafe, sizeof(input));
582         if(offsafe != fwd_safe[i]){
583             log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
584         }
585         i++;
586     }
587 
588     i=0;
589     offsafe=sizeof(input);
590     while(offsafe > 0){
591         UTF8_BACK_1_SAFE(input, 0,  offsafe);
592         if(offsafe != back_safe[i]){
593             log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe[i], offsafe);
594         }
595         i++;
596     }
597 
598     i=0;
599     offsafe=sizeof(input);
600     while(offsafe > 0){
601         U8_BACK_1(input, 0,  offsafe);
602         if(offsafe != back_safe[i]){
603             log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i], offsafe);
604         }
605         i++;
606     }
607 
608     offsafe=0;
609     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
610         UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
611         if(offsafe != fwd_N_safe[i]){
612             log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
613         }
614 
615     }
616 
617     offsafe=0;
618     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
619         U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
620         if(offsafe != fwd_N_safe[i]){
621             log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
622         }
623 
624     }
625 
626     offsafe=sizeof(input);
627     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
628         UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
629         if(offsafe != back_N_safe[i]){
630             log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
631         }
632     }
633 
634     offsafe=sizeof(input);
635     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
636         U8_BACK_N(input, 0, offsafe, Nvalue[i]);
637         if(offsafe != back_N_safe[i]){
638             log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
639         }
640     }
641 }
642 
TestFwdBackUnsafe()643 static void TestFwdBackUnsafe() {
644     /*
645      * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
646      * The behavior of _UNSAFE macros for ill-formed strings is undefined.
647      */
648     static const uint8_t input[]={
649         0x61,
650         0xf0, 0x90, 0x90, 0x81,
651         0xc0, 0x80,  /* non-shortest form */
652         0xe2, 0x82, 0xac,
653         0xc2, 0xa1,
654         0xf4, 0x8f, 0xbf, 0xbf,
655         0x00
656     };
657     static const int8_t boundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
658 
659     int32_t offset;
660     int32_t i;
661     for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
662         UTF8_FWD_1_UNSAFE(input, offset);
663         if(offset != boundaries[i]){
664             log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
665         }
666     }
667     for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
668         U8_FWD_1_UNSAFE(input, offset);
669         if(offset != boundaries[i]){
670             log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
671         }
672     }
673 
674     for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
675         UTF8_BACK_1_UNSAFE(input, offset);
676         if(offset != boundaries[i]){
677             log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
678         }
679     }
680     for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
681         U8_BACK_1_UNSAFE(input, offset);
682         if(offset != boundaries[i]){
683             log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
684         }
685     }
686 
687     for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
688         offset=0;
689         UTF8_FWD_N_UNSAFE(input, offset, i);
690         if(offset != boundaries[i]) {
691             log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
692         }
693     }
694     for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
695         offset=0;
696         U8_FWD_N_UNSAFE(input, offset, i);
697         if(offset != boundaries[i]) {
698             log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
699         }
700     }
701 
702     for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
703         int32_t j=UPRV_LENGTHOF(boundaries)-1-i;
704         offset=UPRV_LENGTHOF(input);
705         UTF8_BACK_N_UNSAFE(input, offset, i);
706         if(offset != boundaries[j]) {
707             log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
708         }
709     }
710     for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
711         int32_t j=UPRV_LENGTHOF(boundaries)-1-i;
712         offset=UPRV_LENGTHOF(input);
713         U8_BACK_N_UNSAFE(input, offset, i);
714         if(offset != boundaries[j]) {
715             log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
716         }
717     }
718 }
719 
TestSetChar()720 static void TestSetChar() {
721     static const uint8_t input[]
722         = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
723     static const int16_t start_safe[]
724         = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,  14 };
725     static const int16_t limit_safe[]
726         = {0,    1,    4,    4,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,  14 };
727 
728     uint32_t i=0;
729     int32_t offset=0, setOffset=0;
730     for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
731         if (offset<UPRV_LENGTHOF(input)){
732             setOffset=offset;
733             UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
734             if(setOffset != start_safe[i]){
735                 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
736             }
737 
738             setOffset=offset;
739             U8_SET_CP_START(input, 0, setOffset);
740             if(setOffset != start_safe[i]){
741                 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
742             }
743         }
744 
745         setOffset=offset;
746         UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input));
747         if(setOffset != limit_safe[i]){
748             log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
749         }
750 
751         setOffset=offset;
752         U8_SET_CP_LIMIT(input,0, setOffset, sizeof(input));
753         if(setOffset != limit_safe[i]){
754             log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
755         }
756 
757         i++;
758     }
759 }
760 
TestSetCharUnsafe()761 static void TestSetCharUnsafe() {
762     static const uint8_t input[]
763         = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x80, 0x80, 0x00 };
764     static const int16_t start_unsafe[]
765         = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    9,    9,    12,   12,   12,   15 };
766     static const int16_t limit_unsafe[]
767         = {0,    1,    4,    4,    4,    5,    6,    7,    9,    9,    10,   10,   10,   15,   15,   15,   16 };
768 
769     uint32_t i=0;
770     int32_t offset=0, setOffset=0;
771     for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
772         if (offset<UPRV_LENGTHOF(input)){
773             setOffset=offset;
774             UTF8_SET_CHAR_START_UNSAFE(input, setOffset);
775             if(setOffset != start_unsafe[i]){
776                 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
777             }
778 
779             setOffset=offset;
780             U8_SET_CP_START_UNSAFE(input, setOffset);
781             if(setOffset != start_unsafe[i]){
782                 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
783             }
784         }
785 
786         if (offset != 0) { /* Can't have it go off the end of the array */
787             setOffset=offset;
788             UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
789             if(setOffset != limit_unsafe[i]){
790                 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
791             }
792 
793             setOffset=offset;
794             U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
795             if(setOffset != limit_unsafe[i]){
796                 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
797             }
798         }
799 
800         i++;
801     }
802 }
803 
TestAppendChar()804 static void TestAppendChar(){
805     static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
806     static const uint32_t test[]={
807     /*  append-position(unsafe),  CHAR to be appended */
808         0,                        0x10401,
809         2,                        0x0028,
810         2,                        0x007f,
811         3,                        0xd801,
812         1,                        0x20402,
813         8,                        0x10401,
814         5,                        0xc0,
815         5,                        0xc1,
816         5,                        0xfd,
817         6,                        0x80,
818         6,                        0x81,
819         6,                        0xbf,
820         7,                        0xfe,
821 
822     /*  append-position(safe),    CHAR to be appended */
823         0,                        0x10401,
824         2,                        0x0028,
825         3,                        0x7f,
826         3,                        0xd801,   /* illegal for UTF-8 starting with Unicode 3.2 */
827         1,                        0x20402,
828         9,                        0x10401,
829         5,                        0xc0,
830         5,                        0xc1,
831         5,                        0xfd,
832         6,                        0x80,
833         6,                        0x81,
834         6,                        0xbf,
835         7,                        0xfe,
836 
837     };
838     static const uint16_t movedOffset[]={
839     /* offset-moved-to(unsafe) */
840           4,              /*for append-pos: 0 , CHAR 0x10401*/
841           3,
842           3,
843           6,
844           5,
845           12,
846           7,
847           7,
848           7,
849           8,
850           8,
851           8,
852           9,
853 
854     /* offset-moved-to(safe) */
855           4,              /*for append-pos: 0, CHAR  0x10401*/
856           3,
857           4,
858           6,
859           5,
860           11,
861           7,
862           7,
863           7,
864           8,
865           8,
866           8,
867           9,
868 
869     };
870 
871     static const uint8_t result[][11]={
872         /*unsafe*/
873         {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
874         {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
875         {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
876         {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
877         {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
878         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
879 
880         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
881         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
882         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
883 
884         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
885         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
886         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
887 
888         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
889         /*safe*/
890         {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
891         {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
892         {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
893         {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
894         {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
895         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
896 
897         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
898         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
899         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
900 
901         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
902         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
903         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
904 
905         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
906 
907     };
908     uint16_t i, count=0;
909     uint8_t str[12];
910     uint32_t offset;
911 /*    UChar32 c=0;*/
912     uint16_t size=UPRV_LENGTHOF(s);
913     for(i=0; i<UPRV_LENGTHOF(test); i=(uint16_t)(i+2)){
914         uprv_memcpy(str, s, size);
915         offset=test[i];
916         if(count<13){
917             UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]);
918             if(offset != movedOffset[count]){
919                 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
920                     count, movedOffset[count], offset);
921 
922             }
923             if(uprv_memcmp(str, result[count], size) !=0){
924                 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count);
925                 printUChars(result[count], size);
926                 log_err("\nGot:      ");
927                 printUChars(str, size);
928                 log_err("\n");
929             }
930         }else{
931             UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]);
932             if(offset != movedOffset[count]){
933                 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
934                     count, movedOffset[count], offset);
935 
936             }
937             if(uprv_memcmp(str, result[count], size) !=0){
938                 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count);
939                 printUChars(result[count], size);
940                 log_err("\nGot:     ");
941                 printUChars(str, size);
942                 log_err("\n");
943             }
944             /*call the API instead of MACRO
945             uprv_memcpy(str, s, size);
946             offset=test[i];
947             c=test[i+1];
948             if((uint32_t)(c)<=0x7f) {
949                   (str)[(offset)++]=(uint8_t)(c);
950             } else {
951                  (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
952             }
953             if(offset != movedOffset[count]){
954                 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
955                     count, movedOffset[count], offset);
956 
957             }
958             if(uprv_memcmp(str, result[count], size) !=0){
959                 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
960                 printUChars(result[count], size);
961                 printf("\nGot:     ");
962                 printUChars(str, size);
963                 printf("\n");
964             }
965             */
966         }
967         count++;
968     }
969 
970 
971 }
972 
TestAppend()973 static void TestAppend() {
974     static const UChar32 codePoints[]={
975         0x61, 0xdf, 0x901, 0x3040,
976         0xac00, 0xd800, 0xdbff, 0xdcde,
977         0xdffd, 0xe000, 0xffff, 0x10000,
978         0x12345, 0xe0021, 0x10ffff, 0x110000,
979         0x234567, 0x7fffffff, -1, -1000,
980         0, 0x400
981     };
982     static const uint8_t expectUnsafe[]={
983         0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
984         0xea, 0xb0, 0x80,  0xed, 0xa0, 0x80,  0xed, 0xaf, 0xbf,  0xed, 0xb3, 0x9e,
985         0xed, 0xbf, 0xbd,  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
986         0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
987         /* none from this line */
988         0,  0xd0, 0x80
989     }, expectSafe[]={
990         0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
991         0xea, 0xb0, 0x80,  /* no surrogates */
992         /* no surrogates */  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
993         0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
994         /* none from this line */
995         0,  0xd0, 0x80
996     };
997 
998     uint8_t buffer[100];
999     UChar32 c;
1000     int32_t i, length;
1001     UBool isError, expectIsError, wrongIsError;
1002 
1003     length=0;
1004     for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
1005         c=codePoints[i];
1006         if(c<0 || 0x10ffff<c) {
1007             continue; /* skip non-code points for U8_APPEND_UNSAFE */
1008         }
1009 
1010         U8_APPEND_UNSAFE(buffer, length, c);
1011     }
1012     if(length!=UPRV_LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
1013         log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
1014     }
1015 
1016     length=0;
1017     wrongIsError=FALSE;
1018     for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
1019         c=codePoints[i];
1020         expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
1021         isError=FALSE;
1022 
1023         U8_APPEND(buffer, length, UPRV_LENGTHOF(buffer), c, isError);
1024         wrongIsError|= isError!=expectIsError;
1025     }
1026     if(wrongIsError) {
1027         log_err("U8_APPEND did not set isError correctly\n");
1028     }
1029     if(length!=UPRV_LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
1030         log_err("U8_APPEND did not generate the expected output\n");
1031     }
1032 }
1033 
1034 static void
TestSurrogates()1035 TestSurrogates() {
1036     static const uint8_t b[]={
1037         0xc3, 0x9f,             /*  00DF */
1038         0xed, 0x9f, 0xbf,       /*  D7FF */
1039         0xed, 0xa0, 0x81,       /*  D801 */
1040         0xed, 0xbf, 0xbe,       /*  DFFE */
1041         0xee, 0x80, 0x80,       /*  E000 */
1042         0xf0, 0x97, 0xbf, 0xbe  /* 17FFE */
1043     };
1044     static const UChar32 cp[]={
1045         0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
1046     };
1047 
1048     UChar32 cu, cs, cl;
1049     int32_t i, j, k, iu, is, il, length;
1050 
1051     k=0; /* index into cp[] */
1052     length=UPRV_LENGTHOF(b);
1053     for(i=0; i<length;) {
1054         j=i;
1055         U8_NEXT_UNSAFE(b, j, cu);
1056         iu=j;
1057 
1058         j=i;
1059         U8_NEXT(b, j, length, cs);
1060         is=j;
1061 
1062         j=i;
1063         L8_NEXT(b, j, length, cl);
1064         il=j;
1065 
1066         if(cu!=cp[k]) {
1067             log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1068         }
1069 
1070         /* U8_NEXT() returns <0 for surrogate code points */
1071         if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1072             log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1073         }
1074 
1075         /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
1076         if(cl!=cu) {
1077             log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1078         }
1079 
1080         if(is!=iu || il!=iu) {
1081             log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1082         }
1083 
1084         ++k;    /* next code point */
1085         i=iu;   /* advance by one UTF-8 sequence */
1086     }
1087 
1088     while(i>0) {
1089         --k; /* previous code point */
1090 
1091         j=i;
1092         U8_PREV_UNSAFE(b, j, cu);
1093         iu=j;
1094 
1095         j=i;
1096         U8_PREV(b, 0, j, cs);
1097         is=j;
1098 
1099         j=i;
1100         L8_PREV(b, 0, j, cl);
1101         il=j;
1102 
1103         if(cu!=cp[k]) {
1104             log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1105         }
1106 
1107         /* U8_PREV() returns <0 for surrogate code points */
1108         if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1109             log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1110         }
1111 
1112         /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
1113         if(cl!=cu) {
1114             log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1115         }
1116 
1117         if(is!=iu || il !=iu) {
1118             log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1119         }
1120 
1121         i=iu;   /* go back by one UTF-8 sequence */
1122     }
1123 }
1124 
printUChars(const uint8_t * uchars,int16_t len)1125 static void printUChars(const uint8_t *uchars, int16_t len){
1126     int16_t i=0;
1127     for(i=0; i<len; i++){
1128         log_err("0x%02x ", *(uchars+i));
1129     }
1130 }
1131