1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 1998-2014, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /*
9 * File utf8tst.c
10 *
11 * Modification History:
12 *
13 *   Date          Name        Description
14 *   07/24/2000    Madhu       Creation
15 *******************************************************************************
16 */
17 
18 #include "unicode/utypes.h"
19 #include "unicode/utf8.h"
20 #include "unicode/utf_old.h"
21 #include "cmemory.h"
22 #include "cintltst.h"
23 
24 /* lenient UTF-8 ------------------------------------------------------------ */
25 
26 /*
27  * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
28  * code points with their "natural" encoding.
29  * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
30  * single surrogates.
31  *
32  * This is not conformant with UTF-8.
33  *
34  * Supplementary code points may be encoded as pairs of 3-byte sequences, but
35  * the macros below do not attempt to assemble such pairs.
36  */
37 
38 #define L8_NEXT(s, i, length, c) { \
39     (c)=(uint8_t)(s)[(i)++]; \
40     if((c)>=0x80) { \
41         if(U8_IS_LEAD(c)) { \
42             (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
43         } else { \
44             (c)=U_SENTINEL; \
45         } \
46     } \
47 }
48 
49 #define L8_PREV(s, start, i, c) { \
50     (c)=(uint8_t)(s)[--(i)]; \
51     if((c)>=0x80) { \
52         if((c)<=0xbf) { \
53             (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
54         } else { \
55             (c)=U_SENTINEL; \
56         } \
57     } \
58 }
59 
60 /* -------------------------------------------------------------------------- */
61 
62 // Obsolete macros from obsolete unicode/utf_old.h, for some old test data.
63 #ifndef UTF8_ERROR_VALUE_1
64 #   define UTF8_ERROR_VALUE_1 0x15
65 #endif
66 #ifndef UTF8_ERROR_VALUE_2
67 #   define UTF8_ERROR_VALUE_2 0x9f
68 #endif
69 #ifndef UTF_ERROR_VALUE
70 #   define UTF_ERROR_VALUE 0xffff
71 #endif
72 #ifndef UTF_IS_ERROR
73 #   define UTF_IS_ERROR(c) \
74         (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2)
75 #endif
76 
77 #if !U_HIDE_OBSOLETE_UTF_OLD_H
printUChars(const uint8_t * uchars,int16_t len)78 static void printUChars(const uint8_t *uchars, int16_t len){
79     int16_t i=0;
80     for(i=0; i<len; i++){
81         log_err("0x%02x ", *(uchars+i));
82     }
83 }
84 #endif
85 
86 static void TestCodeUnitValues(void);
87 static void TestCharLength(void);
88 static void TestGetChar(void);
89 static void TestNextPrevChar(void);
90 static void TestNulTerminated(void);
91 static void TestNextPrevNonCharacters(void);
92 static void TestNextPrevCharUnsafe(void);
93 static void TestFwdBack(void);
94 static void TestFwdBackUnsafe(void);
95 static void TestSetChar(void);
96 static void TestSetCharUnsafe(void);
97 static void TestTruncateIfIncomplete(void);
98 static void TestAppendChar(void);
99 static void TestAppend(void);
100 static void TestSurrogates(void);
101 
102 void addUTF8Test(TestNode** root);
103 
104 void
addUTF8Test(TestNode ** root)105 addUTF8Test(TestNode** root)
106 {
107     addTest(root, &TestCodeUnitValues,          "utf8tst/TestCodeUnitValues");
108     addTest(root, &TestCharLength,              "utf8tst/TestCharLength");
109     addTest(root, &TestGetChar,                 "utf8tst/TestGetChar");
110     addTest(root, &TestNextPrevChar,            "utf8tst/TestNextPrevChar");
111     addTest(root, &TestNulTerminated,           "utf8tst/TestNulTerminated");
112     addTest(root, &TestNextPrevNonCharacters,   "utf8tst/TestNextPrevNonCharacters");
113     addTest(root, &TestNextPrevCharUnsafe,      "utf8tst/TestNextPrevCharUnsafe");
114     addTest(root, &TestFwdBack,                 "utf8tst/TestFwdBack");
115     addTest(root, &TestFwdBackUnsafe,           "utf8tst/TestFwdBackUnsafe");
116     addTest(root, &TestSetChar,                 "utf8tst/TestSetChar");
117     addTest(root, &TestSetCharUnsafe,           "utf8tst/TestSetCharUnsafe");
118     addTest(root, &TestTruncateIfIncomplete,    "utf8tst/TestTruncateIfIncomplete");
119     addTest(root, &TestAppendChar,              "utf8tst/TestAppendChar");
120     addTest(root, &TestAppend,                  "utf8tst/TestAppend");
121     addTest(root, &TestSurrogates,              "utf8tst/TestSurrogates");
122 }
123 
TestCodeUnitValues()124 static void TestCodeUnitValues()
125 {
126     static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,};
127 
128     int16_t i;
129     for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
130         uint8_t c=codeunit[i];
131         log_verbose("Testing code unit value of %x\n", c);
132         if(i<4){
133             if(
134 #if !U_HIDE_OBSOLETE_UTF_OLD_H
135                     !UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) ||
136 #endif
137                     !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)) {
138                 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
139                     c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
140             }
141         } else if(i< 8){
142             if(
143 #if !U_HIDE_OBSOLETE_UTF_OLD_H
144                     !UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) ||
145 #endif
146                     !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)) {
147                 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
148                     c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
149             }
150         } else if(i< 12){
151             if(
152 #if !U_HIDE_OBSOLETE_UTF_OLD_H
153                     !UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) ||
154 #endif
155                     !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
156                 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
157                     c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
158             }
159         }
160     }
161 }
162 
TestCharLength()163 static void TestCharLength()
164 {
165     static const uint32_t codepoint[]={
166         1, 0x0061,
167         1, 0x007f,
168         2, 0x016f,
169         2, 0x07ff,
170         3, 0x0865,
171         3, 0x20ac,
172         4, 0x20402,
173         4, 0x23456,
174         4, 0x24506,
175         4, 0x20402,
176         4, 0x10402,
177         3, 0xd7ff,
178         3, 0xe000,
179 
180     };
181 
182     int16_t i;
183 #if !U_HIDE_OBSOLETE_UTF_OLD_H
184     UBool multiple;
185 #endif
186     for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
187         UChar32 c=codepoint[i+1];
188         if(
189 #if !U_HIDE_OBSOLETE_UTF_OLD_H
190                 UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] ||
191 #endif
192                 U8_LENGTH(c) != (uint16_t)codepoint[i]) {
193             log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], U8_LENGTH(c));
194         }else{
195               log_verbose("The no: of code units for %lx is %d\n",c, U8_LENGTH(c));
196         }
197 #if !U_HIDE_OBSOLETE_UTF_OLD_H
198         multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
199         if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){
200               log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c);
201         }
202 #endif
203     }
204 }
205 
TestGetChar()206 static void TestGetChar()
207 {
208     static const uint8_t input[]={
209     /*  code unit,*/
210         0x61,
211         0x7f,
212         0xe4,
213         0xba,
214         0x8c,
215         0xF0,
216         0x90,
217         0x90,
218         0x81,
219         0xc0,
220         0x65,
221         0x31,
222         0x9a,
223         0xc9
224     };
225     static const UChar32 result[]={
226     /*  codepoint-unsafe, codepoint-safe(not strict)  codepoint-safe(strict) */
227         0x61,             0x61,                       0x61,
228         0x7f,             0x7f,                       0x7f,
229         0x4e8c,           0x4e8c,                     0x4e8c,
230         0x4e8c,           0x4e8c,                     0x4e8c ,
231         0x4e8c,           0x4e8c,                     0x4e8c,
232         0x10401,          0x10401,                    0x10401 ,
233         0x10401,          0x10401,                    0x10401 ,
234         0x10401,          0x10401,                    0x10401 ,
235         0x10401,          0x10401,                    0x10401,
236         -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
237         0x65,             0x65,                       0x65,
238         0x31,             0x31,                       0x31,
239         -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
240         -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1
241     };
242     uint16_t i=0;
243     UChar32 c, expected;
244     uint32_t offset=0;
245 
246     for(offset=0; offset<sizeof(input); offset++) {
247         expected = result[i];
248         if (expected >= 0 && offset < sizeof(input) - 1) {
249 #if !U_HIDE_OBSOLETE_UTF_OLD_H
250             UTF8_GET_CHAR_UNSAFE(input, offset, c);
251             if(c != expected) {
252                 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
253                         offset, expected, c);
254 
255             }
256 #endif
257             U8_GET_UNSAFE(input, offset, c);
258             if(c != expected) {
259                 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
260                         offset, expected, c);
261 
262             }
263         }
264         expected=result[i+1];
265 #if !U_HIDE_OBSOLETE_UTF_OLD_H
266         UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
267         if(c != expected){
268             log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
269         }
270 #endif
271         U8_GET(input, 0, offset, sizeof(input), c);
272         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
273         if(c != expected){
274             log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
275         }
276 
277         U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c);
278         if(expected<0) { expected=0xfffd; }
279         if(c != expected){
280             log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
281         }
282 #if !U_HIDE_OBSOLETE_UTF_OLD_H
283         UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
284         if(c != result[i+2]){
285             log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
286         }
287 #endif
288         i=(uint16_t)(i+3);
289     }
290 }
291 
TestNextPrevChar()292 static void TestNextPrevChar() {
293     static const uint8_t input[]={
294         0x61,
295         0xf0, 0x90, 0x90, 0x81,
296         0xc0, 0x80,  // non-shortest form
297         0xf3, 0xbe,  // truncated
298         0xc2,  // truncated
299         0x61,
300         0x81, 0x90, 0x90, 0xf0,  // "backwards" sequence
301         0x00
302     };
303     static const UChar32 result[]={
304     /*  next_safe_ns        next_safe_s          prev_safe_ns        prev_safe_s */
305         0x0061,             0x0061,              0x0000,             0x0000,
306         0x10401,            0x10401,             UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
307         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
308         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
309         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
310         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x61,               0x61,
311         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
312         UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
313         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
314         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
315         0x61,               0x61,                UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
316         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x10401,            0x10401,
317         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF_ERROR_VALUE,    UTF_ERROR_VALUE,
318         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
319         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
320         0x0000,             0x0000,              0x0061,             0x0061
321     };
322     static const int32_t movedOffset[]={
323     /*  next_safe    prev_safe_s */
324         1,           15,
325         5,           14,
326         3,           13,
327         4,           12,
328         5,           11,
329         6,           10,
330         7,           9,
331         9,           7,
332         9,           7,
333         10,          6,
334         11,          5,
335         12,          1,
336         13,          1,
337         14,          1,
338         15,          1,
339         16,          0,
340     };
341 
342     UChar32 c, expected;
343     uint32_t i=0, j=0;
344     uint32_t offset=0;
345     int32_t setOffset=0;
346     for(offset=0; offset<sizeof(input); offset++){
347         expected=result[i];  // next_safe_ns
348 #if !U_HIDE_OBSOLETE_UTF_OLD_H
349         setOffset=offset;
350         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
351         if(setOffset != movedOffset[j]) {
352             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
353                 offset, movedOffset[j], setOffset);
354         }
355         if(c != expected) {
356             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
357         }
358 #endif
359         setOffset=offset;
360         U8_NEXT(input, setOffset, sizeof(input), c);
361         if(setOffset != movedOffset[j]) {
362             log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
363                 offset, movedOffset[j], setOffset);
364         }
365         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
366         if(c != expected) {
367             log_err("ERROR: U8_NEXT failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
368         }
369 
370         setOffset=offset;
371         U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
372         if(setOffset != movedOffset[j]) {
373             log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
374                 offset, movedOffset[j], setOffset);
375         }
376         if(expected<0) { expected=0xfffd; }
377         if(c != expected) {
378             log_err("ERROR: U8_NEXT_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
379         }
380 #if !U_HIDE_OBSOLETE_UTF_OLD_H
381         setOffset=offset;
382         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
383         if(setOffset != movedOffset[j]) {
384             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
385                 offset, movedOffset[j], setOffset);
386         }
387         expected=result[i+1];  // next_safe_s
388         if(c != expected) {
389             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
390                     offset, expected, c);
391         }
392 #endif
393         i=i+4;
394         j=j+2;
395     }
396 
397     i=j=0;
398     for(offset=sizeof(input); offset > 0; --offset){
399         expected=result[i+2];  // prev_safe_ns
400 #if !U_HIDE_OBSOLETE_UTF_OLD_H
401         setOffset=offset;
402         UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
403         if(setOffset != movedOffset[j+1]) {
404             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
405                 offset, movedOffset[j+1], setOffset);
406         }
407         if(c != expected) {
408             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
409         }
410 #endif
411         setOffset=offset;
412         U8_PREV(input, 0, setOffset, c);
413         if(setOffset != movedOffset[j+1]) {
414             log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
415                 offset, movedOffset[j+1], setOffset);
416         }
417         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
418         if(c != expected) {
419             log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
420         }
421 
422         setOffset=offset;
423         U8_PREV_OR_FFFD(input, 0, setOffset, c);
424         if(setOffset != movedOffset[j+1]) {
425             log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
426                 offset, movedOffset[j+1], setOffset);
427         }
428         if(expected<0) { expected=0xfffd; }
429         if(c != expected) {
430             log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
431         }
432 #if !U_HIDE_OBSOLETE_UTF_OLD_H
433         setOffset=offset;
434         UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
435         if(setOffset != movedOffset[j+1]) {
436             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
437                 offset, movedOffset[j+1], setOffset);
438         }
439         expected=result[i+3];  // prev_safe_s
440         if(c != expected) {
441             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
442                     offset, expected, c);
443         }
444 #endif
445         i=i+4;
446         j=j+2;
447     }
448 }
449 
450 /* keep this in sync with utf16tst.c's TestNulTerminated() */
TestNulTerminated()451 static void TestNulTerminated() {
452     static const uint8_t input[]={
453         /*  0 */  0x61,
454         /*  1 */  0xf0, 0x90, 0x90, 0x81,
455         /*  5 */  0xc0,
456         /*  6 */  0x80,
457         /*  7 */  0xdf, 0x80,
458         /*  9 */  0xc2,
459         /* 10 */  0x62,
460         /* 11 */  0xfd,
461         /* 12 */  0xbe,
462         /* 13 */  0xe0, 0xa0, 0x80,
463         /* 16 */  0xe2, 0x82, 0xac,
464         /* 19 */  0xf0, 0x90, 0x90,
465         /* 22 */  0x00
466         /* 23 */
467     };
468     static const UChar32 result[]={
469         0x61,
470         0x10401,
471         U_SENTINEL,  // C0 not a lead byte
472         U_SENTINEL,  // 80
473         0x7c0,
474         U_SENTINEL,  // C2
475         0x62,
476         U_SENTINEL,  // FD not a lead byte
477         U_SENTINEL,  // BE
478         0x800,
479         0x20ac,
480         U_SENTINEL,  // truncated F0 90 90
481         0
482     };
483 
484     UChar32 c, c2, expected;
485     int32_t i0, i=0, j, k, expectedIndex;
486     int32_t cpIndex=0;
487     do {
488         i0=i;
489         U8_NEXT(input, i, -1, c);
490         expected=result[cpIndex];
491         if(c!=expected) {
492             log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
493         }
494         j=i0;
495         U8_NEXT_OR_FFFD(input, j, -1, c);
496         if(expected<0) { expected=0xfffd; }
497         if(c!=expected) {
498             log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
499         }
500         if(j!=i) {
501             log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);
502         }
503         j=i0;
504         U8_FWD_1(input, j, -1);
505         if(j!=i) {
506             log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i);
507         }
508         ++cpIndex;
509         /*
510          * Move by this many code points from the start.
511          * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
512          */
513         expectedIndex= (c==0) ? i-1 : i;
514         k=0;
515         U8_FWD_N(input, k, -1, cpIndex);
516         if(k!=expectedIndex) {
517             log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);
518         }
519     } while(c!=0);
520 
521     i=0;
522     do {
523         j=i0=i;
524         U8_NEXT(input, i, -1, c);
525         do {
526             U8_GET(input, 0, j, -1, c2);
527             if(c2!=c) {
528                 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);
529             }
530             U8_GET_OR_FFFD(input, 0, j, -1, c2);
531             expected= (c>=0) ? c : 0xfffd;
532             if(c2!=expected) {
533                 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
534             }
535             /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
536             k=j+1;
537             U8_SET_CP_LIMIT(input, 0, k, -1);
538             if(k!=i) {
539                 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);
540             }
541         } while(++j<i);
542     } while(c!=0);
543 }
544 
TestNextPrevNonCharacters()545 static void TestNextPrevNonCharacters() {
546     /* test non-characters */
547     static const uint8_t nonChars[]={
548         0xef, 0xb7, 0x90,       /* U+fdd0 */
549         0xef, 0xbf, 0xbf,       /* U+feff */
550         0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
551         0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
552         0xf4, 0x8f, 0xbf, 0xbe  /* U+10fffe */
553     };
554 
555     UChar32 ch;
556     int32_t idx;
557 
558     for(idx=0; idx<(int32_t)sizeof(nonChars);) {
559         U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
560         if(!U_IS_UNICODE_NONCHAR(ch)) {
561             log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
562         }
563     }
564     for(idx=(int32_t)sizeof(nonChars); idx>0;) {
565         U8_PREV(nonChars, 0, idx, ch);
566         if(!U_IS_UNICODE_NONCHAR(ch)) {
567             log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
568         }
569     }
570 #if !U_HIDE_OBSOLETE_UTF_OLD_H
571     for(idx=0; idx<(int32_t)sizeof(nonChars);) {
572         UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
573         UTF8_NEXT_CHAR_SAFE(nonChars, idx, sizeof(nonChars), ch, TRUE);
574         if(ch!=expected) {
575             log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx);
576         }
577     }
578     for(idx=(int32_t)sizeof(nonChars); idx>0;) {
579         UTF8_PREV_CHAR_SAFE(nonChars, 0, idx, ch, TRUE);
580         UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
581         if(ch!=expected) {
582             log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx);
583         }
584     }
585 #endif
586 }
587 
TestNextPrevCharUnsafe()588 static void TestNextPrevCharUnsafe() {
589     /*
590      * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
591      * The behavior of _UNSAFE macros for ill-formed strings is undefined.
592      */
593     static const uint8_t input[]={
594         0x61,
595         0xf0, 0x90, 0x90, 0x81,
596         0xc0, 0x80,  /* non-shortest form */
597         0xe2, 0x82, 0xac,
598         0xc2, 0xa1,
599         0xf4, 0x8f, 0xbf, 0xbf,
600         0x00
601     };
602     static const UChar32 codePoints[]={
603         0x61,
604         0x10401,
605         -1,
606         0x20ac,
607         0xa1,
608         0x10ffff,
609         0
610     };
611 
612     UChar32 c, expected;
613     int32_t i;
614     uint32_t offset;
615 #if !U_HIDE_OBSOLETE_UTF_OLD_H
616     for(i=0, offset=0; offset<sizeof(input); ++i) {
617         UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
618         expected = codePoints[i];
619         if(expected >= 0 && c != expected) {
620             log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
621                     offset, expected, c);
622         }
623         if(offset==6) {
624             // The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes
625             // while the new one skips C0 80 together.
626             ++offset;
627         }
628     }
629 #endif
630     for(i=0, offset=0; offset<sizeof(input); ++i) {
631         U8_NEXT_UNSAFE(input, offset, c);
632         expected = codePoints[i];
633         if(expected >= 0 && c != expected) {
634             log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
635                     offset, expected, c);
636         }
637     }
638 #if !U_HIDE_OBSOLETE_UTF_OLD_H
639     for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
640         UTF8_PREV_CHAR_UNSAFE(input, offset, c);
641         expected = codePoints[i];
642         if(expected >= 0 && c != expected) {
643             log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
644                     offset, expected, c);
645         }
646     }
647 #endif
648     for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
649         U8_PREV_UNSAFE(input, offset, c);
650         expected = codePoints[i];
651         if(expected >= 0 && c != expected) {
652             log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
653                     offset, expected, c);
654         }
655     }
656 }
657 
TestFwdBack()658 static void TestFwdBack() {
659     static const uint8_t input[]={
660         0x61,
661         0xF0, 0x90, 0x90, 0x81,
662         0xff,
663         0x62,
664         0xc0,
665         0x80,
666         0x7f,
667         0x8f,
668         0xc0,
669         0x63,
670         0x81,
671         0x90,
672         0x90,
673         0xF0,
674         0x00
675     };
676     static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
677     static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0};
678 
679     static const uint16_t Nvalue[]= {0, 1, 2, 4, 1, 2, 1, 5};
680     static const uint16_t fwd_N_safe[]   ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
681     static const uint16_t back_N_safe[]  ={18, 17, 15, 11, 10, 8, 7, 0};
682 
683     uint32_t offsafe=0;
684 
685     uint32_t i=0;
686 #if !U_HIDE_OBSOLETE_UTF_OLD_H
687     while(offsafe < sizeof(input)){
688         UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
689         if(offsafe != fwd_safe[i]){
690             log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
691         }
692         i++;
693     }
694 #endif
695     offsafe=0;
696     i=0;
697     while(offsafe < sizeof(input)){
698         U8_FWD_1(input, offsafe, sizeof(input));
699         if(offsafe != fwd_safe[i]){
700             log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
701         }
702         i++;
703     }
704 #if !U_HIDE_OBSOLETE_UTF_OLD_H
705     i=0;
706     offsafe=sizeof(input);
707     while(offsafe > 0){
708         UTF8_BACK_1_SAFE(input, 0,  offsafe);
709         if(offsafe != back_safe[i]){
710             log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe[i], offsafe);
711         }
712         i++;
713     }
714 #endif
715     i=0;
716     offsafe=sizeof(input);
717     while(offsafe > 0){
718         U8_BACK_1(input, 0,  offsafe);
719         if(offsafe != back_safe[i]){
720             log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i], offsafe);
721         }
722         i++;
723     }
724 #if !U_HIDE_OBSOLETE_UTF_OLD_H
725     offsafe=0;
726     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
727         UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
728         if(offsafe != fwd_N_safe[i]){
729             log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
730         }
731 
732     }
733 #endif
734     offsafe=0;
735     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
736         U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
737         if(offsafe != fwd_N_safe[i]){
738             log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
739         }
740 
741     }
742 #if !U_HIDE_OBSOLETE_UTF_OLD_H
743     offsafe=sizeof(input);
744     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
745         UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
746         if(offsafe != back_N_safe[i]){
747             log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
748         }
749     }
750 #endif
751     offsafe=sizeof(input);
752     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
753         U8_BACK_N(input, 0, offsafe, Nvalue[i]);
754         if(offsafe != back_N_safe[i]){
755             log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
756         }
757     }
758 }
759 
760 /**
761 * Ticket #13636 - Visual Studio 2017 has problems optimizing this function.
762 * As a workaround, we will turn off optimization just for this function on VS2017 and above.
763 */
764 #if defined(_MSC_VER) && (_MSC_VER > 1900)
765 #pragma optimize( "", off )
766 #endif
767 
TestFwdBackUnsafe()768 static void TestFwdBackUnsafe() {
769     /*
770      * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
771      * The behavior of _UNSAFE macros for ill-formed strings is undefined.
772      */
773     static const uint8_t input[]={
774         0x61,
775         0xf0, 0x90, 0x90, 0x81,
776         0xc0, 0x80,  /* non-shortest form */
777         0xe2, 0x82, 0xac,
778         0xc2, 0xa1,
779         0xf4, 0x8f, 0xbf, 0xbf,
780         0x00
781     };
782     // forward unsafe skips only C0
783     static const int8_t boundaries[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 };
784     // backward unsafe skips C0 80 together
785     static const int8_t backBoundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
786 
787     int32_t offset;
788     int32_t i;
789 #if !U_HIDE_OBSOLETE_UTF_OLD_H
790     for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
791         UTF8_FWD_1_UNSAFE(input, offset);
792         if(offset != boundaries[i]){
793             log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
794         }
795     }
796 #endif
797     for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
798         U8_FWD_1_UNSAFE(input, offset);
799         if(offset != boundaries[i]){
800             log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
801         }
802     }
803 #if !U_HIDE_OBSOLETE_UTF_OLD_H
804     for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
805         UTF8_BACK_1_UNSAFE(input, offset);
806         if(offset != backBoundaries[i]){
807             log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
808         }
809     }
810 #endif
811     for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
812         U8_BACK_1_UNSAFE(input, offset);
813         if(offset != backBoundaries[i]){
814             log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
815         }
816     }
817 #if !U_HIDE_OBSOLETE_UTF_OLD_H
818     for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
819         offset=0;
820         UTF8_FWD_N_UNSAFE(input, offset, i);
821         if(offset != boundaries[i]) {
822             log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
823         }
824     }
825 #endif
826     for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
827         offset=0;
828         U8_FWD_N_UNSAFE(input, offset, i);
829         if(offset != boundaries[i]) {
830             log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
831         }
832     }
833 #if !U_HIDE_OBSOLETE_UTF_OLD_H
834     for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
835         int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
836         offset=UPRV_LENGTHOF(input);
837         UTF8_BACK_N_UNSAFE(input, offset, i);
838         if(offset != backBoundaries[j]) {
839             log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
840         }
841     }
842 #endif
843     for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
844         int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
845         offset=UPRV_LENGTHOF(input);
846         U8_BACK_N_UNSAFE(input, offset, i);
847         if(offset != backBoundaries[j]) {
848             log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
849         }
850     }
851 }
852 
853 /**
854 * Ticket #13636 - Turn optimization back on.
855 */
856 #if defined(_MSC_VER) && (_MSC_VER > 1900)
857 #pragma optimize( "", on )
858 #endif
859 
TestSetChar()860 static void TestSetChar() {
861     static const uint8_t input[]
862         = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
863     static const int16_t start_safe[]
864         = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,  14 };
865     static const int16_t limit_safe[]
866         = {0,    1,    4,    4,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,  14 };
867 
868     uint32_t i=0;
869     int32_t offset=0, setOffset=0;
870     for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
871         if (offset<UPRV_LENGTHOF(input)){
872 #if !U_HIDE_OBSOLETE_UTF_OLD_H
873             setOffset=offset;
874             UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
875             if(setOffset != start_safe[i]){
876                 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
877             }
878 #endif
879             setOffset=offset;
880             U8_SET_CP_START(input, 0, setOffset);
881             if(setOffset != start_safe[i]){
882                 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
883             }
884         }
885 #if !U_HIDE_OBSOLETE_UTF_OLD_H
886         setOffset=offset;
887         UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input));
888         if(setOffset != limit_safe[i]){
889             log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
890         }
891 #endif
892         setOffset=offset;
893         U8_SET_CP_LIMIT(input,0, setOffset, sizeof(input));
894         if(setOffset != limit_safe[i]){
895             log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
896         }
897 
898         i++;
899     }
900 }
901 
TestSetCharUnsafe()902 static void TestSetCharUnsafe() {
903     static const uint8_t input[]
904         = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x80, 0x80, 0x00 };
905     static const int16_t start_unsafe[]
906         = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    9,    9,    12,   12,   12,   15 };
907     static const int16_t limit_unsafe[]
908         = {0,    1,    4,    4,    4,    5,    6,    7,    9,    9,    10,   10,   10,   15,   15,   15,   16 };
909 
910     uint32_t i=0;
911     int32_t offset=0, setOffset=0;
912     for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
913         if (offset<UPRV_LENGTHOF(input)){
914 #if !U_HIDE_OBSOLETE_UTF_OLD_H
915             setOffset=offset;
916             UTF8_SET_CHAR_START_UNSAFE(input, setOffset);
917             if(setOffset != start_unsafe[i]){
918                 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
919             }
920 #endif
921             setOffset=offset;
922             U8_SET_CP_START_UNSAFE(input, setOffset);
923             if(setOffset != start_unsafe[i]){
924                 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
925             }
926         }
927 
928         if (offset != 0) { /* Can't have it go off the end of the array */
929 #if !U_HIDE_OBSOLETE_UTF_OLD_H
930             setOffset=offset;
931             UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
932             if(setOffset != limit_unsafe[i]){
933                 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
934             }
935 #endif
936             setOffset=offset;
937             U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
938             if(setOffset != limit_unsafe[i]){
939                 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
940             }
941         }
942 
943         i++;
944     }
945 }
946 
TestTruncateIfIncomplete()947 static void TestTruncateIfIncomplete() {
948     // Difference from U8_SET_CP_START():
949     // U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length].
950     // Therefore, if the last byte is a lead byte, then this macro truncates
951     // even if the byte at the input index cannot continue a valid sequence
952     // (including when that is not a trail byte).
953     // On the other hand, if the last byte is a trail byte, then the two macros behave the same.
954     static const struct {
955         const char *s;
956         int32_t expected;
957     } cases[] = {
958         { "", 0 },
959         { "a", 1 },
960         { "\x80", 1 },
961         { "\xC1", 1 },
962         { "\xC2", 0 },
963         { "\xE0", 0 },
964         { "\xF4", 0 },
965         { "\xF5", 1 },
966         { "\x80\x80", 2 },
967         { "\xC2\xA0", 2 },
968         { "\xE0\x9F", 2 },
969         { "\xE0\xA0", 0 },
970         { "\xED\x9F", 0 },
971         { "\xED\xA0", 2 },
972         { "\xF0\x8F", 2 },
973         { "\xF0\x90", 0 },
974         { "\xF4\x8F", 0 },
975         { "\xF4\x90", 2 },
976         { "\xF5\x80", 2 },
977         { "\x80\x80\x80", 3 },
978         { "\xC2\xA0\x80", 3 },
979         { "\xE0\xA0\x80", 3 },
980         { "\xF0\x8F\x80", 3 },
981         { "\xF0\x90\x80", 0 },
982         { "\xF4\x8F\x80", 0 },
983         { "\xF4\x90\x80", 3 },
984         { "\xF5\x80\x80", 3 },
985         { "\x80\x80\x80\x80", 4 },
986         { "\xC2\xA0\x80\x80", 4 },
987         { "\xE0\xA0\x80\x80", 4 },
988         { "\xF0\x90\x80\x80", 4 },
989         { "\xF5\x80\x80\x80", 4 }
990     };
991     int32_t i;
992     for (i = 0; i < UPRV_LENGTHOF(cases); ++i) {
993         const char *s = cases[i].s;
994         int32_t expected = cases[i].expected;
995         int32_t length = (int32_t)strlen(s);
996         int32_t adjusted = length;
997         U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted);
998         if (adjusted != expected) {
999             log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n",
1000                     (int)i, (int)length, (int)expected, (int)adjusted);
1001         }
1002     }
1003 }
1004 
TestAppendChar()1005 static void TestAppendChar(){
1006 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1007     static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
1008     static const uint32_t test[]={
1009     /*  append-position(unsafe),  CHAR to be appended */
1010         0,                        0x10401,
1011         2,                        0x0028,
1012         2,                        0x007f,
1013         3,                        0xd801,
1014         1,                        0x20402,
1015         8,                        0x10401,
1016         5,                        0xc0,
1017         5,                        0xc1,
1018         5,                        0xfd,
1019         6,                        0x80,
1020         6,                        0x81,
1021         6,                        0xbf,
1022         7,                        0xfe,
1023 
1024     /*  append-position(safe),    CHAR to be appended */
1025         0,                        0x10401,
1026         2,                        0x0028,
1027         3,                        0x7f,
1028         3,                        0xd801,   /* illegal for UTF-8 starting with Unicode 3.2 */
1029         1,                        0x20402,
1030         9,                        0x10401,
1031         5,                        0xc0,
1032         5,                        0xc1,
1033         5,                        0xfd,
1034         6,                        0x80,
1035         6,                        0x81,
1036         6,                        0xbf,
1037         7,                        0xfe,
1038 
1039     };
1040     static const uint16_t movedOffset[]={
1041     /* offset-moved-to(unsafe) */
1042           4,              /*for append-pos: 0 , CHAR 0x10401*/
1043           3,
1044           3,
1045           6,
1046           5,
1047           12,
1048           7,
1049           7,
1050           7,
1051           8,
1052           8,
1053           8,
1054           9,
1055 
1056     /* offset-moved-to(safe) */
1057           4,              /*for append-pos: 0, CHAR  0x10401*/
1058           3,
1059           4,
1060           6,
1061           5,
1062           11,
1063           7,
1064           7,
1065           7,
1066           8,
1067           8,
1068           8,
1069           9,
1070 
1071     };
1072 
1073     static const uint8_t result[][11]={
1074         /*unsafe*/
1075         {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1076         {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1077         {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1078         {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
1079         {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1080         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
1081 
1082         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
1083         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
1084         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
1085 
1086         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
1087         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
1088         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
1089 
1090         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
1091         /*safe*/
1092         {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1093         {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1094         {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1095         {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
1096         {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1097         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
1098 
1099         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
1100         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
1101         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
1102 
1103         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
1104         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
1105         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
1106 
1107         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
1108 
1109     };
1110     uint16_t i, count=0;
1111     uint8_t str[12];
1112     uint32_t offset;
1113 /*    UChar32 c=0;*/
1114     uint16_t size=UPRV_LENGTHOF(s);
1115     for(i=0; i<UPRV_LENGTHOF(test); i=(uint16_t)(i+2)){
1116         uprv_memcpy(str, s, size);
1117         offset=test[i];
1118         if(count<13){
1119             UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]);
1120             if(offset != movedOffset[count]){
1121                 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
1122                     count, movedOffset[count], offset);
1123 
1124             }
1125             if(uprv_memcmp(str, result[count], size) !=0){
1126                 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count);
1127                 printUChars(result[count], size);
1128                 log_err("\nGot:      ");
1129                 printUChars(str, size);
1130                 log_err("\n");
1131             }
1132         }else{
1133             UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]);
1134             if(offset != movedOffset[count]){
1135                 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
1136                     count, movedOffset[count], offset);
1137 
1138             }
1139             if(uprv_memcmp(str, result[count], size) !=0){
1140                 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count);
1141                 printUChars(result[count], size);
1142                 log_err("\nGot:     ");
1143                 printUChars(str, size);
1144                 log_err("\n");
1145             }
1146             /*call the API instead of MACRO
1147             uprv_memcpy(str, s, size);
1148             offset=test[i];
1149             c=test[i+1];
1150             if((uint32_t)(c)<=0x7f) {
1151                   (str)[(offset)++]=(uint8_t)(c);
1152             } else {
1153                  (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
1154             }
1155             if(offset != movedOffset[count]){
1156                 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
1157                     count, movedOffset[count], offset);
1158 
1159             }
1160             if(uprv_memcmp(str, result[count], size) !=0){
1161                 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
1162                 printUChars(result[count], size);
1163                 printf("\nGot:     ");
1164                 printUChars(str, size);
1165                 printf("\n");
1166             }
1167             */
1168         }
1169         count++;
1170     }
1171 #endif
1172 }
1173 
TestAppend()1174 static void TestAppend() {
1175     static const UChar32 codePoints[]={
1176         0x61, 0xdf, 0x901, 0x3040,
1177         0xac00, 0xd800, 0xdbff, 0xdcde,
1178         0xdffd, 0xe000, 0xffff, 0x10000,
1179         0x12345, 0xe0021, 0x10ffff, 0x110000,
1180         0x234567, 0x7fffffff, -1, -1000,
1181         0, 0x400
1182     };
1183     static const uint8_t expectUnsafe[]={
1184         0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
1185         0xea, 0xb0, 0x80,  0xed, 0xa0, 0x80,  0xed, 0xaf, 0xbf,  0xed, 0xb3, 0x9e,
1186         0xed, 0xbf, 0xbd,  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
1187         0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
1188         /* none from this line */
1189         0,  0xd0, 0x80
1190     }, expectSafe[]={
1191         0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
1192         0xea, 0xb0, 0x80,  /* no surrogates */
1193         /* no surrogates */  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
1194         0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
1195         /* none from this line */
1196         0,  0xd0, 0x80
1197     };
1198 
1199     uint8_t buffer[100];
1200     UChar32 c;
1201     int32_t i, length;
1202     UBool isError, expectIsError, wrongIsError;
1203 
1204     length=0;
1205     for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
1206         c=codePoints[i];
1207         if(c<0 || 0x10ffff<c) {
1208             continue; /* skip non-code points for U8_APPEND_UNSAFE */
1209         }
1210 
1211         U8_APPEND_UNSAFE(buffer, length, c);
1212     }
1213     if(length!=UPRV_LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
1214         log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
1215     }
1216 
1217     length=0;
1218     wrongIsError=FALSE;
1219     for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
1220         c=codePoints[i];
1221         expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
1222         isError=FALSE;
1223 
1224         U8_APPEND(buffer, length, UPRV_LENGTHOF(buffer), c, isError);
1225         wrongIsError|= isError!=expectIsError;
1226     }
1227     if(wrongIsError) {
1228         log_err("U8_APPEND did not set isError correctly\n");
1229     }
1230     if(length!=UPRV_LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
1231         log_err("U8_APPEND did not generate the expected output\n");
1232     }
1233 }
1234 
1235 static void
TestSurrogates()1236 TestSurrogates() {
1237     static const uint8_t b[]={
1238         0xc3, 0x9f,             /*  00DF */
1239         0xed, 0x9f, 0xbf,       /*  D7FF */
1240         0xed, 0xa0, 0x81,       /*  D801 */
1241         0xed, 0xbf, 0xbe,       /*  DFFE */
1242         0xee, 0x80, 0x80,       /*  E000 */
1243         0xf0, 0x97, 0xbf, 0xbe  /* 17FFE */
1244     };
1245     static const UChar32 cp[]={
1246         0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
1247     };
1248 
1249     UChar32 cu, cs, cl;
1250     int32_t i, j, k, iu, is, il, length;
1251 
1252     k=0; /* index into cp[] */
1253     length=UPRV_LENGTHOF(b);
1254     for(i=0; i<length;) {
1255         j=i;
1256         U8_NEXT_UNSAFE(b, j, cu);
1257         iu=j;
1258 
1259         j=i;
1260         U8_NEXT(b, j, length, cs);
1261         is=j;
1262 
1263         j=i;
1264         L8_NEXT(b, j, length, cl);
1265         il=j;
1266 
1267         if(cu!=cp[k]) {
1268             log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1269         }
1270 
1271         /* U8_NEXT() returns <0 for surrogate code points */
1272         if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1273             log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1274         }
1275 
1276         /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
1277         if(cl!=cu) {
1278             log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1279         }
1280 
1281         // U8_NEXT() skips only the first byte of a surrogate byte sequence.
1282         if(U_IS_SURROGATE(cu) ? is!=(i+1) : is!=iu) {
1283             log_err("U8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1284         }
1285         if(il!=iu) {
1286             log_err("L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1287         }
1288 
1289         ++k;    /* next code point */
1290         i=iu;   /* advance by one UTF-8 sequence */
1291     }
1292 
1293     while(i>0) {
1294         --k; /* previous code point */
1295 
1296         j=i;
1297         U8_PREV_UNSAFE(b, j, cu);
1298         iu=j;
1299 
1300         j=i;
1301         U8_PREV(b, 0, j, cs);
1302         is=j;
1303 
1304         j=i;
1305         L8_PREV(b, 0, j, cl);
1306         il=j;
1307 
1308         if(cu!=cp[k]) {
1309             log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1310         }
1311 
1312         /* U8_PREV() returns <0 for surrogate code points */
1313         if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1314             log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1315         }
1316 
1317         /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
1318         if(cl!=cu) {
1319             log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1320         }
1321 
1322         // U8_PREV() skips only the last byte of a surrogate byte sequence.
1323         if(U_IS_SURROGATE(cu) ? is!=(i-1) : is!=iu) {
1324             log_err("U8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1325         }
1326         if(il !=iu) {
1327             log_err("L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1328         }
1329 
1330         i=iu;   /* go back by one UTF-8 sequence */
1331     }
1332 }
1333