1 /*
2  * Copyright 2001-2004 Unicode, Inc.
3  *
4  * Disclaimer
5  *
6  * This source code is provided as is by Unicode, Inc. No claims are
7  * made as to fitness for any particular purpose. No warranties of any
8  * kind are expressed or implied. The recipient agrees to determine
9  * applicability of information provided. If this file has been
10  * purchased on magnetic or optical media from Unicode, Inc., the
11  * sole remedy for any claim will be exchange of defective media
12  * within 90 days of receipt.
13  *
14  * Limitations on Rights to Redistribute This Code
15  *
16  * Unicode, Inc. hereby grants the right to freely use the information
17  * supplied in this file in the creation of products supporting the
18  * Unicode Standard, and to make copies of this file in any form
19  * for internal or external distribution as long as this notice
20  * remains attached.
21  *
22  * harness.c
23  *
24  * This is a test harness for "ConvertUTF.c".  Compile this
25  * and run without arguments.  It will exhaustively test
26  * the conversion routines, and print a few lines of diagnostic
27  * output.  You don't need to compile ConvertUTF.c itself,
28  * since it gets #included here along with the header.
29  * Example of a compile line:
30  *
31  *		$	gcc -g harness.c -o harness
32  *
33  * Rev History: Rick McGowan, new file April 2001.
34  * Sept 19, 2002: Corrected error on line 234:  utf16_buf[2] becomes utf16_result[2]
35  * 	per report from Iain Murray.
36  * July 3, 2003: Updated printout message.
37  * Oct 19, 2004: Updated isLegalUTF8 test data and corrected switch statements to catch
38  *	illegal surrogate use in UTF-8, per report from Frank Tang.
39  *
40  */
41 
42 #define CVTUTF_DEBUG 1
43 
44 #include <stdio.h>
45 #include "ConvertUTF.c"
46 
47 /* ---------------------------------------------------------------------
48 	test01 - Spot check a few legal & illegal UTF-8 values only.
49 	This is not an exhaustive test, just a brief one that was
50 	used to develop the "isLegalUTF8" routine.
51 
52 	Legal UTF-8 sequences are:
53 
54 	1st----	2nd----	3rd----	4th----	Codepoints---
55 
56 	00-7F				  0000-  007F
57 	C2-DF	80-BF			  0080-  07FF
58 	E0	A0-BF	80-BF		  0800-  0FFF
59 	E1-EC   80-BF   80-BF             1000-  CFFF
60 	ED      80-9F   80-BF             D000-  D7FF
61 	EE-EF   80-BF   80-BF             E000-  FFFF
62 	F0	90-BF	80-BF	80-BF	 10000- 3FFFF
63 	F1-F3	80-BF	80-BF	80-BF	 40000- FFFFF
64 	F4	80-8F	80-BF	80-BF	100000-10FFFF
65 
66    --------------------------------------------------------------------- */
67 
68 
69 struct utf8_test {
70     Boolean utf8_legal;	/* is legal sequence? */
71     int utf8_len;	/* length of sequence */
72     unsigned char utf8_seq[5];	/* the sequence */
73 };
74 
75 struct utf8_test utf8_testData[] = {
76     { 1,	1,	{ 0x7A, 0x00, 0x00, 0x00, 0x00 }},	/* 0 */
77     { 1,	2,	{ 0xC2, 0xAC, 0x00, 0x00, 0x00 }},	/* 1 */
78     { 1,	2,	{ 0xDF, 0xB2, 0x00, 0x00, 0x00 }},	/* 2 */
79     { 1,	3,	{ 0xE0, 0xA1, 0x81, 0x00, 0x00 }},	/* 3 */
80     { 1,	3,	{ 0xE1, 0xAC, 0x90, 0x00, 0x00 }},	/* 4 */
81     { 1,	3,	{ 0xF0, 0x93, 0xB2, 0xA1, 0x00 }},	/* 5 */
82     { 1,	4,	{ 0xF1, 0x87, 0x9A, 0xB0, 0x00 }},	/* 6 */
83     { 1,	4,	{ 0xF3, 0x88, 0x9B, 0xAD, 0x00 }},	/* 7 */
84     { 1,	4,	{ 0xF4, 0x82, 0x89, 0x8F, 0x00 }},	/* 8 */
85 
86     { 0,	3,	{ 0x82, 0x00, 0x00, 0x00, 0x00 }},	/* 9 */
87     { 0,	2,	{ 0xF8, 0xAC, 0x00, 0x00, 0x00 }},	/* 10 */
88     { 0,	2,	{ 0xE1, 0xFC, 0xFF, 0x00, 0x00 }},	/* 11 */
89     { 0,	3,	{ 0xC2, 0xFC, 0x00, 0x00, 0x00 }},	/* 12 */
90     { 0,	3,	{ 0xE1, 0xC2, 0x81, 0x00, 0x00 }},	/* 13 */
91     { 0,	2,	{ 0xC2, 0xC1, 0x00, 0x00, 0x00 }},	/* 14 */
92     { 0,	2,	{ 0xC0, 0xAF, 0x00, 0x00, 0x00 }},	/* 15 */
93     { 0,	3,	{ 0xE0, 0x9F, 0x80, 0x00, 0x00 }},	/* 16 */
94     { 0,	4,	{ 0xF0, 0x93, 0xB2, 0xC1, 0x00 }},	/* 17 */
95 
96     { 1,	3,	{ 0xED, 0x9F, 0xBF, 0x00, 0x00 }},	/* 18 */
97     { 1,	3,	{ 0xEE, 0x80, 0x80, 0x00, 0x00 }},	/* 19 */
98     { 0,	3,	{ 0xED, 0xA0, 0x80, 0x00, 0x00 }},	/* 20 */
99     { 0,	3,	{ 0xED, 0xBF, 0xBF, 0x00, 0x00 }},	/* 21 */
100 
101 /* for all > 21 use "short" buffer lengths to detect over-run */
102     { 0,	4,	{ 0xF0, 0x93, 0xB2, 0xC3, 0x00 }},	/* 18 use short buflen */
103     { 0,	0,	{ 0x00, 0x00, 0x00, 0x00, 0x00 }},
104 
105 };
106 
test01()107 int test01() {
108 	int i;
109 	int rval, wantVal1, wantVal2, gotVal1, gotVal2, len2;
110 
111 	printf("Begin Test01\n"); fflush(stdout);
112 
113 	rval = 0;
114 	for (i = 0; utf8_testData[i].utf8_len; i++) {
115 		wantVal1 = wantVal2 = utf8_testData[i].utf8_legal;
116 		gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len);
117 		/* use truncated length for tests over 21 */
118 		if (i <= 21) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; }
119 		gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2);
120 		if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) {
121 			printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n",
122 			i, gotVal1, gotVal2, wantVal1, wantVal2, utf8_testData[i].utf8_seq[0],
123 			utf8_testData[i].utf8_seq[1], utf8_testData[i].utf8_seq[2],
124 			utf8_testData[i].utf8_seq[3], utf8_testData[i].utf8_seq[4],
125 			utf8_testData[i].utf8_len);
126 			++rval;
127 		}
128 	}
129 
130 	return (rval ? 0 : 1);
131 }
132 
133 
134 /* ---------------------------------------------------------------------
135 	test02 - Test round trip UTF32 -> UTF16 -> UTF8 -> UTF16 -> UTF32
136 
137 	This is an exhaustive test of values 0 through 0x10FFFF.  It
138 	takes each integer value and converts from UTC4 through the
139 	other encoding forms, and back to UTR32, checking the results
140 	along the way.
141 
142 	It does not check the un-paired low surrogates, except for
143 	the first low surrogate.  It intends to get that one illegal
144 	result, prints a message, and continues with tests.
145 
146    --------------------------------------------------------------------- */
147 
test02()148 int test02() {
149 	int i, n;
150 	ConversionResult result;
151 	UTF32 utf32_buf[2], utf32_result[2];
152 	UTF16 utf16_buf[3], utf16_result[3];
153 	UTF8 utf8_buf[8];
154 	UTF32 *utf32SourceStart, *utf32TargetStart;
155 	UTF16 *utf16SourceStart, *utf16TargetStart;
156 	UTF8 *utf8SourceStart, *utf8TargetStart;
157 
158 	printf("Begin Test02\n"); fflush(stdout);
159 
160 	for (i = 0; i <= 0x10FFFF; i++) {
161 		utf32_buf[0] = i; utf32_buf[1] = 0;
162 		utf32_result[0] = utf32_result[1] = 0;
163 		utf16_buf[0] = utf16_buf[1] = utf16_buf[2] = 0;
164 		utf16_result[0] = utf16_result[1] = utf16_result[2] = 0;
165 		for (n = 0; n < 8; n++) utf8_buf[n] = 0;
166 
167 		utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
168 		utf16TargetStart = utf16SourceStart = utf16_buf;
169 		utf8TargetStart = utf8SourceStart = utf8_buf;
170 
171 		/*
172 		 * Test UTF32 -> UTF16
173 		 */
174 		result = ConvertUTF32toUTF16((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion);
175 		if (i < UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) {
176 			/* skip result checking for all but 0000d800, which we know to be illegal */
177 			switch (result) {
178 			default: fprintf(stderr, "Test02A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
179 			case conversionOK: break;
180 			case sourceExhausted: printf("sourceExhausted\t"); break;
181 			case targetExhausted: printf("targetExhausted\t"); break;
182 			case sourceIllegal: printf("sourceIllegal\t"); break;
183 		    }
184 		}
185 		if (result != conversionOK) {
186 			if (i <= UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) {
187 				printf("Test02A for %d, input %08x, output %04x,%04x, result %d\n",
188 				    i, utf32_buf[0], utf16_buf[0], utf16_buf[1], result);
189 				if ((i != UNI_SUR_HIGH_START) || (result != sourceIllegal)) {
190 					return 0;
191 				} else {
192 					printf("!!! Test02A: note expected illegal result for 0x0000D800\n");
193 				}
194 			}
195 		}
196 		if (i > UNI_SUR_HIGH_START && i <= UNI_SUR_LOW_END) continue;
197 
198 		/*
199 		 * Test UTF16 -> UTF8, with legality check on.  We check for everything except
200 		 * for unpaired low surrogates.  We do make one check that the lowest low
201 		 * surrogate, when unpaired, is illegal.
202 		 */
203 		result = ConvertUTF16toUTF8((const UTF16 **) &utf16SourceStart, &(utf16_buf[2]), &utf8TargetStart, &(utf8_buf[7]), strictConversion);
204 		switch (result) {
205 		default: fprintf(stderr, "Test02B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
206 		case conversionOK: break;
207 		case sourceExhausted: printf("sourceExhausted\t"); break;
208 		case targetExhausted: printf("targetExhausted\t"); break;
209 		case sourceIllegal: printf("sourceIllegal\t"); break;
210 		}
211 		if (result != conversionOK) {
212 			printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n",
213 				i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result);
214 			if ((i != UNI_SUR_LOW_START) && (i != UNI_SUR_HIGH_START)) {
215 				return 0;
216 			} else {
217 				/* Note: This illegal result only happens if we remove the surrogate
218 				    check in Test02A.  So it shouldn't be seen unless that check and
219 				    the "continue" are removed in the test above.
220 				*/
221 				if (i == UNI_SUR_LOW_START)
222 				    printf("!!! Test02B: note expected illegal result for 0xDC00,0000\n");
223 				else if (i == UNI_SUR_HIGH_START)
224 				    printf("!!! Test02B: note expected illegal result for 0xD800,0000\n");
225 			}
226 		}
227 		if ((i == UNI_SUR_LOW_START) && result != sourceIllegal) {
228 			printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n",
229 				i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result);
230 				printf("Test02B: expected illegal result for 0xDC00,0000 was not flagged illegal.\n");
231 				return 0;
232 		}
233 
234 		if ((i >= UNI_SUR_HIGH_START) & (i <= UNI_SUR_LOW_END)) continue;
235 
236 		/*
237 		 * Reset some result buffer pointers for the trip back.
238 		 */
239 		utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
240 		utf16TargetStart = utf16SourceStart = utf16_result;
241 		utf8TargetStart = utf8SourceStart = utf8_buf;
242 
243 		/*
244 		 * Test UTF8 -> UTF16, with legality check on.
245 		 */
246 		result = ConvertUTF8toUTF16((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_result[2]), strictConversion);
247 		switch (result) {
248 		default: fprintf(stderr, "Test02C fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
249 		case conversionOK: break;
250 		case sourceExhausted: printf("sourceExhausted\t"); break;
251 		case targetExhausted: printf("targetExhausted\t"); break;
252 		case sourceIllegal: printf("sourceIllegal\t"); break;
253 		}
254 		if (result != conversionOK) {
255 			printf("Test02C for %d (0x%x), input %s; output %04x,%04x; result %d\n",
256 				i, utf32_buf[0], utf8_buf, utf16_buf[0], utf16_buf[1], result);
257 			return 0;
258 		}
259 		for (n = 0; n < 3; n++) { /* check that the utf16 result is the same as what went in. */
260 			if (utf16_buf[n] != utf16_result[n]) {
261 				printf("Test02C error: input = 0x%08x; utf16_buf = 0x%04x,0x%04x; utf16_result = 0x%04x,0x%04x\n",
262 					utf32_buf[0], utf16_buf[0], utf16_buf[1], utf16_result[0], utf16_result[1]);
263 				return 0;
264 			}
265 		}
266 
267 		/*
268 		 * Test UTF16 -> UTF32, with legality check on.  If the result of our previous
269 		 * conversion gave us a "surrogate pair", then we need to convert 2 entities
270 		 * back to UTF32.
271 		 */
272 		if (utf16_result[0] >= UNI_SUR_HIGH_START && utf16_result[0] <= UNI_SUR_HIGH_END) {
273 			result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[2]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
274 		} else {
275 			result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
276 		}
277 		switch (result) {
278 		default: fprintf(stderr, "Test02D fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
279 		case conversionOK: break;
280 		case sourceExhausted: printf("sourceExhausted\t"); break;
281 		case targetExhausted: printf("targetExhausted\t"); break;
282 		case sourceIllegal: printf("sourceIllegal\t"); break;
283 		}
284 		if (result != conversionOK) {
285 			printf("Test02D for %d (0x%x), input %04x,%04x; output %08x; result %d\n",
286 				i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf32_result[0], result);
287 			return 0;
288 		}
289 
290 		/*
291 		 * Now, check the final round-trip value.
292 		 */
293 		if (utf32_buf[0] != utf32_result[0]) {
294 			printf("Test02E for %d: utf32 input %08x; trip output %08x (utf_16buf is %04x,%04x)\n", i, utf32_buf[0], utf32_result[0], utf16_buf[0], utf16_buf[1]);
295 			return 0;
296 		}
297 	}
298 	return 1;
299 }
300 
301 /* ---------------------------------------------------------------------
302 	test03 - Test round trip UTF32 -> UTF8 -> UTF32
303 
304 	This tests the functions that were not tested by test02 above.
305 	For each UTF32 value 0 through 0x10FFFF, it tests the conversion
306 	to UTF-8 and back.  The test is exhaustive.
307 
308    --------------------------------------------------------------------- */
309 
test03()310 int test03() {
311 	int i, n;
312 	ConversionResult result;
313 	UTF32 utf32_buf[2], utf32_result[2];
314 	UTF8 utf8_buf[8];
315 	UTF32 *utf32SourceStart, *utf32TargetStart;
316 	UTF8 *utf8SourceStart, *utf8TargetStart;
317 
318 	printf("Begin Test03\n"); fflush(stdout);
319 
320 	for (i = 0; i <= 0x10FFFF; i++) {
321 		/* Skip all surrogates except UNI_SUR_HIGH_START, which we test for illegality. */
322 		if (i > UNI_SUR_HIGH_START && i <= UNI_SUR_LOW_END) continue;
323 
324 		utf32_buf[0] = i; utf32_buf[1] = 0;
325 		utf32_result[0] = utf32_result[1] = 0;
326 		for (n = 0; n < 8; n++) utf8_buf[n] = 0;
327 
328 		utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
329 		utf8TargetStart = utf8SourceStart = utf8_buf;
330 
331 		/*
332 		 * Test UTF32 -> UTF8, with legality check on.
333 		 */
334 		result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
335 		switch (result) {
336 		default: fprintf(stderr, "Test03A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
337 		case conversionOK: break;
338 		case sourceExhausted: printf("sourceExhausted\t"); break;
339 		case targetExhausted: printf("targetExhausted\t"); break;
340 		case sourceIllegal: printf("sourceIllegal\t"); break;
341 		}
342 		if (result != conversionOK) {
343 			printf("Test03A for %d (0x%x); output %s; result %d\n",
344 				i, utf32_buf[0], utf8_buf, result);
345 			if (i != UNI_SUR_HIGH_START) {
346 				return 0;
347 			} else {
348 				printf("!!! Test03A: note expected illegal result for 0x0000D800\n");
349 			}
350 		}
351 		if ((i == UNI_SUR_HIGH_START) && result != sourceIllegal) {
352 			printf("Test03A for %d (0x%x); output %s; result %d\n",
353 				i, utf32_buf[0], utf8_buf, result);
354 				printf("Test03A: expected illegal result for 0x0000D800 was not flagged illegal.\n");
355 				return 0;
356 		}
357 
358 		if ((i >= UNI_SUR_HIGH_START) & (i <= UNI_SUR_LOW_END)) continue;
359 
360 		/*
361 		 * Reset some result buffer pointers for the trip back.
362 		 */
363 		utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
364 		utf8TargetStart = utf8SourceStart = utf8_buf;
365 
366 		/*
367 		 * Test UTF8 -> UTF32, with legality check on.
368 		 */
369 		result = ConvertUTF8toUTF32((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
370 		switch (result) {
371 		default: fprintf(stderr, "Test03B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
372 		case conversionOK: break;
373 		case sourceExhausted: printf("sourceExhausted\t"); break;
374 		case targetExhausted: printf("targetExhausted\t"); break;
375 		case sourceIllegal: printf("sourceIllegal\t"); break;
376 		}
377 		if (result != conversionOK) {
378 			printf("Test03B for %d (0x%x), input %s; output 0x%08x; result %d\n",
379 				i, utf32_buf[0], utf8_buf, utf32_result[0], result);
380 			return 0;
381 		}
382 
383 		/*
384 		 * Now, check the final round-trip value.
385 		 */
386 		if (utf32_buf[0] != utf32_result[0]) {
387 			printf("Test03C for %d: utf32 input %08x; utf8 buf %s; trip output %08x\n", i, utf32_buf[0], utf8_buf, utf32_result[0]);
388 			return 0;
389 		}
390 	}
391 	return 1;
392 }
393 
394 /* ---------------------------------------------------------------------
395 	test04 - Test an illegal UTF-32 value > 10FFFF conversion to UTF-8.
396 	Expect it will be turned into UNI_REPLACEMENT_CHAR.
397 
398    --------------------------------------------------------------------- */
399 
test04()400 int test04() {
401 	int i, n;
402 	ConversionResult result;
403 	UTF32 utf32_buf[2];
404 	UTF8 utf8_buf[8];
405 	UTF32 *utf32SourceStart, *utf32TargetStart;
406 	UTF8 *utf8SourceStart, *utf8TargetStart;
407 
408 	printf("Begin Test04\n"); fflush(stdout);
409 
410 	i = 0x10FFFF + 21; /* an arbitrary value > legal */
411 
412 	utf32_buf[0] = i; utf32_buf[1] = 0;
413 	for (n = 0; n < 8; n++) utf8_buf[n] = 0;
414 
415 	utf32SourceStart = utf32_buf;
416 	utf8TargetStart = utf8_buf;
417 
418 	/*
419 	 * Test UTF32 -> UTF8, with legality check on.
420 	 */
421 	result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
422 	if (result != sourceIllegal) {
423 		fprintf(stderr, "Test04A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
424 	}
425 
426 	return 1;
427 }
428 
429 /* --------------------------------------------------------------------- */
430 
main()431 int main() {
432 	printf("Three tests of round-trip conversions will be performed.\n");
433 	printf("One test of illegal UTF-32 will be peroformed.\n");
434 	printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n");
435 	printf("These are for tests of Surrogate conversion.\n\n");
436 	fflush(stdout);
437 	if (test01()) {	printf("******** Test01 succeeded without error. ********\n\n"); }
438 	else { printf("-------- Test01 failed. --------\n\n"); }
439 	if (test02()) { printf("******** Test02 succeeded without error. ********\n\n"); }
440 	else { printf("-------- Test02 failed. --------\n\n"); }
441 	if (test03()) { printf("******** Test03 succeeded without error. ********\n\n"); }
442 	else { printf("-------- Test03 failed. --------\n\n"); }
443 	if (test04()) { printf("******** Test04 succeeded without error. ********\n\n"); }
444 	else { printf("-------- Test04 failed. --------\n\n"); }
445 	return 0;
446 }
447