1 /**
2  * Test the UTF-8 decoding routines
3  *
4  * author: Daniel Veillard
5  * copy: see Copyright for the status of this software.
6  */
7 
8 #include <stdio.h>
9 #include <string.h>
10 #include <libxml/parser.h>
11 #include <libxml/parserInternals.h>
12 
13 #include "buf.h"
14 
15 int lastError;
16 
errorHandler(void * unused,xmlErrorPtr err)17 static void errorHandler(void *unused, xmlErrorPtr err) {
18     if ((unused == NULL) && (err != NULL) && (lastError == 0)) {
19         lastError = err->code;
20     }
21 }
22 
23 char document1[100] = "<doc>XXXX</doc>";
24 char document2[100] = "<doc foo='XXXX'/>";
25 
testDocumentRangeByte1(xmlParserCtxtPtr ctxt,char * document,int len,char * data,int forbid1,int forbid2)26 static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document,
27                   int len,  char *data, int forbid1, int forbid2) {
28     int i;
29     xmlDocPtr res;
30 
31     for (i = 0;i <= 0xFF;i++) {
32 	lastError = 0;
33 	xmlCtxtReset(ctxt);
34 
35         data[0] = i;
36 
37 	res = xmlReadMemory(document, len, "test", NULL, 0);
38 
39 	if ((i == forbid1) || (i == forbid2)) {
40 	    if ((lastError == 0) || (res != NULL))
41 	        fprintf(stderr,
42 		    "Failed to detect invalid char for Byte 0x%02X: %c\n",
43 		        i, i);
44 	}
45 
46 	else if ((i == '<') || (i == '&')) {
47 	    if ((lastError == 0) || (res != NULL))
48 	        fprintf(stderr,
49 		    "Failed to detect illegal char %c for Byte 0x%02X\n", i, i);
50 	}
51 	else if (((i < 0x20) || (i >= 0x80)) &&
52 	    (i != 0x9) && (i != 0xA) && (i != 0xD)) {
53 	    if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL))
54 	        fprintf(stderr,
55 		    "Failed to detect invalid char for Byte 0x%02X\n", i);
56 	}
57 	else if (res == NULL) {
58 	    fprintf(stderr,
59 		"Failed to parse valid char for Byte 0x%02X : %c\n", i, i);
60 	}
61 	if (res != NULL)
62 	    xmlFreeDoc(res);
63     }
64 }
65 
testDocumentRangeByte2(xmlParserCtxtPtr ctxt,char * document,int len,char * data)66 static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document,
67                   int len,  char *data) {
68     int i, j;
69     xmlDocPtr res;
70 
71     for (i = 0x80;i <= 0xFF;i++) {
72     for (j = 0;j <= 0xFF;j++) {
73 	lastError = 0;
74 	xmlCtxtReset(ctxt);
75 
76         data[0] = i;
77         data[1] = j;
78 
79 	res = xmlReadMemory(document, len, "test", NULL, 0);
80 
81 	/* if first bit of first char is set, then second bit must too */
82 	if ((i & 0x80) && ((i & 0x40) == 0)) {
83 	    if ((lastError == 0) || (res != NULL))
84 		fprintf(stderr,
85 		"Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
86 			i, j);
87 	}
88 
89 	/*
90 	 * if first bit of first char is set, then second char first
91 	 * bits must be 10
92 	 */
93 	else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
94 	    if ((lastError == 0) || (res != NULL))
95 		fprintf(stderr,
96 	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
97 			i, j);
98 	}
99 
100 	/*
101 	 * if using a 2 byte encoding then the value must be greater
102 	 * than 0x80, i.e. one of bits 5 to 1 of i must be set
103 	 */
104 	else if ((i & 0x80) && ((i & 0x1E) == 0)) {
105 	    if ((lastError == 0) || (res != NULL))
106 		fprintf(stderr,
107 	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
108 			i, j);
109 	}
110 
111 	/*
112 	 * if third bit of first char is set, then the sequence would need
113 	 * at least 3 bytes, but we give only 2 !
114 	 */
115 	else if ((i & 0xE0) == 0xE0) {
116 	    if ((lastError == 0) || (res != NULL))
117 		fprintf(stderr,
118 	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
119 			i, j);
120 	}
121 
122 	/*
123 	 * We should see no error in remaning cases
124 	 */
125 	else if ((lastError != 0) || (res == NULL)) {
126 	    fprintf(stderr,
127 		"Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j);
128 	}
129 	if (res != NULL)
130 	    xmlFreeDoc(res);
131     }
132     }
133 }
134 
135 /**
136  * testDocumentRanges:
137  *
138  * Test the correct UTF8 character parsing in context of XML documents
139  * Those are in-context injection tests checking the parser behaviour on
140  * edge case values at different point in content, beginning and end of
141  * CDATA in text or in attribute values.
142  */
143 
testDocumentRanges(void)144 static void testDocumentRanges(void) {
145     xmlParserCtxtPtr ctxt;
146     char *data;
147 
148     /*
149      * Set up a parsing context using the first document as
150      * the current input source.
151      */
152     ctxt = xmlNewParserCtxt();
153     if (ctxt == NULL) {
154         fprintf(stderr, "Failed to allocate parser context\n");
155 	return;
156     }
157 
158     printf("testing 1 byte char in document: 1");
159     fflush(stdout);
160     data = &document1[5];
161     data[0] = ' ';
162     data[1] = ' ';
163     data[2] = ' ';
164     data[3] = ' ';
165     /* test 1 byte injection at beginning of area */
166     testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
167                            data, -1, -1);
168     printf(" 2");
169     fflush(stdout);
170     data[0] = ' ';
171     data[1] = ' ';
172     data[2] = ' ';
173     data[3] = ' ';
174     /* test 1 byte injection at end of area */
175     testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
176                            data + 3, -1, -1);
177 
178     printf(" 3");
179     fflush(stdout);
180     data = &document2[10];
181     data[0] = ' ';
182     data[1] = ' ';
183     data[2] = ' ';
184     data[3] = ' ';
185     /* test 1 byte injection at beginning of area */
186     testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
187                            data, '\'', -1);
188     printf(" 4");
189     fflush(stdout);
190     data[0] = ' ';
191     data[1] = ' ';
192     data[2] = ' ';
193     data[3] = ' ';
194     /* test 1 byte injection at end of area */
195     testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
196                            data + 3, '\'', -1);
197     printf(" done\n");
198 
199     printf("testing 2 byte char in document: 1");
200     fflush(stdout);
201     data = &document1[5];
202     data[0] = ' ';
203     data[1] = ' ';
204     data[2] = ' ';
205     data[3] = ' ';
206     /* test 2 byte injection at beginning of area */
207     testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
208                            data);
209     printf(" 2");
210     fflush(stdout);
211     data[0] = ' ';
212     data[1] = ' ';
213     data[2] = ' ';
214     data[3] = ' ';
215     /* test 2 byte injection at end of area */
216     testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
217                            data + 2);
218 
219     printf(" 3");
220     fflush(stdout);
221     data = &document2[10];
222     data[0] = ' ';
223     data[1] = ' ';
224     data[2] = ' ';
225     data[3] = ' ';
226     /* test 2 byte injection at beginning of area */
227     testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
228                            data);
229     printf(" 4");
230     fflush(stdout);
231     data[0] = ' ';
232     data[1] = ' ';
233     data[2] = ' ';
234     data[3] = ' ';
235     /* test 2 byte injection at end of area */
236     testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
237                            data + 2);
238     printf(" done\n");
239 
240     xmlFreeParserCtxt(ctxt);
241 }
242 
testCharRangeByte1(xmlParserCtxtPtr ctxt,char * data)243 static void testCharRangeByte1(xmlParserCtxtPtr ctxt, char *data) {
244     int i = 0;
245     int len, c;
246 
247     data[1] = 0;
248     data[2] = 0;
249     data[3] = 0;
250     for (i = 0;i <= 0xFF;i++) {
251         data[0] = i;
252 	ctxt->charset = XML_CHAR_ENCODING_UTF8;
253 
254 	lastError = 0;
255         c = xmlCurrentChar(ctxt, &len);
256 	if ((i == 0) || (i >= 0x80)) {
257 	    /* we must see an error there */
258 	    if (lastError != XML_ERR_INVALID_CHAR)
259 	        fprintf(stderr,
260 		    "Failed to detect invalid char for Byte 0x%02X\n", i);
261 	} else if (i == 0xD) {
262 	    if ((c != 0xA) || (len != 1))
263 		fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i);
264 	} else if ((c != i) || (len != 1)) {
265 	    fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i);
266 	}
267     }
268 }
269 
testCharRangeByte2(xmlParserCtxtPtr ctxt,char * data)270 static void testCharRangeByte2(xmlParserCtxtPtr ctxt, char *data) {
271     int i, j;
272     int len, c;
273 
274     data[2] = 0;
275     data[3] = 0;
276     for (i = 0x80;i <= 0xFF;i++) {
277 	for (j = 0;j <= 0xFF;j++) {
278 	    data[0] = i;
279 	    data[1] = j;
280 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
281 
282 	    lastError = 0;
283 	    c = xmlCurrentChar(ctxt, &len);
284 
285 	    /* if first bit of first char is set, then second bit must too */
286 	    if ((i & 0x80) && ((i & 0x40) == 0)) {
287 		if (lastError != XML_ERR_INVALID_CHAR)
288 		    fprintf(stderr,
289 		    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
290 		            i, j);
291 	    }
292 
293 	    /*
294 	     * if first bit of first char is set, then second char first
295 	     * bits must be 10
296 	     */
297 	    else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
298 		if (lastError != XML_ERR_INVALID_CHAR)
299 		    fprintf(stderr,
300 		"Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
301 		            i, j, c);
302 	    }
303 
304 	    /*
305 	     * if using a 2 byte encoding then the value must be greater
306 	     * than 0x80, i.e. one of bits 5 to 1 of i must be set
307 	     */
308 	    else if ((i & 0x80) && ((i & 0x1E) == 0)) {
309 		if (lastError != XML_ERR_INVALID_CHAR)
310 		    fprintf(stderr,
311 		"Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
312 		            i, j, c);
313 	    }
314 
315 	    /*
316 	     * if third bit of first char is set, then the sequence would need
317 	     * at least 3 bytes, but we give only 2 !
318 	     */
319 	    else if ((i & 0xE0) == 0xE0) {
320 		if (lastError != XML_ERR_INVALID_CHAR)
321 		    fprintf(stderr,
322 		"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
323 		            i, j);
324 	    }
325 
326             /*
327 	     * We should see no error in remaning cases
328 	     */
329 	    else if ((lastError != 0) || (len != 2)) {
330 		fprintf(stderr,
331 		    "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j);
332 	    }
333 
334             /*
335 	     * Finally check the value is right
336 	     */
337 	    else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) {
338 		fprintf(stderr,
339 	"Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
340 	                i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c);
341 	    }
342         }
343     }
344 }
345 
testCharRangeByte3(xmlParserCtxtPtr ctxt,char * data)346 static void testCharRangeByte3(xmlParserCtxtPtr ctxt, char *data) {
347     int i, j, k, K;
348     int len, c;
349     unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
350     int value;
351 
352     data[3] = 0;
353     for (i = 0xE0;i <= 0xFF;i++) {
354     for (j = 0;j <= 0xFF;j++) {
355     for (k = 0;k < 6;k++) {
356 	data[0] = i;
357 	data[1] = j;
358 	K = lows[k];
359 	data[2] = (char) K;
360 	value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
361 	ctxt->charset = XML_CHAR_ENCODING_UTF8;
362 
363 	lastError = 0;
364 	c = xmlCurrentChar(ctxt, &len);
365 
366 	/*
367 	 * if fourth bit of first char is set, then the sequence would need
368 	 * at least 4 bytes, but we give only 3 !
369 	 */
370 	if ((i & 0xF0) == 0xF0) {
371 	    if (lastError != XML_ERR_INVALID_CHAR)
372 		fprintf(stderr,
373 	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
374 			i, j, K, data[3]);
375 	}
376 
377         /*
378 	 * The second and the third bytes must start with 10
379 	 */
380 	else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) {
381 	    if (lastError != XML_ERR_INVALID_CHAR)
382 		fprintf(stderr,
383 	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
384 			i, j, K);
385 	}
386 
387 	/*
388 	 * if using a 3 byte encoding then the value must be greater
389 	 * than 0x800, i.e. one of bits 4 to 0 of i must be set or
390 	 * the 6th byte of data[1] must be set
391 	 */
392 	else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) {
393 	    if (lastError != XML_ERR_INVALID_CHAR)
394 		fprintf(stderr,
395 	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
396 			i, j, K);
397 	}
398 
399         /*
400 	 * There are values in that range that are not allowed in XML-1.0
401 	 */
402 	else if (((value > 0xD7FF) && (value <0xE000)) ||
403 	         ((value > 0xFFFD) && (value <0x10000))) {
404 	    if (lastError != XML_ERR_INVALID_CHAR)
405 		fprintf(stderr,
406 	"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
407 			value, i, j, K);
408 	}
409 
410 	/*
411 	 * We should see no error in remaining cases
412 	 */
413 	else if ((lastError != 0) || (len != 3)) {
414 	    fprintf(stderr,
415 		"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
416 		    i, j, K);
417 	}
418 
419 	/*
420 	 * Finally check the value is right
421 	 */
422 	else if (c != value) {
423 	    fprintf(stderr,
424     "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
425 		i, j, data[2], value, c);
426 	}
427     }
428     }
429     }
430 }
431 
testCharRangeByte4(xmlParserCtxtPtr ctxt,char * data)432 static void testCharRangeByte4(xmlParserCtxtPtr ctxt, char *data) {
433     int i, j, k, K, l, L;
434     int len, c;
435     unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
436     int value;
437 
438     data[4] = 0;
439     for (i = 0xF0;i <= 0xFF;i++) {
440     for (j = 0;j <= 0xFF;j++) {
441     for (k = 0;k < 6;k++) {
442     for (l = 0;l < 6;l++) {
443 	data[0] = i;
444 	data[1] = j;
445 	K = lows[k];
446 	data[2] = (char) K;
447 	L = lows[l];
448 	data[3] = (char) L;
449 	value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
450 	        ((i & 0x7) << 18);
451 	ctxt->charset = XML_CHAR_ENCODING_UTF8;
452 
453 	lastError = 0;
454 	c = xmlCurrentChar(ctxt, &len);
455 
456 	/*
457 	 * if fifth bit of first char is set, then the sequence would need
458 	 * at least 5 bytes, but we give only 4 !
459 	 */
460 	if ((i & 0xF8) == 0xF8) {
461 	    if (lastError != XML_ERR_INVALID_CHAR)
462 		fprintf(stderr,
463   "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
464 			i, j, K, data[3]);
465 	}
466 
467         /*
468 	 * The second, third and fourth bytes must start with 10
469 	 */
470 	else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) ||
471 	         ((L & 0xC0) != 0x80)) {
472 	    if (lastError != XML_ERR_INVALID_CHAR)
473 		fprintf(stderr,
474 	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
475 			i, j, K, L);
476 	}
477 
478 	/*
479 	 * if using a 3 byte encoding then the value must be greater
480 	 * than 0x10000, i.e. one of bits 3 to 0 of i must be set or
481 	 * the 6 or 5th byte of j must be set
482 	 */
483 	else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) {
484 	    if (lastError != XML_ERR_INVALID_CHAR)
485 		fprintf(stderr,
486 	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
487 			i, j, K, L);
488 	}
489 
490         /*
491 	 * There are values in that range that are not allowed in XML-1.0
492 	 */
493 	else if (((value > 0xD7FF) && (value <0xE000)) ||
494 	         ((value > 0xFFFD) && (value <0x10000)) ||
495 		 (value > 0x10FFFF)) {
496 	    if (lastError != XML_ERR_INVALID_CHAR)
497 		fprintf(stderr,
498 "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
499 			value, i, j, K, L);
500 	}
501 
502 	/*
503 	 * We should see no error in remaining cases
504 	 */
505 	else if ((lastError != 0) || (len != 4)) {
506 	    fprintf(stderr,
507 		"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
508 		    i, j, K);
509 	}
510 
511 	/*
512 	 * Finally check the value is right
513 	 */
514 	else if (c != value) {
515 	    fprintf(stderr,
516     "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
517 		i, j, data[2], value, c);
518 	}
519     }
520     }
521     }
522     }
523 }
524 
525 /**
526  * testCharRanges:
527  *
528  * Test the correct UTF8 character parsing in isolation i.e.
529  * not when parsing a full document, this is less expensive and we can
530  * cover the full range of UTF-8 chars accepted by XML-1.0
531  */
532 
testCharRanges(void)533 static void testCharRanges(void) {
534     char data[5];
535     xmlParserCtxtPtr ctxt;
536     xmlParserInputBufferPtr buf;
537     xmlParserInputPtr input;
538 
539     memset(data, 0, 5);
540 
541     /*
542      * Set up a parsing context using the above data buffer as
543      * the current input source.
544      */
545     ctxt = xmlNewParserCtxt();
546     if (ctxt == NULL) {
547         fprintf(stderr, "Failed to allocate parser context\n");
548 	return;
549     }
550     buf = xmlParserInputBufferCreateStatic(data, sizeof(data),
551                                            XML_CHAR_ENCODING_NONE);
552     if (buf == NULL) {
553         fprintf(stderr, "Failed to allocate input buffer\n");
554 	goto error;
555     }
556     input = xmlNewInputStream(ctxt);
557     if (input == NULL) {
558         xmlFreeParserInputBuffer(buf);
559 	goto error;
560     }
561     input->filename = NULL;
562     input->buf = buf;
563     input->cur =
564     input->base = xmlBufContent(input->buf->buffer);
565     input->end = input->base + 4;
566     inputPush(ctxt, input);
567 
568     printf("testing char range: 1");
569     fflush(stdout);
570     testCharRangeByte1(ctxt, data);
571     printf(" 2");
572     fflush(stdout);
573     testCharRangeByte2(ctxt, data);
574     printf(" 3");
575     fflush(stdout);
576     testCharRangeByte3(ctxt, data);
577     printf(" 4");
578     fflush(stdout);
579     testCharRangeByte4(ctxt, data);
580     printf(" done\n");
581     fflush(stdout);
582 
583 error:
584     xmlFreeParserCtxt(ctxt);
585 }
586 
main(void)587 int main(void) {
588 
589     /*
590      * this initialize the library and check potential ABI mismatches
591      * between the version it was compiled for and the actual shared
592      * library used.
593      */
594     LIBXML_TEST_VERSION
595 
596     /*
597      * Catch errors separately
598      */
599 
600     xmlSetStructuredErrorFunc(NULL, errorHandler);
601 
602     /*
603      * Run the tests
604      */
605     testCharRanges();
606     testDocumentRanges();
607 
608     /*
609      * Cleanup function for the XML library.
610      */
611     xmlCleanupParser();
612     /*
613      * this is to debug memory for regression tests
614      */
615     xmlMemoryDump();
616     return(0);
617 }
618