1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2 See the file COPYING for copying permission.
3 */
4
5 /* This file is included! */
6 #ifdef XML_TOK_IMPL_C
7
8 #ifndef IS_INVALID_CHAR
9 #define IS_INVALID_CHAR(enc, ptr, n) (0)
10 #endif
11
12 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
13 case BT_LEAD ## n: \
14 if (end - ptr < n) \
15 return XML_TOK_PARTIAL_CHAR; \
16 if (IS_INVALID_CHAR(enc, ptr, n)) { \
17 *(nextTokPtr) = (ptr); \
18 return XML_TOK_INVALID; \
19 } \
20 ptr += n; \
21 break;
22
23 #define INVALID_CASES(ptr, nextTokPtr) \
24 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
25 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
26 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
27 case BT_NONXML: \
28 case BT_MALFORM: \
29 case BT_TRAIL: \
30 *(nextTokPtr) = (ptr); \
31 return XML_TOK_INVALID;
32
33 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
34 case BT_LEAD ## n: \
35 if (end - ptr < n) \
36 return XML_TOK_PARTIAL_CHAR; \
37 if (!IS_NAME_CHAR(enc, ptr, n)) { \
38 *nextTokPtr = ptr; \
39 return XML_TOK_INVALID; \
40 } \
41 ptr += n; \
42 break;
43
44 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
45 case BT_NONASCII: \
46 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
47 *nextTokPtr = ptr; \
48 return XML_TOK_INVALID; \
49 } \
50 case BT_NMSTRT: \
51 case BT_HEX: \
52 case BT_DIGIT: \
53 case BT_NAME: \
54 case BT_MINUS: \
55 ptr += MINBPC(enc); \
56 break; \
57 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
58 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
59 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
60
61 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
62 case BT_LEAD ## n: \
63 if (end - ptr < n) \
64 return XML_TOK_PARTIAL_CHAR; \
65 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
66 *nextTokPtr = ptr; \
67 return XML_TOK_INVALID; \
68 } \
69 ptr += n; \
70 break;
71
72 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
73 case BT_NONASCII: \
74 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
75 *nextTokPtr = ptr; \
76 return XML_TOK_INVALID; \
77 } \
78 case BT_NMSTRT: \
79 case BT_HEX: \
80 ptr += MINBPC(enc); \
81 break; \
82 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
83 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
84 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
85
86 #ifndef PREFIX
87 #define PREFIX(ident) ident
88 #endif
89
90
91 #define HAS_CHARS(enc, ptr, end, count) \
92 (end - ptr >= count * MINBPC(enc))
93
94 #define HAS_CHAR(enc, ptr, end) \
95 HAS_CHARS(enc, ptr, end, 1)
96
97 #define REQUIRE_CHARS(enc, ptr, end, count) \
98 { \
99 if (! HAS_CHARS(enc, ptr, end, count)) { \
100 return XML_TOK_PARTIAL; \
101 } \
102 }
103
104 #define REQUIRE_CHAR(enc, ptr, end) \
105 REQUIRE_CHARS(enc, ptr, end, 1)
106
107
108 /* ptr points to character following "<!-" */
109
110 static int PTRCALL
PREFIX(scanComment)111 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
112 const char *end, const char **nextTokPtr)
113 {
114 if (HAS_CHAR(enc, ptr, end)) {
115 if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
116 *nextTokPtr = ptr;
117 return XML_TOK_INVALID;
118 }
119 ptr += MINBPC(enc);
120 while (HAS_CHAR(enc, ptr, end)) {
121 switch (BYTE_TYPE(enc, ptr)) {
122 INVALID_CASES(ptr, nextTokPtr)
123 case BT_MINUS:
124 ptr += MINBPC(enc);
125 REQUIRE_CHAR(enc, ptr, end);
126 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
127 ptr += MINBPC(enc);
128 REQUIRE_CHAR(enc, ptr, end);
129 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
130 *nextTokPtr = ptr;
131 return XML_TOK_INVALID;
132 }
133 *nextTokPtr = ptr + MINBPC(enc);
134 return XML_TOK_COMMENT;
135 }
136 break;
137 default:
138 ptr += MINBPC(enc);
139 break;
140 }
141 }
142 }
143 return XML_TOK_PARTIAL;
144 }
145
146 /* ptr points to character following "<!" */
147
148 static int PTRCALL
PREFIX(scanDecl)149 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
150 const char *end, const char **nextTokPtr)
151 {
152 REQUIRE_CHAR(enc, ptr, end);
153 switch (BYTE_TYPE(enc, ptr)) {
154 case BT_MINUS:
155 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
156 case BT_LSQB:
157 *nextTokPtr = ptr + MINBPC(enc);
158 return XML_TOK_COND_SECT_OPEN;
159 case BT_NMSTRT:
160 case BT_HEX:
161 ptr += MINBPC(enc);
162 break;
163 default:
164 *nextTokPtr = ptr;
165 return XML_TOK_INVALID;
166 }
167 while (HAS_CHAR(enc, ptr, end)) {
168 switch (BYTE_TYPE(enc, ptr)) {
169 case BT_PERCNT:
170 REQUIRE_CHARS(enc, ptr, end, 2);
171 /* don't allow <!ENTITY% foo "whatever"> */
172 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
173 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
174 *nextTokPtr = ptr;
175 return XML_TOK_INVALID;
176 }
177 /* fall through */
178 case BT_S: case BT_CR: case BT_LF:
179 *nextTokPtr = ptr;
180 return XML_TOK_DECL_OPEN;
181 case BT_NMSTRT:
182 case BT_HEX:
183 ptr += MINBPC(enc);
184 break;
185 default:
186 *nextTokPtr = ptr;
187 return XML_TOK_INVALID;
188 }
189 }
190 return XML_TOK_PARTIAL;
191 }
192
193 static int PTRCALL
PREFIX(checkPiTarget)194 PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr,
195 const char *end, int *tokPtr)
196 {
197 int upper = 0;
198 *tokPtr = XML_TOK_PI;
199 if (end - ptr != MINBPC(enc)*3)
200 return 1;
201 switch (BYTE_TO_ASCII(enc, ptr)) {
202 case ASCII_x:
203 break;
204 case ASCII_X:
205 upper = 1;
206 break;
207 default:
208 return 1;
209 }
210 ptr += MINBPC(enc);
211 switch (BYTE_TO_ASCII(enc, ptr)) {
212 case ASCII_m:
213 break;
214 case ASCII_M:
215 upper = 1;
216 break;
217 default:
218 return 1;
219 }
220 ptr += MINBPC(enc);
221 switch (BYTE_TO_ASCII(enc, ptr)) {
222 case ASCII_l:
223 break;
224 case ASCII_L:
225 upper = 1;
226 break;
227 default:
228 return 1;
229 }
230 if (upper)
231 return 0;
232 *tokPtr = XML_TOK_XML_DECL;
233 return 1;
234 }
235
236 /* ptr points to character following "<?" */
237
238 static int PTRCALL
PREFIX(scanPi)239 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
240 const char *end, const char **nextTokPtr)
241 {
242 int tok;
243 const char *target = ptr;
244 REQUIRE_CHAR(enc, ptr, end);
245 switch (BYTE_TYPE(enc, ptr)) {
246 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
247 default:
248 *nextTokPtr = ptr;
249 return XML_TOK_INVALID;
250 }
251 while (HAS_CHAR(enc, ptr, end)) {
252 switch (BYTE_TYPE(enc, ptr)) {
253 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
254 case BT_S: case BT_CR: case BT_LF:
255 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
256 *nextTokPtr = ptr;
257 return XML_TOK_INVALID;
258 }
259 ptr += MINBPC(enc);
260 while (HAS_CHAR(enc, ptr, end)) {
261 switch (BYTE_TYPE(enc, ptr)) {
262 INVALID_CASES(ptr, nextTokPtr)
263 case BT_QUEST:
264 ptr += MINBPC(enc);
265 REQUIRE_CHAR(enc, ptr, end);
266 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
267 *nextTokPtr = ptr + MINBPC(enc);
268 return tok;
269 }
270 break;
271 default:
272 ptr += MINBPC(enc);
273 break;
274 }
275 }
276 return XML_TOK_PARTIAL;
277 case BT_QUEST:
278 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
279 *nextTokPtr = ptr;
280 return XML_TOK_INVALID;
281 }
282 ptr += MINBPC(enc);
283 REQUIRE_CHAR(enc, ptr, end);
284 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
285 *nextTokPtr = ptr + MINBPC(enc);
286 return tok;
287 }
288 /* fall through */
289 default:
290 *nextTokPtr = ptr;
291 return XML_TOK_INVALID;
292 }
293 }
294 return XML_TOK_PARTIAL;
295 }
296
297 static int PTRCALL
PREFIX(scanCdataSection)298 PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr,
299 const char *end, const char **nextTokPtr)
300 {
301 static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
302 ASCII_T, ASCII_A, ASCII_LSQB };
303 int i;
304 /* CDATA[ */
305 REQUIRE_CHARS(enc, ptr, end, 6);
306 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
307 if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
308 *nextTokPtr = ptr;
309 return XML_TOK_INVALID;
310 }
311 }
312 *nextTokPtr = ptr;
313 return XML_TOK_CDATA_SECT_OPEN;
314 }
315
316 static int PTRCALL
PREFIX(cdataSectionTok)317 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
318 const char *end, const char **nextTokPtr)
319 {
320 if (ptr >= end)
321 return XML_TOK_NONE;
322 if (MINBPC(enc) > 1) {
323 size_t n = end - ptr;
324 if (n & (MINBPC(enc) - 1)) {
325 n &= ~(MINBPC(enc) - 1);
326 if (n == 0)
327 return XML_TOK_PARTIAL;
328 end = ptr + n;
329 }
330 }
331 switch (BYTE_TYPE(enc, ptr)) {
332 case BT_RSQB:
333 ptr += MINBPC(enc);
334 REQUIRE_CHAR(enc, ptr, end);
335 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
336 break;
337 ptr += MINBPC(enc);
338 REQUIRE_CHAR(enc, ptr, end);
339 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
340 ptr -= MINBPC(enc);
341 break;
342 }
343 *nextTokPtr = ptr + MINBPC(enc);
344 return XML_TOK_CDATA_SECT_CLOSE;
345 case BT_CR:
346 ptr += MINBPC(enc);
347 REQUIRE_CHAR(enc, ptr, end);
348 if (BYTE_TYPE(enc, ptr) == BT_LF)
349 ptr += MINBPC(enc);
350 *nextTokPtr = ptr;
351 return XML_TOK_DATA_NEWLINE;
352 case BT_LF:
353 *nextTokPtr = ptr + MINBPC(enc);
354 return XML_TOK_DATA_NEWLINE;
355 INVALID_CASES(ptr, nextTokPtr)
356 default:
357 ptr += MINBPC(enc);
358 break;
359 }
360 while (HAS_CHAR(enc, ptr, end)) {
361 switch (BYTE_TYPE(enc, ptr)) {
362 #define LEAD_CASE(n) \
363 case BT_LEAD ## n: \
364 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
365 *nextTokPtr = ptr; \
366 return XML_TOK_DATA_CHARS; \
367 } \
368 ptr += n; \
369 break;
370 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
371 #undef LEAD_CASE
372 case BT_NONXML:
373 case BT_MALFORM:
374 case BT_TRAIL:
375 case BT_CR:
376 case BT_LF:
377 case BT_RSQB:
378 *nextTokPtr = ptr;
379 return XML_TOK_DATA_CHARS;
380 default:
381 ptr += MINBPC(enc);
382 break;
383 }
384 }
385 *nextTokPtr = ptr;
386 return XML_TOK_DATA_CHARS;
387 }
388
389 /* ptr points to character following "</" */
390
391 static int PTRCALL
PREFIX(scanEndTag)392 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
393 const char *end, const char **nextTokPtr)
394 {
395 REQUIRE_CHAR(enc, ptr, end);
396 switch (BYTE_TYPE(enc, ptr)) {
397 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
398 default:
399 *nextTokPtr = ptr;
400 return XML_TOK_INVALID;
401 }
402 while (HAS_CHAR(enc, ptr, end)) {
403 switch (BYTE_TYPE(enc, ptr)) {
404 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
405 case BT_S: case BT_CR: case BT_LF:
406 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
407 switch (BYTE_TYPE(enc, ptr)) {
408 case BT_S: case BT_CR: case BT_LF:
409 break;
410 case BT_GT:
411 *nextTokPtr = ptr + MINBPC(enc);
412 return XML_TOK_END_TAG;
413 default:
414 *nextTokPtr = ptr;
415 return XML_TOK_INVALID;
416 }
417 }
418 return XML_TOK_PARTIAL;
419 #ifdef XML_NS
420 case BT_COLON:
421 /* no need to check qname syntax here,
422 since end-tag must match exactly */
423 ptr += MINBPC(enc);
424 break;
425 #endif
426 case BT_GT:
427 *nextTokPtr = ptr + MINBPC(enc);
428 return XML_TOK_END_TAG;
429 default:
430 *nextTokPtr = ptr;
431 return XML_TOK_INVALID;
432 }
433 }
434 return XML_TOK_PARTIAL;
435 }
436
437 /* ptr points to character following "&#X" */
438
439 static int PTRCALL
PREFIX(scanHexCharRef)440 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
441 const char *end, const char **nextTokPtr)
442 {
443 if (HAS_CHAR(enc, ptr, end)) {
444 switch (BYTE_TYPE(enc, ptr)) {
445 case BT_DIGIT:
446 case BT_HEX:
447 break;
448 default:
449 *nextTokPtr = ptr;
450 return XML_TOK_INVALID;
451 }
452 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
453 switch (BYTE_TYPE(enc, ptr)) {
454 case BT_DIGIT:
455 case BT_HEX:
456 break;
457 case BT_SEMI:
458 *nextTokPtr = ptr + MINBPC(enc);
459 return XML_TOK_CHAR_REF;
460 default:
461 *nextTokPtr = ptr;
462 return XML_TOK_INVALID;
463 }
464 }
465 }
466 return XML_TOK_PARTIAL;
467 }
468
469 /* ptr points to character following "&#" */
470
471 static int PTRCALL
PREFIX(scanCharRef)472 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
473 const char *end, const char **nextTokPtr)
474 {
475 if (HAS_CHAR(enc, ptr, end)) {
476 if (CHAR_MATCHES(enc, ptr, ASCII_x))
477 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
478 switch (BYTE_TYPE(enc, ptr)) {
479 case BT_DIGIT:
480 break;
481 default:
482 *nextTokPtr = ptr;
483 return XML_TOK_INVALID;
484 }
485 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
486 switch (BYTE_TYPE(enc, ptr)) {
487 case BT_DIGIT:
488 break;
489 case BT_SEMI:
490 *nextTokPtr = ptr + MINBPC(enc);
491 return XML_TOK_CHAR_REF;
492 default:
493 *nextTokPtr = ptr;
494 return XML_TOK_INVALID;
495 }
496 }
497 }
498 return XML_TOK_PARTIAL;
499 }
500
501 /* ptr points to character following "&" */
502
503 static int PTRCALL
PREFIX(scanRef)504 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
505 const char **nextTokPtr)
506 {
507 REQUIRE_CHAR(enc, ptr, end);
508 switch (BYTE_TYPE(enc, ptr)) {
509 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
510 case BT_NUM:
511 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
512 default:
513 *nextTokPtr = ptr;
514 return XML_TOK_INVALID;
515 }
516 while (HAS_CHAR(enc, ptr, end)) {
517 switch (BYTE_TYPE(enc, ptr)) {
518 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
519 case BT_SEMI:
520 *nextTokPtr = ptr + MINBPC(enc);
521 return XML_TOK_ENTITY_REF;
522 default:
523 *nextTokPtr = ptr;
524 return XML_TOK_INVALID;
525 }
526 }
527 return XML_TOK_PARTIAL;
528 }
529
530 /* ptr points to character following first character of attribute name */
531
532 static int PTRCALL
PREFIX(scanAtts)533 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
534 const char **nextTokPtr)
535 {
536 #ifdef XML_NS
537 int hadColon = 0;
538 #endif
539 while (HAS_CHAR(enc, ptr, end)) {
540 switch (BYTE_TYPE(enc, ptr)) {
541 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
542 #ifdef XML_NS
543 case BT_COLON:
544 if (hadColon) {
545 *nextTokPtr = ptr;
546 return XML_TOK_INVALID;
547 }
548 hadColon = 1;
549 ptr += MINBPC(enc);
550 REQUIRE_CHAR(enc, ptr, end);
551 switch (BYTE_TYPE(enc, ptr)) {
552 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
553 default:
554 *nextTokPtr = ptr;
555 return XML_TOK_INVALID;
556 }
557 break;
558 #endif
559 case BT_S: case BT_CR: case BT_LF:
560 for (;;) {
561 int t;
562
563 ptr += MINBPC(enc);
564 REQUIRE_CHAR(enc, ptr, end);
565 t = BYTE_TYPE(enc, ptr);
566 if (t == BT_EQUALS)
567 break;
568 switch (t) {
569 case BT_S:
570 case BT_LF:
571 case BT_CR:
572 break;
573 default:
574 *nextTokPtr = ptr;
575 return XML_TOK_INVALID;
576 }
577 }
578 /* fall through */
579 case BT_EQUALS:
580 {
581 int open;
582 #ifdef XML_NS
583 hadColon = 0;
584 #endif
585 for (;;) {
586 ptr += MINBPC(enc);
587 REQUIRE_CHAR(enc, ptr, end);
588 open = BYTE_TYPE(enc, ptr);
589 if (open == BT_QUOT || open == BT_APOS)
590 break;
591 switch (open) {
592 case BT_S:
593 case BT_LF:
594 case BT_CR:
595 break;
596 default:
597 *nextTokPtr = ptr;
598 return XML_TOK_INVALID;
599 }
600 }
601 ptr += MINBPC(enc);
602 /* in attribute value */
603 for (;;) {
604 int t;
605 REQUIRE_CHAR(enc, ptr, end);
606 t = BYTE_TYPE(enc, ptr);
607 if (t == open)
608 break;
609 switch (t) {
610 INVALID_CASES(ptr, nextTokPtr)
611 case BT_AMP:
612 {
613 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
614 if (tok <= 0) {
615 if (tok == XML_TOK_INVALID)
616 *nextTokPtr = ptr;
617 return tok;
618 }
619 break;
620 }
621 case BT_LT:
622 *nextTokPtr = ptr;
623 return XML_TOK_INVALID;
624 default:
625 ptr += MINBPC(enc);
626 break;
627 }
628 }
629 ptr += MINBPC(enc);
630 REQUIRE_CHAR(enc, ptr, end);
631 switch (BYTE_TYPE(enc, ptr)) {
632 case BT_S:
633 case BT_CR:
634 case BT_LF:
635 break;
636 case BT_SOL:
637 goto sol;
638 case BT_GT:
639 goto gt;
640 default:
641 *nextTokPtr = ptr;
642 return XML_TOK_INVALID;
643 }
644 /* ptr points to closing quote */
645 for (;;) {
646 ptr += MINBPC(enc);
647 REQUIRE_CHAR(enc, ptr, end);
648 switch (BYTE_TYPE(enc, ptr)) {
649 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
650 case BT_S: case BT_CR: case BT_LF:
651 continue;
652 case BT_GT:
653 gt:
654 *nextTokPtr = ptr + MINBPC(enc);
655 return XML_TOK_START_TAG_WITH_ATTS;
656 case BT_SOL:
657 sol:
658 ptr += MINBPC(enc);
659 REQUIRE_CHAR(enc, ptr, end);
660 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
661 *nextTokPtr = ptr;
662 return XML_TOK_INVALID;
663 }
664 *nextTokPtr = ptr + MINBPC(enc);
665 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
666 default:
667 *nextTokPtr = ptr;
668 return XML_TOK_INVALID;
669 }
670 break;
671 }
672 break;
673 }
674 default:
675 *nextTokPtr = ptr;
676 return XML_TOK_INVALID;
677 }
678 }
679 return XML_TOK_PARTIAL;
680 }
681
682 /* ptr points to character following "<" */
683
684 static int PTRCALL
PREFIX(scanLt)685 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
686 const char **nextTokPtr)
687 {
688 #ifdef XML_NS
689 int hadColon;
690 #endif
691 REQUIRE_CHAR(enc, ptr, end);
692 switch (BYTE_TYPE(enc, ptr)) {
693 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
694 case BT_EXCL:
695 ptr += MINBPC(enc);
696 REQUIRE_CHAR(enc, ptr, end);
697 switch (BYTE_TYPE(enc, ptr)) {
698 case BT_MINUS:
699 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
700 case BT_LSQB:
701 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
702 end, nextTokPtr);
703 }
704 *nextTokPtr = ptr;
705 return XML_TOK_INVALID;
706 case BT_QUEST:
707 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
708 case BT_SOL:
709 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
710 default:
711 *nextTokPtr = ptr;
712 return XML_TOK_INVALID;
713 }
714 #ifdef XML_NS
715 hadColon = 0;
716 #endif
717 /* we have a start-tag */
718 while (HAS_CHAR(enc, ptr, end)) {
719 switch (BYTE_TYPE(enc, ptr)) {
720 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
721 #ifdef XML_NS
722 case BT_COLON:
723 if (hadColon) {
724 *nextTokPtr = ptr;
725 return XML_TOK_INVALID;
726 }
727 hadColon = 1;
728 ptr += MINBPC(enc);
729 REQUIRE_CHAR(enc, ptr, end);
730 switch (BYTE_TYPE(enc, ptr)) {
731 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
732 default:
733 *nextTokPtr = ptr;
734 return XML_TOK_INVALID;
735 }
736 break;
737 #endif
738 case BT_S: case BT_CR: case BT_LF:
739 {
740 ptr += MINBPC(enc);
741 while (HAS_CHAR(enc, ptr, end)) {
742 switch (BYTE_TYPE(enc, ptr)) {
743 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
744 case BT_GT:
745 goto gt;
746 case BT_SOL:
747 goto sol;
748 case BT_S: case BT_CR: case BT_LF:
749 ptr += MINBPC(enc);
750 continue;
751 default:
752 *nextTokPtr = ptr;
753 return XML_TOK_INVALID;
754 }
755 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
756 }
757 return XML_TOK_PARTIAL;
758 }
759 case BT_GT:
760 gt:
761 *nextTokPtr = ptr + MINBPC(enc);
762 return XML_TOK_START_TAG_NO_ATTS;
763 case BT_SOL:
764 sol:
765 ptr += MINBPC(enc);
766 REQUIRE_CHAR(enc, ptr, end);
767 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
768 *nextTokPtr = ptr;
769 return XML_TOK_INVALID;
770 }
771 *nextTokPtr = ptr + MINBPC(enc);
772 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
773 default:
774 *nextTokPtr = ptr;
775 return XML_TOK_INVALID;
776 }
777 }
778 return XML_TOK_PARTIAL;
779 }
780
781 static int PTRCALL
PREFIX(contentTok)782 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
783 const char **nextTokPtr)
784 {
785 if (ptr >= end)
786 return XML_TOK_NONE;
787 if (MINBPC(enc) > 1) {
788 size_t n = end - ptr;
789 if (n & (MINBPC(enc) - 1)) {
790 n &= ~(MINBPC(enc) - 1);
791 if (n == 0)
792 return XML_TOK_PARTIAL;
793 end = ptr + n;
794 }
795 }
796 switch (BYTE_TYPE(enc, ptr)) {
797 case BT_LT:
798 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
799 case BT_AMP:
800 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
801 case BT_CR:
802 ptr += MINBPC(enc);
803 if (! HAS_CHAR(enc, ptr, end))
804 return XML_TOK_TRAILING_CR;
805 if (BYTE_TYPE(enc, ptr) == BT_LF)
806 ptr += MINBPC(enc);
807 *nextTokPtr = ptr;
808 return XML_TOK_DATA_NEWLINE;
809 case BT_LF:
810 *nextTokPtr = ptr + MINBPC(enc);
811 return XML_TOK_DATA_NEWLINE;
812 case BT_RSQB:
813 ptr += MINBPC(enc);
814 if (! HAS_CHAR(enc, ptr, end))
815 return XML_TOK_TRAILING_RSQB;
816 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
817 break;
818 ptr += MINBPC(enc);
819 if (! HAS_CHAR(enc, ptr, end))
820 return XML_TOK_TRAILING_RSQB;
821 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
822 ptr -= MINBPC(enc);
823 break;
824 }
825 *nextTokPtr = ptr;
826 return XML_TOK_INVALID;
827 INVALID_CASES(ptr, nextTokPtr)
828 default:
829 ptr += MINBPC(enc);
830 break;
831 }
832 while (HAS_CHAR(enc, ptr, end)) {
833 switch (BYTE_TYPE(enc, ptr)) {
834 #define LEAD_CASE(n) \
835 case BT_LEAD ## n: \
836 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
837 *nextTokPtr = ptr; \
838 return XML_TOK_DATA_CHARS; \
839 } \
840 ptr += n; \
841 break;
842 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
843 #undef LEAD_CASE
844 case BT_RSQB:
845 if (HAS_CHARS(enc, ptr, end, 2)) {
846 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
847 ptr += MINBPC(enc);
848 break;
849 }
850 if (HAS_CHARS(enc, ptr, end, 3)) {
851 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
852 ptr += MINBPC(enc);
853 break;
854 }
855 *nextTokPtr = ptr + 2*MINBPC(enc);
856 return XML_TOK_INVALID;
857 }
858 }
859 /* fall through */
860 case BT_AMP:
861 case BT_LT:
862 case BT_NONXML:
863 case BT_MALFORM:
864 case BT_TRAIL:
865 case BT_CR:
866 case BT_LF:
867 *nextTokPtr = ptr;
868 return XML_TOK_DATA_CHARS;
869 default:
870 ptr += MINBPC(enc);
871 break;
872 }
873 }
874 *nextTokPtr = ptr;
875 return XML_TOK_DATA_CHARS;
876 }
877
878 /* ptr points to character following "%" */
879
880 static int PTRCALL
PREFIX(scanPercent)881 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
882 const char **nextTokPtr)
883 {
884 REQUIRE_CHAR(enc, ptr, end);
885 switch (BYTE_TYPE(enc, ptr)) {
886 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
887 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
888 *nextTokPtr = ptr;
889 return XML_TOK_PERCENT;
890 default:
891 *nextTokPtr = ptr;
892 return XML_TOK_INVALID;
893 }
894 while (HAS_CHAR(enc, ptr, end)) {
895 switch (BYTE_TYPE(enc, ptr)) {
896 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
897 case BT_SEMI:
898 *nextTokPtr = ptr + MINBPC(enc);
899 return XML_TOK_PARAM_ENTITY_REF;
900 default:
901 *nextTokPtr = ptr;
902 return XML_TOK_INVALID;
903 }
904 }
905 return XML_TOK_PARTIAL;
906 }
907
908 static int PTRCALL
PREFIX(scanPoundName)909 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
910 const char **nextTokPtr)
911 {
912 REQUIRE_CHAR(enc, ptr, end);
913 switch (BYTE_TYPE(enc, ptr)) {
914 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
915 default:
916 *nextTokPtr = ptr;
917 return XML_TOK_INVALID;
918 }
919 while (HAS_CHAR(enc, ptr, end)) {
920 switch (BYTE_TYPE(enc, ptr)) {
921 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
922 case BT_CR: case BT_LF: case BT_S:
923 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
924 *nextTokPtr = ptr;
925 return XML_TOK_POUND_NAME;
926 default:
927 *nextTokPtr = ptr;
928 return XML_TOK_INVALID;
929 }
930 }
931 return -XML_TOK_POUND_NAME;
932 }
933
934 static int PTRCALL
PREFIX(scanLit)935 PREFIX(scanLit)(int open, const ENCODING *enc,
936 const char *ptr, const char *end,
937 const char **nextTokPtr)
938 {
939 while (HAS_CHAR(enc, ptr, end)) {
940 int t = BYTE_TYPE(enc, ptr);
941 switch (t) {
942 INVALID_CASES(ptr, nextTokPtr)
943 case BT_QUOT:
944 case BT_APOS:
945 ptr += MINBPC(enc);
946 if (t != open)
947 break;
948 if (! HAS_CHAR(enc, ptr, end))
949 return -XML_TOK_LITERAL;
950 *nextTokPtr = ptr;
951 switch (BYTE_TYPE(enc, ptr)) {
952 case BT_S: case BT_CR: case BT_LF:
953 case BT_GT: case BT_PERCNT: case BT_LSQB:
954 return XML_TOK_LITERAL;
955 default:
956 return XML_TOK_INVALID;
957 }
958 default:
959 ptr += MINBPC(enc);
960 break;
961 }
962 }
963 return XML_TOK_PARTIAL;
964 }
965
966 static int PTRCALL
PREFIX(prologTok)967 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
968 const char **nextTokPtr)
969 {
970 int tok;
971 if (ptr >= end)
972 return XML_TOK_NONE;
973 if (MINBPC(enc) > 1) {
974 size_t n = end - ptr;
975 if (n & (MINBPC(enc) - 1)) {
976 n &= ~(MINBPC(enc) - 1);
977 if (n == 0)
978 return XML_TOK_PARTIAL;
979 end = ptr + n;
980 }
981 }
982 switch (BYTE_TYPE(enc, ptr)) {
983 case BT_QUOT:
984 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
985 case BT_APOS:
986 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
987 case BT_LT:
988 {
989 ptr += MINBPC(enc);
990 REQUIRE_CHAR(enc, ptr, end);
991 switch (BYTE_TYPE(enc, ptr)) {
992 case BT_EXCL:
993 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
994 case BT_QUEST:
995 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
996 case BT_NMSTRT:
997 case BT_HEX:
998 case BT_NONASCII:
999 case BT_LEAD2:
1000 case BT_LEAD3:
1001 case BT_LEAD4:
1002 *nextTokPtr = ptr - MINBPC(enc);
1003 return XML_TOK_INSTANCE_START;
1004 }
1005 *nextTokPtr = ptr;
1006 return XML_TOK_INVALID;
1007 }
1008 case BT_CR:
1009 if (ptr + MINBPC(enc) == end) {
1010 *nextTokPtr = end;
1011 /* indicate that this might be part of a CR/LF pair */
1012 return -XML_TOK_PROLOG_S;
1013 }
1014 /* fall through */
1015 case BT_S: case BT_LF:
1016 for (;;) {
1017 ptr += MINBPC(enc);
1018 if (! HAS_CHAR(enc, ptr, end))
1019 break;
1020 switch (BYTE_TYPE(enc, ptr)) {
1021 case BT_S: case BT_LF:
1022 break;
1023 case BT_CR:
1024 /* don't split CR/LF pair */
1025 if (ptr + MINBPC(enc) != end)
1026 break;
1027 /* fall through */
1028 default:
1029 *nextTokPtr = ptr;
1030 return XML_TOK_PROLOG_S;
1031 }
1032 }
1033 *nextTokPtr = ptr;
1034 return XML_TOK_PROLOG_S;
1035 case BT_PERCNT:
1036 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1037 case BT_COMMA:
1038 *nextTokPtr = ptr + MINBPC(enc);
1039 return XML_TOK_COMMA;
1040 case BT_LSQB:
1041 *nextTokPtr = ptr + MINBPC(enc);
1042 return XML_TOK_OPEN_BRACKET;
1043 case BT_RSQB:
1044 ptr += MINBPC(enc);
1045 if (! HAS_CHAR(enc, ptr, end))
1046 return -XML_TOK_CLOSE_BRACKET;
1047 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1048 REQUIRE_CHARS(enc, ptr, end, 2);
1049 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1050 *nextTokPtr = ptr + 2*MINBPC(enc);
1051 return XML_TOK_COND_SECT_CLOSE;
1052 }
1053 }
1054 *nextTokPtr = ptr;
1055 return XML_TOK_CLOSE_BRACKET;
1056 case BT_LPAR:
1057 *nextTokPtr = ptr + MINBPC(enc);
1058 return XML_TOK_OPEN_PAREN;
1059 case BT_RPAR:
1060 ptr += MINBPC(enc);
1061 if (! HAS_CHAR(enc, ptr, end))
1062 return -XML_TOK_CLOSE_PAREN;
1063 switch (BYTE_TYPE(enc, ptr)) {
1064 case BT_AST:
1065 *nextTokPtr = ptr + MINBPC(enc);
1066 return XML_TOK_CLOSE_PAREN_ASTERISK;
1067 case BT_QUEST:
1068 *nextTokPtr = ptr + MINBPC(enc);
1069 return XML_TOK_CLOSE_PAREN_QUESTION;
1070 case BT_PLUS:
1071 *nextTokPtr = ptr + MINBPC(enc);
1072 return XML_TOK_CLOSE_PAREN_PLUS;
1073 case BT_CR: case BT_LF: case BT_S:
1074 case BT_GT: case BT_COMMA: case BT_VERBAR:
1075 case BT_RPAR:
1076 *nextTokPtr = ptr;
1077 return XML_TOK_CLOSE_PAREN;
1078 }
1079 *nextTokPtr = ptr;
1080 return XML_TOK_INVALID;
1081 case BT_VERBAR:
1082 *nextTokPtr = ptr + MINBPC(enc);
1083 return XML_TOK_OR;
1084 case BT_GT:
1085 *nextTokPtr = ptr + MINBPC(enc);
1086 return XML_TOK_DECL_CLOSE;
1087 case BT_NUM:
1088 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1089 #define LEAD_CASE(n) \
1090 case BT_LEAD ## n: \
1091 if (end - ptr < n) \
1092 return XML_TOK_PARTIAL_CHAR; \
1093 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1094 ptr += n; \
1095 tok = XML_TOK_NAME; \
1096 break; \
1097 } \
1098 if (IS_NAME_CHAR(enc, ptr, n)) { \
1099 ptr += n; \
1100 tok = XML_TOK_NMTOKEN; \
1101 break; \
1102 } \
1103 *nextTokPtr = ptr; \
1104 return XML_TOK_INVALID;
1105 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1106 #undef LEAD_CASE
1107 case BT_NMSTRT:
1108 case BT_HEX:
1109 tok = XML_TOK_NAME;
1110 ptr += MINBPC(enc);
1111 break;
1112 case BT_DIGIT:
1113 case BT_NAME:
1114 case BT_MINUS:
1115 #ifdef XML_NS
1116 case BT_COLON:
1117 #endif
1118 tok = XML_TOK_NMTOKEN;
1119 ptr += MINBPC(enc);
1120 break;
1121 case BT_NONASCII:
1122 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1123 ptr += MINBPC(enc);
1124 tok = XML_TOK_NAME;
1125 break;
1126 }
1127 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1128 ptr += MINBPC(enc);
1129 tok = XML_TOK_NMTOKEN;
1130 break;
1131 }
1132 /* fall through */
1133 default:
1134 *nextTokPtr = ptr;
1135 return XML_TOK_INVALID;
1136 }
1137 while (HAS_CHAR(enc, ptr, end)) {
1138 switch (BYTE_TYPE(enc, ptr)) {
1139 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1140 case BT_GT: case BT_RPAR: case BT_COMMA:
1141 case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1142 case BT_S: case BT_CR: case BT_LF:
1143 *nextTokPtr = ptr;
1144 return tok;
1145 #ifdef XML_NS
1146 case BT_COLON:
1147 ptr += MINBPC(enc);
1148 switch (tok) {
1149 case XML_TOK_NAME:
1150 REQUIRE_CHAR(enc, ptr, end);
1151 tok = XML_TOK_PREFIXED_NAME;
1152 switch (BYTE_TYPE(enc, ptr)) {
1153 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1154 default:
1155 tok = XML_TOK_NMTOKEN;
1156 break;
1157 }
1158 break;
1159 case XML_TOK_PREFIXED_NAME:
1160 tok = XML_TOK_NMTOKEN;
1161 break;
1162 }
1163 break;
1164 #endif
1165 case BT_PLUS:
1166 if (tok == XML_TOK_NMTOKEN) {
1167 *nextTokPtr = ptr;
1168 return XML_TOK_INVALID;
1169 }
1170 *nextTokPtr = ptr + MINBPC(enc);
1171 return XML_TOK_NAME_PLUS;
1172 case BT_AST:
1173 if (tok == XML_TOK_NMTOKEN) {
1174 *nextTokPtr = ptr;
1175 return XML_TOK_INVALID;
1176 }
1177 *nextTokPtr = ptr + MINBPC(enc);
1178 return XML_TOK_NAME_ASTERISK;
1179 case BT_QUEST:
1180 if (tok == XML_TOK_NMTOKEN) {
1181 *nextTokPtr = ptr;
1182 return XML_TOK_INVALID;
1183 }
1184 *nextTokPtr = ptr + MINBPC(enc);
1185 return XML_TOK_NAME_QUESTION;
1186 default:
1187 *nextTokPtr = ptr;
1188 return XML_TOK_INVALID;
1189 }
1190 }
1191 return -tok;
1192 }
1193
1194 static int PTRCALL
PREFIX(attributeValueTok)1195 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1196 const char *end, const char **nextTokPtr)
1197 {
1198 const char *start;
1199 if (ptr >= end)
1200 return XML_TOK_NONE;
1201 else if (! HAS_CHAR(enc, ptr, end))
1202 return XML_TOK_PARTIAL;
1203 start = ptr;
1204 while (HAS_CHAR(enc, ptr, end)) {
1205 switch (BYTE_TYPE(enc, ptr)) {
1206 #define LEAD_CASE(n) \
1207 case BT_LEAD ## n: ptr += n; break;
1208 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1209 #undef LEAD_CASE
1210 case BT_AMP:
1211 if (ptr == start)
1212 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1213 *nextTokPtr = ptr;
1214 return XML_TOK_DATA_CHARS;
1215 case BT_LT:
1216 /* this is for inside entity references */
1217 *nextTokPtr = ptr;
1218 return XML_TOK_INVALID;
1219 case BT_LF:
1220 if (ptr == start) {
1221 *nextTokPtr = ptr + MINBPC(enc);
1222 return XML_TOK_DATA_NEWLINE;
1223 }
1224 *nextTokPtr = ptr;
1225 return XML_TOK_DATA_CHARS;
1226 case BT_CR:
1227 if (ptr == start) {
1228 ptr += MINBPC(enc);
1229 if (! HAS_CHAR(enc, ptr, end))
1230 return XML_TOK_TRAILING_CR;
1231 if (BYTE_TYPE(enc, ptr) == BT_LF)
1232 ptr += MINBPC(enc);
1233 *nextTokPtr = ptr;
1234 return XML_TOK_DATA_NEWLINE;
1235 }
1236 *nextTokPtr = ptr;
1237 return XML_TOK_DATA_CHARS;
1238 case BT_S:
1239 if (ptr == start) {
1240 *nextTokPtr = ptr + MINBPC(enc);
1241 return XML_TOK_ATTRIBUTE_VALUE_S;
1242 }
1243 *nextTokPtr = ptr;
1244 return XML_TOK_DATA_CHARS;
1245 default:
1246 ptr += MINBPC(enc);
1247 break;
1248 }
1249 }
1250 *nextTokPtr = ptr;
1251 return XML_TOK_DATA_CHARS;
1252 }
1253
1254 static int PTRCALL
PREFIX(entityValueTok)1255 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1256 const char *end, const char **nextTokPtr)
1257 {
1258 const char *start;
1259 if (ptr >= end)
1260 return XML_TOK_NONE;
1261 else if (! HAS_CHAR(enc, ptr, end))
1262 return XML_TOK_PARTIAL;
1263 start = ptr;
1264 while (HAS_CHAR(enc, ptr, end)) {
1265 switch (BYTE_TYPE(enc, ptr)) {
1266 #define LEAD_CASE(n) \
1267 case BT_LEAD ## n: ptr += n; break;
1268 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1269 #undef LEAD_CASE
1270 case BT_AMP:
1271 if (ptr == start)
1272 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1273 *nextTokPtr = ptr;
1274 return XML_TOK_DATA_CHARS;
1275 case BT_PERCNT:
1276 if (ptr == start) {
1277 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1278 end, nextTokPtr);
1279 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1280 }
1281 *nextTokPtr = ptr;
1282 return XML_TOK_DATA_CHARS;
1283 case BT_LF:
1284 if (ptr == start) {
1285 *nextTokPtr = ptr + MINBPC(enc);
1286 return XML_TOK_DATA_NEWLINE;
1287 }
1288 *nextTokPtr = ptr;
1289 return XML_TOK_DATA_CHARS;
1290 case BT_CR:
1291 if (ptr == start) {
1292 ptr += MINBPC(enc);
1293 if (! HAS_CHAR(enc, ptr, end))
1294 return XML_TOK_TRAILING_CR;
1295 if (BYTE_TYPE(enc, ptr) == BT_LF)
1296 ptr += MINBPC(enc);
1297 *nextTokPtr = ptr;
1298 return XML_TOK_DATA_NEWLINE;
1299 }
1300 *nextTokPtr = ptr;
1301 return XML_TOK_DATA_CHARS;
1302 default:
1303 ptr += MINBPC(enc);
1304 break;
1305 }
1306 }
1307 *nextTokPtr = ptr;
1308 return XML_TOK_DATA_CHARS;
1309 }
1310
1311 #ifdef XML_DTD
1312
1313 static int PTRCALL
PREFIX(ignoreSectionTok)1314 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1315 const char *end, const char **nextTokPtr)
1316 {
1317 int level = 0;
1318 if (MINBPC(enc) > 1) {
1319 size_t n = end - ptr;
1320 if (n & (MINBPC(enc) - 1)) {
1321 n &= ~(MINBPC(enc) - 1);
1322 end = ptr + n;
1323 }
1324 }
1325 while (HAS_CHAR(enc, ptr, end)) {
1326 switch (BYTE_TYPE(enc, ptr)) {
1327 INVALID_CASES(ptr, nextTokPtr)
1328 case BT_LT:
1329 ptr += MINBPC(enc);
1330 REQUIRE_CHAR(enc, ptr, end);
1331 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1332 ptr += MINBPC(enc);
1333 REQUIRE_CHAR(enc, ptr, end);
1334 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1335 ++level;
1336 ptr += MINBPC(enc);
1337 }
1338 }
1339 break;
1340 case BT_RSQB:
1341 ptr += MINBPC(enc);
1342 REQUIRE_CHAR(enc, ptr, end);
1343 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1344 ptr += MINBPC(enc);
1345 REQUIRE_CHAR(enc, ptr, end);
1346 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1347 ptr += MINBPC(enc);
1348 if (level == 0) {
1349 *nextTokPtr = ptr;
1350 return XML_TOK_IGNORE_SECT;
1351 }
1352 --level;
1353 }
1354 }
1355 break;
1356 default:
1357 ptr += MINBPC(enc);
1358 break;
1359 }
1360 }
1361 return XML_TOK_PARTIAL;
1362 }
1363
1364 #endif /* XML_DTD */
1365
1366 static int PTRCALL
PREFIX(isPublicId)1367 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1368 const char **badPtr)
1369 {
1370 ptr += MINBPC(enc);
1371 end -= MINBPC(enc);
1372 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1373 switch (BYTE_TYPE(enc, ptr)) {
1374 case BT_DIGIT:
1375 case BT_HEX:
1376 case BT_MINUS:
1377 case BT_APOS:
1378 case BT_LPAR:
1379 case BT_RPAR:
1380 case BT_PLUS:
1381 case BT_COMMA:
1382 case BT_SOL:
1383 case BT_EQUALS:
1384 case BT_QUEST:
1385 case BT_CR:
1386 case BT_LF:
1387 case BT_SEMI:
1388 case BT_EXCL:
1389 case BT_AST:
1390 case BT_PERCNT:
1391 case BT_NUM:
1392 #ifdef XML_NS
1393 case BT_COLON:
1394 #endif
1395 break;
1396 case BT_S:
1397 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1398 *badPtr = ptr;
1399 return 0;
1400 }
1401 break;
1402 case BT_NAME:
1403 case BT_NMSTRT:
1404 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1405 break;
1406 default:
1407 switch (BYTE_TO_ASCII(enc, ptr)) {
1408 case 0x24: /* $ */
1409 case 0x40: /* @ */
1410 break;
1411 default:
1412 *badPtr = ptr;
1413 return 0;
1414 }
1415 break;
1416 }
1417 }
1418 return 1;
1419 }
1420
1421 /* This must only be called for a well-formed start-tag or empty
1422 element tag. Returns the number of attributes. Pointers to the
1423 first attsMax attributes are stored in atts.
1424 */
1425
1426 static int PTRCALL
PREFIX(getAtts)1427 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1428 int attsMax, ATTRIBUTE *atts)
1429 {
1430 enum { other, inName, inValue } state = inName;
1431 int nAtts = 0;
1432 int open = 0; /* defined when state == inValue;
1433 initialization just to shut up compilers */
1434
1435 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1436 switch (BYTE_TYPE(enc, ptr)) {
1437 #define START_NAME \
1438 if (state == other) { \
1439 if (nAtts < attsMax) { \
1440 atts[nAtts].name = ptr; \
1441 atts[nAtts].normalized = 1; \
1442 } \
1443 state = inName; \
1444 }
1445 #define LEAD_CASE(n) \
1446 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1447 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1448 #undef LEAD_CASE
1449 case BT_NONASCII:
1450 case BT_NMSTRT:
1451 case BT_HEX:
1452 START_NAME
1453 break;
1454 #undef START_NAME
1455 case BT_QUOT:
1456 if (state != inValue) {
1457 if (nAtts < attsMax)
1458 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1459 state = inValue;
1460 open = BT_QUOT;
1461 }
1462 else if (open == BT_QUOT) {
1463 state = other;
1464 if (nAtts < attsMax)
1465 atts[nAtts].valueEnd = ptr;
1466 nAtts++;
1467 }
1468 break;
1469 case BT_APOS:
1470 if (state != inValue) {
1471 if (nAtts < attsMax)
1472 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1473 state = inValue;
1474 open = BT_APOS;
1475 }
1476 else if (open == BT_APOS) {
1477 state = other;
1478 if (nAtts < attsMax)
1479 atts[nAtts].valueEnd = ptr;
1480 nAtts++;
1481 }
1482 break;
1483 case BT_AMP:
1484 if (nAtts < attsMax)
1485 atts[nAtts].normalized = 0;
1486 break;
1487 case BT_S:
1488 if (state == inName)
1489 state = other;
1490 else if (state == inValue
1491 && nAtts < attsMax
1492 && atts[nAtts].normalized
1493 && (ptr == atts[nAtts].valuePtr
1494 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1495 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1496 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1497 atts[nAtts].normalized = 0;
1498 break;
1499 case BT_CR: case BT_LF:
1500 /* This case ensures that the first attribute name is counted
1501 Apart from that we could just change state on the quote. */
1502 if (state == inName)
1503 state = other;
1504 else if (state == inValue && nAtts < attsMax)
1505 atts[nAtts].normalized = 0;
1506 break;
1507 case BT_GT:
1508 case BT_SOL:
1509 if (state != inValue)
1510 return nAtts;
1511 break;
1512 default:
1513 break;
1514 }
1515 }
1516 /* not reached */
1517 }
1518
1519 static int PTRFASTCALL
PREFIX(charRefNumber)1520 PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr)
1521 {
1522 int result = 0;
1523 /* skip &# */
1524 ptr += 2*MINBPC(enc);
1525 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1526 for (ptr += MINBPC(enc);
1527 !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1528 ptr += MINBPC(enc)) {
1529 int c = BYTE_TO_ASCII(enc, ptr);
1530 switch (c) {
1531 case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1532 case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1533 result <<= 4;
1534 result |= (c - ASCII_0);
1535 break;
1536 case ASCII_A: case ASCII_B: case ASCII_C:
1537 case ASCII_D: case ASCII_E: case ASCII_F:
1538 result <<= 4;
1539 result += 10 + (c - ASCII_A);
1540 break;
1541 case ASCII_a: case ASCII_b: case ASCII_c:
1542 case ASCII_d: case ASCII_e: case ASCII_f:
1543 result <<= 4;
1544 result += 10 + (c - ASCII_a);
1545 break;
1546 }
1547 if (result >= 0x110000)
1548 return -1;
1549 }
1550 }
1551 else {
1552 for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1553 int c = BYTE_TO_ASCII(enc, ptr);
1554 result *= 10;
1555 result += (c - ASCII_0);
1556 if (result >= 0x110000)
1557 return -1;
1558 }
1559 }
1560 return checkCharRefNumber(result);
1561 }
1562
1563 static int PTRCALL
PREFIX(predefinedEntityName)1564 PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr,
1565 const char *end)
1566 {
1567 switch ((end - ptr)/MINBPC(enc)) {
1568 case 2:
1569 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1570 switch (BYTE_TO_ASCII(enc, ptr)) {
1571 case ASCII_l:
1572 return ASCII_LT;
1573 case ASCII_g:
1574 return ASCII_GT;
1575 }
1576 }
1577 break;
1578 case 3:
1579 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1580 ptr += MINBPC(enc);
1581 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1582 ptr += MINBPC(enc);
1583 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1584 return ASCII_AMP;
1585 }
1586 }
1587 break;
1588 case 4:
1589 switch (BYTE_TO_ASCII(enc, ptr)) {
1590 case ASCII_q:
1591 ptr += MINBPC(enc);
1592 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1593 ptr += MINBPC(enc);
1594 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1595 ptr += MINBPC(enc);
1596 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1597 return ASCII_QUOT;
1598 }
1599 }
1600 break;
1601 case ASCII_a:
1602 ptr += MINBPC(enc);
1603 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1604 ptr += MINBPC(enc);
1605 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1606 ptr += MINBPC(enc);
1607 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1608 return ASCII_APOS;
1609 }
1610 }
1611 break;
1612 }
1613 }
1614 return 0;
1615 }
1616
1617 static int PTRCALL
PREFIX(sameName)1618 PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1619 {
1620 for (;;) {
1621 switch (BYTE_TYPE(enc, ptr1)) {
1622 #define LEAD_CASE(n) \
1623 case BT_LEAD ## n: \
1624 if (*ptr1++ != *ptr2++) \
1625 return 0;
1626 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1627 #undef LEAD_CASE
1628 /* fall through */
1629 if (*ptr1++ != *ptr2++)
1630 return 0;
1631 break;
1632 case BT_NONASCII:
1633 case BT_NMSTRT:
1634 #ifdef XML_NS
1635 case BT_COLON:
1636 #endif
1637 case BT_HEX:
1638 case BT_DIGIT:
1639 case BT_NAME:
1640 case BT_MINUS:
1641 if (*ptr2++ != *ptr1++)
1642 return 0;
1643 if (MINBPC(enc) > 1) {
1644 if (*ptr2++ != *ptr1++)
1645 return 0;
1646 if (MINBPC(enc) > 2) {
1647 if (*ptr2++ != *ptr1++)
1648 return 0;
1649 if (MINBPC(enc) > 3) {
1650 if (*ptr2++ != *ptr1++)
1651 return 0;
1652 }
1653 }
1654 }
1655 break;
1656 default:
1657 if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1658 return 1;
1659 switch (BYTE_TYPE(enc, ptr2)) {
1660 case BT_LEAD2:
1661 case BT_LEAD3:
1662 case BT_LEAD4:
1663 case BT_NONASCII:
1664 case BT_NMSTRT:
1665 #ifdef XML_NS
1666 case BT_COLON:
1667 #endif
1668 case BT_HEX:
1669 case BT_DIGIT:
1670 case BT_NAME:
1671 case BT_MINUS:
1672 return 0;
1673 default:
1674 return 1;
1675 }
1676 }
1677 }
1678 /* not reached */
1679 }
1680
1681 static int PTRCALL
PREFIX(nameMatchesAscii)1682 PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1,
1683 const char *end1, const char *ptr2)
1684 {
1685 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1686 if (end1 - ptr1 < MINBPC(enc))
1687 return 0;
1688 if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1689 return 0;
1690 }
1691 return ptr1 == end1;
1692 }
1693
1694 static int PTRFASTCALL
PREFIX(nameLength)1695 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1696 {
1697 const char *start = ptr;
1698 for (;;) {
1699 switch (BYTE_TYPE(enc, ptr)) {
1700 #define LEAD_CASE(n) \
1701 case BT_LEAD ## n: ptr += n; break;
1702 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1703 #undef LEAD_CASE
1704 case BT_NONASCII:
1705 case BT_NMSTRT:
1706 #ifdef XML_NS
1707 case BT_COLON:
1708 #endif
1709 case BT_HEX:
1710 case BT_DIGIT:
1711 case BT_NAME:
1712 case BT_MINUS:
1713 ptr += MINBPC(enc);
1714 break;
1715 default:
1716 return (int)(ptr - start);
1717 }
1718 }
1719 }
1720
1721 static const char * PTRFASTCALL
PREFIX(skipS)1722 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1723 {
1724 for (;;) {
1725 switch (BYTE_TYPE(enc, ptr)) {
1726 case BT_LF:
1727 case BT_CR:
1728 case BT_S:
1729 ptr += MINBPC(enc);
1730 break;
1731 default:
1732 return ptr;
1733 }
1734 }
1735 }
1736
1737 static void PTRCALL
PREFIX(updatePosition)1738 PREFIX(updatePosition)(const ENCODING *enc,
1739 const char *ptr,
1740 const char *end,
1741 POSITION *pos)
1742 {
1743 while (HAS_CHAR(enc, ptr, end)) {
1744 switch (BYTE_TYPE(enc, ptr)) {
1745 #define LEAD_CASE(n) \
1746 case BT_LEAD ## n: \
1747 ptr += n; \
1748 break;
1749 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1750 #undef LEAD_CASE
1751 case BT_LF:
1752 pos->columnNumber = (XML_Size)-1;
1753 pos->lineNumber++;
1754 ptr += MINBPC(enc);
1755 break;
1756 case BT_CR:
1757 pos->lineNumber++;
1758 ptr += MINBPC(enc);
1759 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1760 ptr += MINBPC(enc);
1761 pos->columnNumber = (XML_Size)-1;
1762 break;
1763 default:
1764 ptr += MINBPC(enc);
1765 break;
1766 }
1767 pos->columnNumber++;
1768 }
1769 }
1770
1771 #undef DO_LEAD_CASE
1772 #undef MULTIBYTE_CASES
1773 #undef INVALID_CASES
1774 #undef CHECK_NAME_CASE
1775 #undef CHECK_NAME_CASES
1776 #undef CHECK_NMSTRT_CASE
1777 #undef CHECK_NMSTRT_CASES
1778
1779 #endif /* XML_TOK_IMPL_C */
1780