1 /* This file is included!
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000-2017 Expat development team
11    Licensed under the MIT license:
12 
13    Permission is  hereby granted,  free of charge,  to any  person obtaining
14    a  copy  of  this  software   and  associated  documentation  files  (the
15    "Software"),  to  deal in  the  Software  without restriction,  including
16    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
17    distribute, sublicense, and/or sell copies of the Software, and to permit
18    persons  to whom  the Software  is  furnished to  do so,  subject to  the
19    following conditions:
20 
21    The above copyright  notice and this permission notice  shall be included
22    in all copies or substantial portions of the Software.
23 
24    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
25    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
26    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
29    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30    USE OR OTHER DEALINGS IN THE SOFTWARE.
31 */
32 
33 #ifdef XML_TOK_IMPL_C
34 
35 #ifndef IS_INVALID_CHAR
36 #define IS_INVALID_CHAR(enc, ptr, n) (0)
37 #endif
38 
39 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
40     case BT_LEAD ## n: \
41       if (end - ptr < n) \
42         return XML_TOK_PARTIAL_CHAR; \
43       if (IS_INVALID_CHAR(enc, ptr, n)) { \
44         *(nextTokPtr) = (ptr); \
45         return XML_TOK_INVALID; \
46       } \
47       ptr += n; \
48       break;
49 
50 #define INVALID_CASES(ptr, nextTokPtr) \
51   INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
52   INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
53   INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
54   case BT_NONXML: \
55   case BT_MALFORM: \
56   case BT_TRAIL: \
57     *(nextTokPtr) = (ptr); \
58     return XML_TOK_INVALID;
59 
60 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
61    case BT_LEAD ## n: \
62      if (end - ptr < n) \
63        return XML_TOK_PARTIAL_CHAR; \
64      if (!IS_NAME_CHAR(enc, ptr, n)) { \
65        *nextTokPtr = ptr; \
66        return XML_TOK_INVALID; \
67      } \
68      ptr += n; \
69      break;
70 
71 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
72   case BT_NONASCII: \
73     if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
74       *nextTokPtr = ptr; \
75       return XML_TOK_INVALID; \
76     } \
77     /* fall through */ \
78   case BT_NMSTRT: \
79   case BT_HEX: \
80   case BT_DIGIT: \
81   case BT_NAME: \
82   case BT_MINUS: \
83     ptr += MINBPC(enc); \
84     break; \
85   CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
86   CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
87   CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
88 
89 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
90    case BT_LEAD ## n: \
91      if (end - ptr < n) \
92        return XML_TOK_PARTIAL_CHAR; \
93      if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
94        *nextTokPtr = ptr; \
95        return XML_TOK_INVALID; \
96      } \
97      ptr += n; \
98      break;
99 
100 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
101   case BT_NONASCII: \
102     if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
103       *nextTokPtr = ptr; \
104       return XML_TOK_INVALID; \
105     } \
106     /* fall through */ \
107   case BT_NMSTRT: \
108   case BT_HEX: \
109     ptr += MINBPC(enc); \
110     break; \
111   CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
112   CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
113   CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
114 
115 #ifndef PREFIX
116 #define PREFIX(ident) ident
117 #endif
118 
119 
120 #define HAS_CHARS(enc, ptr, end, count) \
121     (end - ptr >= count * MINBPC(enc))
122 
123 #define HAS_CHAR(enc, ptr, end) \
124     HAS_CHARS(enc, ptr, end, 1)
125 
126 #define REQUIRE_CHARS(enc, ptr, end, count) \
127     { \
128       if (! HAS_CHARS(enc, ptr, end, count)) { \
129         return XML_TOK_PARTIAL; \
130       } \
131     }
132 
133 #define REQUIRE_CHAR(enc, ptr, end) \
134     REQUIRE_CHARS(enc, ptr, end, 1)
135 
136 
137 /* ptr points to character following "<!-" */
138 
139 static int PTRCALL
PREFIX(scanComment)140 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
141                     const char *end, const char **nextTokPtr)
142 {
143   if (HAS_CHAR(enc, ptr, end)) {
144     if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
145       *nextTokPtr = ptr;
146       return XML_TOK_INVALID;
147     }
148     ptr += MINBPC(enc);
149     while (HAS_CHAR(enc, ptr, end)) {
150       switch (BYTE_TYPE(enc, ptr)) {
151       INVALID_CASES(ptr, nextTokPtr)
152       case BT_MINUS:
153         ptr += MINBPC(enc);
154         REQUIRE_CHAR(enc, ptr, end);
155         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
156           ptr += MINBPC(enc);
157           REQUIRE_CHAR(enc, ptr, end);
158           if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
159             *nextTokPtr = ptr;
160             return XML_TOK_INVALID;
161           }
162           *nextTokPtr = ptr + MINBPC(enc);
163           return XML_TOK_COMMENT;
164         }
165         break;
166       default:
167         ptr += MINBPC(enc);
168         break;
169       }
170     }
171   }
172   return XML_TOK_PARTIAL;
173 }
174 
175 /* ptr points to character following "<!" */
176 
177 static int PTRCALL
PREFIX(scanDecl)178 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
179                  const char *end, const char **nextTokPtr)
180 {
181   REQUIRE_CHAR(enc, ptr, end);
182   switch (BYTE_TYPE(enc, ptr)) {
183   case BT_MINUS:
184     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
185   case BT_LSQB:
186     *nextTokPtr = ptr + MINBPC(enc);
187     return XML_TOK_COND_SECT_OPEN;
188   case BT_NMSTRT:
189   case BT_HEX:
190     ptr += MINBPC(enc);
191     break;
192   default:
193     *nextTokPtr = ptr;
194     return XML_TOK_INVALID;
195   }
196   while (HAS_CHAR(enc, ptr, end)) {
197     switch (BYTE_TYPE(enc, ptr)) {
198     case BT_PERCNT:
199       REQUIRE_CHARS(enc, ptr, end, 2);
200       /* don't allow <!ENTITY% foo "whatever"> */
201       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
202       case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
203         *nextTokPtr = ptr;
204         return XML_TOK_INVALID;
205       }
206       /* fall through */
207     case BT_S: case BT_CR: case BT_LF:
208       *nextTokPtr = ptr;
209       return XML_TOK_DECL_OPEN;
210     case BT_NMSTRT:
211     case BT_HEX:
212       ptr += MINBPC(enc);
213       break;
214     default:
215       *nextTokPtr = ptr;
216       return XML_TOK_INVALID;
217     }
218   }
219   return XML_TOK_PARTIAL;
220 }
221 
222 static int PTRCALL
PREFIX(checkPiTarget)223 PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr,
224                       const char *end, int *tokPtr)
225 {
226   int upper = 0;
227   *tokPtr = XML_TOK_PI;
228   if (end - ptr != MINBPC(enc)*3)
229     return 1;
230   switch (BYTE_TO_ASCII(enc, ptr)) {
231   case ASCII_x:
232     break;
233   case ASCII_X:
234     upper = 1;
235     break;
236   default:
237     return 1;
238   }
239   ptr += MINBPC(enc);
240   switch (BYTE_TO_ASCII(enc, ptr)) {
241   case ASCII_m:
242     break;
243   case ASCII_M:
244     upper = 1;
245     break;
246   default:
247     return 1;
248   }
249   ptr += MINBPC(enc);
250   switch (BYTE_TO_ASCII(enc, ptr)) {
251   case ASCII_l:
252     break;
253   case ASCII_L:
254     upper = 1;
255     break;
256   default:
257     return 1;
258   }
259   if (upper)
260     return 0;
261   *tokPtr = XML_TOK_XML_DECL;
262   return 1;
263 }
264 
265 /* ptr points to character following "<?" */
266 
267 static int PTRCALL
PREFIX(scanPi)268 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
269                const char *end, const char **nextTokPtr)
270 {
271   int tok;
272   const char *target = ptr;
273   REQUIRE_CHAR(enc, ptr, end);
274   switch (BYTE_TYPE(enc, ptr)) {
275   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
276   default:
277     *nextTokPtr = ptr;
278     return XML_TOK_INVALID;
279   }
280   while (HAS_CHAR(enc, ptr, end)) {
281     switch (BYTE_TYPE(enc, ptr)) {
282     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
283     case BT_S: case BT_CR: case BT_LF:
284       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
285         *nextTokPtr = ptr;
286         return XML_TOK_INVALID;
287       }
288       ptr += MINBPC(enc);
289       while (HAS_CHAR(enc, ptr, end)) {
290         switch (BYTE_TYPE(enc, ptr)) {
291         INVALID_CASES(ptr, nextTokPtr)
292         case BT_QUEST:
293           ptr += MINBPC(enc);
294           REQUIRE_CHAR(enc, ptr, end);
295           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
296             *nextTokPtr = ptr + MINBPC(enc);
297             return tok;
298           }
299           break;
300         default:
301           ptr += MINBPC(enc);
302           break;
303         }
304       }
305       return XML_TOK_PARTIAL;
306     case BT_QUEST:
307       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
308         *nextTokPtr = ptr;
309         return XML_TOK_INVALID;
310       }
311       ptr += MINBPC(enc);
312       REQUIRE_CHAR(enc, ptr, end);
313       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
314         *nextTokPtr = ptr + MINBPC(enc);
315         return tok;
316       }
317       /* fall through */
318     default:
319       *nextTokPtr = ptr;
320       return XML_TOK_INVALID;
321     }
322   }
323   return XML_TOK_PARTIAL;
324 }
325 
326 static int PTRCALL
PREFIX(scanCdataSection)327 PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr,
328                          const char *end, const char **nextTokPtr)
329 {
330   static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
331                                      ASCII_T, ASCII_A, ASCII_LSQB };
332   int i;
333   /* CDATA[ */
334   REQUIRE_CHARS(enc, ptr, end, 6);
335   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
336     if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
337       *nextTokPtr = ptr;
338       return XML_TOK_INVALID;
339     }
340   }
341   *nextTokPtr = ptr;
342   return XML_TOK_CDATA_SECT_OPEN;
343 }
344 
345 static int PTRCALL
PREFIX(cdataSectionTok)346 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
347                         const char *end, const char **nextTokPtr)
348 {
349   if (ptr >= end)
350     return XML_TOK_NONE;
351   if (MINBPC(enc) > 1) {
352     size_t n = end - ptr;
353     if (n & (MINBPC(enc) - 1)) {
354       n &= ~(MINBPC(enc) - 1);
355       if (n == 0)
356         return XML_TOK_PARTIAL;
357       end = ptr + n;
358     }
359   }
360   switch (BYTE_TYPE(enc, ptr)) {
361   case BT_RSQB:
362     ptr += MINBPC(enc);
363     REQUIRE_CHAR(enc, ptr, end);
364     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
365       break;
366     ptr += MINBPC(enc);
367     REQUIRE_CHAR(enc, ptr, end);
368     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
369       ptr -= MINBPC(enc);
370       break;
371     }
372     *nextTokPtr = ptr + MINBPC(enc);
373     return XML_TOK_CDATA_SECT_CLOSE;
374   case BT_CR:
375     ptr += MINBPC(enc);
376     REQUIRE_CHAR(enc, ptr, end);
377     if (BYTE_TYPE(enc, ptr) == BT_LF)
378       ptr += MINBPC(enc);
379     *nextTokPtr = ptr;
380     return XML_TOK_DATA_NEWLINE;
381   case BT_LF:
382     *nextTokPtr = ptr + MINBPC(enc);
383     return XML_TOK_DATA_NEWLINE;
384   INVALID_CASES(ptr, nextTokPtr)
385   default:
386     ptr += MINBPC(enc);
387     break;
388   }
389   while (HAS_CHAR(enc, ptr, end)) {
390     switch (BYTE_TYPE(enc, ptr)) {
391 #define LEAD_CASE(n) \
392     case BT_LEAD ## n: \
393       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
394         *nextTokPtr = ptr; \
395         return XML_TOK_DATA_CHARS; \
396       } \
397       ptr += n; \
398       break;
399     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
400 #undef LEAD_CASE
401     case BT_NONXML:
402     case BT_MALFORM:
403     case BT_TRAIL:
404     case BT_CR:
405     case BT_LF:
406     case BT_RSQB:
407       *nextTokPtr = ptr;
408       return XML_TOK_DATA_CHARS;
409     default:
410       ptr += MINBPC(enc);
411       break;
412     }
413   }
414   *nextTokPtr = ptr;
415   return XML_TOK_DATA_CHARS;
416 }
417 
418 /* ptr points to character following "</" */
419 
420 static int PTRCALL
PREFIX(scanEndTag)421 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
422                    const char *end, const char **nextTokPtr)
423 {
424   REQUIRE_CHAR(enc, ptr, end);
425   switch (BYTE_TYPE(enc, ptr)) {
426   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
427   default:
428     *nextTokPtr = ptr;
429     return XML_TOK_INVALID;
430   }
431   while (HAS_CHAR(enc, ptr, end)) {
432     switch (BYTE_TYPE(enc, ptr)) {
433     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
434     case BT_S: case BT_CR: case BT_LF:
435       for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
436         switch (BYTE_TYPE(enc, ptr)) {
437         case BT_S: case BT_CR: case BT_LF:
438           break;
439         case BT_GT:
440           *nextTokPtr = ptr + MINBPC(enc);
441           return XML_TOK_END_TAG;
442         default:
443           *nextTokPtr = ptr;
444           return XML_TOK_INVALID;
445         }
446       }
447       return XML_TOK_PARTIAL;
448 #ifdef XML_NS
449     case BT_COLON:
450       /* no need to check qname syntax here,
451          since end-tag must match exactly */
452       ptr += MINBPC(enc);
453       break;
454 #endif
455     case BT_GT:
456       *nextTokPtr = ptr + MINBPC(enc);
457       return XML_TOK_END_TAG;
458     default:
459       *nextTokPtr = ptr;
460       return XML_TOK_INVALID;
461     }
462   }
463   return XML_TOK_PARTIAL;
464 }
465 
466 /* ptr points to character following "&#X" */
467 
468 static int PTRCALL
PREFIX(scanHexCharRef)469 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
470                        const char *end, const char **nextTokPtr)
471 {
472   if (HAS_CHAR(enc, ptr, end)) {
473     switch (BYTE_TYPE(enc, ptr)) {
474     case BT_DIGIT:
475     case BT_HEX:
476       break;
477     default:
478       *nextTokPtr = ptr;
479       return XML_TOK_INVALID;
480     }
481     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
482       switch (BYTE_TYPE(enc, ptr)) {
483       case BT_DIGIT:
484       case BT_HEX:
485         break;
486       case BT_SEMI:
487         *nextTokPtr = ptr + MINBPC(enc);
488         return XML_TOK_CHAR_REF;
489       default:
490         *nextTokPtr = ptr;
491         return XML_TOK_INVALID;
492       }
493     }
494   }
495   return XML_TOK_PARTIAL;
496 }
497 
498 /* ptr points to character following "&#" */
499 
500 static int PTRCALL
PREFIX(scanCharRef)501 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
502                     const char *end, const char **nextTokPtr)
503 {
504   if (HAS_CHAR(enc, ptr, end)) {
505     if (CHAR_MATCHES(enc, ptr, ASCII_x))
506       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
507     switch (BYTE_TYPE(enc, ptr)) {
508     case BT_DIGIT:
509       break;
510     default:
511       *nextTokPtr = ptr;
512       return XML_TOK_INVALID;
513     }
514     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
515       switch (BYTE_TYPE(enc, ptr)) {
516       case BT_DIGIT:
517         break;
518       case BT_SEMI:
519         *nextTokPtr = ptr + MINBPC(enc);
520         return XML_TOK_CHAR_REF;
521       default:
522         *nextTokPtr = ptr;
523         return XML_TOK_INVALID;
524       }
525     }
526   }
527   return XML_TOK_PARTIAL;
528 }
529 
530 /* ptr points to character following "&" */
531 
532 static int PTRCALL
PREFIX(scanRef)533 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
534                 const char **nextTokPtr)
535 {
536   REQUIRE_CHAR(enc, ptr, end);
537   switch (BYTE_TYPE(enc, ptr)) {
538   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
539   case BT_NUM:
540     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
541   default:
542     *nextTokPtr = ptr;
543     return XML_TOK_INVALID;
544   }
545   while (HAS_CHAR(enc, ptr, end)) {
546     switch (BYTE_TYPE(enc, ptr)) {
547     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
548     case BT_SEMI:
549       *nextTokPtr = ptr + MINBPC(enc);
550       return XML_TOK_ENTITY_REF;
551     default:
552       *nextTokPtr = ptr;
553       return XML_TOK_INVALID;
554     }
555   }
556   return XML_TOK_PARTIAL;
557 }
558 
559 /* ptr points to character following first character of attribute name */
560 
561 static int PTRCALL
PREFIX(scanAtts)562 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
563                  const char **nextTokPtr)
564 {
565 #ifdef XML_NS
566   int hadColon = 0;
567 #endif
568   while (HAS_CHAR(enc, ptr, end)) {
569     switch (BYTE_TYPE(enc, ptr)) {
570     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
571 #ifdef XML_NS
572     case BT_COLON:
573       if (hadColon) {
574         *nextTokPtr = ptr;
575         return XML_TOK_INVALID;
576       }
577       hadColon = 1;
578       ptr += MINBPC(enc);
579       REQUIRE_CHAR(enc, ptr, end);
580       switch (BYTE_TYPE(enc, ptr)) {
581       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
582       default:
583         *nextTokPtr = ptr;
584         return XML_TOK_INVALID;
585       }
586       break;
587 #endif
588     case BT_S: case BT_CR: case BT_LF:
589       for (;;) {
590         int t;
591 
592         ptr += MINBPC(enc);
593         REQUIRE_CHAR(enc, ptr, end);
594         t = BYTE_TYPE(enc, ptr);
595         if (t == BT_EQUALS)
596           break;
597         switch (t) {
598         case BT_S:
599         case BT_LF:
600         case BT_CR:
601           break;
602         default:
603           *nextTokPtr = ptr;
604           return XML_TOK_INVALID;
605         }
606       }
607       /* fall through */
608     case BT_EQUALS:
609       {
610         int open;
611 #ifdef XML_NS
612         hadColon = 0;
613 #endif
614         for (;;) {
615           ptr += MINBPC(enc);
616           REQUIRE_CHAR(enc, ptr, end);
617           open = BYTE_TYPE(enc, ptr);
618           if (open == BT_QUOT || open == BT_APOS)
619             break;
620           switch (open) {
621           case BT_S:
622           case BT_LF:
623           case BT_CR:
624             break;
625           default:
626             *nextTokPtr = ptr;
627             return XML_TOK_INVALID;
628           }
629         }
630         ptr += MINBPC(enc);
631         /* in attribute value */
632         for (;;) {
633           int t;
634           REQUIRE_CHAR(enc, ptr, end);
635           t = BYTE_TYPE(enc, ptr);
636           if (t == open)
637             break;
638           switch (t) {
639           INVALID_CASES(ptr, nextTokPtr)
640           case BT_AMP:
641             {
642               int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
643               if (tok <= 0) {
644                 if (tok == XML_TOK_INVALID)
645                   *nextTokPtr = ptr;
646                 return tok;
647               }
648               break;
649             }
650           case BT_LT:
651             *nextTokPtr = ptr;
652             return XML_TOK_INVALID;
653           default:
654             ptr += MINBPC(enc);
655             break;
656           }
657         }
658         ptr += MINBPC(enc);
659         REQUIRE_CHAR(enc, ptr, end);
660         switch (BYTE_TYPE(enc, ptr)) {
661         case BT_S:
662         case BT_CR:
663         case BT_LF:
664           break;
665         case BT_SOL:
666           goto sol;
667         case BT_GT:
668           goto gt;
669         default:
670           *nextTokPtr = ptr;
671           return XML_TOK_INVALID;
672         }
673         /* ptr points to closing quote */
674         for (;;) {
675           ptr += MINBPC(enc);
676           REQUIRE_CHAR(enc, ptr, end);
677           switch (BYTE_TYPE(enc, ptr)) {
678           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
679           case BT_S: case BT_CR: case BT_LF:
680             continue;
681           case BT_GT:
682           gt:
683             *nextTokPtr = ptr + MINBPC(enc);
684             return XML_TOK_START_TAG_WITH_ATTS;
685           case BT_SOL:
686           sol:
687             ptr += MINBPC(enc);
688             REQUIRE_CHAR(enc, ptr, end);
689             if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
690               *nextTokPtr = ptr;
691               return XML_TOK_INVALID;
692             }
693             *nextTokPtr = ptr + MINBPC(enc);
694             return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
695           default:
696             *nextTokPtr = ptr;
697             return XML_TOK_INVALID;
698           }
699           break;
700         }
701         break;
702       }
703     default:
704       *nextTokPtr = ptr;
705       return XML_TOK_INVALID;
706     }
707   }
708   return XML_TOK_PARTIAL;
709 }
710 
711 /* ptr points to character following "<" */
712 
713 static int PTRCALL
PREFIX(scanLt)714 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
715                const char **nextTokPtr)
716 {
717 #ifdef XML_NS
718   int hadColon;
719 #endif
720   REQUIRE_CHAR(enc, ptr, end);
721   switch (BYTE_TYPE(enc, ptr)) {
722   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
723   case BT_EXCL:
724     ptr += MINBPC(enc);
725     REQUIRE_CHAR(enc, ptr, end);
726     switch (BYTE_TYPE(enc, ptr)) {
727     case BT_MINUS:
728       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
729     case BT_LSQB:
730       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
731                                       end, nextTokPtr);
732     }
733     *nextTokPtr = ptr;
734     return XML_TOK_INVALID;
735   case BT_QUEST:
736     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
737   case BT_SOL:
738     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
739   default:
740     *nextTokPtr = ptr;
741     return XML_TOK_INVALID;
742   }
743 #ifdef XML_NS
744   hadColon = 0;
745 #endif
746   /* we have a start-tag */
747   while (HAS_CHAR(enc, ptr, end)) {
748     switch (BYTE_TYPE(enc, ptr)) {
749     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
750 #ifdef XML_NS
751     case BT_COLON:
752       if (hadColon) {
753         *nextTokPtr = ptr;
754         return XML_TOK_INVALID;
755       }
756       hadColon = 1;
757       ptr += MINBPC(enc);
758       REQUIRE_CHAR(enc, ptr, end);
759       switch (BYTE_TYPE(enc, ptr)) {
760       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
761       default:
762         *nextTokPtr = ptr;
763         return XML_TOK_INVALID;
764       }
765       break;
766 #endif
767     case BT_S: case BT_CR: case BT_LF:
768       {
769         ptr += MINBPC(enc);
770         while (HAS_CHAR(enc, ptr, end)) {
771           switch (BYTE_TYPE(enc, ptr)) {
772           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
773           case BT_GT:
774             goto gt;
775           case BT_SOL:
776             goto sol;
777           case BT_S: case BT_CR: case BT_LF:
778             ptr += MINBPC(enc);
779             continue;
780           default:
781             *nextTokPtr = ptr;
782             return XML_TOK_INVALID;
783           }
784           return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
785         }
786         return XML_TOK_PARTIAL;
787       }
788     case BT_GT:
789     gt:
790       *nextTokPtr = ptr + MINBPC(enc);
791       return XML_TOK_START_TAG_NO_ATTS;
792     case BT_SOL:
793     sol:
794       ptr += MINBPC(enc);
795       REQUIRE_CHAR(enc, ptr, end);
796       if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
797         *nextTokPtr = ptr;
798         return XML_TOK_INVALID;
799       }
800       *nextTokPtr = ptr + MINBPC(enc);
801       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
802     default:
803       *nextTokPtr = ptr;
804       return XML_TOK_INVALID;
805     }
806   }
807   return XML_TOK_PARTIAL;
808 }
809 
810 static int PTRCALL
PREFIX(contentTok)811 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
812                    const char **nextTokPtr)
813 {
814   if (ptr >= end)
815     return XML_TOK_NONE;
816   if (MINBPC(enc) > 1) {
817     size_t n = end - ptr;
818     if (n & (MINBPC(enc) - 1)) {
819       n &= ~(MINBPC(enc) - 1);
820       if (n == 0)
821         return XML_TOK_PARTIAL;
822       end = ptr + n;
823     }
824   }
825   switch (BYTE_TYPE(enc, ptr)) {
826   case BT_LT:
827     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
828   case BT_AMP:
829     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
830   case BT_CR:
831     ptr += MINBPC(enc);
832     if (! HAS_CHAR(enc, ptr, end))
833       return XML_TOK_TRAILING_CR;
834     if (BYTE_TYPE(enc, ptr) == BT_LF)
835       ptr += MINBPC(enc);
836     *nextTokPtr = ptr;
837     return XML_TOK_DATA_NEWLINE;
838   case BT_LF:
839     *nextTokPtr = ptr + MINBPC(enc);
840     return XML_TOK_DATA_NEWLINE;
841   case BT_RSQB:
842     ptr += MINBPC(enc);
843     if (! HAS_CHAR(enc, ptr, end))
844       return XML_TOK_TRAILING_RSQB;
845     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
846       break;
847     ptr += MINBPC(enc);
848     if (! HAS_CHAR(enc, ptr, end))
849       return XML_TOK_TRAILING_RSQB;
850     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
851       ptr -= MINBPC(enc);
852       break;
853     }
854     *nextTokPtr = ptr;
855     return XML_TOK_INVALID;
856   INVALID_CASES(ptr, nextTokPtr)
857   default:
858     ptr += MINBPC(enc);
859     break;
860   }
861   while (HAS_CHAR(enc, ptr, end)) {
862     switch (BYTE_TYPE(enc, ptr)) {
863 #define LEAD_CASE(n) \
864     case BT_LEAD ## n: \
865       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
866         *nextTokPtr = ptr; \
867         return XML_TOK_DATA_CHARS; \
868       } \
869       ptr += n; \
870       break;
871     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
872 #undef LEAD_CASE
873     case BT_RSQB:
874       if (HAS_CHARS(enc, ptr, end, 2)) {
875          if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
876            ptr += MINBPC(enc);
877            break;
878          }
879          if (HAS_CHARS(enc, ptr, end, 3)) {
880            if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
881              ptr += MINBPC(enc);
882              break;
883            }
884            *nextTokPtr = ptr + 2*MINBPC(enc);
885            return XML_TOK_INVALID;
886          }
887       }
888       /* fall through */
889     case BT_AMP:
890     case BT_LT:
891     case BT_NONXML:
892     case BT_MALFORM:
893     case BT_TRAIL:
894     case BT_CR:
895     case BT_LF:
896       *nextTokPtr = ptr;
897       return XML_TOK_DATA_CHARS;
898     default:
899       ptr += MINBPC(enc);
900       break;
901     }
902   }
903   *nextTokPtr = ptr;
904   return XML_TOK_DATA_CHARS;
905 }
906 
907 /* ptr points to character following "%" */
908 
909 static int PTRCALL
PREFIX(scanPercent)910 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
911                     const char **nextTokPtr)
912 {
913   REQUIRE_CHAR(enc, ptr, end);
914   switch (BYTE_TYPE(enc, ptr)) {
915   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
916   case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
917     *nextTokPtr = ptr;
918     return XML_TOK_PERCENT;
919   default:
920     *nextTokPtr = ptr;
921     return XML_TOK_INVALID;
922   }
923   while (HAS_CHAR(enc, ptr, end)) {
924     switch (BYTE_TYPE(enc, ptr)) {
925     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
926     case BT_SEMI:
927       *nextTokPtr = ptr + MINBPC(enc);
928       return XML_TOK_PARAM_ENTITY_REF;
929     default:
930       *nextTokPtr = ptr;
931       return XML_TOK_INVALID;
932     }
933   }
934   return XML_TOK_PARTIAL;
935 }
936 
937 static int PTRCALL
PREFIX(scanPoundName)938 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
939                       const char **nextTokPtr)
940 {
941   REQUIRE_CHAR(enc, ptr, end);
942   switch (BYTE_TYPE(enc, ptr)) {
943   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
944   default:
945     *nextTokPtr = ptr;
946     return XML_TOK_INVALID;
947   }
948   while (HAS_CHAR(enc, ptr, end)) {
949     switch (BYTE_TYPE(enc, ptr)) {
950     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
951     case BT_CR: case BT_LF: case BT_S:
952     case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
953       *nextTokPtr = ptr;
954       return XML_TOK_POUND_NAME;
955     default:
956       *nextTokPtr = ptr;
957       return XML_TOK_INVALID;
958     }
959   }
960   return -XML_TOK_POUND_NAME;
961 }
962 
963 static int PTRCALL
PREFIX(scanLit)964 PREFIX(scanLit)(int open, const ENCODING *enc,
965                 const char *ptr, const char *end,
966                 const char **nextTokPtr)
967 {
968   while (HAS_CHAR(enc, ptr, end)) {
969     int t = BYTE_TYPE(enc, ptr);
970     switch (t) {
971     INVALID_CASES(ptr, nextTokPtr)
972     case BT_QUOT:
973     case BT_APOS:
974       ptr += MINBPC(enc);
975       if (t != open)
976         break;
977       if (! HAS_CHAR(enc, ptr, end))
978         return -XML_TOK_LITERAL;
979       *nextTokPtr = ptr;
980       switch (BYTE_TYPE(enc, ptr)) {
981       case BT_S: case BT_CR: case BT_LF:
982       case BT_GT: case BT_PERCNT: case BT_LSQB:
983         return XML_TOK_LITERAL;
984       default:
985         return XML_TOK_INVALID;
986       }
987     default:
988       ptr += MINBPC(enc);
989       break;
990     }
991   }
992   return XML_TOK_PARTIAL;
993 }
994 
995 static int PTRCALL
PREFIX(prologTok)996 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
997                   const char **nextTokPtr)
998 {
999   int tok;
1000   if (ptr >= end)
1001     return XML_TOK_NONE;
1002   if (MINBPC(enc) > 1) {
1003     size_t n = end - ptr;
1004     if (n & (MINBPC(enc) - 1)) {
1005       n &= ~(MINBPC(enc) - 1);
1006       if (n == 0)
1007         return XML_TOK_PARTIAL;
1008       end = ptr + n;
1009     }
1010   }
1011   switch (BYTE_TYPE(enc, ptr)) {
1012   case BT_QUOT:
1013     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1014   case BT_APOS:
1015     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1016   case BT_LT:
1017     {
1018       ptr += MINBPC(enc);
1019       REQUIRE_CHAR(enc, ptr, end);
1020       switch (BYTE_TYPE(enc, ptr)) {
1021       case BT_EXCL:
1022         return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1023       case BT_QUEST:
1024         return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1025       case BT_NMSTRT:
1026       case BT_HEX:
1027       case BT_NONASCII:
1028       case BT_LEAD2:
1029       case BT_LEAD3:
1030       case BT_LEAD4:
1031         *nextTokPtr = ptr - MINBPC(enc);
1032         return XML_TOK_INSTANCE_START;
1033       }
1034       *nextTokPtr = ptr;
1035       return XML_TOK_INVALID;
1036     }
1037   case BT_CR:
1038     if (ptr + MINBPC(enc) == end) {
1039       *nextTokPtr = end;
1040       /* indicate that this might be part of a CR/LF pair */
1041       return -XML_TOK_PROLOG_S;
1042     }
1043     /* fall through */
1044   case BT_S: case BT_LF:
1045     for (;;) {
1046       ptr += MINBPC(enc);
1047       if (! HAS_CHAR(enc, ptr, end))
1048         break;
1049       switch (BYTE_TYPE(enc, ptr)) {
1050       case BT_S: case BT_LF:
1051         break;
1052       case BT_CR:
1053         /* don't split CR/LF pair */
1054         if (ptr + MINBPC(enc) != end)
1055           break;
1056         /* fall through */
1057       default:
1058         *nextTokPtr = ptr;
1059         return XML_TOK_PROLOG_S;
1060       }
1061     }
1062     *nextTokPtr = ptr;
1063     return XML_TOK_PROLOG_S;
1064   case BT_PERCNT:
1065     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1066   case BT_COMMA:
1067     *nextTokPtr = ptr + MINBPC(enc);
1068     return XML_TOK_COMMA;
1069   case BT_LSQB:
1070     *nextTokPtr = ptr + MINBPC(enc);
1071     return XML_TOK_OPEN_BRACKET;
1072   case BT_RSQB:
1073     ptr += MINBPC(enc);
1074     if (! HAS_CHAR(enc, ptr, end))
1075       return -XML_TOK_CLOSE_BRACKET;
1076     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1077       REQUIRE_CHARS(enc, ptr, end, 2);
1078       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1079         *nextTokPtr = ptr + 2*MINBPC(enc);
1080         return XML_TOK_COND_SECT_CLOSE;
1081       }
1082     }
1083     *nextTokPtr = ptr;
1084     return XML_TOK_CLOSE_BRACKET;
1085   case BT_LPAR:
1086     *nextTokPtr = ptr + MINBPC(enc);
1087     return XML_TOK_OPEN_PAREN;
1088   case BT_RPAR:
1089     ptr += MINBPC(enc);
1090     if (! HAS_CHAR(enc, ptr, end))
1091       return -XML_TOK_CLOSE_PAREN;
1092     switch (BYTE_TYPE(enc, ptr)) {
1093     case BT_AST:
1094       *nextTokPtr = ptr + MINBPC(enc);
1095       return XML_TOK_CLOSE_PAREN_ASTERISK;
1096     case BT_QUEST:
1097       *nextTokPtr = ptr + MINBPC(enc);
1098       return XML_TOK_CLOSE_PAREN_QUESTION;
1099     case BT_PLUS:
1100       *nextTokPtr = ptr + MINBPC(enc);
1101       return XML_TOK_CLOSE_PAREN_PLUS;
1102     case BT_CR: case BT_LF: case BT_S:
1103     case BT_GT: case BT_COMMA: case BT_VERBAR:
1104     case BT_RPAR:
1105       *nextTokPtr = ptr;
1106       return XML_TOK_CLOSE_PAREN;
1107     }
1108     *nextTokPtr = ptr;
1109     return XML_TOK_INVALID;
1110   case BT_VERBAR:
1111     *nextTokPtr = ptr + MINBPC(enc);
1112     return XML_TOK_OR;
1113   case BT_GT:
1114     *nextTokPtr = ptr + MINBPC(enc);
1115     return XML_TOK_DECL_CLOSE;
1116   case BT_NUM:
1117     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1118 #define LEAD_CASE(n) \
1119   case BT_LEAD ## n: \
1120     if (end - ptr < n) \
1121       return XML_TOK_PARTIAL_CHAR; \
1122     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1123       ptr += n; \
1124       tok = XML_TOK_NAME; \
1125       break; \
1126     } \
1127     if (IS_NAME_CHAR(enc, ptr, n)) { \
1128       ptr += n; \
1129       tok = XML_TOK_NMTOKEN; \
1130       break; \
1131     } \
1132     *nextTokPtr = ptr; \
1133     return XML_TOK_INVALID;
1134     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1135 #undef LEAD_CASE
1136   case BT_NMSTRT:
1137   case BT_HEX:
1138     tok = XML_TOK_NAME;
1139     ptr += MINBPC(enc);
1140     break;
1141   case BT_DIGIT:
1142   case BT_NAME:
1143   case BT_MINUS:
1144 #ifdef XML_NS
1145   case BT_COLON:
1146 #endif
1147     tok = XML_TOK_NMTOKEN;
1148     ptr += MINBPC(enc);
1149     break;
1150   case BT_NONASCII:
1151     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1152       ptr += MINBPC(enc);
1153       tok = XML_TOK_NAME;
1154       break;
1155     }
1156     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1157       ptr += MINBPC(enc);
1158       tok = XML_TOK_NMTOKEN;
1159       break;
1160     }
1161     /* fall through */
1162   default:
1163     *nextTokPtr = ptr;
1164     return XML_TOK_INVALID;
1165   }
1166   while (HAS_CHAR(enc, ptr, end)) {
1167     switch (BYTE_TYPE(enc, ptr)) {
1168     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1169     case BT_GT: case BT_RPAR: case BT_COMMA:
1170     case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1171     case BT_S: case BT_CR: case BT_LF:
1172       *nextTokPtr = ptr;
1173       return tok;
1174 #ifdef XML_NS
1175     case BT_COLON:
1176       ptr += MINBPC(enc);
1177       switch (tok) {
1178       case XML_TOK_NAME:
1179         REQUIRE_CHAR(enc, ptr, end);
1180         tok = XML_TOK_PREFIXED_NAME;
1181         switch (BYTE_TYPE(enc, ptr)) {
1182         CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1183         default:
1184           tok = XML_TOK_NMTOKEN;
1185           break;
1186         }
1187         break;
1188       case XML_TOK_PREFIXED_NAME:
1189         tok = XML_TOK_NMTOKEN;
1190         break;
1191       }
1192       break;
1193 #endif
1194     case BT_PLUS:
1195       if (tok == XML_TOK_NMTOKEN)  {
1196         *nextTokPtr = ptr;
1197         return XML_TOK_INVALID;
1198       }
1199       *nextTokPtr = ptr + MINBPC(enc);
1200       return XML_TOK_NAME_PLUS;
1201     case BT_AST:
1202       if (tok == XML_TOK_NMTOKEN)  {
1203         *nextTokPtr = ptr;
1204         return XML_TOK_INVALID;
1205       }
1206       *nextTokPtr = ptr + MINBPC(enc);
1207       return XML_TOK_NAME_ASTERISK;
1208     case BT_QUEST:
1209       if (tok == XML_TOK_NMTOKEN)  {
1210         *nextTokPtr = ptr;
1211         return XML_TOK_INVALID;
1212       }
1213       *nextTokPtr = ptr + MINBPC(enc);
1214       return XML_TOK_NAME_QUESTION;
1215     default:
1216       *nextTokPtr = ptr;
1217       return XML_TOK_INVALID;
1218     }
1219   }
1220   return -tok;
1221 }
1222 
1223 static int PTRCALL
PREFIX(attributeValueTok)1224 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1225                           const char *end, const char **nextTokPtr)
1226 {
1227   const char *start;
1228   if (ptr >= end)
1229     return XML_TOK_NONE;
1230   else if (! HAS_CHAR(enc, ptr, end)) {
1231     /* This line cannot be executed.  The incoming data has already
1232      * been tokenized once, so incomplete characters like this have
1233      * already been eliminated from the input.  Retaining the paranoia
1234      * check is still valuable, however.
1235      */
1236     return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1237   }
1238   start = ptr;
1239   while (HAS_CHAR(enc, ptr, end)) {
1240     switch (BYTE_TYPE(enc, ptr)) {
1241 #define LEAD_CASE(n) \
1242     case BT_LEAD ## n: ptr += n; break;
1243     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1244 #undef LEAD_CASE
1245     case BT_AMP:
1246       if (ptr == start)
1247         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1248       *nextTokPtr = ptr;
1249       return XML_TOK_DATA_CHARS;
1250     case BT_LT:
1251       /* this is for inside entity references */
1252       *nextTokPtr = ptr;
1253       return XML_TOK_INVALID;
1254     case BT_LF:
1255       if (ptr == start) {
1256         *nextTokPtr = ptr + MINBPC(enc);
1257         return XML_TOK_DATA_NEWLINE;
1258       }
1259       *nextTokPtr = ptr;
1260       return XML_TOK_DATA_CHARS;
1261     case BT_CR:
1262       if (ptr == start) {
1263         ptr += MINBPC(enc);
1264         if (! HAS_CHAR(enc, ptr, end))
1265           return XML_TOK_TRAILING_CR;
1266         if (BYTE_TYPE(enc, ptr) == BT_LF)
1267           ptr += MINBPC(enc);
1268         *nextTokPtr = ptr;
1269         return XML_TOK_DATA_NEWLINE;
1270       }
1271       *nextTokPtr = ptr;
1272       return XML_TOK_DATA_CHARS;
1273     case BT_S:
1274       if (ptr == start) {
1275         *nextTokPtr = ptr + MINBPC(enc);
1276         return XML_TOK_ATTRIBUTE_VALUE_S;
1277       }
1278       *nextTokPtr = ptr;
1279       return XML_TOK_DATA_CHARS;
1280     default:
1281       ptr += MINBPC(enc);
1282       break;
1283     }
1284   }
1285   *nextTokPtr = ptr;
1286   return XML_TOK_DATA_CHARS;
1287 }
1288 
1289 static int PTRCALL
PREFIX(entityValueTok)1290 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1291                        const char *end, const char **nextTokPtr)
1292 {
1293   const char *start;
1294   if (ptr >= end)
1295     return XML_TOK_NONE;
1296   else if (! HAS_CHAR(enc, ptr, end)) {
1297     /* This line cannot be executed.  The incoming data has already
1298      * been tokenized once, so incomplete characters like this have
1299      * already been eliminated from the input.  Retaining the paranoia
1300      * check is still valuable, however.
1301      */
1302     return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1303   }
1304   start = ptr;
1305   while (HAS_CHAR(enc, ptr, end)) {
1306     switch (BYTE_TYPE(enc, ptr)) {
1307 #define LEAD_CASE(n) \
1308     case BT_LEAD ## n: ptr += n; break;
1309     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1310 #undef LEAD_CASE
1311     case BT_AMP:
1312       if (ptr == start)
1313         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1314       *nextTokPtr = ptr;
1315       return XML_TOK_DATA_CHARS;
1316     case BT_PERCNT:
1317       if (ptr == start) {
1318         int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1319                                        end, nextTokPtr);
1320         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1321       }
1322       *nextTokPtr = ptr;
1323       return XML_TOK_DATA_CHARS;
1324     case BT_LF:
1325       if (ptr == start) {
1326         *nextTokPtr = ptr + MINBPC(enc);
1327         return XML_TOK_DATA_NEWLINE;
1328       }
1329       *nextTokPtr = ptr;
1330       return XML_TOK_DATA_CHARS;
1331     case BT_CR:
1332       if (ptr == start) {
1333         ptr += MINBPC(enc);
1334         if (! HAS_CHAR(enc, ptr, end))
1335           return XML_TOK_TRAILING_CR;
1336         if (BYTE_TYPE(enc, ptr) == BT_LF)
1337           ptr += MINBPC(enc);
1338         *nextTokPtr = ptr;
1339         return XML_TOK_DATA_NEWLINE;
1340       }
1341       *nextTokPtr = ptr;
1342       return XML_TOK_DATA_CHARS;
1343     default:
1344       ptr += MINBPC(enc);
1345       break;
1346     }
1347   }
1348   *nextTokPtr = ptr;
1349   return XML_TOK_DATA_CHARS;
1350 }
1351 
1352 #ifdef XML_DTD
1353 
1354 static int PTRCALL
PREFIX(ignoreSectionTok)1355 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1356                          const char *end, const char **nextTokPtr)
1357 {
1358   int level = 0;
1359   if (MINBPC(enc) > 1) {
1360     size_t n = end - ptr;
1361     if (n & (MINBPC(enc) - 1)) {
1362       n &= ~(MINBPC(enc) - 1);
1363       end = ptr + n;
1364     }
1365   }
1366   while (HAS_CHAR(enc, ptr, end)) {
1367     switch (BYTE_TYPE(enc, ptr)) {
1368     INVALID_CASES(ptr, nextTokPtr)
1369     case BT_LT:
1370       ptr += MINBPC(enc);
1371       REQUIRE_CHAR(enc, ptr, end);
1372       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1373         ptr += MINBPC(enc);
1374         REQUIRE_CHAR(enc, ptr, end);
1375         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1376           ++level;
1377           ptr += MINBPC(enc);
1378         }
1379       }
1380       break;
1381     case BT_RSQB:
1382       ptr += MINBPC(enc);
1383       REQUIRE_CHAR(enc, ptr, end);
1384       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1385         ptr += MINBPC(enc);
1386         REQUIRE_CHAR(enc, ptr, end);
1387         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1388           ptr += MINBPC(enc);
1389           if (level == 0) {
1390             *nextTokPtr = ptr;
1391             return XML_TOK_IGNORE_SECT;
1392           }
1393           --level;
1394         }
1395       }
1396       break;
1397     default:
1398       ptr += MINBPC(enc);
1399       break;
1400     }
1401   }
1402   return XML_TOK_PARTIAL;
1403 }
1404 
1405 #endif /* XML_DTD */
1406 
1407 static int PTRCALL
PREFIX(isPublicId)1408 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1409                    const char **badPtr)
1410 {
1411   ptr += MINBPC(enc);
1412   end -= MINBPC(enc);
1413   for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1414     switch (BYTE_TYPE(enc, ptr)) {
1415     case BT_DIGIT:
1416     case BT_HEX:
1417     case BT_MINUS:
1418     case BT_APOS:
1419     case BT_LPAR:
1420     case BT_RPAR:
1421     case BT_PLUS:
1422     case BT_COMMA:
1423     case BT_SOL:
1424     case BT_EQUALS:
1425     case BT_QUEST:
1426     case BT_CR:
1427     case BT_LF:
1428     case BT_SEMI:
1429     case BT_EXCL:
1430     case BT_AST:
1431     case BT_PERCNT:
1432     case BT_NUM:
1433 #ifdef XML_NS
1434     case BT_COLON:
1435 #endif
1436       break;
1437     case BT_S:
1438       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1439         *badPtr = ptr;
1440         return 0;
1441       }
1442       break;
1443     case BT_NAME:
1444     case BT_NMSTRT:
1445       if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1446         break;
1447       /* fall through */
1448     default:
1449       switch (BYTE_TO_ASCII(enc, ptr)) {
1450       case 0x24: /* $ */
1451       case 0x40: /* @ */
1452         break;
1453       default:
1454         *badPtr = ptr;
1455         return 0;
1456       }
1457       break;
1458     }
1459   }
1460   return 1;
1461 }
1462 
1463 /* This must only be called for a well-formed start-tag or empty
1464    element tag.  Returns the number of attributes.  Pointers to the
1465    first attsMax attributes are stored in atts.
1466 */
1467 
1468 static int PTRCALL
PREFIX(getAtts)1469 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1470                 int attsMax, ATTRIBUTE *atts)
1471 {
1472   enum { other, inName, inValue } state = inName;
1473   int nAtts = 0;
1474   int open = 0; /* defined when state == inValue;
1475                    initialization just to shut up compilers */
1476 
1477   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1478     switch (BYTE_TYPE(enc, ptr)) {
1479 #define START_NAME \
1480       if (state == other) { \
1481         if (nAtts < attsMax) { \
1482           atts[nAtts].name = ptr; \
1483           atts[nAtts].normalized = 1; \
1484         } \
1485         state = inName; \
1486       }
1487 #define LEAD_CASE(n) \
1488     case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1489     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1490 #undef LEAD_CASE
1491     case BT_NONASCII:
1492     case BT_NMSTRT:
1493     case BT_HEX:
1494       START_NAME
1495       break;
1496 #undef START_NAME
1497     case BT_QUOT:
1498       if (state != inValue) {
1499         if (nAtts < attsMax)
1500           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1501         state = inValue;
1502         open = BT_QUOT;
1503       }
1504       else if (open == BT_QUOT) {
1505         state = other;
1506         if (nAtts < attsMax)
1507           atts[nAtts].valueEnd = ptr;
1508         nAtts++;
1509       }
1510       break;
1511     case BT_APOS:
1512       if (state != inValue) {
1513         if (nAtts < attsMax)
1514           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1515         state = inValue;
1516         open = BT_APOS;
1517       }
1518       else if (open == BT_APOS) {
1519         state = other;
1520         if (nAtts < attsMax)
1521           atts[nAtts].valueEnd = ptr;
1522         nAtts++;
1523       }
1524       break;
1525     case BT_AMP:
1526       if (nAtts < attsMax)
1527         atts[nAtts].normalized = 0;
1528       break;
1529     case BT_S:
1530       if (state == inName)
1531         state = other;
1532       else if (state == inValue
1533                && nAtts < attsMax
1534                && atts[nAtts].normalized
1535                && (ptr == atts[nAtts].valuePtr
1536                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1537                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1538                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1539         atts[nAtts].normalized = 0;
1540       break;
1541     case BT_CR: case BT_LF:
1542       /* This case ensures that the first attribute name is counted
1543          Apart from that we could just change state on the quote. */
1544       if (state == inName)
1545         state = other;
1546       else if (state == inValue && nAtts < attsMax)
1547         atts[nAtts].normalized = 0;
1548       break;
1549     case BT_GT:
1550     case BT_SOL:
1551       if (state != inValue)
1552         return nAtts;
1553       break;
1554     default:
1555       break;
1556     }
1557   }
1558   /* not reached */
1559 }
1560 
1561 static int PTRFASTCALL
PREFIX(charRefNumber)1562 PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr)
1563 {
1564   int result = 0;
1565   /* skip &# */
1566   ptr += 2*MINBPC(enc);
1567   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1568     for (ptr += MINBPC(enc);
1569          !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1570          ptr += MINBPC(enc)) {
1571       int c = BYTE_TO_ASCII(enc, ptr);
1572       switch (c) {
1573       case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1574       case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1575         result <<= 4;
1576         result |= (c - ASCII_0);
1577         break;
1578       case ASCII_A: case ASCII_B: case ASCII_C:
1579       case ASCII_D: case ASCII_E: case ASCII_F:
1580         result <<= 4;
1581         result += 10 + (c - ASCII_A);
1582         break;
1583       case ASCII_a: case ASCII_b: case ASCII_c:
1584       case ASCII_d: case ASCII_e: case ASCII_f:
1585         result <<= 4;
1586         result += 10 + (c - ASCII_a);
1587         break;
1588       }
1589       if (result >= 0x110000)
1590         return -1;
1591     }
1592   }
1593   else {
1594     for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1595       int c = BYTE_TO_ASCII(enc, ptr);
1596       result *= 10;
1597       result += (c - ASCII_0);
1598       if (result >= 0x110000)
1599         return -1;
1600     }
1601   }
1602   return checkCharRefNumber(result);
1603 }
1604 
1605 static int PTRCALL
PREFIX(predefinedEntityName)1606 PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr,
1607                              const char *end)
1608 {
1609   switch ((end - ptr)/MINBPC(enc)) {
1610   case 2:
1611     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1612       switch (BYTE_TO_ASCII(enc, ptr)) {
1613       case ASCII_l:
1614         return ASCII_LT;
1615       case ASCII_g:
1616         return ASCII_GT;
1617       }
1618     }
1619     break;
1620   case 3:
1621     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1622       ptr += MINBPC(enc);
1623       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1624         ptr += MINBPC(enc);
1625         if (CHAR_MATCHES(enc, ptr, ASCII_p))
1626           return ASCII_AMP;
1627       }
1628     }
1629     break;
1630   case 4:
1631     switch (BYTE_TO_ASCII(enc, ptr)) {
1632     case ASCII_q:
1633       ptr += MINBPC(enc);
1634       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1635         ptr += MINBPC(enc);
1636         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1637           ptr += MINBPC(enc);
1638           if (CHAR_MATCHES(enc, ptr, ASCII_t))
1639             return ASCII_QUOT;
1640         }
1641       }
1642       break;
1643     case ASCII_a:
1644       ptr += MINBPC(enc);
1645       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1646         ptr += MINBPC(enc);
1647         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1648           ptr += MINBPC(enc);
1649           if (CHAR_MATCHES(enc, ptr, ASCII_s))
1650             return ASCII_APOS;
1651         }
1652       }
1653       break;
1654     }
1655   }
1656   return 0;
1657 }
1658 
1659 static int PTRCALL
PREFIX(nameMatchesAscii)1660 PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1,
1661                          const char *end1, const char *ptr2)
1662 {
1663   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1664     if (end1 - ptr1 < MINBPC(enc)) {
1665       /* This line cannot be executed.  The incoming data has already
1666        * been tokenized once, so incomplete characters like this have
1667        * already been eliminated from the input.  Retaining the
1668        * paranoia check is still valuable, however.
1669        */
1670       return 0; /* LCOV_EXCL_LINE */
1671     }
1672     if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1673       return 0;
1674   }
1675   return ptr1 == end1;
1676 }
1677 
1678 static int PTRFASTCALL
PREFIX(nameLength)1679 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1680 {
1681   const char *start = ptr;
1682   for (;;) {
1683     switch (BYTE_TYPE(enc, ptr)) {
1684 #define LEAD_CASE(n) \
1685     case BT_LEAD ## n: ptr += n; break;
1686     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1687 #undef LEAD_CASE
1688     case BT_NONASCII:
1689     case BT_NMSTRT:
1690 #ifdef XML_NS
1691     case BT_COLON:
1692 #endif
1693     case BT_HEX:
1694     case BT_DIGIT:
1695     case BT_NAME:
1696     case BT_MINUS:
1697       ptr += MINBPC(enc);
1698       break;
1699     default:
1700       return (int)(ptr - start);
1701     }
1702   }
1703 }
1704 
1705 static const char * PTRFASTCALL
PREFIX(skipS)1706 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1707 {
1708   for (;;) {
1709     switch (BYTE_TYPE(enc, ptr)) {
1710     case BT_LF:
1711     case BT_CR:
1712     case BT_S:
1713       ptr += MINBPC(enc);
1714       break;
1715     default:
1716       return ptr;
1717     }
1718   }
1719 }
1720 
1721 static void PTRCALL
PREFIX(updatePosition)1722 PREFIX(updatePosition)(const ENCODING *enc,
1723                        const char *ptr,
1724                        const char *end,
1725                        POSITION *pos)
1726 {
1727   while (HAS_CHAR(enc, ptr, end)) {
1728     switch (BYTE_TYPE(enc, ptr)) {
1729 #define LEAD_CASE(n) \
1730     case BT_LEAD ## n: \
1731       ptr += n; \
1732       break;
1733     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1734 #undef LEAD_CASE
1735     case BT_LF:
1736       pos->columnNumber = (XML_Size)-1;
1737       pos->lineNumber++;
1738       ptr += MINBPC(enc);
1739       break;
1740     case BT_CR:
1741       pos->lineNumber++;
1742       ptr += MINBPC(enc);
1743       if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1744         ptr += MINBPC(enc);
1745       pos->columnNumber = (XML_Size)-1;
1746       break;
1747     default:
1748       ptr += MINBPC(enc);
1749       break;
1750     }
1751     pos->columnNumber++;
1752   }
1753 }
1754 
1755 #undef DO_LEAD_CASE
1756 #undef MULTIBYTE_CASES
1757 #undef INVALID_CASES
1758 #undef CHECK_NAME_CASE
1759 #undef CHECK_NAME_CASES
1760 #undef CHECK_NMSTRT_CASE
1761 #undef CHECK_NMSTRT_CASES
1762 
1763 #endif /* XML_TOK_IMPL_C */
1764