1 /* This file is included! 2 __ __ _ 3 ___\ \/ /_ __ __ _| |_ 4 / _ \\ /| '_ \ / _` | __| 5 | __// \| |_) | (_| | |_ 6 \___/_/\_\ .__/ \__,_|\__| 7 |_| XML parser 8 9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd 10 Copyright (c) 2000-2017 Expat development team 11 Licensed under the MIT license: 12 13 Permission is hereby granted, free of charge, to any person obtaining 14 a copy of this software and associated documentation files (the 15 "Software"), to deal in the Software without restriction, including 16 without limitation the rights to use, copy, modify, merge, publish, 17 distribute, sublicense, and/or sell copies of the Software, and to permit 18 persons to whom the Software is furnished to do so, subject to the 19 following conditions: 20 21 The above copyright notice and this permission notice shall be included 22 in all copies or substantial portions of the Software. 23 24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 30 USE OR OTHER DEALINGS IN THE SOFTWARE. 31 */ 32 33 #ifdef XML_TOK_IMPL_C 34 35 #ifndef IS_INVALID_CHAR 36 #define IS_INVALID_CHAR(enc, ptr, n) (0) 37 #endif 38 39 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \ 40 case BT_LEAD ## n: \ 41 if (end - ptr < n) \ 42 return XML_TOK_PARTIAL_CHAR; \ 43 if (IS_INVALID_CHAR(enc, ptr, n)) { \ 44 *(nextTokPtr) = (ptr); \ 45 return XML_TOK_INVALID; \ 46 } \ 47 ptr += n; \ 48 break; 49 50 #define INVALID_CASES(ptr, nextTokPtr) \ 51 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \ 52 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \ 53 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \ 54 case BT_NONXML: \ 55 case BT_MALFORM: \ 56 case BT_TRAIL: \ 57 *(nextTokPtr) = (ptr); \ 58 return XML_TOK_INVALID; 59 60 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \ 61 case BT_LEAD ## n: \ 62 if (end - ptr < n) \ 63 return XML_TOK_PARTIAL_CHAR; \ 64 if (!IS_NAME_CHAR(enc, ptr, n)) { \ 65 *nextTokPtr = ptr; \ 66 return XML_TOK_INVALID; \ 67 } \ 68 ptr += n; \ 69 break; 70 71 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \ 72 case BT_NONASCII: \ 73 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \ 74 *nextTokPtr = ptr; \ 75 return XML_TOK_INVALID; \ 76 } \ 77 case BT_NMSTRT: \ 78 case BT_HEX: \ 79 case BT_DIGIT: \ 80 case BT_NAME: \ 81 case BT_MINUS: \ 82 ptr += MINBPC(enc); \ 83 break; \ 84 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \ 85 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \ 86 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr) 87 88 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \ 89 case BT_LEAD ## n: \ 90 if (end - ptr < n) \ 91 return XML_TOK_PARTIAL_CHAR; \ 92 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \ 93 *nextTokPtr = ptr; \ 94 return XML_TOK_INVALID; \ 95 } \ 96 ptr += n; \ 97 break; 98 99 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \ 100 case BT_NONASCII: \ 101 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \ 102 *nextTokPtr = ptr; \ 103 return XML_TOK_INVALID; \ 104 } \ 105 case BT_NMSTRT: \ 106 case BT_HEX: \ 107 ptr += MINBPC(enc); \ 108 break; \ 109 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \ 110 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \ 111 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr) 112 113 #ifndef PREFIX 114 #define PREFIX(ident) ident 115 #endif 116 117 118 #define HAS_CHARS(enc, ptr, end, count) \ 119 (end - ptr >= count * MINBPC(enc)) 120 121 #define HAS_CHAR(enc, ptr, end) \ 122 HAS_CHARS(enc, ptr, end, 1) 123 124 #define REQUIRE_CHARS(enc, ptr, end, count) \ 125 { \ 126 if (! HAS_CHARS(enc, ptr, end, count)) { \ 127 return XML_TOK_PARTIAL; \ 128 } \ 129 } 130 131 #define REQUIRE_CHAR(enc, ptr, end) \ 132 REQUIRE_CHARS(enc, ptr, end, 1) 133 134 135 /* ptr points to character following "<!-" */ 136 137 static int PTRCALL 138 PREFIX(scanComment)(const ENCODING *enc, const char *ptr, 139 const char *end, const char **nextTokPtr) 140 { 141 if (HAS_CHAR(enc, ptr, end)) { 142 if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { 143 *nextTokPtr = ptr; 144 return XML_TOK_INVALID; 145 } 146 ptr += MINBPC(enc); 147 while (HAS_CHAR(enc, ptr, end)) { 148 switch (BYTE_TYPE(enc, ptr)) { 149 INVALID_CASES(ptr, nextTokPtr) 150 case BT_MINUS: 151 ptr += MINBPC(enc); 152 REQUIRE_CHAR(enc, ptr, end); 153 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { 154 ptr += MINBPC(enc); 155 REQUIRE_CHAR(enc, ptr, end); 156 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 157 *nextTokPtr = ptr; 158 return XML_TOK_INVALID; 159 } 160 *nextTokPtr = ptr + MINBPC(enc); 161 return XML_TOK_COMMENT; 162 } 163 break; 164 default: 165 ptr += MINBPC(enc); 166 break; 167 } 168 } 169 } 170 return XML_TOK_PARTIAL; 171 } 172 173 /* ptr points to character following "<!" */ 174 175 static int PTRCALL 176 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, 177 const char *end, const char **nextTokPtr) 178 { 179 REQUIRE_CHAR(enc, ptr, end); 180 switch (BYTE_TYPE(enc, ptr)) { 181 case BT_MINUS: 182 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); 183 case BT_LSQB: 184 *nextTokPtr = ptr + MINBPC(enc); 185 return XML_TOK_COND_SECT_OPEN; 186 case BT_NMSTRT: 187 case BT_HEX: 188 ptr += MINBPC(enc); 189 break; 190 default: 191 *nextTokPtr = ptr; 192 return XML_TOK_INVALID; 193 } 194 while (HAS_CHAR(enc, ptr, end)) { 195 switch (BYTE_TYPE(enc, ptr)) { 196 case BT_PERCNT: 197 REQUIRE_CHARS(enc, ptr, end, 2); 198 /* don't allow <!ENTITY% foo "whatever"> */ 199 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) { 200 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT: 201 *nextTokPtr = ptr; 202 return XML_TOK_INVALID; 203 } 204 /* fall through */ 205 case BT_S: case BT_CR: case BT_LF: 206 *nextTokPtr = ptr; 207 return XML_TOK_DECL_OPEN; 208 case BT_NMSTRT: 209 case BT_HEX: 210 ptr += MINBPC(enc); 211 break; 212 default: 213 *nextTokPtr = ptr; 214 return XML_TOK_INVALID; 215 } 216 } 217 return XML_TOK_PARTIAL; 218 } 219 220 static int PTRCALL 221 PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr, 222 const char *end, int *tokPtr) 223 { 224 int upper = 0; 225 *tokPtr = XML_TOK_PI; 226 if (end - ptr != MINBPC(enc)*3) 227 return 1; 228 switch (BYTE_TO_ASCII(enc, ptr)) { 229 case ASCII_x: 230 break; 231 case ASCII_X: 232 upper = 1; 233 break; 234 default: 235 return 1; 236 } 237 ptr += MINBPC(enc); 238 switch (BYTE_TO_ASCII(enc, ptr)) { 239 case ASCII_m: 240 break; 241 case ASCII_M: 242 upper = 1; 243 break; 244 default: 245 return 1; 246 } 247 ptr += MINBPC(enc); 248 switch (BYTE_TO_ASCII(enc, ptr)) { 249 case ASCII_l: 250 break; 251 case ASCII_L: 252 upper = 1; 253 break; 254 default: 255 return 1; 256 } 257 if (upper) 258 return 0; 259 *tokPtr = XML_TOK_XML_DECL; 260 return 1; 261 } 262 263 /* ptr points to character following "<?" */ 264 265 static int PTRCALL 266 PREFIX(scanPi)(const ENCODING *enc, const char *ptr, 267 const char *end, const char **nextTokPtr) 268 { 269 int tok; 270 const char *target = ptr; 271 REQUIRE_CHAR(enc, ptr, end); 272 switch (BYTE_TYPE(enc, ptr)) { 273 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 274 default: 275 *nextTokPtr = ptr; 276 return XML_TOK_INVALID; 277 } 278 while (HAS_CHAR(enc, ptr, end)) { 279 switch (BYTE_TYPE(enc, ptr)) { 280 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 281 case BT_S: case BT_CR: case BT_LF: 282 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { 283 *nextTokPtr = ptr; 284 return XML_TOK_INVALID; 285 } 286 ptr += MINBPC(enc); 287 while (HAS_CHAR(enc, ptr, end)) { 288 switch (BYTE_TYPE(enc, ptr)) { 289 INVALID_CASES(ptr, nextTokPtr) 290 case BT_QUEST: 291 ptr += MINBPC(enc); 292 REQUIRE_CHAR(enc, ptr, end); 293 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 294 *nextTokPtr = ptr + MINBPC(enc); 295 return tok; 296 } 297 break; 298 default: 299 ptr += MINBPC(enc); 300 break; 301 } 302 } 303 return XML_TOK_PARTIAL; 304 case BT_QUEST: 305 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { 306 *nextTokPtr = ptr; 307 return XML_TOK_INVALID; 308 } 309 ptr += MINBPC(enc); 310 REQUIRE_CHAR(enc, ptr, end); 311 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 312 *nextTokPtr = ptr + MINBPC(enc); 313 return tok; 314 } 315 /* fall through */ 316 default: 317 *nextTokPtr = ptr; 318 return XML_TOK_INVALID; 319 } 320 } 321 return XML_TOK_PARTIAL; 322 } 323 324 static int PTRCALL 325 PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr, 326 const char *end, const char **nextTokPtr) 327 { 328 static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, 329 ASCII_T, ASCII_A, ASCII_LSQB }; 330 int i; 331 /* CDATA[ */ 332 REQUIRE_CHARS(enc, ptr, end, 6); 333 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) { 334 if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) { 335 *nextTokPtr = ptr; 336 return XML_TOK_INVALID; 337 } 338 } 339 *nextTokPtr = ptr; 340 return XML_TOK_CDATA_SECT_OPEN; 341 } 342 343 static int PTRCALL 344 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, 345 const char *end, const char **nextTokPtr) 346 { 347 if (ptr >= end) 348 return XML_TOK_NONE; 349 if (MINBPC(enc) > 1) { 350 size_t n = end - ptr; 351 if (n & (MINBPC(enc) - 1)) { 352 n &= ~(MINBPC(enc) - 1); 353 if (n == 0) 354 return XML_TOK_PARTIAL; 355 end = ptr + n; 356 } 357 } 358 switch (BYTE_TYPE(enc, ptr)) { 359 case BT_RSQB: 360 ptr += MINBPC(enc); 361 REQUIRE_CHAR(enc, ptr, end); 362 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) 363 break; 364 ptr += MINBPC(enc); 365 REQUIRE_CHAR(enc, ptr, end); 366 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 367 ptr -= MINBPC(enc); 368 break; 369 } 370 *nextTokPtr = ptr + MINBPC(enc); 371 return XML_TOK_CDATA_SECT_CLOSE; 372 case BT_CR: 373 ptr += MINBPC(enc); 374 REQUIRE_CHAR(enc, ptr, end); 375 if (BYTE_TYPE(enc, ptr) == BT_LF) 376 ptr += MINBPC(enc); 377 *nextTokPtr = ptr; 378 return XML_TOK_DATA_NEWLINE; 379 case BT_LF: 380 *nextTokPtr = ptr + MINBPC(enc); 381 return XML_TOK_DATA_NEWLINE; 382 INVALID_CASES(ptr, nextTokPtr) 383 default: 384 ptr += MINBPC(enc); 385 break; 386 } 387 while (HAS_CHAR(enc, ptr, end)) { 388 switch (BYTE_TYPE(enc, ptr)) { 389 #define LEAD_CASE(n) \ 390 case BT_LEAD ## n: \ 391 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ 392 *nextTokPtr = ptr; \ 393 return XML_TOK_DATA_CHARS; \ 394 } \ 395 ptr += n; \ 396 break; 397 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 398 #undef LEAD_CASE 399 case BT_NONXML: 400 case BT_MALFORM: 401 case BT_TRAIL: 402 case BT_CR: 403 case BT_LF: 404 case BT_RSQB: 405 *nextTokPtr = ptr; 406 return XML_TOK_DATA_CHARS; 407 default: 408 ptr += MINBPC(enc); 409 break; 410 } 411 } 412 *nextTokPtr = ptr; 413 return XML_TOK_DATA_CHARS; 414 } 415 416 /* ptr points to character following "</" */ 417 418 static int PTRCALL 419 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, 420 const char *end, const char **nextTokPtr) 421 { 422 REQUIRE_CHAR(enc, ptr, end); 423 switch (BYTE_TYPE(enc, ptr)) { 424 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 425 default: 426 *nextTokPtr = ptr; 427 return XML_TOK_INVALID; 428 } 429 while (HAS_CHAR(enc, ptr, end)) { 430 switch (BYTE_TYPE(enc, ptr)) { 431 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 432 case BT_S: case BT_CR: case BT_LF: 433 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 434 switch (BYTE_TYPE(enc, ptr)) { 435 case BT_S: case BT_CR: case BT_LF: 436 break; 437 case BT_GT: 438 *nextTokPtr = ptr + MINBPC(enc); 439 return XML_TOK_END_TAG; 440 default: 441 *nextTokPtr = ptr; 442 return XML_TOK_INVALID; 443 } 444 } 445 return XML_TOK_PARTIAL; 446 #ifdef XML_NS 447 case BT_COLON: 448 /* no need to check qname syntax here, 449 since end-tag must match exactly */ 450 ptr += MINBPC(enc); 451 break; 452 #endif 453 case BT_GT: 454 *nextTokPtr = ptr + MINBPC(enc); 455 return XML_TOK_END_TAG; 456 default: 457 *nextTokPtr = ptr; 458 return XML_TOK_INVALID; 459 } 460 } 461 return XML_TOK_PARTIAL; 462 } 463 464 /* ptr points to character following "&#X" */ 465 466 static int PTRCALL 467 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, 468 const char *end, const char **nextTokPtr) 469 { 470 if (HAS_CHAR(enc, ptr, end)) { 471 switch (BYTE_TYPE(enc, ptr)) { 472 case BT_DIGIT: 473 case BT_HEX: 474 break; 475 default: 476 *nextTokPtr = ptr; 477 return XML_TOK_INVALID; 478 } 479 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 480 switch (BYTE_TYPE(enc, ptr)) { 481 case BT_DIGIT: 482 case BT_HEX: 483 break; 484 case BT_SEMI: 485 *nextTokPtr = ptr + MINBPC(enc); 486 return XML_TOK_CHAR_REF; 487 default: 488 *nextTokPtr = ptr; 489 return XML_TOK_INVALID; 490 } 491 } 492 } 493 return XML_TOK_PARTIAL; 494 } 495 496 /* ptr points to character following "&#" */ 497 498 static int PTRCALL 499 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, 500 const char *end, const char **nextTokPtr) 501 { 502 if (HAS_CHAR(enc, ptr, end)) { 503 if (CHAR_MATCHES(enc, ptr, ASCII_x)) 504 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 505 switch (BYTE_TYPE(enc, ptr)) { 506 case BT_DIGIT: 507 break; 508 default: 509 *nextTokPtr = ptr; 510 return XML_TOK_INVALID; 511 } 512 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 513 switch (BYTE_TYPE(enc, ptr)) { 514 case BT_DIGIT: 515 break; 516 case BT_SEMI: 517 *nextTokPtr = ptr + MINBPC(enc); 518 return XML_TOK_CHAR_REF; 519 default: 520 *nextTokPtr = ptr; 521 return XML_TOK_INVALID; 522 } 523 } 524 } 525 return XML_TOK_PARTIAL; 526 } 527 528 /* ptr points to character following "&" */ 529 530 static int PTRCALL 531 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end, 532 const char **nextTokPtr) 533 { 534 REQUIRE_CHAR(enc, ptr, end); 535 switch (BYTE_TYPE(enc, ptr)) { 536 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 537 case BT_NUM: 538 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 539 default: 540 *nextTokPtr = ptr; 541 return XML_TOK_INVALID; 542 } 543 while (HAS_CHAR(enc, ptr, end)) { 544 switch (BYTE_TYPE(enc, ptr)) { 545 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 546 case BT_SEMI: 547 *nextTokPtr = ptr + MINBPC(enc); 548 return XML_TOK_ENTITY_REF; 549 default: 550 *nextTokPtr = ptr; 551 return XML_TOK_INVALID; 552 } 553 } 554 return XML_TOK_PARTIAL; 555 } 556 557 /* ptr points to character following first character of attribute name */ 558 559 static int PTRCALL 560 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, 561 const char **nextTokPtr) 562 { 563 #ifdef XML_NS 564 int hadColon = 0; 565 #endif 566 while (HAS_CHAR(enc, ptr, end)) { 567 switch (BYTE_TYPE(enc, ptr)) { 568 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 569 #ifdef XML_NS 570 case BT_COLON: 571 if (hadColon) { 572 *nextTokPtr = ptr; 573 return XML_TOK_INVALID; 574 } 575 hadColon = 1; 576 ptr += MINBPC(enc); 577 REQUIRE_CHAR(enc, ptr, end); 578 switch (BYTE_TYPE(enc, ptr)) { 579 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 580 default: 581 *nextTokPtr = ptr; 582 return XML_TOK_INVALID; 583 } 584 break; 585 #endif 586 case BT_S: case BT_CR: case BT_LF: 587 for (;;) { 588 int t; 589 590 ptr += MINBPC(enc); 591 REQUIRE_CHAR(enc, ptr, end); 592 t = BYTE_TYPE(enc, ptr); 593 if (t == BT_EQUALS) 594 break; 595 switch (t) { 596 case BT_S: 597 case BT_LF: 598 case BT_CR: 599 break; 600 default: 601 *nextTokPtr = ptr; 602 return XML_TOK_INVALID; 603 } 604 } 605 /* fall through */ 606 case BT_EQUALS: 607 { 608 int open; 609 #ifdef XML_NS 610 hadColon = 0; 611 #endif 612 for (;;) { 613 ptr += MINBPC(enc); 614 REQUIRE_CHAR(enc, ptr, end); 615 open = BYTE_TYPE(enc, ptr); 616 if (open == BT_QUOT || open == BT_APOS) 617 break; 618 switch (open) { 619 case BT_S: 620 case BT_LF: 621 case BT_CR: 622 break; 623 default: 624 *nextTokPtr = ptr; 625 return XML_TOK_INVALID; 626 } 627 } 628 ptr += MINBPC(enc); 629 /* in attribute value */ 630 for (;;) { 631 int t; 632 REQUIRE_CHAR(enc, ptr, end); 633 t = BYTE_TYPE(enc, ptr); 634 if (t == open) 635 break; 636 switch (t) { 637 INVALID_CASES(ptr, nextTokPtr) 638 case BT_AMP: 639 { 640 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr); 641 if (tok <= 0) { 642 if (tok == XML_TOK_INVALID) 643 *nextTokPtr = ptr; 644 return tok; 645 } 646 break; 647 } 648 case BT_LT: 649 *nextTokPtr = ptr; 650 return XML_TOK_INVALID; 651 default: 652 ptr += MINBPC(enc); 653 break; 654 } 655 } 656 ptr += MINBPC(enc); 657 REQUIRE_CHAR(enc, ptr, end); 658 switch (BYTE_TYPE(enc, ptr)) { 659 case BT_S: 660 case BT_CR: 661 case BT_LF: 662 break; 663 case BT_SOL: 664 goto sol; 665 case BT_GT: 666 goto gt; 667 default: 668 *nextTokPtr = ptr; 669 return XML_TOK_INVALID; 670 } 671 /* ptr points to closing quote */ 672 for (;;) { 673 ptr += MINBPC(enc); 674 REQUIRE_CHAR(enc, ptr, end); 675 switch (BYTE_TYPE(enc, ptr)) { 676 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 677 case BT_S: case BT_CR: case BT_LF: 678 continue; 679 case BT_GT: 680 gt: 681 *nextTokPtr = ptr + MINBPC(enc); 682 return XML_TOK_START_TAG_WITH_ATTS; 683 case BT_SOL: 684 sol: 685 ptr += MINBPC(enc); 686 REQUIRE_CHAR(enc, ptr, end); 687 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 688 *nextTokPtr = ptr; 689 return XML_TOK_INVALID; 690 } 691 *nextTokPtr = ptr + MINBPC(enc); 692 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS; 693 default: 694 *nextTokPtr = ptr; 695 return XML_TOK_INVALID; 696 } 697 break; 698 } 699 break; 700 } 701 default: 702 *nextTokPtr = ptr; 703 return XML_TOK_INVALID; 704 } 705 } 706 return XML_TOK_PARTIAL; 707 } 708 709 /* ptr points to character following "<" */ 710 711 static int PTRCALL 712 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, 713 const char **nextTokPtr) 714 { 715 #ifdef XML_NS 716 int hadColon; 717 #endif 718 REQUIRE_CHAR(enc, ptr, end); 719 switch (BYTE_TYPE(enc, ptr)) { 720 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 721 case BT_EXCL: 722 ptr += MINBPC(enc); 723 REQUIRE_CHAR(enc, ptr, end); 724 switch (BYTE_TYPE(enc, ptr)) { 725 case BT_MINUS: 726 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); 727 case BT_LSQB: 728 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), 729 end, nextTokPtr); 730 } 731 *nextTokPtr = ptr; 732 return XML_TOK_INVALID; 733 case BT_QUEST: 734 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); 735 case BT_SOL: 736 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr); 737 default: 738 *nextTokPtr = ptr; 739 return XML_TOK_INVALID; 740 } 741 #ifdef XML_NS 742 hadColon = 0; 743 #endif 744 /* we have a start-tag */ 745 while (HAS_CHAR(enc, ptr, end)) { 746 switch (BYTE_TYPE(enc, ptr)) { 747 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 748 #ifdef XML_NS 749 case BT_COLON: 750 if (hadColon) { 751 *nextTokPtr = ptr; 752 return XML_TOK_INVALID; 753 } 754 hadColon = 1; 755 ptr += MINBPC(enc); 756 REQUIRE_CHAR(enc, ptr, end); 757 switch (BYTE_TYPE(enc, ptr)) { 758 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 759 default: 760 *nextTokPtr = ptr; 761 return XML_TOK_INVALID; 762 } 763 break; 764 #endif 765 case BT_S: case BT_CR: case BT_LF: 766 { 767 ptr += MINBPC(enc); 768 while (HAS_CHAR(enc, ptr, end)) { 769 switch (BYTE_TYPE(enc, ptr)) { 770 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 771 case BT_GT: 772 goto gt; 773 case BT_SOL: 774 goto sol; 775 case BT_S: case BT_CR: case BT_LF: 776 ptr += MINBPC(enc); 777 continue; 778 default: 779 *nextTokPtr = ptr; 780 return XML_TOK_INVALID; 781 } 782 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr); 783 } 784 return XML_TOK_PARTIAL; 785 } 786 case BT_GT: 787 gt: 788 *nextTokPtr = ptr + MINBPC(enc); 789 return XML_TOK_START_TAG_NO_ATTS; 790 case BT_SOL: 791 sol: 792 ptr += MINBPC(enc); 793 REQUIRE_CHAR(enc, ptr, end); 794 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 795 *nextTokPtr = ptr; 796 return XML_TOK_INVALID; 797 } 798 *nextTokPtr = ptr + MINBPC(enc); 799 return XML_TOK_EMPTY_ELEMENT_NO_ATTS; 800 default: 801 *nextTokPtr = ptr; 802 return XML_TOK_INVALID; 803 } 804 } 805 return XML_TOK_PARTIAL; 806 } 807 808 static int PTRCALL 809 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, 810 const char **nextTokPtr) 811 { 812 if (ptr >= end) 813 return XML_TOK_NONE; 814 if (MINBPC(enc) > 1) { 815 size_t n = end - ptr; 816 if (n & (MINBPC(enc) - 1)) { 817 n &= ~(MINBPC(enc) - 1); 818 if (n == 0) 819 return XML_TOK_PARTIAL; 820 end = ptr + n; 821 } 822 } 823 switch (BYTE_TYPE(enc, ptr)) { 824 case BT_LT: 825 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr); 826 case BT_AMP: 827 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 828 case BT_CR: 829 ptr += MINBPC(enc); 830 if (! HAS_CHAR(enc, ptr, end)) 831 return XML_TOK_TRAILING_CR; 832 if (BYTE_TYPE(enc, ptr) == BT_LF) 833 ptr += MINBPC(enc); 834 *nextTokPtr = ptr; 835 return XML_TOK_DATA_NEWLINE; 836 case BT_LF: 837 *nextTokPtr = ptr + MINBPC(enc); 838 return XML_TOK_DATA_NEWLINE; 839 case BT_RSQB: 840 ptr += MINBPC(enc); 841 if (! HAS_CHAR(enc, ptr, end)) 842 return XML_TOK_TRAILING_RSQB; 843 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) 844 break; 845 ptr += MINBPC(enc); 846 if (! HAS_CHAR(enc, ptr, end)) 847 return XML_TOK_TRAILING_RSQB; 848 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 849 ptr -= MINBPC(enc); 850 break; 851 } 852 *nextTokPtr = ptr; 853 return XML_TOK_INVALID; 854 INVALID_CASES(ptr, nextTokPtr) 855 default: 856 ptr += MINBPC(enc); 857 break; 858 } 859 while (HAS_CHAR(enc, ptr, end)) { 860 switch (BYTE_TYPE(enc, ptr)) { 861 #define LEAD_CASE(n) \ 862 case BT_LEAD ## n: \ 863 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ 864 *nextTokPtr = ptr; \ 865 return XML_TOK_DATA_CHARS; \ 866 } \ 867 ptr += n; \ 868 break; 869 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 870 #undef LEAD_CASE 871 case BT_RSQB: 872 if (HAS_CHARS(enc, ptr, end, 2)) { 873 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) { 874 ptr += MINBPC(enc); 875 break; 876 } 877 if (HAS_CHARS(enc, ptr, end, 3)) { 878 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) { 879 ptr += MINBPC(enc); 880 break; 881 } 882 *nextTokPtr = ptr + 2*MINBPC(enc); 883 return XML_TOK_INVALID; 884 } 885 } 886 /* fall through */ 887 case BT_AMP: 888 case BT_LT: 889 case BT_NONXML: 890 case BT_MALFORM: 891 case BT_TRAIL: 892 case BT_CR: 893 case BT_LF: 894 *nextTokPtr = ptr; 895 return XML_TOK_DATA_CHARS; 896 default: 897 ptr += MINBPC(enc); 898 break; 899 } 900 } 901 *nextTokPtr = ptr; 902 return XML_TOK_DATA_CHARS; 903 } 904 905 /* ptr points to character following "%" */ 906 907 static int PTRCALL 908 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, 909 const char **nextTokPtr) 910 { 911 REQUIRE_CHAR(enc, ptr, end); 912 switch (BYTE_TYPE(enc, ptr)) { 913 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 914 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: 915 *nextTokPtr = ptr; 916 return XML_TOK_PERCENT; 917 default: 918 *nextTokPtr = ptr; 919 return XML_TOK_INVALID; 920 } 921 while (HAS_CHAR(enc, ptr, end)) { 922 switch (BYTE_TYPE(enc, ptr)) { 923 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 924 case BT_SEMI: 925 *nextTokPtr = ptr + MINBPC(enc); 926 return XML_TOK_PARAM_ENTITY_REF; 927 default: 928 *nextTokPtr = ptr; 929 return XML_TOK_INVALID; 930 } 931 } 932 return XML_TOK_PARTIAL; 933 } 934 935 static int PTRCALL 936 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end, 937 const char **nextTokPtr) 938 { 939 REQUIRE_CHAR(enc, ptr, end); 940 switch (BYTE_TYPE(enc, ptr)) { 941 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 942 default: 943 *nextTokPtr = ptr; 944 return XML_TOK_INVALID; 945 } 946 while (HAS_CHAR(enc, ptr, end)) { 947 switch (BYTE_TYPE(enc, ptr)) { 948 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 949 case BT_CR: case BT_LF: case BT_S: 950 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR: 951 *nextTokPtr = ptr; 952 return XML_TOK_POUND_NAME; 953 default: 954 *nextTokPtr = ptr; 955 return XML_TOK_INVALID; 956 } 957 } 958 return -XML_TOK_POUND_NAME; 959 } 960 961 static int PTRCALL 962 PREFIX(scanLit)(int open, const ENCODING *enc, 963 const char *ptr, const char *end, 964 const char **nextTokPtr) 965 { 966 while (HAS_CHAR(enc, ptr, end)) { 967 int t = BYTE_TYPE(enc, ptr); 968 switch (t) { 969 INVALID_CASES(ptr, nextTokPtr) 970 case BT_QUOT: 971 case BT_APOS: 972 ptr += MINBPC(enc); 973 if (t != open) 974 break; 975 if (! HAS_CHAR(enc, ptr, end)) 976 return -XML_TOK_LITERAL; 977 *nextTokPtr = ptr; 978 switch (BYTE_TYPE(enc, ptr)) { 979 case BT_S: case BT_CR: case BT_LF: 980 case BT_GT: case BT_PERCNT: case BT_LSQB: 981 return XML_TOK_LITERAL; 982 default: 983 return XML_TOK_INVALID; 984 } 985 default: 986 ptr += MINBPC(enc); 987 break; 988 } 989 } 990 return XML_TOK_PARTIAL; 991 } 992 993 static int PTRCALL 994 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, 995 const char **nextTokPtr) 996 { 997 int tok; 998 if (ptr >= end) 999 return XML_TOK_NONE; 1000 if (MINBPC(enc) > 1) { 1001 size_t n = end - ptr; 1002 if (n & (MINBPC(enc) - 1)) { 1003 n &= ~(MINBPC(enc) - 1); 1004 if (n == 0) 1005 return XML_TOK_PARTIAL; 1006 end = ptr + n; 1007 } 1008 } 1009 switch (BYTE_TYPE(enc, ptr)) { 1010 case BT_QUOT: 1011 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr); 1012 case BT_APOS: 1013 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr); 1014 case BT_LT: 1015 { 1016 ptr += MINBPC(enc); 1017 REQUIRE_CHAR(enc, ptr, end); 1018 switch (BYTE_TYPE(enc, ptr)) { 1019 case BT_EXCL: 1020 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1021 case BT_QUEST: 1022 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1023 case BT_NMSTRT: 1024 case BT_HEX: 1025 case BT_NONASCII: 1026 case BT_LEAD2: 1027 case BT_LEAD3: 1028 case BT_LEAD4: 1029 *nextTokPtr = ptr - MINBPC(enc); 1030 return XML_TOK_INSTANCE_START; 1031 } 1032 *nextTokPtr = ptr; 1033 return XML_TOK_INVALID; 1034 } 1035 case BT_CR: 1036 if (ptr + MINBPC(enc) == end) { 1037 *nextTokPtr = end; 1038 /* indicate that this might be part of a CR/LF pair */ 1039 return -XML_TOK_PROLOG_S; 1040 } 1041 /* fall through */ 1042 case BT_S: case BT_LF: 1043 for (;;) { 1044 ptr += MINBPC(enc); 1045 if (! HAS_CHAR(enc, ptr, end)) 1046 break; 1047 switch (BYTE_TYPE(enc, ptr)) { 1048 case BT_S: case BT_LF: 1049 break; 1050 case BT_CR: 1051 /* don't split CR/LF pair */ 1052 if (ptr + MINBPC(enc) != end) 1053 break; 1054 /* fall through */ 1055 default: 1056 *nextTokPtr = ptr; 1057 return XML_TOK_PROLOG_S; 1058 } 1059 } 1060 *nextTokPtr = ptr; 1061 return XML_TOK_PROLOG_S; 1062 case BT_PERCNT: 1063 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1064 case BT_COMMA: 1065 *nextTokPtr = ptr + MINBPC(enc); 1066 return XML_TOK_COMMA; 1067 case BT_LSQB: 1068 *nextTokPtr = ptr + MINBPC(enc); 1069 return XML_TOK_OPEN_BRACKET; 1070 case BT_RSQB: 1071 ptr += MINBPC(enc); 1072 if (! HAS_CHAR(enc, ptr, end)) 1073 return -XML_TOK_CLOSE_BRACKET; 1074 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { 1075 REQUIRE_CHARS(enc, ptr, end, 2); 1076 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) { 1077 *nextTokPtr = ptr + 2*MINBPC(enc); 1078 return XML_TOK_COND_SECT_CLOSE; 1079 } 1080 } 1081 *nextTokPtr = ptr; 1082 return XML_TOK_CLOSE_BRACKET; 1083 case BT_LPAR: 1084 *nextTokPtr = ptr + MINBPC(enc); 1085 return XML_TOK_OPEN_PAREN; 1086 case BT_RPAR: 1087 ptr += MINBPC(enc); 1088 if (! HAS_CHAR(enc, ptr, end)) 1089 return -XML_TOK_CLOSE_PAREN; 1090 switch (BYTE_TYPE(enc, ptr)) { 1091 case BT_AST: 1092 *nextTokPtr = ptr + MINBPC(enc); 1093 return XML_TOK_CLOSE_PAREN_ASTERISK; 1094 case BT_QUEST: 1095 *nextTokPtr = ptr + MINBPC(enc); 1096 return XML_TOK_CLOSE_PAREN_QUESTION; 1097 case BT_PLUS: 1098 *nextTokPtr = ptr + MINBPC(enc); 1099 return XML_TOK_CLOSE_PAREN_PLUS; 1100 case BT_CR: case BT_LF: case BT_S: 1101 case BT_GT: case BT_COMMA: case BT_VERBAR: 1102 case BT_RPAR: 1103 *nextTokPtr = ptr; 1104 return XML_TOK_CLOSE_PAREN; 1105 } 1106 *nextTokPtr = ptr; 1107 return XML_TOK_INVALID; 1108 case BT_VERBAR: 1109 *nextTokPtr = ptr + MINBPC(enc); 1110 return XML_TOK_OR; 1111 case BT_GT: 1112 *nextTokPtr = ptr + MINBPC(enc); 1113 return XML_TOK_DECL_CLOSE; 1114 case BT_NUM: 1115 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1116 #define LEAD_CASE(n) \ 1117 case BT_LEAD ## n: \ 1118 if (end - ptr < n) \ 1119 return XML_TOK_PARTIAL_CHAR; \ 1120 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ 1121 ptr += n; \ 1122 tok = XML_TOK_NAME; \ 1123 break; \ 1124 } \ 1125 if (IS_NAME_CHAR(enc, ptr, n)) { \ 1126 ptr += n; \ 1127 tok = XML_TOK_NMTOKEN; \ 1128 break; \ 1129 } \ 1130 *nextTokPtr = ptr; \ 1131 return XML_TOK_INVALID; 1132 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1133 #undef LEAD_CASE 1134 case BT_NMSTRT: 1135 case BT_HEX: 1136 tok = XML_TOK_NAME; 1137 ptr += MINBPC(enc); 1138 break; 1139 case BT_DIGIT: 1140 case BT_NAME: 1141 case BT_MINUS: 1142 #ifdef XML_NS 1143 case BT_COLON: 1144 #endif 1145 tok = XML_TOK_NMTOKEN; 1146 ptr += MINBPC(enc); 1147 break; 1148 case BT_NONASCII: 1149 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { 1150 ptr += MINBPC(enc); 1151 tok = XML_TOK_NAME; 1152 break; 1153 } 1154 if (IS_NAME_CHAR_MINBPC(enc, ptr)) { 1155 ptr += MINBPC(enc); 1156 tok = XML_TOK_NMTOKEN; 1157 break; 1158 } 1159 /* fall through */ 1160 default: 1161 *nextTokPtr = ptr; 1162 return XML_TOK_INVALID; 1163 } 1164 while (HAS_CHAR(enc, ptr, end)) { 1165 switch (BYTE_TYPE(enc, ptr)) { 1166 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 1167 case BT_GT: case BT_RPAR: case BT_COMMA: 1168 case BT_VERBAR: case BT_LSQB: case BT_PERCNT: 1169 case BT_S: case BT_CR: case BT_LF: 1170 *nextTokPtr = ptr; 1171 return tok; 1172 #ifdef XML_NS 1173 case BT_COLON: 1174 ptr += MINBPC(enc); 1175 switch (tok) { 1176 case XML_TOK_NAME: 1177 REQUIRE_CHAR(enc, ptr, end); 1178 tok = XML_TOK_PREFIXED_NAME; 1179 switch (BYTE_TYPE(enc, ptr)) { 1180 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 1181 default: 1182 tok = XML_TOK_NMTOKEN; 1183 break; 1184 } 1185 break; 1186 case XML_TOK_PREFIXED_NAME: 1187 tok = XML_TOK_NMTOKEN; 1188 break; 1189 } 1190 break; 1191 #endif 1192 case BT_PLUS: 1193 if (tok == XML_TOK_NMTOKEN) { 1194 *nextTokPtr = ptr; 1195 return XML_TOK_INVALID; 1196 } 1197 *nextTokPtr = ptr + MINBPC(enc); 1198 return XML_TOK_NAME_PLUS; 1199 case BT_AST: 1200 if (tok == XML_TOK_NMTOKEN) { 1201 *nextTokPtr = ptr; 1202 return XML_TOK_INVALID; 1203 } 1204 *nextTokPtr = ptr + MINBPC(enc); 1205 return XML_TOK_NAME_ASTERISK; 1206 case BT_QUEST: 1207 if (tok == XML_TOK_NMTOKEN) { 1208 *nextTokPtr = ptr; 1209 return XML_TOK_INVALID; 1210 } 1211 *nextTokPtr = ptr + MINBPC(enc); 1212 return XML_TOK_NAME_QUESTION; 1213 default: 1214 *nextTokPtr = ptr; 1215 return XML_TOK_INVALID; 1216 } 1217 } 1218 return -tok; 1219 } 1220 1221 static int PTRCALL 1222 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, 1223 const char *end, const char **nextTokPtr) 1224 { 1225 const char *start; 1226 if (ptr >= end) 1227 return XML_TOK_NONE; 1228 else if (! HAS_CHAR(enc, ptr, end)) { 1229 /* This line cannot be executed. The incoming data has already 1230 * been tokenized once, so incomplete characters like this have 1231 * already been eliminated from the input. Retaining the paranoia 1232 * check is still valuable, however. 1233 */ 1234 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */ 1235 } 1236 start = ptr; 1237 while (HAS_CHAR(enc, ptr, end)) { 1238 switch (BYTE_TYPE(enc, ptr)) { 1239 #define LEAD_CASE(n) \ 1240 case BT_LEAD ## n: ptr += n; break; 1241 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1242 #undef LEAD_CASE 1243 case BT_AMP: 1244 if (ptr == start) 1245 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1246 *nextTokPtr = ptr; 1247 return XML_TOK_DATA_CHARS; 1248 case BT_LT: 1249 /* this is for inside entity references */ 1250 *nextTokPtr = ptr; 1251 return XML_TOK_INVALID; 1252 case BT_LF: 1253 if (ptr == start) { 1254 *nextTokPtr = ptr + MINBPC(enc); 1255 return XML_TOK_DATA_NEWLINE; 1256 } 1257 *nextTokPtr = ptr; 1258 return XML_TOK_DATA_CHARS; 1259 case BT_CR: 1260 if (ptr == start) { 1261 ptr += MINBPC(enc); 1262 if (! HAS_CHAR(enc, ptr, end)) 1263 return XML_TOK_TRAILING_CR; 1264 if (BYTE_TYPE(enc, ptr) == BT_LF) 1265 ptr += MINBPC(enc); 1266 *nextTokPtr = ptr; 1267 return XML_TOK_DATA_NEWLINE; 1268 } 1269 *nextTokPtr = ptr; 1270 return XML_TOK_DATA_CHARS; 1271 case BT_S: 1272 if (ptr == start) { 1273 *nextTokPtr = ptr + MINBPC(enc); 1274 return XML_TOK_ATTRIBUTE_VALUE_S; 1275 } 1276 *nextTokPtr = ptr; 1277 return XML_TOK_DATA_CHARS; 1278 default: 1279 ptr += MINBPC(enc); 1280 break; 1281 } 1282 } 1283 *nextTokPtr = ptr; 1284 return XML_TOK_DATA_CHARS; 1285 } 1286 1287 static int PTRCALL 1288 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, 1289 const char *end, const char **nextTokPtr) 1290 { 1291 const char *start; 1292 if (ptr >= end) 1293 return XML_TOK_NONE; 1294 else if (! HAS_CHAR(enc, ptr, end)) { 1295 /* This line cannot be executed. The incoming data has already 1296 * been tokenized once, so incomplete characters like this have 1297 * already been eliminated from the input. Retaining the paranoia 1298 * check is still valuable, however. 1299 */ 1300 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */ 1301 } 1302 start = ptr; 1303 while (HAS_CHAR(enc, ptr, end)) { 1304 switch (BYTE_TYPE(enc, ptr)) { 1305 #define LEAD_CASE(n) \ 1306 case BT_LEAD ## n: ptr += n; break; 1307 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1308 #undef LEAD_CASE 1309 case BT_AMP: 1310 if (ptr == start) 1311 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1312 *nextTokPtr = ptr; 1313 return XML_TOK_DATA_CHARS; 1314 case BT_PERCNT: 1315 if (ptr == start) { 1316 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), 1317 end, nextTokPtr); 1318 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok; 1319 } 1320 *nextTokPtr = ptr; 1321 return XML_TOK_DATA_CHARS; 1322 case BT_LF: 1323 if (ptr == start) { 1324 *nextTokPtr = ptr + MINBPC(enc); 1325 return XML_TOK_DATA_NEWLINE; 1326 } 1327 *nextTokPtr = ptr; 1328 return XML_TOK_DATA_CHARS; 1329 case BT_CR: 1330 if (ptr == start) { 1331 ptr += MINBPC(enc); 1332 if (! HAS_CHAR(enc, ptr, end)) 1333 return XML_TOK_TRAILING_CR; 1334 if (BYTE_TYPE(enc, ptr) == BT_LF) 1335 ptr += MINBPC(enc); 1336 *nextTokPtr = ptr; 1337 return XML_TOK_DATA_NEWLINE; 1338 } 1339 *nextTokPtr = ptr; 1340 return XML_TOK_DATA_CHARS; 1341 default: 1342 ptr += MINBPC(enc); 1343 break; 1344 } 1345 } 1346 *nextTokPtr = ptr; 1347 return XML_TOK_DATA_CHARS; 1348 } 1349 1350 #ifdef XML_DTD 1351 1352 static int PTRCALL 1353 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, 1354 const char *end, const char **nextTokPtr) 1355 { 1356 int level = 0; 1357 if (MINBPC(enc) > 1) { 1358 size_t n = end - ptr; 1359 if (n & (MINBPC(enc) - 1)) { 1360 n &= ~(MINBPC(enc) - 1); 1361 end = ptr + n; 1362 } 1363 } 1364 while (HAS_CHAR(enc, ptr, end)) { 1365 switch (BYTE_TYPE(enc, ptr)) { 1366 INVALID_CASES(ptr, nextTokPtr) 1367 case BT_LT: 1368 ptr += MINBPC(enc); 1369 REQUIRE_CHAR(enc, ptr, end); 1370 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) { 1371 ptr += MINBPC(enc); 1372 REQUIRE_CHAR(enc, ptr, end); 1373 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) { 1374 ++level; 1375 ptr += MINBPC(enc); 1376 } 1377 } 1378 break; 1379 case BT_RSQB: 1380 ptr += MINBPC(enc); 1381 REQUIRE_CHAR(enc, ptr, end); 1382 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { 1383 ptr += MINBPC(enc); 1384 REQUIRE_CHAR(enc, ptr, end); 1385 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 1386 ptr += MINBPC(enc); 1387 if (level == 0) { 1388 *nextTokPtr = ptr; 1389 return XML_TOK_IGNORE_SECT; 1390 } 1391 --level; 1392 } 1393 } 1394 break; 1395 default: 1396 ptr += MINBPC(enc); 1397 break; 1398 } 1399 } 1400 return XML_TOK_PARTIAL; 1401 } 1402 1403 #endif /* XML_DTD */ 1404 1405 static int PTRCALL 1406 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end, 1407 const char **badPtr) 1408 { 1409 ptr += MINBPC(enc); 1410 end -= MINBPC(enc); 1411 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 1412 switch (BYTE_TYPE(enc, ptr)) { 1413 case BT_DIGIT: 1414 case BT_HEX: 1415 case BT_MINUS: 1416 case BT_APOS: 1417 case BT_LPAR: 1418 case BT_RPAR: 1419 case BT_PLUS: 1420 case BT_COMMA: 1421 case BT_SOL: 1422 case BT_EQUALS: 1423 case BT_QUEST: 1424 case BT_CR: 1425 case BT_LF: 1426 case BT_SEMI: 1427 case BT_EXCL: 1428 case BT_AST: 1429 case BT_PERCNT: 1430 case BT_NUM: 1431 #ifdef XML_NS 1432 case BT_COLON: 1433 #endif 1434 break; 1435 case BT_S: 1436 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) { 1437 *badPtr = ptr; 1438 return 0; 1439 } 1440 break; 1441 case BT_NAME: 1442 case BT_NMSTRT: 1443 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f)) 1444 break; 1445 default: 1446 switch (BYTE_TO_ASCII(enc, ptr)) { 1447 case 0x24: /* $ */ 1448 case 0x40: /* @ */ 1449 break; 1450 default: 1451 *badPtr = ptr; 1452 return 0; 1453 } 1454 break; 1455 } 1456 } 1457 return 1; 1458 } 1459 1460 /* This must only be called for a well-formed start-tag or empty 1461 element tag. Returns the number of attributes. Pointers to the 1462 first attsMax attributes are stored in atts. 1463 */ 1464 1465 static int PTRCALL 1466 PREFIX(getAtts)(const ENCODING *enc, const char *ptr, 1467 int attsMax, ATTRIBUTE *atts) 1468 { 1469 enum { other, inName, inValue } state = inName; 1470 int nAtts = 0; 1471 int open = 0; /* defined when state == inValue; 1472 initialization just to shut up compilers */ 1473 1474 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) { 1475 switch (BYTE_TYPE(enc, ptr)) { 1476 #define START_NAME \ 1477 if (state == other) { \ 1478 if (nAtts < attsMax) { \ 1479 atts[nAtts].name = ptr; \ 1480 atts[nAtts].normalized = 1; \ 1481 } \ 1482 state = inName; \ 1483 } 1484 #define LEAD_CASE(n) \ 1485 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break; 1486 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1487 #undef LEAD_CASE 1488 case BT_NONASCII: 1489 case BT_NMSTRT: 1490 case BT_HEX: 1491 START_NAME 1492 break; 1493 #undef START_NAME 1494 case BT_QUOT: 1495 if (state != inValue) { 1496 if (nAtts < attsMax) 1497 atts[nAtts].valuePtr = ptr + MINBPC(enc); 1498 state = inValue; 1499 open = BT_QUOT; 1500 } 1501 else if (open == BT_QUOT) { 1502 state = other; 1503 if (nAtts < attsMax) 1504 atts[nAtts].valueEnd = ptr; 1505 nAtts++; 1506 } 1507 break; 1508 case BT_APOS: 1509 if (state != inValue) { 1510 if (nAtts < attsMax) 1511 atts[nAtts].valuePtr = ptr + MINBPC(enc); 1512 state = inValue; 1513 open = BT_APOS; 1514 } 1515 else if (open == BT_APOS) { 1516 state = other; 1517 if (nAtts < attsMax) 1518 atts[nAtts].valueEnd = ptr; 1519 nAtts++; 1520 } 1521 break; 1522 case BT_AMP: 1523 if (nAtts < attsMax) 1524 atts[nAtts].normalized = 0; 1525 break; 1526 case BT_S: 1527 if (state == inName) 1528 state = other; 1529 else if (state == inValue 1530 && nAtts < attsMax 1531 && atts[nAtts].normalized 1532 && (ptr == atts[nAtts].valuePtr 1533 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE 1534 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE 1535 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open)) 1536 atts[nAtts].normalized = 0; 1537 break; 1538 case BT_CR: case BT_LF: 1539 /* This case ensures that the first attribute name is counted 1540 Apart from that we could just change state on the quote. */ 1541 if (state == inName) 1542 state = other; 1543 else if (state == inValue && nAtts < attsMax) 1544 atts[nAtts].normalized = 0; 1545 break; 1546 case BT_GT: 1547 case BT_SOL: 1548 if (state != inValue) 1549 return nAtts; 1550 break; 1551 default: 1552 break; 1553 } 1554 } 1555 /* not reached */ 1556 } 1557 1558 static int PTRFASTCALL 1559 PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr) 1560 { 1561 int result = 0; 1562 /* skip &# */ 1563 ptr += 2*MINBPC(enc); 1564 if (CHAR_MATCHES(enc, ptr, ASCII_x)) { 1565 for (ptr += MINBPC(enc); 1566 !CHAR_MATCHES(enc, ptr, ASCII_SEMI); 1567 ptr += MINBPC(enc)) { 1568 int c = BYTE_TO_ASCII(enc, ptr); 1569 switch (c) { 1570 case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4: 1571 case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9: 1572 result <<= 4; 1573 result |= (c - ASCII_0); 1574 break; 1575 case ASCII_A: case ASCII_B: case ASCII_C: 1576 case ASCII_D: case ASCII_E: case ASCII_F: 1577 result <<= 4; 1578 result += 10 + (c - ASCII_A); 1579 break; 1580 case ASCII_a: case ASCII_b: case ASCII_c: 1581 case ASCII_d: case ASCII_e: case ASCII_f: 1582 result <<= 4; 1583 result += 10 + (c - ASCII_a); 1584 break; 1585 } 1586 if (result >= 0x110000) 1587 return -1; 1588 } 1589 } 1590 else { 1591 for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) { 1592 int c = BYTE_TO_ASCII(enc, ptr); 1593 result *= 10; 1594 result += (c - ASCII_0); 1595 if (result >= 0x110000) 1596 return -1; 1597 } 1598 } 1599 return checkCharRefNumber(result); 1600 } 1601 1602 static int PTRCALL 1603 PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr, 1604 const char *end) 1605 { 1606 switch ((end - ptr)/MINBPC(enc)) { 1607 case 2: 1608 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) { 1609 switch (BYTE_TO_ASCII(enc, ptr)) { 1610 case ASCII_l: 1611 return ASCII_LT; 1612 case ASCII_g: 1613 return ASCII_GT; 1614 } 1615 } 1616 break; 1617 case 3: 1618 if (CHAR_MATCHES(enc, ptr, ASCII_a)) { 1619 ptr += MINBPC(enc); 1620 if (CHAR_MATCHES(enc, ptr, ASCII_m)) { 1621 ptr += MINBPC(enc); 1622 if (CHAR_MATCHES(enc, ptr, ASCII_p)) 1623 return ASCII_AMP; 1624 } 1625 } 1626 break; 1627 case 4: 1628 switch (BYTE_TO_ASCII(enc, ptr)) { 1629 case ASCII_q: 1630 ptr += MINBPC(enc); 1631 if (CHAR_MATCHES(enc, ptr, ASCII_u)) { 1632 ptr += MINBPC(enc); 1633 if (CHAR_MATCHES(enc, ptr, ASCII_o)) { 1634 ptr += MINBPC(enc); 1635 if (CHAR_MATCHES(enc, ptr, ASCII_t)) 1636 return ASCII_QUOT; 1637 } 1638 } 1639 break; 1640 case ASCII_a: 1641 ptr += MINBPC(enc); 1642 if (CHAR_MATCHES(enc, ptr, ASCII_p)) { 1643 ptr += MINBPC(enc); 1644 if (CHAR_MATCHES(enc, ptr, ASCII_o)) { 1645 ptr += MINBPC(enc); 1646 if (CHAR_MATCHES(enc, ptr, ASCII_s)) 1647 return ASCII_APOS; 1648 } 1649 } 1650 break; 1651 } 1652 } 1653 return 0; 1654 } 1655 1656 static int PTRCALL 1657 PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1, 1658 const char *end1, const char *ptr2) 1659 { 1660 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) { 1661 if (end1 - ptr1 < MINBPC(enc)) { 1662 /* This line cannot be executed. THe incoming data has already 1663 * been tokenized once, so imcomplete characters like this have 1664 * already been eliminated from the input. Retaining the 1665 * paranoia check is still valuable, however. 1666 */ 1667 return 0; /* LCOV_EXCL_LINE */ 1668 } 1669 if (!CHAR_MATCHES(enc, ptr1, *ptr2)) 1670 return 0; 1671 } 1672 return ptr1 == end1; 1673 } 1674 1675 static int PTRFASTCALL 1676 PREFIX(nameLength)(const ENCODING *enc, const char *ptr) 1677 { 1678 const char *start = ptr; 1679 for (;;) { 1680 switch (BYTE_TYPE(enc, ptr)) { 1681 #define LEAD_CASE(n) \ 1682 case BT_LEAD ## n: ptr += n; break; 1683 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1684 #undef LEAD_CASE 1685 case BT_NONASCII: 1686 case BT_NMSTRT: 1687 #ifdef XML_NS 1688 case BT_COLON: 1689 #endif 1690 case BT_HEX: 1691 case BT_DIGIT: 1692 case BT_NAME: 1693 case BT_MINUS: 1694 ptr += MINBPC(enc); 1695 break; 1696 default: 1697 return (int)(ptr - start); 1698 } 1699 } 1700 } 1701 1702 static const char * PTRFASTCALL 1703 PREFIX(skipS)(const ENCODING *enc, const char *ptr) 1704 { 1705 for (;;) { 1706 switch (BYTE_TYPE(enc, ptr)) { 1707 case BT_LF: 1708 case BT_CR: 1709 case BT_S: 1710 ptr += MINBPC(enc); 1711 break; 1712 default: 1713 return ptr; 1714 } 1715 } 1716 } 1717 1718 static void PTRCALL 1719 PREFIX(updatePosition)(const ENCODING *enc, 1720 const char *ptr, 1721 const char *end, 1722 POSITION *pos) 1723 { 1724 while (HAS_CHAR(enc, ptr, end)) { 1725 switch (BYTE_TYPE(enc, ptr)) { 1726 #define LEAD_CASE(n) \ 1727 case BT_LEAD ## n: \ 1728 ptr += n; \ 1729 break; 1730 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1731 #undef LEAD_CASE 1732 case BT_LF: 1733 pos->columnNumber = (XML_Size)-1; 1734 pos->lineNumber++; 1735 ptr += MINBPC(enc); 1736 break; 1737 case BT_CR: 1738 pos->lineNumber++; 1739 ptr += MINBPC(enc); 1740 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF) 1741 ptr += MINBPC(enc); 1742 pos->columnNumber = (XML_Size)-1; 1743 break; 1744 default: 1745 ptr += MINBPC(enc); 1746 break; 1747 } 1748 pos->columnNumber++; 1749 } 1750 } 1751 1752 #undef DO_LEAD_CASE 1753 #undef MULTIBYTE_CASES 1754 #undef INVALID_CASES 1755 #undef CHECK_NAME_CASE 1756 #undef CHECK_NAME_CASES 1757 #undef CHECK_NMSTRT_CASE 1758 #undef CHECK_NMSTRT_CASES 1759 1760 #endif /* XML_TOK_IMPL_C */ 1761