1from __future__ import absolute_import, division, unicode_literals 2 3try: 4 chr = unichr # flake8: noqa 5except NameError: 6 pass 7 8from collections import deque 9 10from .constants import spaceCharacters 11from .constants import entities 12from .constants import asciiLetters, asciiUpper2Lower 13from .constants import digits, hexDigits, EOF 14from .constants import tokenTypes, tagTokenTypes 15from .constants import replacementCharacters 16 17from .inputstream import HTMLInputStream 18 19from .trie import Trie 20 21entitiesTrie = Trie(entities) 22 23 24class HTMLTokenizer(object): 25 """ This class takes care of tokenizing HTML. 26 27 * self.currentToken 28 Holds the token that is currently being processed. 29 30 * self.state 31 Holds a reference to the method to be invoked... XXX 32 33 * self.stream 34 Points to HTMLInputStream object. 35 """ 36 37 def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, 38 lowercaseElementName=True, lowercaseAttrName=True, parser=None): 39 40 self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet) 41 self.parser = parser 42 43 # Perform case conversions? 44 self.lowercaseElementName = lowercaseElementName 45 self.lowercaseAttrName = lowercaseAttrName 46 47 # Setup the initial tokenizer state 48 self.escapeFlag = False 49 self.lastFourChars = [] 50 self.state = self.dataState 51 self.escape = False 52 53 # The current token being created 54 self.currentToken = None 55 super(HTMLTokenizer, self).__init__() 56 57 def __iter__(self): 58 """ This is where the magic happens. 59 60 We do our usually processing through the states and when we have a token 61 to return we yield the token which pauses processing until the next token 62 is requested. 63 """ 64 self.tokenQueue = deque([]) 65 # Start processing. When EOF is reached self.state will return False 66 # instead of True and the loop will terminate. 67 while self.state(): 68 while self.stream.errors: 69 yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} 70 while self.tokenQueue: 71 yield self.tokenQueue.popleft() 72 73 def consumeNumberEntity(self, isHex): 74 """This function returns either U+FFFD or the character based on the 75 decimal or hexadecimal representation. It also discards ";" if present. 76 If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. 77 """ 78 79 allowed = digits 80 radix = 10 81 if isHex: 82 allowed = hexDigits 83 radix = 16 84 85 charStack = [] 86 87 # Consume all the characters that are in range while making sure we 88 # don't hit an EOF. 89 c = self.stream.char() 90 while c in allowed and c is not EOF: 91 charStack.append(c) 92 c = self.stream.char() 93 94 # Convert the set of characters consumed to an int. 95 charAsInt = int("".join(charStack), radix) 96 97 # Certain characters get replaced with others 98 if charAsInt in replacementCharacters: 99 char = replacementCharacters[charAsInt] 100 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 101 "illegal-codepoint-for-numeric-entity", 102 "datavars": {"charAsInt": charAsInt}}) 103 elif ((0xD800 <= charAsInt <= 0xDFFF) or 104 (charAsInt > 0x10FFFF)): 105 char = "\uFFFD" 106 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 107 "illegal-codepoint-for-numeric-entity", 108 "datavars": {"charAsInt": charAsInt}}) 109 else: 110 # Should speed up this check somehow (e.g. move the set to a constant) 111 if ((0x0001 <= charAsInt <= 0x0008) or 112 (0x000E <= charAsInt <= 0x001F) or 113 (0x007F <= charAsInt <= 0x009F) or 114 (0xFDD0 <= charAsInt <= 0xFDEF) or 115 charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, 116 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 117 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 118 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 119 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 120 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 121 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 122 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 123 0xFFFFF, 0x10FFFE, 0x10FFFF])): 124 self.tokenQueue.append({"type": tokenTypes["ParseError"], 125 "data": 126 "illegal-codepoint-for-numeric-entity", 127 "datavars": {"charAsInt": charAsInt}}) 128 try: 129 # Try/except needed as UCS-2 Python builds' unichar only works 130 # within the BMP. 131 char = chr(charAsInt) 132 except ValueError: 133 v = charAsInt - 0x10000 134 char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF)) 135 136 # Discard the ; if present. Otherwise, put it back on the queue and 137 # invoke parseError on parser. 138 if c != ";": 139 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 140 "numeric-entity-without-semicolon"}) 141 self.stream.unget(c) 142 143 return char 144 145 def consumeEntity(self, allowedChar=None, fromAttribute=False): 146 # Initialise to the default output for when no entity is matched 147 output = "&" 148 149 charStack = [self.stream.char()] 150 if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") 151 or (allowedChar is not None and allowedChar == charStack[0])): 152 self.stream.unget(charStack[0]) 153 154 elif charStack[0] == "#": 155 # Read the next character to see if it's hex or decimal 156 hex = False 157 charStack.append(self.stream.char()) 158 if charStack[-1] in ("x", "X"): 159 hex = True 160 charStack.append(self.stream.char()) 161 162 # charStack[-1] should be the first digit 163 if (hex and charStack[-1] in hexDigits) \ 164 or (not hex and charStack[-1] in digits): 165 # At least one digit found, so consume the whole number 166 self.stream.unget(charStack[-1]) 167 output = self.consumeNumberEntity(hex) 168 else: 169 # No digits found 170 self.tokenQueue.append({"type": tokenTypes["ParseError"], 171 "data": "expected-numeric-entity"}) 172 self.stream.unget(charStack.pop()) 173 output = "&" + "".join(charStack) 174 175 else: 176 # At this point in the process might have named entity. Entities 177 # are stored in the global variable "entities". 178 # 179 # Consume characters and compare to these to a substring of the 180 # entity names in the list until the substring no longer matches. 181 while (charStack[-1] is not EOF): 182 if not entitiesTrie.has_keys_with_prefix("".join(charStack)): 183 break 184 charStack.append(self.stream.char()) 185 186 # At this point we have a string that starts with some characters 187 # that may match an entity 188 # Try to find the longest entity the string will match to take care 189 # of ¬i for instance. 190 try: 191 entityName = entitiesTrie.longest_prefix("".join(charStack[:-1])) 192 entityLength = len(entityName) 193 except KeyError: 194 entityName = None 195 196 if entityName is not None: 197 if entityName[-1] != ";": 198 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 199 "named-entity-without-semicolon"}) 200 if (entityName[-1] != ";" and fromAttribute and 201 (charStack[entityLength] in asciiLetters or 202 charStack[entityLength] in digits or 203 charStack[entityLength] == "=")): 204 self.stream.unget(charStack.pop()) 205 output = "&" + "".join(charStack) 206 else: 207 output = entities[entityName] 208 self.stream.unget(charStack.pop()) 209 output += "".join(charStack[entityLength:]) 210 else: 211 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 212 "expected-named-entity"}) 213 self.stream.unget(charStack.pop()) 214 output = "&" + "".join(charStack) 215 216 if fromAttribute: 217 self.currentToken["data"][-1][1] += output 218 else: 219 if output in spaceCharacters: 220 tokenType = "SpaceCharacters" 221 else: 222 tokenType = "Characters" 223 self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output}) 224 225 def processEntityInAttribute(self, allowedChar): 226 """This method replaces the need for "entityInAttributeValueState". 227 """ 228 self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) 229 230 def emitCurrentToken(self): 231 """This method is a generic handler for emitting the tags. It also sets 232 the state to "data" because that's what's needed after a token has been 233 emitted. 234 """ 235 token = self.currentToken 236 # Add token to the queue to be yielded 237 if (token["type"] in tagTokenTypes): 238 if self.lowercaseElementName: 239 token["name"] = token["name"].translate(asciiUpper2Lower) 240 if token["type"] == tokenTypes["EndTag"]: 241 if token["data"]: 242 self.tokenQueue.append({"type": tokenTypes["ParseError"], 243 "data": "attributes-in-end-tag"}) 244 if token["selfClosing"]: 245 self.tokenQueue.append({"type": tokenTypes["ParseError"], 246 "data": "self-closing-flag-on-end-tag"}) 247 self.tokenQueue.append(token) 248 self.state = self.dataState 249 250 # Below are the various tokenizer states worked out. 251 def dataState(self): 252 data = self.stream.char() 253 if data == "&": 254 self.state = self.entityDataState 255 elif data == "<": 256 self.state = self.tagOpenState 257 elif data == "\u0000": 258 self.tokenQueue.append({"type": tokenTypes["ParseError"], 259 "data": "invalid-codepoint"}) 260 self.tokenQueue.append({"type": tokenTypes["Characters"], 261 "data": "\u0000"}) 262 elif data is EOF: 263 # Tokenization ends. 264 return False 265 elif data in spaceCharacters: 266 # Directly after emitting a token you switch back to the "data 267 # state". At that point spaceCharacters are important so they are 268 # emitted separately. 269 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": 270 data + self.stream.charsUntil(spaceCharacters, True)}) 271 # No need to update lastFourChars here, since the first space will 272 # have already been appended to lastFourChars and will have broken 273 # any <!-- or --> sequences 274 else: 275 chars = self.stream.charsUntil(("&", "<", "\u0000")) 276 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 277 data + chars}) 278 return True 279 280 def entityDataState(self): 281 self.consumeEntity() 282 self.state = self.dataState 283 return True 284 285 def rcdataState(self): 286 data = self.stream.char() 287 if data == "&": 288 self.state = self.characterReferenceInRcdata 289 elif data == "<": 290 self.state = self.rcdataLessThanSignState 291 elif data == EOF: 292 # Tokenization ends. 293 return False 294 elif data == "\u0000": 295 self.tokenQueue.append({"type": tokenTypes["ParseError"], 296 "data": "invalid-codepoint"}) 297 self.tokenQueue.append({"type": tokenTypes["Characters"], 298 "data": "\uFFFD"}) 299 elif data in spaceCharacters: 300 # Directly after emitting a token you switch back to the "data 301 # state". At that point spaceCharacters are important so they are 302 # emitted separately. 303 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": 304 data + self.stream.charsUntil(spaceCharacters, True)}) 305 # No need to update lastFourChars here, since the first space will 306 # have already been appended to lastFourChars and will have broken 307 # any <!-- or --> sequences 308 else: 309 chars = self.stream.charsUntil(("&", "<", "\u0000")) 310 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 311 data + chars}) 312 return True 313 314 def characterReferenceInRcdata(self): 315 self.consumeEntity() 316 self.state = self.rcdataState 317 return True 318 319 def rawtextState(self): 320 data = self.stream.char() 321 if data == "<": 322 self.state = self.rawtextLessThanSignState 323 elif data == "\u0000": 324 self.tokenQueue.append({"type": tokenTypes["ParseError"], 325 "data": "invalid-codepoint"}) 326 self.tokenQueue.append({"type": tokenTypes["Characters"], 327 "data": "\uFFFD"}) 328 elif data == EOF: 329 # Tokenization ends. 330 return False 331 else: 332 chars = self.stream.charsUntil(("<", "\u0000")) 333 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 334 data + chars}) 335 return True 336 337 def scriptDataState(self): 338 data = self.stream.char() 339 if data == "<": 340 self.state = self.scriptDataLessThanSignState 341 elif data == "\u0000": 342 self.tokenQueue.append({"type": tokenTypes["ParseError"], 343 "data": "invalid-codepoint"}) 344 self.tokenQueue.append({"type": tokenTypes["Characters"], 345 "data": "\uFFFD"}) 346 elif data == EOF: 347 # Tokenization ends. 348 return False 349 else: 350 chars = self.stream.charsUntil(("<", "\u0000")) 351 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 352 data + chars}) 353 return True 354 355 def plaintextState(self): 356 data = self.stream.char() 357 if data == EOF: 358 # Tokenization ends. 359 return False 360 elif data == "\u0000": 361 self.tokenQueue.append({"type": tokenTypes["ParseError"], 362 "data": "invalid-codepoint"}) 363 self.tokenQueue.append({"type": tokenTypes["Characters"], 364 "data": "\uFFFD"}) 365 else: 366 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 367 data + self.stream.charsUntil("\u0000")}) 368 return True 369 370 def tagOpenState(self): 371 data = self.stream.char() 372 if data == "!": 373 self.state = self.markupDeclarationOpenState 374 elif data == "/": 375 self.state = self.closeTagOpenState 376 elif data in asciiLetters: 377 self.currentToken = {"type": tokenTypes["StartTag"], 378 "name": data, "data": [], 379 "selfClosing": False, 380 "selfClosingAcknowledged": False} 381 self.state = self.tagNameState 382 elif data == ">": 383 # XXX In theory it could be something besides a tag name. But 384 # do we really care? 385 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 386 "expected-tag-name-but-got-right-bracket"}) 387 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"}) 388 self.state = self.dataState 389 elif data == "?": 390 # XXX In theory it could be something besides a tag name. But 391 # do we really care? 392 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 393 "expected-tag-name-but-got-question-mark"}) 394 self.stream.unget(data) 395 self.state = self.bogusCommentState 396 else: 397 # XXX 398 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 399 "expected-tag-name"}) 400 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 401 self.stream.unget(data) 402 self.state = self.dataState 403 return True 404 405 def closeTagOpenState(self): 406 data = self.stream.char() 407 if data in asciiLetters: 408 self.currentToken = {"type": tokenTypes["EndTag"], "name": data, 409 "data": [], "selfClosing": False} 410 self.state = self.tagNameState 411 elif data == ">": 412 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 413 "expected-closing-tag-but-got-right-bracket"}) 414 self.state = self.dataState 415 elif data is EOF: 416 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 417 "expected-closing-tag-but-got-eof"}) 418 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 419 self.state = self.dataState 420 else: 421 # XXX data can be _'_... 422 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 423 "expected-closing-tag-but-got-char", 424 "datavars": {"data": data}}) 425 self.stream.unget(data) 426 self.state = self.bogusCommentState 427 return True 428 429 def tagNameState(self): 430 data = self.stream.char() 431 if data in spaceCharacters: 432 self.state = self.beforeAttributeNameState 433 elif data == ">": 434 self.emitCurrentToken() 435 elif data is EOF: 436 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 437 "eof-in-tag-name"}) 438 self.state = self.dataState 439 elif data == "/": 440 self.state = self.selfClosingStartTagState 441 elif data == "\u0000": 442 self.tokenQueue.append({"type": tokenTypes["ParseError"], 443 "data": "invalid-codepoint"}) 444 self.currentToken["name"] += "\uFFFD" 445 else: 446 self.currentToken["name"] += data 447 # (Don't use charsUntil here, because tag names are 448 # very short and it's faster to not do anything fancy) 449 return True 450 451 def rcdataLessThanSignState(self): 452 data = self.stream.char() 453 if data == "/": 454 self.temporaryBuffer = "" 455 self.state = self.rcdataEndTagOpenState 456 else: 457 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 458 self.stream.unget(data) 459 self.state = self.rcdataState 460 return True 461 462 def rcdataEndTagOpenState(self): 463 data = self.stream.char() 464 if data in asciiLetters: 465 self.temporaryBuffer += data 466 self.state = self.rcdataEndTagNameState 467 else: 468 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 469 self.stream.unget(data) 470 self.state = self.rcdataState 471 return True 472 473 def rcdataEndTagNameState(self): 474 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 475 data = self.stream.char() 476 if data in spaceCharacters and appropriate: 477 self.currentToken = {"type": tokenTypes["EndTag"], 478 "name": self.temporaryBuffer, 479 "data": [], "selfClosing": False} 480 self.state = self.beforeAttributeNameState 481 elif data == "/" and appropriate: 482 self.currentToken = {"type": tokenTypes["EndTag"], 483 "name": self.temporaryBuffer, 484 "data": [], "selfClosing": False} 485 self.state = self.selfClosingStartTagState 486 elif data == ">" and appropriate: 487 self.currentToken = {"type": tokenTypes["EndTag"], 488 "name": self.temporaryBuffer, 489 "data": [], "selfClosing": False} 490 self.emitCurrentToken() 491 self.state = self.dataState 492 elif data in asciiLetters: 493 self.temporaryBuffer += data 494 else: 495 self.tokenQueue.append({"type": tokenTypes["Characters"], 496 "data": "</" + self.temporaryBuffer}) 497 self.stream.unget(data) 498 self.state = self.rcdataState 499 return True 500 501 def rawtextLessThanSignState(self): 502 data = self.stream.char() 503 if data == "/": 504 self.temporaryBuffer = "" 505 self.state = self.rawtextEndTagOpenState 506 else: 507 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 508 self.stream.unget(data) 509 self.state = self.rawtextState 510 return True 511 512 def rawtextEndTagOpenState(self): 513 data = self.stream.char() 514 if data in asciiLetters: 515 self.temporaryBuffer += data 516 self.state = self.rawtextEndTagNameState 517 else: 518 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 519 self.stream.unget(data) 520 self.state = self.rawtextState 521 return True 522 523 def rawtextEndTagNameState(self): 524 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 525 data = self.stream.char() 526 if data in spaceCharacters and appropriate: 527 self.currentToken = {"type": tokenTypes["EndTag"], 528 "name": self.temporaryBuffer, 529 "data": [], "selfClosing": False} 530 self.state = self.beforeAttributeNameState 531 elif data == "/" and appropriate: 532 self.currentToken = {"type": tokenTypes["EndTag"], 533 "name": self.temporaryBuffer, 534 "data": [], "selfClosing": False} 535 self.state = self.selfClosingStartTagState 536 elif data == ">" and appropriate: 537 self.currentToken = {"type": tokenTypes["EndTag"], 538 "name": self.temporaryBuffer, 539 "data": [], "selfClosing": False} 540 self.emitCurrentToken() 541 self.state = self.dataState 542 elif data in asciiLetters: 543 self.temporaryBuffer += data 544 else: 545 self.tokenQueue.append({"type": tokenTypes["Characters"], 546 "data": "</" + self.temporaryBuffer}) 547 self.stream.unget(data) 548 self.state = self.rawtextState 549 return True 550 551 def scriptDataLessThanSignState(self): 552 data = self.stream.char() 553 if data == "/": 554 self.temporaryBuffer = "" 555 self.state = self.scriptDataEndTagOpenState 556 elif data == "!": 557 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"}) 558 self.state = self.scriptDataEscapeStartState 559 else: 560 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 561 self.stream.unget(data) 562 self.state = self.scriptDataState 563 return True 564 565 def scriptDataEndTagOpenState(self): 566 data = self.stream.char() 567 if data in asciiLetters: 568 self.temporaryBuffer += data 569 self.state = self.scriptDataEndTagNameState 570 else: 571 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 572 self.stream.unget(data) 573 self.state = self.scriptDataState 574 return True 575 576 def scriptDataEndTagNameState(self): 577 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 578 data = self.stream.char() 579 if data in spaceCharacters and appropriate: 580 self.currentToken = {"type": tokenTypes["EndTag"], 581 "name": self.temporaryBuffer, 582 "data": [], "selfClosing": False} 583 self.state = self.beforeAttributeNameState 584 elif data == "/" and appropriate: 585 self.currentToken = {"type": tokenTypes["EndTag"], 586 "name": self.temporaryBuffer, 587 "data": [], "selfClosing": False} 588 self.state = self.selfClosingStartTagState 589 elif data == ">" and appropriate: 590 self.currentToken = {"type": tokenTypes["EndTag"], 591 "name": self.temporaryBuffer, 592 "data": [], "selfClosing": False} 593 self.emitCurrentToken() 594 self.state = self.dataState 595 elif data in asciiLetters: 596 self.temporaryBuffer += data 597 else: 598 self.tokenQueue.append({"type": tokenTypes["Characters"], 599 "data": "</" + self.temporaryBuffer}) 600 self.stream.unget(data) 601 self.state = self.scriptDataState 602 return True 603 604 def scriptDataEscapeStartState(self): 605 data = self.stream.char() 606 if data == "-": 607 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 608 self.state = self.scriptDataEscapeStartDashState 609 else: 610 self.stream.unget(data) 611 self.state = self.scriptDataState 612 return True 613 614 def scriptDataEscapeStartDashState(self): 615 data = self.stream.char() 616 if data == "-": 617 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 618 self.state = self.scriptDataEscapedDashDashState 619 else: 620 self.stream.unget(data) 621 self.state = self.scriptDataState 622 return True 623 624 def scriptDataEscapedState(self): 625 data = self.stream.char() 626 if data == "-": 627 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 628 self.state = self.scriptDataEscapedDashState 629 elif data == "<": 630 self.state = self.scriptDataEscapedLessThanSignState 631 elif data == "\u0000": 632 self.tokenQueue.append({"type": tokenTypes["ParseError"], 633 "data": "invalid-codepoint"}) 634 self.tokenQueue.append({"type": tokenTypes["Characters"], 635 "data": "\uFFFD"}) 636 elif data == EOF: 637 self.state = self.dataState 638 else: 639 chars = self.stream.charsUntil(("<", "-", "\u0000")) 640 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 641 data + chars}) 642 return True 643 644 def scriptDataEscapedDashState(self): 645 data = self.stream.char() 646 if data == "-": 647 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 648 self.state = self.scriptDataEscapedDashDashState 649 elif data == "<": 650 self.state = self.scriptDataEscapedLessThanSignState 651 elif data == "\u0000": 652 self.tokenQueue.append({"type": tokenTypes["ParseError"], 653 "data": "invalid-codepoint"}) 654 self.tokenQueue.append({"type": tokenTypes["Characters"], 655 "data": "\uFFFD"}) 656 self.state = self.scriptDataEscapedState 657 elif data == EOF: 658 self.state = self.dataState 659 else: 660 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 661 self.state = self.scriptDataEscapedState 662 return True 663 664 def scriptDataEscapedDashDashState(self): 665 data = self.stream.char() 666 if data == "-": 667 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 668 elif data == "<": 669 self.state = self.scriptDataEscapedLessThanSignState 670 elif data == ">": 671 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) 672 self.state = self.scriptDataState 673 elif data == "\u0000": 674 self.tokenQueue.append({"type": tokenTypes["ParseError"], 675 "data": "invalid-codepoint"}) 676 self.tokenQueue.append({"type": tokenTypes["Characters"], 677 "data": "\uFFFD"}) 678 self.state = self.scriptDataEscapedState 679 elif data == EOF: 680 self.state = self.dataState 681 else: 682 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 683 self.state = self.scriptDataEscapedState 684 return True 685 686 def scriptDataEscapedLessThanSignState(self): 687 data = self.stream.char() 688 if data == "/": 689 self.temporaryBuffer = "" 690 self.state = self.scriptDataEscapedEndTagOpenState 691 elif data in asciiLetters: 692 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data}) 693 self.temporaryBuffer = data 694 self.state = self.scriptDataDoubleEscapeStartState 695 else: 696 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 697 self.stream.unget(data) 698 self.state = self.scriptDataEscapedState 699 return True 700 701 def scriptDataEscapedEndTagOpenState(self): 702 data = self.stream.char() 703 if data in asciiLetters: 704 self.temporaryBuffer = data 705 self.state = self.scriptDataEscapedEndTagNameState 706 else: 707 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 708 self.stream.unget(data) 709 self.state = self.scriptDataEscapedState 710 return True 711 712 def scriptDataEscapedEndTagNameState(self): 713 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 714 data = self.stream.char() 715 if data in spaceCharacters and appropriate: 716 self.currentToken = {"type": tokenTypes["EndTag"], 717 "name": self.temporaryBuffer, 718 "data": [], "selfClosing": False} 719 self.state = self.beforeAttributeNameState 720 elif data == "/" and appropriate: 721 self.currentToken = {"type": tokenTypes["EndTag"], 722 "name": self.temporaryBuffer, 723 "data": [], "selfClosing": False} 724 self.state = self.selfClosingStartTagState 725 elif data == ">" and appropriate: 726 self.currentToken = {"type": tokenTypes["EndTag"], 727 "name": self.temporaryBuffer, 728 "data": [], "selfClosing": False} 729 self.emitCurrentToken() 730 self.state = self.dataState 731 elif data in asciiLetters: 732 self.temporaryBuffer += data 733 else: 734 self.tokenQueue.append({"type": tokenTypes["Characters"], 735 "data": "</" + self.temporaryBuffer}) 736 self.stream.unget(data) 737 self.state = self.scriptDataEscapedState 738 return True 739 740 def scriptDataDoubleEscapeStartState(self): 741 data = self.stream.char() 742 if data in (spaceCharacters | frozenset(("/", ">"))): 743 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 744 if self.temporaryBuffer.lower() == "script": 745 self.state = self.scriptDataDoubleEscapedState 746 else: 747 self.state = self.scriptDataEscapedState 748 elif data in asciiLetters: 749 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 750 self.temporaryBuffer += data 751 else: 752 self.stream.unget(data) 753 self.state = self.scriptDataEscapedState 754 return True 755 756 def scriptDataDoubleEscapedState(self): 757 data = self.stream.char() 758 if data == "-": 759 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 760 self.state = self.scriptDataDoubleEscapedDashState 761 elif data == "<": 762 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 763 self.state = self.scriptDataDoubleEscapedLessThanSignState 764 elif data == "\u0000": 765 self.tokenQueue.append({"type": tokenTypes["ParseError"], 766 "data": "invalid-codepoint"}) 767 self.tokenQueue.append({"type": tokenTypes["Characters"], 768 "data": "\uFFFD"}) 769 elif data == EOF: 770 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 771 "eof-in-script-in-script"}) 772 self.state = self.dataState 773 else: 774 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 775 return True 776 777 def scriptDataDoubleEscapedDashState(self): 778 data = self.stream.char() 779 if data == "-": 780 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 781 self.state = self.scriptDataDoubleEscapedDashDashState 782 elif data == "<": 783 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 784 self.state = self.scriptDataDoubleEscapedLessThanSignState 785 elif data == "\u0000": 786 self.tokenQueue.append({"type": tokenTypes["ParseError"], 787 "data": "invalid-codepoint"}) 788 self.tokenQueue.append({"type": tokenTypes["Characters"], 789 "data": "\uFFFD"}) 790 self.state = self.scriptDataDoubleEscapedState 791 elif data == EOF: 792 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 793 "eof-in-script-in-script"}) 794 self.state = self.dataState 795 else: 796 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 797 self.state = self.scriptDataDoubleEscapedState 798 return True 799 800 def scriptDataDoubleEscapedDashDashState(self): 801 data = self.stream.char() 802 if data == "-": 803 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 804 elif data == "<": 805 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 806 self.state = self.scriptDataDoubleEscapedLessThanSignState 807 elif data == ">": 808 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) 809 self.state = self.scriptDataState 810 elif data == "\u0000": 811 self.tokenQueue.append({"type": tokenTypes["ParseError"], 812 "data": "invalid-codepoint"}) 813 self.tokenQueue.append({"type": tokenTypes["Characters"], 814 "data": "\uFFFD"}) 815 self.state = self.scriptDataDoubleEscapedState 816 elif data == EOF: 817 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 818 "eof-in-script-in-script"}) 819 self.state = self.dataState 820 else: 821 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 822 self.state = self.scriptDataDoubleEscapedState 823 return True 824 825 def scriptDataDoubleEscapedLessThanSignState(self): 826 data = self.stream.char() 827 if data == "/": 828 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"}) 829 self.temporaryBuffer = "" 830 self.state = self.scriptDataDoubleEscapeEndState 831 else: 832 self.stream.unget(data) 833 self.state = self.scriptDataDoubleEscapedState 834 return True 835 836 def scriptDataDoubleEscapeEndState(self): 837 data = self.stream.char() 838 if data in (spaceCharacters | frozenset(("/", ">"))): 839 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 840 if self.temporaryBuffer.lower() == "script": 841 self.state = self.scriptDataEscapedState 842 else: 843 self.state = self.scriptDataDoubleEscapedState 844 elif data in asciiLetters: 845 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 846 self.temporaryBuffer += data 847 else: 848 self.stream.unget(data) 849 self.state = self.scriptDataDoubleEscapedState 850 return True 851 852 def beforeAttributeNameState(self): 853 data = self.stream.char() 854 if data in spaceCharacters: 855 self.stream.charsUntil(spaceCharacters, True) 856 elif data in asciiLetters: 857 self.currentToken["data"].append([data, ""]) 858 self.state = self.attributeNameState 859 elif data == ">": 860 self.emitCurrentToken() 861 elif data == "/": 862 self.state = self.selfClosingStartTagState 863 elif data in ("'", '"', "=", "<"): 864 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 865 "invalid-character-in-attribute-name"}) 866 self.currentToken["data"].append([data, ""]) 867 self.state = self.attributeNameState 868 elif data == "\u0000": 869 self.tokenQueue.append({"type": tokenTypes["ParseError"], 870 "data": "invalid-codepoint"}) 871 self.currentToken["data"].append(["\uFFFD", ""]) 872 self.state = self.attributeNameState 873 elif data is EOF: 874 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 875 "expected-attribute-name-but-got-eof"}) 876 self.state = self.dataState 877 else: 878 self.currentToken["data"].append([data, ""]) 879 self.state = self.attributeNameState 880 return True 881 882 def attributeNameState(self): 883 data = self.stream.char() 884 leavingThisState = True 885 emitToken = False 886 if data == "=": 887 self.state = self.beforeAttributeValueState 888 elif data in asciiLetters: 889 self.currentToken["data"][-1][0] += data +\ 890 self.stream.charsUntil(asciiLetters, True) 891 leavingThisState = False 892 elif data == ">": 893 # XXX If we emit here the attributes are converted to a dict 894 # without being checked and when the code below runs we error 895 # because data is a dict not a list 896 emitToken = True 897 elif data in spaceCharacters: 898 self.state = self.afterAttributeNameState 899 elif data == "/": 900 self.state = self.selfClosingStartTagState 901 elif data == "\u0000": 902 self.tokenQueue.append({"type": tokenTypes["ParseError"], 903 "data": "invalid-codepoint"}) 904 self.currentToken["data"][-1][0] += "\uFFFD" 905 leavingThisState = False 906 elif data in ("'", '"', "<"): 907 self.tokenQueue.append({"type": tokenTypes["ParseError"], 908 "data": 909 "invalid-character-in-attribute-name"}) 910 self.currentToken["data"][-1][0] += data 911 leavingThisState = False 912 elif data is EOF: 913 self.tokenQueue.append({"type": tokenTypes["ParseError"], 914 "data": "eof-in-attribute-name"}) 915 self.state = self.dataState 916 else: 917 self.currentToken["data"][-1][0] += data 918 leavingThisState = False 919 920 if leavingThisState: 921 # Attributes are not dropped at this stage. That happens when the 922 # start tag token is emitted so values can still be safely appended 923 # to attributes, but we do want to report the parse error in time. 924 if self.lowercaseAttrName: 925 self.currentToken["data"][-1][0] = ( 926 self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) 927 for name, value in self.currentToken["data"][:-1]: 928 if self.currentToken["data"][-1][0] == name: 929 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 930 "duplicate-attribute"}) 931 break 932 # XXX Fix for above XXX 933 if emitToken: 934 self.emitCurrentToken() 935 return True 936 937 def afterAttributeNameState(self): 938 data = self.stream.char() 939 if data in spaceCharacters: 940 self.stream.charsUntil(spaceCharacters, True) 941 elif data == "=": 942 self.state = self.beforeAttributeValueState 943 elif data == ">": 944 self.emitCurrentToken() 945 elif data in asciiLetters: 946 self.currentToken["data"].append([data, ""]) 947 self.state = self.attributeNameState 948 elif data == "/": 949 self.state = self.selfClosingStartTagState 950 elif data == "\u0000": 951 self.tokenQueue.append({"type": tokenTypes["ParseError"], 952 "data": "invalid-codepoint"}) 953 self.currentToken["data"].append(["\uFFFD", ""]) 954 self.state = self.attributeNameState 955 elif data in ("'", '"', "<"): 956 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 957 "invalid-character-after-attribute-name"}) 958 self.currentToken["data"].append([data, ""]) 959 self.state = self.attributeNameState 960 elif data is EOF: 961 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 962 "expected-end-of-tag-but-got-eof"}) 963 self.state = self.dataState 964 else: 965 self.currentToken["data"].append([data, ""]) 966 self.state = self.attributeNameState 967 return True 968 969 def beforeAttributeValueState(self): 970 data = self.stream.char() 971 if data in spaceCharacters: 972 self.stream.charsUntil(spaceCharacters, True) 973 elif data == "\"": 974 self.state = self.attributeValueDoubleQuotedState 975 elif data == "&": 976 self.state = self.attributeValueUnQuotedState 977 self.stream.unget(data) 978 elif data == "'": 979 self.state = self.attributeValueSingleQuotedState 980 elif data == ">": 981 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 982 "expected-attribute-value-but-got-right-bracket"}) 983 self.emitCurrentToken() 984 elif data == "\u0000": 985 self.tokenQueue.append({"type": tokenTypes["ParseError"], 986 "data": "invalid-codepoint"}) 987 self.currentToken["data"][-1][1] += "\uFFFD" 988 self.state = self.attributeValueUnQuotedState 989 elif data in ("=", "<", "`"): 990 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 991 "equals-in-unquoted-attribute-value"}) 992 self.currentToken["data"][-1][1] += data 993 self.state = self.attributeValueUnQuotedState 994 elif data is EOF: 995 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 996 "expected-attribute-value-but-got-eof"}) 997 self.state = self.dataState 998 else: 999 self.currentToken["data"][-1][1] += data 1000 self.state = self.attributeValueUnQuotedState 1001 return True 1002 1003 def attributeValueDoubleQuotedState(self): 1004 data = self.stream.char() 1005 if data == "\"": 1006 self.state = self.afterAttributeValueState 1007 elif data == "&": 1008 self.processEntityInAttribute('"') 1009 elif data == "\u0000": 1010 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1011 "data": "invalid-codepoint"}) 1012 self.currentToken["data"][-1][1] += "\uFFFD" 1013 elif data is EOF: 1014 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1015 "eof-in-attribute-value-double-quote"}) 1016 self.state = self.dataState 1017 else: 1018 self.currentToken["data"][-1][1] += data +\ 1019 self.stream.charsUntil(("\"", "&", "\u0000")) 1020 return True 1021 1022 def attributeValueSingleQuotedState(self): 1023 data = self.stream.char() 1024 if data == "'": 1025 self.state = self.afterAttributeValueState 1026 elif data == "&": 1027 self.processEntityInAttribute("'") 1028 elif data == "\u0000": 1029 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1030 "data": "invalid-codepoint"}) 1031 self.currentToken["data"][-1][1] += "\uFFFD" 1032 elif data is EOF: 1033 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1034 "eof-in-attribute-value-single-quote"}) 1035 self.state = self.dataState 1036 else: 1037 self.currentToken["data"][-1][1] += data +\ 1038 self.stream.charsUntil(("'", "&", "\u0000")) 1039 return True 1040 1041 def attributeValueUnQuotedState(self): 1042 data = self.stream.char() 1043 if data in spaceCharacters: 1044 self.state = self.beforeAttributeNameState 1045 elif data == "&": 1046 self.processEntityInAttribute(">") 1047 elif data == ">": 1048 self.emitCurrentToken() 1049 elif data in ('"', "'", "=", "<", "`"): 1050 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1051 "unexpected-character-in-unquoted-attribute-value"}) 1052 self.currentToken["data"][-1][1] += data 1053 elif data == "\u0000": 1054 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1055 "data": "invalid-codepoint"}) 1056 self.currentToken["data"][-1][1] += "\uFFFD" 1057 elif data is EOF: 1058 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1059 "eof-in-attribute-value-no-quotes"}) 1060 self.state = self.dataState 1061 else: 1062 self.currentToken["data"][-1][1] += data + self.stream.charsUntil( 1063 frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters) 1064 return True 1065 1066 def afterAttributeValueState(self): 1067 data = self.stream.char() 1068 if data in spaceCharacters: 1069 self.state = self.beforeAttributeNameState 1070 elif data == ">": 1071 self.emitCurrentToken() 1072 elif data == "/": 1073 self.state = self.selfClosingStartTagState 1074 elif data is EOF: 1075 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1076 "unexpected-EOF-after-attribute-value"}) 1077 self.stream.unget(data) 1078 self.state = self.dataState 1079 else: 1080 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1081 "unexpected-character-after-attribute-value"}) 1082 self.stream.unget(data) 1083 self.state = self.beforeAttributeNameState 1084 return True 1085 1086 def selfClosingStartTagState(self): 1087 data = self.stream.char() 1088 if data == ">": 1089 self.currentToken["selfClosing"] = True 1090 self.emitCurrentToken() 1091 elif data is EOF: 1092 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1093 "data": 1094 "unexpected-EOF-after-solidus-in-tag"}) 1095 self.stream.unget(data) 1096 self.state = self.dataState 1097 else: 1098 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1099 "unexpected-character-after-solidus-in-tag"}) 1100 self.stream.unget(data) 1101 self.state = self.beforeAttributeNameState 1102 return True 1103 1104 def bogusCommentState(self): 1105 # Make a new comment token and give it as value all the characters 1106 # until the first > or EOF (charsUntil checks for EOF automatically) 1107 # and emit it. 1108 data = self.stream.charsUntil(">") 1109 data = data.replace("\u0000", "\uFFFD") 1110 self.tokenQueue.append( 1111 {"type": tokenTypes["Comment"], "data": data}) 1112 1113 # Eat the character directly after the bogus comment which is either a 1114 # ">" or an EOF. 1115 self.stream.char() 1116 self.state = self.dataState 1117 return True 1118 1119 def markupDeclarationOpenState(self): 1120 charStack = [self.stream.char()] 1121 if charStack[-1] == "-": 1122 charStack.append(self.stream.char()) 1123 if charStack[-1] == "-": 1124 self.currentToken = {"type": tokenTypes["Comment"], "data": ""} 1125 self.state = self.commentStartState 1126 return True 1127 elif charStack[-1] in ('d', 'D'): 1128 matched = True 1129 for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'), 1130 ('y', 'Y'), ('p', 'P'), ('e', 'E')): 1131 charStack.append(self.stream.char()) 1132 if charStack[-1] not in expected: 1133 matched = False 1134 break 1135 if matched: 1136 self.currentToken = {"type": tokenTypes["Doctype"], 1137 "name": "", 1138 "publicId": None, "systemId": None, 1139 "correct": True} 1140 self.state = self.doctypeState 1141 return True 1142 elif (charStack[-1] == "[" and 1143 self.parser is not None and 1144 self.parser.tree.openElements and 1145 self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace): 1146 matched = True 1147 for expected in ["C", "D", "A", "T", "A", "["]: 1148 charStack.append(self.stream.char()) 1149 if charStack[-1] != expected: 1150 matched = False 1151 break 1152 if matched: 1153 self.state = self.cdataSectionState 1154 return True 1155 1156 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1157 "expected-dashes-or-doctype"}) 1158 1159 while charStack: 1160 self.stream.unget(charStack.pop()) 1161 self.state = self.bogusCommentState 1162 return True 1163 1164 def commentStartState(self): 1165 data = self.stream.char() 1166 if data == "-": 1167 self.state = self.commentStartDashState 1168 elif data == "\u0000": 1169 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1170 "data": "invalid-codepoint"}) 1171 self.currentToken["data"] += "\uFFFD" 1172 elif data == ">": 1173 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1174 "incorrect-comment"}) 1175 self.tokenQueue.append(self.currentToken) 1176 self.state = self.dataState 1177 elif data is EOF: 1178 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1179 "eof-in-comment"}) 1180 self.tokenQueue.append(self.currentToken) 1181 self.state = self.dataState 1182 else: 1183 self.currentToken["data"] += data 1184 self.state = self.commentState 1185 return True 1186 1187 def commentStartDashState(self): 1188 data = self.stream.char() 1189 if data == "-": 1190 self.state = self.commentEndState 1191 elif data == "\u0000": 1192 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1193 "data": "invalid-codepoint"}) 1194 self.currentToken["data"] += "-\uFFFD" 1195 elif data == ">": 1196 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1197 "incorrect-comment"}) 1198 self.tokenQueue.append(self.currentToken) 1199 self.state = self.dataState 1200 elif data is EOF: 1201 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1202 "eof-in-comment"}) 1203 self.tokenQueue.append(self.currentToken) 1204 self.state = self.dataState 1205 else: 1206 self.currentToken["data"] += "-" + data 1207 self.state = self.commentState 1208 return True 1209 1210 def commentState(self): 1211 data = self.stream.char() 1212 if data == "-": 1213 self.state = self.commentEndDashState 1214 elif data == "\u0000": 1215 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1216 "data": "invalid-codepoint"}) 1217 self.currentToken["data"] += "\uFFFD" 1218 elif data is EOF: 1219 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1220 "data": "eof-in-comment"}) 1221 self.tokenQueue.append(self.currentToken) 1222 self.state = self.dataState 1223 else: 1224 self.currentToken["data"] += data + \ 1225 self.stream.charsUntil(("-", "\u0000")) 1226 return True 1227 1228 def commentEndDashState(self): 1229 data = self.stream.char() 1230 if data == "-": 1231 self.state = self.commentEndState 1232 elif data == "\u0000": 1233 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1234 "data": "invalid-codepoint"}) 1235 self.currentToken["data"] += "-\uFFFD" 1236 self.state = self.commentState 1237 elif data is EOF: 1238 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1239 "eof-in-comment-end-dash"}) 1240 self.tokenQueue.append(self.currentToken) 1241 self.state = self.dataState 1242 else: 1243 self.currentToken["data"] += "-" + data 1244 self.state = self.commentState 1245 return True 1246 1247 def commentEndState(self): 1248 data = self.stream.char() 1249 if data == ">": 1250 self.tokenQueue.append(self.currentToken) 1251 self.state = self.dataState 1252 elif data == "\u0000": 1253 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1254 "data": "invalid-codepoint"}) 1255 self.currentToken["data"] += "--\uFFFD" 1256 self.state = self.commentState 1257 elif data == "!": 1258 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1259 "unexpected-bang-after-double-dash-in-comment"}) 1260 self.state = self.commentEndBangState 1261 elif data == "-": 1262 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1263 "unexpected-dash-after-double-dash-in-comment"}) 1264 self.currentToken["data"] += data 1265 elif data is EOF: 1266 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1267 "eof-in-comment-double-dash"}) 1268 self.tokenQueue.append(self.currentToken) 1269 self.state = self.dataState 1270 else: 1271 # XXX 1272 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1273 "unexpected-char-in-comment"}) 1274 self.currentToken["data"] += "--" + data 1275 self.state = self.commentState 1276 return True 1277 1278 def commentEndBangState(self): 1279 data = self.stream.char() 1280 if data == ">": 1281 self.tokenQueue.append(self.currentToken) 1282 self.state = self.dataState 1283 elif data == "-": 1284 self.currentToken["data"] += "--!" 1285 self.state = self.commentEndDashState 1286 elif data == "\u0000": 1287 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1288 "data": "invalid-codepoint"}) 1289 self.currentToken["data"] += "--!\uFFFD" 1290 self.state = self.commentState 1291 elif data is EOF: 1292 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1293 "eof-in-comment-end-bang-state"}) 1294 self.tokenQueue.append(self.currentToken) 1295 self.state = self.dataState 1296 else: 1297 self.currentToken["data"] += "--!" + data 1298 self.state = self.commentState 1299 return True 1300 1301 def doctypeState(self): 1302 data = self.stream.char() 1303 if data in spaceCharacters: 1304 self.state = self.beforeDoctypeNameState 1305 elif data is EOF: 1306 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1307 "expected-doctype-name-but-got-eof"}) 1308 self.currentToken["correct"] = False 1309 self.tokenQueue.append(self.currentToken) 1310 self.state = self.dataState 1311 else: 1312 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1313 "need-space-after-doctype"}) 1314 self.stream.unget(data) 1315 self.state = self.beforeDoctypeNameState 1316 return True 1317 1318 def beforeDoctypeNameState(self): 1319 data = self.stream.char() 1320 if data in spaceCharacters: 1321 pass 1322 elif data == ">": 1323 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1324 "expected-doctype-name-but-got-right-bracket"}) 1325 self.currentToken["correct"] = False 1326 self.tokenQueue.append(self.currentToken) 1327 self.state = self.dataState 1328 elif data == "\u0000": 1329 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1330 "data": "invalid-codepoint"}) 1331 self.currentToken["name"] = "\uFFFD" 1332 self.state = self.doctypeNameState 1333 elif data is EOF: 1334 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1335 "expected-doctype-name-but-got-eof"}) 1336 self.currentToken["correct"] = False 1337 self.tokenQueue.append(self.currentToken) 1338 self.state = self.dataState 1339 else: 1340 self.currentToken["name"] = data 1341 self.state = self.doctypeNameState 1342 return True 1343 1344 def doctypeNameState(self): 1345 data = self.stream.char() 1346 if data in spaceCharacters: 1347 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) 1348 self.state = self.afterDoctypeNameState 1349 elif data == ">": 1350 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) 1351 self.tokenQueue.append(self.currentToken) 1352 self.state = self.dataState 1353 elif data == "\u0000": 1354 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1355 "data": "invalid-codepoint"}) 1356 self.currentToken["name"] += "\uFFFD" 1357 self.state = self.doctypeNameState 1358 elif data is EOF: 1359 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1360 "eof-in-doctype-name"}) 1361 self.currentToken["correct"] = False 1362 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) 1363 self.tokenQueue.append(self.currentToken) 1364 self.state = self.dataState 1365 else: 1366 self.currentToken["name"] += data 1367 return True 1368 1369 def afterDoctypeNameState(self): 1370 data = self.stream.char() 1371 if data in spaceCharacters: 1372 pass 1373 elif data == ">": 1374 self.tokenQueue.append(self.currentToken) 1375 self.state = self.dataState 1376 elif data is EOF: 1377 self.currentToken["correct"] = False 1378 self.stream.unget(data) 1379 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1380 "eof-in-doctype"}) 1381 self.tokenQueue.append(self.currentToken) 1382 self.state = self.dataState 1383 else: 1384 if data in ("p", "P"): 1385 matched = True 1386 for expected in (("u", "U"), ("b", "B"), ("l", "L"), 1387 ("i", "I"), ("c", "C")): 1388 data = self.stream.char() 1389 if data not in expected: 1390 matched = False 1391 break 1392 if matched: 1393 self.state = self.afterDoctypePublicKeywordState 1394 return True 1395 elif data in ("s", "S"): 1396 matched = True 1397 for expected in (("y", "Y"), ("s", "S"), ("t", "T"), 1398 ("e", "E"), ("m", "M")): 1399 data = self.stream.char() 1400 if data not in expected: 1401 matched = False 1402 break 1403 if matched: 1404 self.state = self.afterDoctypeSystemKeywordState 1405 return True 1406 1407 # All the characters read before the current 'data' will be 1408 # [a-zA-Z], so they're garbage in the bogus doctype and can be 1409 # discarded; only the latest character might be '>' or EOF 1410 # and needs to be ungetted 1411 self.stream.unget(data) 1412 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1413 "expected-space-or-right-bracket-in-doctype", "datavars": 1414 {"data": data}}) 1415 self.currentToken["correct"] = False 1416 self.state = self.bogusDoctypeState 1417 1418 return True 1419 1420 def afterDoctypePublicKeywordState(self): 1421 data = self.stream.char() 1422 if data in spaceCharacters: 1423 self.state = self.beforeDoctypePublicIdentifierState 1424 elif data in ("'", '"'): 1425 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1426 "unexpected-char-in-doctype"}) 1427 self.stream.unget(data) 1428 self.state = self.beforeDoctypePublicIdentifierState 1429 elif data is EOF: 1430 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1431 "eof-in-doctype"}) 1432 self.currentToken["correct"] = False 1433 self.tokenQueue.append(self.currentToken) 1434 self.state = self.dataState 1435 else: 1436 self.stream.unget(data) 1437 self.state = self.beforeDoctypePublicIdentifierState 1438 return True 1439 1440 def beforeDoctypePublicIdentifierState(self): 1441 data = self.stream.char() 1442 if data in spaceCharacters: 1443 pass 1444 elif data == "\"": 1445 self.currentToken["publicId"] = "" 1446 self.state = self.doctypePublicIdentifierDoubleQuotedState 1447 elif data == "'": 1448 self.currentToken["publicId"] = "" 1449 self.state = self.doctypePublicIdentifierSingleQuotedState 1450 elif data == ">": 1451 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1452 "unexpected-end-of-doctype"}) 1453 self.currentToken["correct"] = False 1454 self.tokenQueue.append(self.currentToken) 1455 self.state = self.dataState 1456 elif data is EOF: 1457 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1458 "eof-in-doctype"}) 1459 self.currentToken["correct"] = False 1460 self.tokenQueue.append(self.currentToken) 1461 self.state = self.dataState 1462 else: 1463 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1464 "unexpected-char-in-doctype"}) 1465 self.currentToken["correct"] = False 1466 self.state = self.bogusDoctypeState 1467 return True 1468 1469 def doctypePublicIdentifierDoubleQuotedState(self): 1470 data = self.stream.char() 1471 if data == "\"": 1472 self.state = self.afterDoctypePublicIdentifierState 1473 elif data == "\u0000": 1474 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1475 "data": "invalid-codepoint"}) 1476 self.currentToken["publicId"] += "\uFFFD" 1477 elif data == ">": 1478 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1479 "unexpected-end-of-doctype"}) 1480 self.currentToken["correct"] = False 1481 self.tokenQueue.append(self.currentToken) 1482 self.state = self.dataState 1483 elif data is EOF: 1484 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1485 "eof-in-doctype"}) 1486 self.currentToken["correct"] = False 1487 self.tokenQueue.append(self.currentToken) 1488 self.state = self.dataState 1489 else: 1490 self.currentToken["publicId"] += data 1491 return True 1492 1493 def doctypePublicIdentifierSingleQuotedState(self): 1494 data = self.stream.char() 1495 if data == "'": 1496 self.state = self.afterDoctypePublicIdentifierState 1497 elif data == "\u0000": 1498 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1499 "data": "invalid-codepoint"}) 1500 self.currentToken["publicId"] += "\uFFFD" 1501 elif data == ">": 1502 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1503 "unexpected-end-of-doctype"}) 1504 self.currentToken["correct"] = False 1505 self.tokenQueue.append(self.currentToken) 1506 self.state = self.dataState 1507 elif data is EOF: 1508 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1509 "eof-in-doctype"}) 1510 self.currentToken["correct"] = False 1511 self.tokenQueue.append(self.currentToken) 1512 self.state = self.dataState 1513 else: 1514 self.currentToken["publicId"] += data 1515 return True 1516 1517 def afterDoctypePublicIdentifierState(self): 1518 data = self.stream.char() 1519 if data in spaceCharacters: 1520 self.state = self.betweenDoctypePublicAndSystemIdentifiersState 1521 elif data == ">": 1522 self.tokenQueue.append(self.currentToken) 1523 self.state = self.dataState 1524 elif data == '"': 1525 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1526 "unexpected-char-in-doctype"}) 1527 self.currentToken["systemId"] = "" 1528 self.state = self.doctypeSystemIdentifierDoubleQuotedState 1529 elif data == "'": 1530 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1531 "unexpected-char-in-doctype"}) 1532 self.currentToken["systemId"] = "" 1533 self.state = self.doctypeSystemIdentifierSingleQuotedState 1534 elif data is EOF: 1535 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1536 "eof-in-doctype"}) 1537 self.currentToken["correct"] = False 1538 self.tokenQueue.append(self.currentToken) 1539 self.state = self.dataState 1540 else: 1541 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1542 "unexpected-char-in-doctype"}) 1543 self.currentToken["correct"] = False 1544 self.state = self.bogusDoctypeState 1545 return True 1546 1547 def betweenDoctypePublicAndSystemIdentifiersState(self): 1548 data = self.stream.char() 1549 if data in spaceCharacters: 1550 pass 1551 elif data == ">": 1552 self.tokenQueue.append(self.currentToken) 1553 self.state = self.dataState 1554 elif data == '"': 1555 self.currentToken["systemId"] = "" 1556 self.state = self.doctypeSystemIdentifierDoubleQuotedState 1557 elif data == "'": 1558 self.currentToken["systemId"] = "" 1559 self.state = self.doctypeSystemIdentifierSingleQuotedState 1560 elif data == EOF: 1561 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1562 "eof-in-doctype"}) 1563 self.currentToken["correct"] = False 1564 self.tokenQueue.append(self.currentToken) 1565 self.state = self.dataState 1566 else: 1567 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1568 "unexpected-char-in-doctype"}) 1569 self.currentToken["correct"] = False 1570 self.state = self.bogusDoctypeState 1571 return True 1572 1573 def afterDoctypeSystemKeywordState(self): 1574 data = self.stream.char() 1575 if data in spaceCharacters: 1576 self.state = self.beforeDoctypeSystemIdentifierState 1577 elif data in ("'", '"'): 1578 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1579 "unexpected-char-in-doctype"}) 1580 self.stream.unget(data) 1581 self.state = self.beforeDoctypeSystemIdentifierState 1582 elif data is EOF: 1583 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1584 "eof-in-doctype"}) 1585 self.currentToken["correct"] = False 1586 self.tokenQueue.append(self.currentToken) 1587 self.state = self.dataState 1588 else: 1589 self.stream.unget(data) 1590 self.state = self.beforeDoctypeSystemIdentifierState 1591 return True 1592 1593 def beforeDoctypeSystemIdentifierState(self): 1594 data = self.stream.char() 1595 if data in spaceCharacters: 1596 pass 1597 elif data == "\"": 1598 self.currentToken["systemId"] = "" 1599 self.state = self.doctypeSystemIdentifierDoubleQuotedState 1600 elif data == "'": 1601 self.currentToken["systemId"] = "" 1602 self.state = self.doctypeSystemIdentifierSingleQuotedState 1603 elif data == ">": 1604 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1605 "unexpected-char-in-doctype"}) 1606 self.currentToken["correct"] = False 1607 self.tokenQueue.append(self.currentToken) 1608 self.state = self.dataState 1609 elif data is EOF: 1610 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1611 "eof-in-doctype"}) 1612 self.currentToken["correct"] = False 1613 self.tokenQueue.append(self.currentToken) 1614 self.state = self.dataState 1615 else: 1616 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1617 "unexpected-char-in-doctype"}) 1618 self.currentToken["correct"] = False 1619 self.state = self.bogusDoctypeState 1620 return True 1621 1622 def doctypeSystemIdentifierDoubleQuotedState(self): 1623 data = self.stream.char() 1624 if data == "\"": 1625 self.state = self.afterDoctypeSystemIdentifierState 1626 elif data == "\u0000": 1627 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1628 "data": "invalid-codepoint"}) 1629 self.currentToken["systemId"] += "\uFFFD" 1630 elif data == ">": 1631 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1632 "unexpected-end-of-doctype"}) 1633 self.currentToken["correct"] = False 1634 self.tokenQueue.append(self.currentToken) 1635 self.state = self.dataState 1636 elif data is EOF: 1637 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1638 "eof-in-doctype"}) 1639 self.currentToken["correct"] = False 1640 self.tokenQueue.append(self.currentToken) 1641 self.state = self.dataState 1642 else: 1643 self.currentToken["systemId"] += data 1644 return True 1645 1646 def doctypeSystemIdentifierSingleQuotedState(self): 1647 data = self.stream.char() 1648 if data == "'": 1649 self.state = self.afterDoctypeSystemIdentifierState 1650 elif data == "\u0000": 1651 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1652 "data": "invalid-codepoint"}) 1653 self.currentToken["systemId"] += "\uFFFD" 1654 elif data == ">": 1655 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1656 "unexpected-end-of-doctype"}) 1657 self.currentToken["correct"] = False 1658 self.tokenQueue.append(self.currentToken) 1659 self.state = self.dataState 1660 elif data is EOF: 1661 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1662 "eof-in-doctype"}) 1663 self.currentToken["correct"] = False 1664 self.tokenQueue.append(self.currentToken) 1665 self.state = self.dataState 1666 else: 1667 self.currentToken["systemId"] += data 1668 return True 1669 1670 def afterDoctypeSystemIdentifierState(self): 1671 data = self.stream.char() 1672 if data in spaceCharacters: 1673 pass 1674 elif data == ">": 1675 self.tokenQueue.append(self.currentToken) 1676 self.state = self.dataState 1677 elif data is EOF: 1678 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1679 "eof-in-doctype"}) 1680 self.currentToken["correct"] = False 1681 self.tokenQueue.append(self.currentToken) 1682 self.state = self.dataState 1683 else: 1684 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 1685 "unexpected-char-in-doctype"}) 1686 self.state = self.bogusDoctypeState 1687 return True 1688 1689 def bogusDoctypeState(self): 1690 data = self.stream.char() 1691 if data == ">": 1692 self.tokenQueue.append(self.currentToken) 1693 self.state = self.dataState 1694 elif data is EOF: 1695 # XXX EMIT 1696 self.stream.unget(data) 1697 self.tokenQueue.append(self.currentToken) 1698 self.state = self.dataState 1699 else: 1700 pass 1701 return True 1702 1703 def cdataSectionState(self): 1704 data = [] 1705 while True: 1706 data.append(self.stream.charsUntil("]")) 1707 data.append(self.stream.charsUntil(">")) 1708 char = self.stream.char() 1709 if char == EOF: 1710 break 1711 else: 1712 assert char == ">" 1713 if data[-1][-2:] == "]]": 1714 data[-1] = data[-1][:-2] 1715 break 1716 else: 1717 data.append(char) 1718 1719 data = "".join(data) 1720 # Deal with null here rather than in the parser 1721 nullCount = data.count("\u0000") 1722 if nullCount > 0: 1723 for i in range(nullCount): 1724 self.tokenQueue.append({"type": tokenTypes["ParseError"], 1725 "data": "invalid-codepoint"}) 1726 data = data.replace("\u0000", "\uFFFD") 1727 if data: 1728 self.tokenQueue.append({"type": tokenTypes["Characters"], 1729 "data": data}) 1730 self.state = self.dataState 1731 return True 1732