1from __future__ import absolute_import, division, unicode_literals 2from six import with_metaclass 3 4import types 5 6from . import inputstream 7from . import tokenizer 8 9from . import treebuilders 10from .treebuilders._base import Marker 11 12from . import utils 13from . import constants 14from .constants import spaceCharacters, asciiUpper2Lower 15from .constants import specialElements 16from .constants import headingElements 17from .constants import cdataElements, rcdataElements 18from .constants import tokenTypes, ReparseException, namespaces 19from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements 20from .constants import adjustForeignAttributes as adjustForeignAttributesMap 21from .constants import E 22 23 24def parse(doc, treebuilder="etree", encoding=None, 25 namespaceHTMLElements=True): 26 """Parse a string or file-like object into a tree""" 27 tb = treebuilders.getTreeBuilder(treebuilder) 28 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) 29 return p.parse(doc, encoding=encoding) 30 31 32def parseFragment(doc, container="div", treebuilder="etree", encoding=None, 33 namespaceHTMLElements=True): 34 tb = treebuilders.getTreeBuilder(treebuilder) 35 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) 36 return p.parseFragment(doc, container=container, encoding=encoding) 37 38 39def method_decorator_metaclass(function): 40 class Decorated(type): 41 def __new__(meta, classname, bases, classDict): 42 for attributeName, attribute in classDict.items(): 43 if isinstance(attribute, types.FunctionType): 44 attribute = function(attribute) 45 46 classDict[attributeName] = attribute 47 return type.__new__(meta, classname, bases, classDict) 48 return Decorated 49 50 51class HTMLParser(object): 52 """HTML parser. Generates a tree structure from a stream of (possibly 53 malformed) HTML""" 54 55 def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer, 56 strict=False, namespaceHTMLElements=True, debug=False): 57 """ 58 strict - raise an exception when a parse error is encountered 59 60 tree - a treebuilder class controlling the type of tree that will be 61 returned. Built in treebuilders can be accessed through 62 html5lib.treebuilders.getTreeBuilder(treeType) 63 64 tokenizer - a class that provides a stream of tokens to the treebuilder. 65 This may be replaced for e.g. a sanitizer which converts some tags to 66 text 67 """ 68 69 # Raise an exception on the first error encountered 70 self.strict = strict 71 72 if tree is None: 73 tree = treebuilders.getTreeBuilder("etree") 74 self.tree = tree(namespaceHTMLElements) 75 self.tokenizer_class = tokenizer 76 self.errors = [] 77 78 self.phases = dict([(name, cls(self, self.tree)) for name, cls in 79 getPhases(debug).items()]) 80 81 def _parse(self, stream, innerHTML=False, container="div", 82 encoding=None, parseMeta=True, useChardet=True, **kwargs): 83 84 self.innerHTMLMode = innerHTML 85 self.container = container 86 self.tokenizer = self.tokenizer_class(stream, encoding=encoding, 87 parseMeta=parseMeta, 88 useChardet=useChardet, 89 parser=self, **kwargs) 90 self.reset() 91 92 while True: 93 try: 94 self.mainLoop() 95 break 96 except ReparseException: 97 self.reset() 98 99 def reset(self): 100 self.tree.reset() 101 self.firstStartTag = False 102 self.errors = [] 103 self.log = [] # only used with debug mode 104 # "quirks" / "limited quirks" / "no quirks" 105 self.compatMode = "no quirks" 106 107 if self.innerHTMLMode: 108 self.innerHTML = self.container.lower() 109 110 if self.innerHTML in cdataElements: 111 self.tokenizer.state = self.tokenizer.rcdataState 112 elif self.innerHTML in rcdataElements: 113 self.tokenizer.state = self.tokenizer.rawtextState 114 elif self.innerHTML == 'plaintext': 115 self.tokenizer.state = self.tokenizer.plaintextState 116 else: 117 # state already is data state 118 # self.tokenizer.state = self.tokenizer.dataState 119 pass 120 self.phase = self.phases["beforeHtml"] 121 self.phase.insertHtmlElement() 122 self.resetInsertionMode() 123 else: 124 self.innerHTML = False 125 self.phase = self.phases["initial"] 126 127 self.lastPhase = None 128 129 self.beforeRCDataPhase = None 130 131 self.framesetOK = True 132 133 @property 134 def documentEncoding(self): 135 """The name of the character encoding 136 that was used to decode the input stream, 137 or :obj:`None` if that is not determined yet. 138 139 """ 140 if not hasattr(self, 'tokenizer'): 141 return None 142 return self.tokenizer.stream.charEncoding[0] 143 144 def isHTMLIntegrationPoint(self, element): 145 if (element.name == "annotation-xml" and 146 element.namespace == namespaces["mathml"]): 147 return ("encoding" in element.attributes and 148 element.attributes["encoding"].translate( 149 asciiUpper2Lower) in 150 ("text/html", "application/xhtml+xml")) 151 else: 152 return (element.namespace, element.name) in htmlIntegrationPointElements 153 154 def isMathMLTextIntegrationPoint(self, element): 155 return (element.namespace, element.name) in mathmlTextIntegrationPointElements 156 157 def mainLoop(self): 158 CharactersToken = tokenTypes["Characters"] 159 SpaceCharactersToken = tokenTypes["SpaceCharacters"] 160 StartTagToken = tokenTypes["StartTag"] 161 EndTagToken = tokenTypes["EndTag"] 162 CommentToken = tokenTypes["Comment"] 163 DoctypeToken = tokenTypes["Doctype"] 164 ParseErrorToken = tokenTypes["ParseError"] 165 166 for token in self.normalizedTokens(): 167 new_token = token 168 while new_token is not None: 169 currentNode = self.tree.openElements[-1] if self.tree.openElements else None 170 currentNodeNamespace = currentNode.namespace if currentNode else None 171 currentNodeName = currentNode.name if currentNode else None 172 173 type = new_token["type"] 174 175 if type == ParseErrorToken: 176 self.parseError(new_token["data"], new_token.get("datavars", {})) 177 new_token = None 178 else: 179 if (len(self.tree.openElements) == 0 or 180 currentNodeNamespace == self.tree.defaultNamespace or 181 (self.isMathMLTextIntegrationPoint(currentNode) and 182 ((type == StartTagToken and 183 token["name"] not in frozenset(["mglyph", "malignmark"])) or 184 type in (CharactersToken, SpaceCharactersToken))) or 185 (currentNodeNamespace == namespaces["mathml"] and 186 currentNodeName == "annotation-xml" and 187 token["name"] == "svg") or 188 (self.isHTMLIntegrationPoint(currentNode) and 189 type in (StartTagToken, CharactersToken, SpaceCharactersToken))): 190 phase = self.phase 191 else: 192 phase = self.phases["inForeignContent"] 193 194 if type == CharactersToken: 195 new_token = phase.processCharacters(new_token) 196 elif type == SpaceCharactersToken: 197 new_token = phase.processSpaceCharacters(new_token) 198 elif type == StartTagToken: 199 new_token = phase.processStartTag(new_token) 200 elif type == EndTagToken: 201 new_token = phase.processEndTag(new_token) 202 elif type == CommentToken: 203 new_token = phase.processComment(new_token) 204 elif type == DoctypeToken: 205 new_token = phase.processDoctype(new_token) 206 207 if (type == StartTagToken and token["selfClosing"] 208 and not token["selfClosingAcknowledged"]): 209 self.parseError("non-void-element-with-trailing-solidus", 210 {"name": token["name"]}) 211 212 # When the loop finishes it's EOF 213 reprocess = True 214 phases = [] 215 while reprocess: 216 phases.append(self.phase) 217 reprocess = self.phase.processEOF() 218 if reprocess: 219 assert self.phase not in phases 220 221 def normalizedTokens(self): 222 for token in self.tokenizer: 223 yield self.normalizeToken(token) 224 225 def parse(self, stream, encoding=None, parseMeta=True, useChardet=True): 226 """Parse a HTML document into a well-formed tree 227 228 stream - a filelike object or string containing the HTML to be parsed 229 230 The optional encoding parameter must be a string that indicates 231 the encoding. If specified, that encoding will be used, 232 regardless of any BOM or later declaration (such as in a meta 233 element) 234 """ 235 self._parse(stream, innerHTML=False, encoding=encoding, 236 parseMeta=parseMeta, useChardet=useChardet) 237 return self.tree.getDocument() 238 239 def parseFragment(self, stream, container="div", encoding=None, 240 parseMeta=False, useChardet=True): 241 """Parse a HTML fragment into a well-formed tree fragment 242 243 container - name of the element we're setting the innerHTML property 244 if set to None, default to 'div' 245 246 stream - a filelike object or string containing the HTML to be parsed 247 248 The optional encoding parameter must be a string that indicates 249 the encoding. If specified, that encoding will be used, 250 regardless of any BOM or later declaration (such as in a meta 251 element) 252 """ 253 self._parse(stream, True, container=container, encoding=encoding) 254 return self.tree.getFragment() 255 256 def parseError(self, errorcode="XXX-undefined-error", datavars={}): 257 # XXX The idea is to make errorcode mandatory. 258 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) 259 if self.strict: 260 raise ParseError(E[errorcode] % datavars) 261 262 def normalizeToken(self, token): 263 """ HTML5 specific normalizations to the token stream """ 264 265 if token["type"] == tokenTypes["StartTag"]: 266 token["data"] = dict(token["data"][::-1]) 267 268 return token 269 270 def adjustMathMLAttributes(self, token): 271 replacements = {"definitionurl": "definitionURL"} 272 for k, v in replacements.items(): 273 if k in token["data"]: 274 token["data"][v] = token["data"][k] 275 del token["data"][k] 276 277 def adjustSVGAttributes(self, token): 278 replacements = { 279 "attributename": "attributeName", 280 "attributetype": "attributeType", 281 "basefrequency": "baseFrequency", 282 "baseprofile": "baseProfile", 283 "calcmode": "calcMode", 284 "clippathunits": "clipPathUnits", 285 "contentscripttype": "contentScriptType", 286 "contentstyletype": "contentStyleType", 287 "diffuseconstant": "diffuseConstant", 288 "edgemode": "edgeMode", 289 "externalresourcesrequired": "externalResourcesRequired", 290 "filterres": "filterRes", 291 "filterunits": "filterUnits", 292 "glyphref": "glyphRef", 293 "gradienttransform": "gradientTransform", 294 "gradientunits": "gradientUnits", 295 "kernelmatrix": "kernelMatrix", 296 "kernelunitlength": "kernelUnitLength", 297 "keypoints": "keyPoints", 298 "keysplines": "keySplines", 299 "keytimes": "keyTimes", 300 "lengthadjust": "lengthAdjust", 301 "limitingconeangle": "limitingConeAngle", 302 "markerheight": "markerHeight", 303 "markerunits": "markerUnits", 304 "markerwidth": "markerWidth", 305 "maskcontentunits": "maskContentUnits", 306 "maskunits": "maskUnits", 307 "numoctaves": "numOctaves", 308 "pathlength": "pathLength", 309 "patterncontentunits": "patternContentUnits", 310 "patterntransform": "patternTransform", 311 "patternunits": "patternUnits", 312 "pointsatx": "pointsAtX", 313 "pointsaty": "pointsAtY", 314 "pointsatz": "pointsAtZ", 315 "preservealpha": "preserveAlpha", 316 "preserveaspectratio": "preserveAspectRatio", 317 "primitiveunits": "primitiveUnits", 318 "refx": "refX", 319 "refy": "refY", 320 "repeatcount": "repeatCount", 321 "repeatdur": "repeatDur", 322 "requiredextensions": "requiredExtensions", 323 "requiredfeatures": "requiredFeatures", 324 "specularconstant": "specularConstant", 325 "specularexponent": "specularExponent", 326 "spreadmethod": "spreadMethod", 327 "startoffset": "startOffset", 328 "stddeviation": "stdDeviation", 329 "stitchtiles": "stitchTiles", 330 "surfacescale": "surfaceScale", 331 "systemlanguage": "systemLanguage", 332 "tablevalues": "tableValues", 333 "targetx": "targetX", 334 "targety": "targetY", 335 "textlength": "textLength", 336 "viewbox": "viewBox", 337 "viewtarget": "viewTarget", 338 "xchannelselector": "xChannelSelector", 339 "ychannelselector": "yChannelSelector", 340 "zoomandpan": "zoomAndPan" 341 } 342 for originalName in list(token["data"].keys()): 343 if originalName in replacements: 344 svgName = replacements[originalName] 345 token["data"][svgName] = token["data"][originalName] 346 del token["data"][originalName] 347 348 def adjustForeignAttributes(self, token): 349 replacements = adjustForeignAttributesMap 350 351 for originalName in token["data"].keys(): 352 if originalName in replacements: 353 foreignName = replacements[originalName] 354 token["data"][foreignName] = token["data"][originalName] 355 del token["data"][originalName] 356 357 def reparseTokenNormal(self, token): 358 self.parser.phase() 359 360 def resetInsertionMode(self): 361 # The name of this method is mostly historical. (It's also used in the 362 # specification.) 363 last = False 364 newModes = { 365 "select": "inSelect", 366 "td": "inCell", 367 "th": "inCell", 368 "tr": "inRow", 369 "tbody": "inTableBody", 370 "thead": "inTableBody", 371 "tfoot": "inTableBody", 372 "caption": "inCaption", 373 "colgroup": "inColumnGroup", 374 "table": "inTable", 375 "head": "inBody", 376 "body": "inBody", 377 "frameset": "inFrameset", 378 "html": "beforeHead" 379 } 380 for node in self.tree.openElements[::-1]: 381 nodeName = node.name 382 new_phase = None 383 if node == self.tree.openElements[0]: 384 assert self.innerHTML 385 last = True 386 nodeName = self.innerHTML 387 # Check for conditions that should only happen in the innerHTML 388 # case 389 if nodeName in ("select", "colgroup", "head", "html"): 390 assert self.innerHTML 391 392 if not last and node.namespace != self.tree.defaultNamespace: 393 continue 394 395 if nodeName in newModes: 396 new_phase = self.phases[newModes[nodeName]] 397 break 398 elif last: 399 new_phase = self.phases["inBody"] 400 break 401 402 self.phase = new_phase 403 404 def parseRCDataRawtext(self, token, contentType): 405 """Generic RCDATA/RAWTEXT Parsing algorithm 406 contentType - RCDATA or RAWTEXT 407 """ 408 assert contentType in ("RAWTEXT", "RCDATA") 409 410 self.tree.insertElement(token) 411 412 if contentType == "RAWTEXT": 413 self.tokenizer.state = self.tokenizer.rawtextState 414 else: 415 self.tokenizer.state = self.tokenizer.rcdataState 416 417 self.originalPhase = self.phase 418 419 self.phase = self.phases["text"] 420 421 422def getPhases(debug): 423 def log(function): 424 """Logger that records which phase processes each token""" 425 type_names = dict((value, key) for key, value in 426 constants.tokenTypes.items()) 427 428 def wrapped(self, *args, **kwargs): 429 if function.__name__.startswith("process") and len(args) > 0: 430 token = args[0] 431 try: 432 info = {"type": type_names[token['type']]} 433 except: 434 raise 435 if token['type'] in constants.tagTokenTypes: 436 info["name"] = token['name'] 437 438 self.parser.log.append((self.parser.tokenizer.state.__name__, 439 self.parser.phase.__class__.__name__, 440 self.__class__.__name__, 441 function.__name__, 442 info)) 443 return function(self, *args, **kwargs) 444 else: 445 return function(self, *args, **kwargs) 446 return wrapped 447 448 def getMetaclass(use_metaclass, metaclass_func): 449 if use_metaclass: 450 return method_decorator_metaclass(metaclass_func) 451 else: 452 return type 453 454 class Phase(with_metaclass(getMetaclass(debug, log))): 455 """Base class for helper object that implements each phase of processing 456 """ 457 458 def __init__(self, parser, tree): 459 self.parser = parser 460 self.tree = tree 461 462 def processEOF(self): 463 raise NotImplementedError 464 465 def processComment(self, token): 466 # For most phases the following is correct. Where it's not it will be 467 # overridden. 468 self.tree.insertComment(token, self.tree.openElements[-1]) 469 470 def processDoctype(self, token): 471 self.parser.parseError("unexpected-doctype") 472 473 def processCharacters(self, token): 474 self.tree.insertText(token["data"]) 475 476 def processSpaceCharacters(self, token): 477 self.tree.insertText(token["data"]) 478 479 def processStartTag(self, token): 480 return self.startTagHandler[token["name"]](token) 481 482 def startTagHtml(self, token): 483 if not self.parser.firstStartTag and token["name"] == "html": 484 self.parser.parseError("non-html-root") 485 # XXX Need a check here to see if the first start tag token emitted is 486 # this token... If it's not, invoke self.parser.parseError(). 487 for attr, value in token["data"].items(): 488 if attr not in self.tree.openElements[0].attributes: 489 self.tree.openElements[0].attributes[attr] = value 490 self.parser.firstStartTag = False 491 492 def processEndTag(self, token): 493 return self.endTagHandler[token["name"]](token) 494 495 class InitialPhase(Phase): 496 def processSpaceCharacters(self, token): 497 pass 498 499 def processComment(self, token): 500 self.tree.insertComment(token, self.tree.document) 501 502 def processDoctype(self, token): 503 name = token["name"] 504 publicId = token["publicId"] 505 systemId = token["systemId"] 506 correct = token["correct"] 507 508 if (name != "html" or publicId is not None or 509 systemId is not None and systemId != "about:legacy-compat"): 510 self.parser.parseError("unknown-doctype") 511 512 if publicId is None: 513 publicId = "" 514 515 self.tree.insertDoctype(token) 516 517 if publicId != "": 518 publicId = publicId.translate(asciiUpper2Lower) 519 520 if (not correct or token["name"] != "html" 521 or publicId.startswith( 522 ("+//silmaril//dtd html pro v0r11 19970101//", 523 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", 524 "-//as//dtd html 3.0 aswedit + extensions//", 525 "-//ietf//dtd html 2.0 level 1//", 526 "-//ietf//dtd html 2.0 level 2//", 527 "-//ietf//dtd html 2.0 strict level 1//", 528 "-//ietf//dtd html 2.0 strict level 2//", 529 "-//ietf//dtd html 2.0 strict//", 530 "-//ietf//dtd html 2.0//", 531 "-//ietf//dtd html 2.1e//", 532 "-//ietf//dtd html 3.0//", 533 "-//ietf//dtd html 3.2 final//", 534 "-//ietf//dtd html 3.2//", 535 "-//ietf//dtd html 3//", 536 "-//ietf//dtd html level 0//", 537 "-//ietf//dtd html level 1//", 538 "-//ietf//dtd html level 2//", 539 "-//ietf//dtd html level 3//", 540 "-//ietf//dtd html strict level 0//", 541 "-//ietf//dtd html strict level 1//", 542 "-//ietf//dtd html strict level 2//", 543 "-//ietf//dtd html strict level 3//", 544 "-//ietf//dtd html strict//", 545 "-//ietf//dtd html//", 546 "-//metrius//dtd metrius presentational//", 547 "-//microsoft//dtd internet explorer 2.0 html strict//", 548 "-//microsoft//dtd internet explorer 2.0 html//", 549 "-//microsoft//dtd internet explorer 2.0 tables//", 550 "-//microsoft//dtd internet explorer 3.0 html strict//", 551 "-//microsoft//dtd internet explorer 3.0 html//", 552 "-//microsoft//dtd internet explorer 3.0 tables//", 553 "-//netscape comm. corp.//dtd html//", 554 "-//netscape comm. corp.//dtd strict html//", 555 "-//o'reilly and associates//dtd html 2.0//", 556 "-//o'reilly and associates//dtd html extended 1.0//", 557 "-//o'reilly and associates//dtd html extended relaxed 1.0//", 558 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", 559 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", 560 "-//spyglass//dtd html 2.0 extended//", 561 "-//sq//dtd html 2.0 hotmetal + extensions//", 562 "-//sun microsystems corp.//dtd hotjava html//", 563 "-//sun microsystems corp.//dtd hotjava strict html//", 564 "-//w3c//dtd html 3 1995-03-24//", 565 "-//w3c//dtd html 3.2 draft//", 566 "-//w3c//dtd html 3.2 final//", 567 "-//w3c//dtd html 3.2//", 568 "-//w3c//dtd html 3.2s draft//", 569 "-//w3c//dtd html 4.0 frameset//", 570 "-//w3c//dtd html 4.0 transitional//", 571 "-//w3c//dtd html experimental 19960712//", 572 "-//w3c//dtd html experimental 970421//", 573 "-//w3c//dtd w3 html//", 574 "-//w3o//dtd w3 html 3.0//", 575 "-//webtechs//dtd mozilla html 2.0//", 576 "-//webtechs//dtd mozilla html//")) 577 or publicId in 578 ("-//w3o//dtd w3 html strict 3.0//en//", 579 "-/w3c/dtd html 4.0 transitional/en", 580 "html") 581 or publicId.startswith( 582 ("-//w3c//dtd html 4.01 frameset//", 583 "-//w3c//dtd html 4.01 transitional//")) and 584 systemId is None 585 or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): 586 self.parser.compatMode = "quirks" 587 elif (publicId.startswith( 588 ("-//w3c//dtd xhtml 1.0 frameset//", 589 "-//w3c//dtd xhtml 1.0 transitional//")) 590 or publicId.startswith( 591 ("-//w3c//dtd html 4.01 frameset//", 592 "-//w3c//dtd html 4.01 transitional//")) and 593 systemId is not None): 594 self.parser.compatMode = "limited quirks" 595 596 self.parser.phase = self.parser.phases["beforeHtml"] 597 598 def anythingElse(self): 599 self.parser.compatMode = "quirks" 600 self.parser.phase = self.parser.phases["beforeHtml"] 601 602 def processCharacters(self, token): 603 self.parser.parseError("expected-doctype-but-got-chars") 604 self.anythingElse() 605 return token 606 607 def processStartTag(self, token): 608 self.parser.parseError("expected-doctype-but-got-start-tag", 609 {"name": token["name"]}) 610 self.anythingElse() 611 return token 612 613 def processEndTag(self, token): 614 self.parser.parseError("expected-doctype-but-got-end-tag", 615 {"name": token["name"]}) 616 self.anythingElse() 617 return token 618 619 def processEOF(self): 620 self.parser.parseError("expected-doctype-but-got-eof") 621 self.anythingElse() 622 return True 623 624 class BeforeHtmlPhase(Phase): 625 # helper methods 626 def insertHtmlElement(self): 627 self.tree.insertRoot(impliedTagToken("html", "StartTag")) 628 self.parser.phase = self.parser.phases["beforeHead"] 629 630 # other 631 def processEOF(self): 632 self.insertHtmlElement() 633 return True 634 635 def processComment(self, token): 636 self.tree.insertComment(token, self.tree.document) 637 638 def processSpaceCharacters(self, token): 639 pass 640 641 def processCharacters(self, token): 642 self.insertHtmlElement() 643 return token 644 645 def processStartTag(self, token): 646 if token["name"] == "html": 647 self.parser.firstStartTag = True 648 self.insertHtmlElement() 649 return token 650 651 def processEndTag(self, token): 652 if token["name"] not in ("head", "body", "html", "br"): 653 self.parser.parseError("unexpected-end-tag-before-html", 654 {"name": token["name"]}) 655 else: 656 self.insertHtmlElement() 657 return token 658 659 class BeforeHeadPhase(Phase): 660 def __init__(self, parser, tree): 661 Phase.__init__(self, parser, tree) 662 663 self.startTagHandler = utils.MethodDispatcher([ 664 ("html", self.startTagHtml), 665 ("head", self.startTagHead) 666 ]) 667 self.startTagHandler.default = self.startTagOther 668 669 self.endTagHandler = utils.MethodDispatcher([ 670 (("head", "body", "html", "br"), self.endTagImplyHead) 671 ]) 672 self.endTagHandler.default = self.endTagOther 673 674 def processEOF(self): 675 self.startTagHead(impliedTagToken("head", "StartTag")) 676 return True 677 678 def processSpaceCharacters(self, token): 679 pass 680 681 def processCharacters(self, token): 682 self.startTagHead(impliedTagToken("head", "StartTag")) 683 return token 684 685 def startTagHtml(self, token): 686 return self.parser.phases["inBody"].processStartTag(token) 687 688 def startTagHead(self, token): 689 self.tree.insertElement(token) 690 self.tree.headPointer = self.tree.openElements[-1] 691 self.parser.phase = self.parser.phases["inHead"] 692 693 def startTagOther(self, token): 694 self.startTagHead(impliedTagToken("head", "StartTag")) 695 return token 696 697 def endTagImplyHead(self, token): 698 self.startTagHead(impliedTagToken("head", "StartTag")) 699 return token 700 701 def endTagOther(self, token): 702 self.parser.parseError("end-tag-after-implied-root", 703 {"name": token["name"]}) 704 705 class InHeadPhase(Phase): 706 def __init__(self, parser, tree): 707 Phase.__init__(self, parser, tree) 708 709 self.startTagHandler = utils.MethodDispatcher([ 710 ("html", self.startTagHtml), 711 ("title", self.startTagTitle), 712 (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle), 713 ("script", self.startTagScript), 714 (("base", "basefont", "bgsound", "command", "link"), 715 self.startTagBaseLinkCommand), 716 ("meta", self.startTagMeta), 717 ("head", self.startTagHead) 718 ]) 719 self.startTagHandler.default = self.startTagOther 720 721 self. endTagHandler = utils.MethodDispatcher([ 722 ("head", self.endTagHead), 723 (("br", "html", "body"), self.endTagHtmlBodyBr) 724 ]) 725 self.endTagHandler.default = self.endTagOther 726 727 # the real thing 728 def processEOF(self): 729 self.anythingElse() 730 return True 731 732 def processCharacters(self, token): 733 self.anythingElse() 734 return token 735 736 def startTagHtml(self, token): 737 return self.parser.phases["inBody"].processStartTag(token) 738 739 def startTagHead(self, token): 740 self.parser.parseError("two-heads-are-not-better-than-one") 741 742 def startTagBaseLinkCommand(self, token): 743 self.tree.insertElement(token) 744 self.tree.openElements.pop() 745 token["selfClosingAcknowledged"] = True 746 747 def startTagMeta(self, token): 748 self.tree.insertElement(token) 749 self.tree.openElements.pop() 750 token["selfClosingAcknowledged"] = True 751 752 attributes = token["data"] 753 if self.parser.tokenizer.stream.charEncoding[1] == "tentative": 754 if "charset" in attributes: 755 self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) 756 elif ("content" in attributes and 757 "http-equiv" in attributes and 758 attributes["http-equiv"].lower() == "content-type"): 759 # Encoding it as UTF-8 here is a hack, as really we should pass 760 # the abstract Unicode string, and just use the 761 # ContentAttrParser on that, but using UTF-8 allows all chars 762 # to be encoded and as a ASCII-superset works. 763 data = inputstream.EncodingBytes(attributes["content"].encode("utf-8")) 764 parser = inputstream.ContentAttrParser(data) 765 codec = parser.parse() 766 self.parser.tokenizer.stream.changeEncoding(codec) 767 768 def startTagTitle(self, token): 769 self.parser.parseRCDataRawtext(token, "RCDATA") 770 771 def startTagNoScriptNoFramesStyle(self, token): 772 # Need to decide whether to implement the scripting-disabled case 773 self.parser.parseRCDataRawtext(token, "RAWTEXT") 774 775 def startTagScript(self, token): 776 self.tree.insertElement(token) 777 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState 778 self.parser.originalPhase = self.parser.phase 779 self.parser.phase = self.parser.phases["text"] 780 781 def startTagOther(self, token): 782 self.anythingElse() 783 return token 784 785 def endTagHead(self, token): 786 node = self.parser.tree.openElements.pop() 787 assert node.name == "head", "Expected head got %s" % node.name 788 self.parser.phase = self.parser.phases["afterHead"] 789 790 def endTagHtmlBodyBr(self, token): 791 self.anythingElse() 792 return token 793 794 def endTagOther(self, token): 795 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 796 797 def anythingElse(self): 798 self.endTagHead(impliedTagToken("head")) 799 800 # XXX If we implement a parser for which scripting is disabled we need to 801 # implement this phase. 802 # 803 # class InHeadNoScriptPhase(Phase): 804 class AfterHeadPhase(Phase): 805 def __init__(self, parser, tree): 806 Phase.__init__(self, parser, tree) 807 808 self.startTagHandler = utils.MethodDispatcher([ 809 ("html", self.startTagHtml), 810 ("body", self.startTagBody), 811 ("frameset", self.startTagFrameset), 812 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", 813 "style", "title"), 814 self.startTagFromHead), 815 ("head", self.startTagHead) 816 ]) 817 self.startTagHandler.default = self.startTagOther 818 self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"), 819 self.endTagHtmlBodyBr)]) 820 self.endTagHandler.default = self.endTagOther 821 822 def processEOF(self): 823 self.anythingElse() 824 return True 825 826 def processCharacters(self, token): 827 self.anythingElse() 828 return token 829 830 def startTagHtml(self, token): 831 return self.parser.phases["inBody"].processStartTag(token) 832 833 def startTagBody(self, token): 834 self.parser.framesetOK = False 835 self.tree.insertElement(token) 836 self.parser.phase = self.parser.phases["inBody"] 837 838 def startTagFrameset(self, token): 839 self.tree.insertElement(token) 840 self.parser.phase = self.parser.phases["inFrameset"] 841 842 def startTagFromHead(self, token): 843 self.parser.parseError("unexpected-start-tag-out-of-my-head", 844 {"name": token["name"]}) 845 self.tree.openElements.append(self.tree.headPointer) 846 self.parser.phases["inHead"].processStartTag(token) 847 for node in self.tree.openElements[::-1]: 848 if node.name == "head": 849 self.tree.openElements.remove(node) 850 break 851 852 def startTagHead(self, token): 853 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) 854 855 def startTagOther(self, token): 856 self.anythingElse() 857 return token 858 859 def endTagHtmlBodyBr(self, token): 860 self.anythingElse() 861 return token 862 863 def endTagOther(self, token): 864 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 865 866 def anythingElse(self): 867 self.tree.insertElement(impliedTagToken("body", "StartTag")) 868 self.parser.phase = self.parser.phases["inBody"] 869 self.parser.framesetOK = True 870 871 class InBodyPhase(Phase): 872 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody 873 # the really-really-really-very crazy mode 874 def __init__(self, parser, tree): 875 Phase.__init__(self, parser, tree) 876 877 # Keep a ref to this for special handling of whitespace in <pre> 878 self.processSpaceCharactersNonPre = self.processSpaceCharacters 879 880 self.startTagHandler = utils.MethodDispatcher([ 881 ("html", self.startTagHtml), 882 (("base", "basefont", "bgsound", "command", "link", "meta", 883 "script", "style", "title"), 884 self.startTagProcessInHead), 885 ("body", self.startTagBody), 886 ("frameset", self.startTagFrameset), 887 (("address", "article", "aside", "blockquote", "center", "details", 888 "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", 889 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", 890 "section", "summary", "ul"), 891 self.startTagCloseP), 892 (headingElements, self.startTagHeading), 893 (("pre", "listing"), self.startTagPreListing), 894 ("form", self.startTagForm), 895 (("li", "dd", "dt"), self.startTagListItem), 896 ("plaintext", self.startTagPlaintext), 897 ("a", self.startTagA), 898 (("b", "big", "code", "em", "font", "i", "s", "small", "strike", 899 "strong", "tt", "u"), self.startTagFormatting), 900 ("nobr", self.startTagNobr), 901 ("button", self.startTagButton), 902 (("applet", "marquee", "object"), self.startTagAppletMarqueeObject), 903 ("xmp", self.startTagXmp), 904 ("table", self.startTagTable), 905 (("area", "br", "embed", "img", "keygen", "wbr"), 906 self.startTagVoidFormatting), 907 (("param", "source", "track"), self.startTagParamSource), 908 ("input", self.startTagInput), 909 ("hr", self.startTagHr), 910 ("image", self.startTagImage), 911 ("isindex", self.startTagIsIndex), 912 ("textarea", self.startTagTextarea), 913 ("iframe", self.startTagIFrame), 914 (("noembed", "noframes", "noscript"), self.startTagRawtext), 915 ("select", self.startTagSelect), 916 (("rp", "rt"), self.startTagRpRt), 917 (("option", "optgroup"), self.startTagOpt), 918 (("math"), self.startTagMath), 919 (("svg"), self.startTagSvg), 920 (("caption", "col", "colgroup", "frame", "head", 921 "tbody", "td", "tfoot", "th", "thead", 922 "tr"), self.startTagMisplaced) 923 ]) 924 self.startTagHandler.default = self.startTagOther 925 926 self.endTagHandler = utils.MethodDispatcher([ 927 ("body", self.endTagBody), 928 ("html", self.endTagHtml), 929 (("address", "article", "aside", "blockquote", "button", "center", 930 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", 931 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", 932 "section", "summary", "ul"), self.endTagBlock), 933 ("form", self.endTagForm), 934 ("p", self.endTagP), 935 (("dd", "dt", "li"), self.endTagListItem), 936 (headingElements, self.endTagHeading), 937 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", 938 "strike", "strong", "tt", "u"), self.endTagFormatting), 939 (("applet", "marquee", "object"), self.endTagAppletMarqueeObject), 940 ("br", self.endTagBr), 941 ]) 942 self.endTagHandler.default = self.endTagOther 943 944 def isMatchingFormattingElement(self, node1, node2): 945 if node1.name != node2.name or node1.namespace != node2.namespace: 946 return False 947 elif len(node1.attributes) != len(node2.attributes): 948 return False 949 else: 950 attributes1 = sorted(node1.attributes.items()) 951 attributes2 = sorted(node2.attributes.items()) 952 for attr1, attr2 in zip(attributes1, attributes2): 953 if attr1 != attr2: 954 return False 955 return True 956 957 # helper 958 def addFormattingElement(self, token): 959 self.tree.insertElement(token) 960 element = self.tree.openElements[-1] 961 962 matchingElements = [] 963 for node in self.tree.activeFormattingElements[::-1]: 964 if node is Marker: 965 break 966 elif self.isMatchingFormattingElement(node, element): 967 matchingElements.append(node) 968 969 assert len(matchingElements) <= 3 970 if len(matchingElements) == 3: 971 self.tree.activeFormattingElements.remove(matchingElements[-1]) 972 self.tree.activeFormattingElements.append(element) 973 974 # the real deal 975 def processEOF(self): 976 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", 977 "tfoot", "th", "thead", "tr", "body", 978 "html")) 979 for node in self.tree.openElements[::-1]: 980 if node.name not in allowed_elements: 981 self.parser.parseError("expected-closing-tag-but-got-eof") 982 break 983 # Stop parsing 984 985 def processSpaceCharactersDropNewline(self, token): 986 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we 987 # want to drop leading newlines 988 data = token["data"] 989 self.processSpaceCharacters = self.processSpaceCharactersNonPre 990 if (data.startswith("\n") and 991 self.tree.openElements[-1].name in ("pre", "listing", "textarea") 992 and not self.tree.openElements[-1].hasContent()): 993 data = data[1:] 994 if data: 995 self.tree.reconstructActiveFormattingElements() 996 self.tree.insertText(data) 997 998 def processCharacters(self, token): 999 if token["data"] == "\u0000": 1000 # The tokenizer should always emit null on its own 1001 return 1002 self.tree.reconstructActiveFormattingElements() 1003 self.tree.insertText(token["data"]) 1004 # This must be bad for performance 1005 if (self.parser.framesetOK and 1006 any([char not in spaceCharacters 1007 for char in token["data"]])): 1008 self.parser.framesetOK = False 1009 1010 def processSpaceCharacters(self, token): 1011 self.tree.reconstructActiveFormattingElements() 1012 self.tree.insertText(token["data"]) 1013 1014 def startTagProcessInHead(self, token): 1015 return self.parser.phases["inHead"].processStartTag(token) 1016 1017 def startTagBody(self, token): 1018 self.parser.parseError("unexpected-start-tag", {"name": "body"}) 1019 if (len(self.tree.openElements) == 1 1020 or self.tree.openElements[1].name != "body"): 1021 assert self.parser.innerHTML 1022 else: 1023 self.parser.framesetOK = False 1024 for attr, value in token["data"].items(): 1025 if attr not in self.tree.openElements[1].attributes: 1026 self.tree.openElements[1].attributes[attr] = value 1027 1028 def startTagFrameset(self, token): 1029 self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) 1030 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): 1031 assert self.parser.innerHTML 1032 elif not self.parser.framesetOK: 1033 pass 1034 else: 1035 if self.tree.openElements[1].parent: 1036 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) 1037 while self.tree.openElements[-1].name != "html": 1038 self.tree.openElements.pop() 1039 self.tree.insertElement(token) 1040 self.parser.phase = self.parser.phases["inFrameset"] 1041 1042 def startTagCloseP(self, token): 1043 if self.tree.elementInScope("p", variant="button"): 1044 self.endTagP(impliedTagToken("p")) 1045 self.tree.insertElement(token) 1046 1047 def startTagPreListing(self, token): 1048 if self.tree.elementInScope("p", variant="button"): 1049 self.endTagP(impliedTagToken("p")) 1050 self.tree.insertElement(token) 1051 self.parser.framesetOK = False 1052 self.processSpaceCharacters = self.processSpaceCharactersDropNewline 1053 1054 def startTagForm(self, token): 1055 if self.tree.formPointer: 1056 self.parser.parseError("unexpected-start-tag", {"name": "form"}) 1057 else: 1058 if self.tree.elementInScope("p", variant="button"): 1059 self.endTagP(impliedTagToken("p")) 1060 self.tree.insertElement(token) 1061 self.tree.formPointer = self.tree.openElements[-1] 1062 1063 def startTagListItem(self, token): 1064 self.parser.framesetOK = False 1065 1066 stopNamesMap = {"li": ["li"], 1067 "dt": ["dt", "dd"], 1068 "dd": ["dt", "dd"]} 1069 stopNames = stopNamesMap[token["name"]] 1070 for node in reversed(self.tree.openElements): 1071 if node.name in stopNames: 1072 self.parser.phase.processEndTag( 1073 impliedTagToken(node.name, "EndTag")) 1074 break 1075 if (node.nameTuple in specialElements and 1076 node.name not in ("address", "div", "p")): 1077 break 1078 1079 if self.tree.elementInScope("p", variant="button"): 1080 self.parser.phase.processEndTag( 1081 impliedTagToken("p", "EndTag")) 1082 1083 self.tree.insertElement(token) 1084 1085 def startTagPlaintext(self, token): 1086 if self.tree.elementInScope("p", variant="button"): 1087 self.endTagP(impliedTagToken("p")) 1088 self.tree.insertElement(token) 1089 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState 1090 1091 def startTagHeading(self, token): 1092 if self.tree.elementInScope("p", variant="button"): 1093 self.endTagP(impliedTagToken("p")) 1094 if self.tree.openElements[-1].name in headingElements: 1095 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) 1096 self.tree.openElements.pop() 1097 self.tree.insertElement(token) 1098 1099 def startTagA(self, token): 1100 afeAElement = self.tree.elementInActiveFormattingElements("a") 1101 if afeAElement: 1102 self.parser.parseError("unexpected-start-tag-implies-end-tag", 1103 {"startName": "a", "endName": "a"}) 1104 self.endTagFormatting(impliedTagToken("a")) 1105 if afeAElement in self.tree.openElements: 1106 self.tree.openElements.remove(afeAElement) 1107 if afeAElement in self.tree.activeFormattingElements: 1108 self.tree.activeFormattingElements.remove(afeAElement) 1109 self.tree.reconstructActiveFormattingElements() 1110 self.addFormattingElement(token) 1111 1112 def startTagFormatting(self, token): 1113 self.tree.reconstructActiveFormattingElements() 1114 self.addFormattingElement(token) 1115 1116 def startTagNobr(self, token): 1117 self.tree.reconstructActiveFormattingElements() 1118 if self.tree.elementInScope("nobr"): 1119 self.parser.parseError("unexpected-start-tag-implies-end-tag", 1120 {"startName": "nobr", "endName": "nobr"}) 1121 self.processEndTag(impliedTagToken("nobr")) 1122 # XXX Need tests that trigger the following 1123 self.tree.reconstructActiveFormattingElements() 1124 self.addFormattingElement(token) 1125 1126 def startTagButton(self, token): 1127 if self.tree.elementInScope("button"): 1128 self.parser.parseError("unexpected-start-tag-implies-end-tag", 1129 {"startName": "button", "endName": "button"}) 1130 self.processEndTag(impliedTagToken("button")) 1131 return token 1132 else: 1133 self.tree.reconstructActiveFormattingElements() 1134 self.tree.insertElement(token) 1135 self.parser.framesetOK = False 1136 1137 def startTagAppletMarqueeObject(self, token): 1138 self.tree.reconstructActiveFormattingElements() 1139 self.tree.insertElement(token) 1140 self.tree.activeFormattingElements.append(Marker) 1141 self.parser.framesetOK = False 1142 1143 def startTagXmp(self, token): 1144 if self.tree.elementInScope("p", variant="button"): 1145 self.endTagP(impliedTagToken("p")) 1146 self.tree.reconstructActiveFormattingElements() 1147 self.parser.framesetOK = False 1148 self.parser.parseRCDataRawtext(token, "RAWTEXT") 1149 1150 def startTagTable(self, token): 1151 if self.parser.compatMode != "quirks": 1152 if self.tree.elementInScope("p", variant="button"): 1153 self.processEndTag(impliedTagToken("p")) 1154 self.tree.insertElement(token) 1155 self.parser.framesetOK = False 1156 self.parser.phase = self.parser.phases["inTable"] 1157 1158 def startTagVoidFormatting(self, token): 1159 self.tree.reconstructActiveFormattingElements() 1160 self.tree.insertElement(token) 1161 self.tree.openElements.pop() 1162 token["selfClosingAcknowledged"] = True 1163 self.parser.framesetOK = False 1164 1165 def startTagInput(self, token): 1166 framesetOK = self.parser.framesetOK 1167 self.startTagVoidFormatting(token) 1168 if ("type" in token["data"] and 1169 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): 1170 # input type=hidden doesn't change framesetOK 1171 self.parser.framesetOK = framesetOK 1172 1173 def startTagParamSource(self, token): 1174 self.tree.insertElement(token) 1175 self.tree.openElements.pop() 1176 token["selfClosingAcknowledged"] = True 1177 1178 def startTagHr(self, token): 1179 if self.tree.elementInScope("p", variant="button"): 1180 self.endTagP(impliedTagToken("p")) 1181 self.tree.insertElement(token) 1182 self.tree.openElements.pop() 1183 token["selfClosingAcknowledged"] = True 1184 self.parser.framesetOK = False 1185 1186 def startTagImage(self, token): 1187 # No really... 1188 self.parser.parseError("unexpected-start-tag-treated-as", 1189 {"originalName": "image", "newName": "img"}) 1190 self.processStartTag(impliedTagToken("img", "StartTag", 1191 attributes=token["data"], 1192 selfClosing=token["selfClosing"])) 1193 1194 def startTagIsIndex(self, token): 1195 self.parser.parseError("deprecated-tag", {"name": "isindex"}) 1196 if self.tree.formPointer: 1197 return 1198 form_attrs = {} 1199 if "action" in token["data"]: 1200 form_attrs["action"] = token["data"]["action"] 1201 self.processStartTag(impliedTagToken("form", "StartTag", 1202 attributes=form_attrs)) 1203 self.processStartTag(impliedTagToken("hr", "StartTag")) 1204 self.processStartTag(impliedTagToken("label", "StartTag")) 1205 # XXX Localization ... 1206 if "prompt" in token["data"]: 1207 prompt = token["data"]["prompt"] 1208 else: 1209 prompt = "This is a searchable index. Enter search keywords: " 1210 self.processCharacters( 1211 {"type": tokenTypes["Characters"], "data": prompt}) 1212 attributes = token["data"].copy() 1213 if "action" in attributes: 1214 del attributes["action"] 1215 if "prompt" in attributes: 1216 del attributes["prompt"] 1217 attributes["name"] = "isindex" 1218 self.processStartTag(impliedTagToken("input", "StartTag", 1219 attributes=attributes, 1220 selfClosing=token["selfClosing"])) 1221 self.processEndTag(impliedTagToken("label")) 1222 self.processStartTag(impliedTagToken("hr", "StartTag")) 1223 self.processEndTag(impliedTagToken("form")) 1224 1225 def startTagTextarea(self, token): 1226 self.tree.insertElement(token) 1227 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState 1228 self.processSpaceCharacters = self.processSpaceCharactersDropNewline 1229 self.parser.framesetOK = False 1230 1231 def startTagIFrame(self, token): 1232 self.parser.framesetOK = False 1233 self.startTagRawtext(token) 1234 1235 def startTagRawtext(self, token): 1236 """iframe, noembed noframes, noscript(if scripting enabled)""" 1237 self.parser.parseRCDataRawtext(token, "RAWTEXT") 1238 1239 def startTagOpt(self, token): 1240 if self.tree.openElements[-1].name == "option": 1241 self.parser.phase.processEndTag(impliedTagToken("option")) 1242 self.tree.reconstructActiveFormattingElements() 1243 self.parser.tree.insertElement(token) 1244 1245 def startTagSelect(self, token): 1246 self.tree.reconstructActiveFormattingElements() 1247 self.tree.insertElement(token) 1248 self.parser.framesetOK = False 1249 if self.parser.phase in (self.parser.phases["inTable"], 1250 self.parser.phases["inCaption"], 1251 self.parser.phases["inColumnGroup"], 1252 self.parser.phases["inTableBody"], 1253 self.parser.phases["inRow"], 1254 self.parser.phases["inCell"]): 1255 self.parser.phase = self.parser.phases["inSelectInTable"] 1256 else: 1257 self.parser.phase = self.parser.phases["inSelect"] 1258 1259 def startTagRpRt(self, token): 1260 if self.tree.elementInScope("ruby"): 1261 self.tree.generateImpliedEndTags() 1262 if self.tree.openElements[-1].name != "ruby": 1263 self.parser.parseError() 1264 self.tree.insertElement(token) 1265 1266 def startTagMath(self, token): 1267 self.tree.reconstructActiveFormattingElements() 1268 self.parser.adjustMathMLAttributes(token) 1269 self.parser.adjustForeignAttributes(token) 1270 token["namespace"] = namespaces["mathml"] 1271 self.tree.insertElement(token) 1272 # Need to get the parse error right for the case where the token 1273 # has a namespace not equal to the xmlns attribute 1274 if token["selfClosing"]: 1275 self.tree.openElements.pop() 1276 token["selfClosingAcknowledged"] = True 1277 1278 def startTagSvg(self, token): 1279 self.tree.reconstructActiveFormattingElements() 1280 self.parser.adjustSVGAttributes(token) 1281 self.parser.adjustForeignAttributes(token) 1282 token["namespace"] = namespaces["svg"] 1283 self.tree.insertElement(token) 1284 # Need to get the parse error right for the case where the token 1285 # has a namespace not equal to the xmlns attribute 1286 if token["selfClosing"]: 1287 self.tree.openElements.pop() 1288 token["selfClosingAcknowledged"] = True 1289 1290 def startTagMisplaced(self, token): 1291 """ Elements that should be children of other elements that have a 1292 different insertion mode; here they are ignored 1293 "caption", "col", "colgroup", "frame", "frameset", "head", 1294 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", 1295 "tr", "noscript" 1296 """ 1297 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) 1298 1299 def startTagOther(self, token): 1300 self.tree.reconstructActiveFormattingElements() 1301 self.tree.insertElement(token) 1302 1303 def endTagP(self, token): 1304 if not self.tree.elementInScope("p", variant="button"): 1305 self.startTagCloseP(impliedTagToken("p", "StartTag")) 1306 self.parser.parseError("unexpected-end-tag", {"name": "p"}) 1307 self.endTagP(impliedTagToken("p", "EndTag")) 1308 else: 1309 self.tree.generateImpliedEndTags("p") 1310 if self.tree.openElements[-1].name != "p": 1311 self.parser.parseError("unexpected-end-tag", {"name": "p"}) 1312 node = self.tree.openElements.pop() 1313 while node.name != "p": 1314 node = self.tree.openElements.pop() 1315 1316 def endTagBody(self, token): 1317 if not self.tree.elementInScope("body"): 1318 self.parser.parseError() 1319 return 1320 elif self.tree.openElements[-1].name != "body": 1321 for node in self.tree.openElements[2:]: 1322 if node.name not in frozenset(("dd", "dt", "li", "optgroup", 1323 "option", "p", "rp", "rt", 1324 "tbody", "td", "tfoot", 1325 "th", "thead", "tr", "body", 1326 "html")): 1327 # Not sure this is the correct name for the parse error 1328 self.parser.parseError( 1329 "expected-one-end-tag-but-got-another", 1330 {"expectedName": "body", "gotName": node.name}) 1331 break 1332 self.parser.phase = self.parser.phases["afterBody"] 1333 1334 def endTagHtml(self, token): 1335 # We repeat the test for the body end tag token being ignored here 1336 if self.tree.elementInScope("body"): 1337 self.endTagBody(impliedTagToken("body")) 1338 return token 1339 1340 def endTagBlock(self, token): 1341 # Put us back in the right whitespace handling mode 1342 if token["name"] == "pre": 1343 self.processSpaceCharacters = self.processSpaceCharactersNonPre 1344 inScope = self.tree.elementInScope(token["name"]) 1345 if inScope: 1346 self.tree.generateImpliedEndTags() 1347 if self.tree.openElements[-1].name != token["name"]: 1348 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 1349 if inScope: 1350 node = self.tree.openElements.pop() 1351 while node.name != token["name"]: 1352 node = self.tree.openElements.pop() 1353 1354 def endTagForm(self, token): 1355 node = self.tree.formPointer 1356 self.tree.formPointer = None 1357 if node is None or not self.tree.elementInScope(node): 1358 self.parser.parseError("unexpected-end-tag", 1359 {"name": "form"}) 1360 else: 1361 self.tree.generateImpliedEndTags() 1362 if self.tree.openElements[-1] != node: 1363 self.parser.parseError("end-tag-too-early-ignored", 1364 {"name": "form"}) 1365 self.tree.openElements.remove(node) 1366 1367 def endTagListItem(self, token): 1368 if token["name"] == "li": 1369 variant = "list" 1370 else: 1371 variant = None 1372 if not self.tree.elementInScope(token["name"], variant=variant): 1373 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 1374 else: 1375 self.tree.generateImpliedEndTags(exclude=token["name"]) 1376 if self.tree.openElements[-1].name != token["name"]: 1377 self.parser.parseError( 1378 "end-tag-too-early", 1379 {"name": token["name"]}) 1380 node = self.tree.openElements.pop() 1381 while node.name != token["name"]: 1382 node = self.tree.openElements.pop() 1383 1384 def endTagHeading(self, token): 1385 for item in headingElements: 1386 if self.tree.elementInScope(item): 1387 self.tree.generateImpliedEndTags() 1388 break 1389 if self.tree.openElements[-1].name != token["name"]: 1390 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 1391 1392 for item in headingElements: 1393 if self.tree.elementInScope(item): 1394 item = self.tree.openElements.pop() 1395 while item.name not in headingElements: 1396 item = self.tree.openElements.pop() 1397 break 1398 1399 def endTagFormatting(self, token): 1400 """The much-feared adoption agency algorithm""" 1401 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867 1402 # XXX Better parseError messages appreciated. 1403 1404 # Step 1 1405 outerLoopCounter = 0 1406 1407 # Step 2 1408 while outerLoopCounter < 8: 1409 1410 # Step 3 1411 outerLoopCounter += 1 1412 1413 # Step 4: 1414 1415 # Let the formatting element be the last element in 1416 # the list of active formatting elements that: 1417 # - is between the end of the list and the last scope 1418 # marker in the list, if any, or the start of the list 1419 # otherwise, and 1420 # - has the same tag name as the token. 1421 formattingElement = self.tree.elementInActiveFormattingElements( 1422 token["name"]) 1423 if (not formattingElement or 1424 (formattingElement in self.tree.openElements and 1425 not self.tree.elementInScope(formattingElement.name))): 1426 # If there is no such node, then abort these steps 1427 # and instead act as described in the "any other 1428 # end tag" entry below. 1429 self.endTagOther(token) 1430 return 1431 1432 # Otherwise, if there is such a node, but that node is 1433 # not in the stack of open elements, then this is a 1434 # parse error; remove the element from the list, and 1435 # abort these steps. 1436 elif formattingElement not in self.tree.openElements: 1437 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) 1438 self.tree.activeFormattingElements.remove(formattingElement) 1439 return 1440 1441 # Otherwise, if there is such a node, and that node is 1442 # also in the stack of open elements, but the element 1443 # is not in scope, then this is a parse error; ignore 1444 # the token, and abort these steps. 1445 elif not self.tree.elementInScope(formattingElement.name): 1446 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]}) 1447 return 1448 1449 # Otherwise, there is a formatting element and that 1450 # element is in the stack and is in scope. If the 1451 # element is not the current node, this is a parse 1452 # error. In any case, proceed with the algorithm as 1453 # written in the following steps. 1454 else: 1455 if formattingElement != self.tree.openElements[-1]: 1456 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) 1457 1458 # Step 5: 1459 1460 # Let the furthest block be the topmost node in the 1461 # stack of open elements that is lower in the stack 1462 # than the formatting element, and is an element in 1463 # the special category. There might not be one. 1464 afeIndex = self.tree.openElements.index(formattingElement) 1465 furthestBlock = None 1466 for element in self.tree.openElements[afeIndex:]: 1467 if element.nameTuple in specialElements: 1468 furthestBlock = element 1469 break 1470 1471 # Step 6: 1472 1473 # If there is no furthest block, then the UA must 1474 # first pop all the nodes from the bottom of the stack 1475 # of open elements, from the current node up to and 1476 # including the formatting element, then remove the 1477 # formatting element from the list of active 1478 # formatting elements, and finally abort these steps. 1479 if furthestBlock is None: 1480 element = self.tree.openElements.pop() 1481 while element != formattingElement: 1482 element = self.tree.openElements.pop() 1483 self.tree.activeFormattingElements.remove(element) 1484 return 1485 1486 # Step 7 1487 commonAncestor = self.tree.openElements[afeIndex - 1] 1488 1489 # Step 8: 1490 # The bookmark is supposed to help us identify where to reinsert 1491 # nodes in step 15. We have to ensure that we reinsert nodes after 1492 # the node before the active formatting element. Note the bookmark 1493 # can move in step 9.7 1494 bookmark = self.tree.activeFormattingElements.index(formattingElement) 1495 1496 # Step 9 1497 lastNode = node = furthestBlock 1498 innerLoopCounter = 0 1499 1500 index = self.tree.openElements.index(node) 1501 while innerLoopCounter < 3: 1502 innerLoopCounter += 1 1503 # Node is element before node in open elements 1504 index -= 1 1505 node = self.tree.openElements[index] 1506 if node not in self.tree.activeFormattingElements: 1507 self.tree.openElements.remove(node) 1508 continue 1509 # Step 9.6 1510 if node == formattingElement: 1511 break 1512 # Step 9.7 1513 if lastNode == furthestBlock: 1514 bookmark = self.tree.activeFormattingElements.index(node) + 1 1515 # Step 9.8 1516 clone = node.cloneNode() 1517 # Replace node with clone 1518 self.tree.activeFormattingElements[ 1519 self.tree.activeFormattingElements.index(node)] = clone 1520 self.tree.openElements[ 1521 self.tree.openElements.index(node)] = clone 1522 node = clone 1523 # Step 9.9 1524 # Remove lastNode from its parents, if any 1525 if lastNode.parent: 1526 lastNode.parent.removeChild(lastNode) 1527 node.appendChild(lastNode) 1528 # Step 9.10 1529 lastNode = node 1530 1531 # Step 10 1532 # Foster parent lastNode if commonAncestor is a 1533 # table, tbody, tfoot, thead, or tr we need to foster 1534 # parent the lastNode 1535 if lastNode.parent: 1536 lastNode.parent.removeChild(lastNode) 1537 1538 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")): 1539 parent, insertBefore = self.tree.getTableMisnestedNodePosition() 1540 parent.insertBefore(lastNode, insertBefore) 1541 else: 1542 commonAncestor.appendChild(lastNode) 1543 1544 # Step 11 1545 clone = formattingElement.cloneNode() 1546 1547 # Step 12 1548 furthestBlock.reparentChildren(clone) 1549 1550 # Step 13 1551 furthestBlock.appendChild(clone) 1552 1553 # Step 14 1554 self.tree.activeFormattingElements.remove(formattingElement) 1555 self.tree.activeFormattingElements.insert(bookmark, clone) 1556 1557 # Step 15 1558 self.tree.openElements.remove(formattingElement) 1559 self.tree.openElements.insert( 1560 self.tree.openElements.index(furthestBlock) + 1, clone) 1561 1562 def endTagAppletMarqueeObject(self, token): 1563 if self.tree.elementInScope(token["name"]): 1564 self.tree.generateImpliedEndTags() 1565 if self.tree.openElements[-1].name != token["name"]: 1566 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 1567 1568 if self.tree.elementInScope(token["name"]): 1569 element = self.tree.openElements.pop() 1570 while element.name != token["name"]: 1571 element = self.tree.openElements.pop() 1572 self.tree.clearActiveFormattingElements() 1573 1574 def endTagBr(self, token): 1575 self.parser.parseError("unexpected-end-tag-treated-as", 1576 {"originalName": "br", "newName": "br element"}) 1577 self.tree.reconstructActiveFormattingElements() 1578 self.tree.insertElement(impliedTagToken("br", "StartTag")) 1579 self.tree.openElements.pop() 1580 1581 def endTagOther(self, token): 1582 for node in self.tree.openElements[::-1]: 1583 if node.name == token["name"]: 1584 self.tree.generateImpliedEndTags(exclude=token["name"]) 1585 if self.tree.openElements[-1].name != token["name"]: 1586 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 1587 while self.tree.openElements.pop() != node: 1588 pass 1589 break 1590 else: 1591 if node.nameTuple in specialElements: 1592 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 1593 break 1594 1595 class TextPhase(Phase): 1596 def __init__(self, parser, tree): 1597 Phase.__init__(self, parser, tree) 1598 self.startTagHandler = utils.MethodDispatcher([]) 1599 self.startTagHandler.default = self.startTagOther 1600 self.endTagHandler = utils.MethodDispatcher([ 1601 ("script", self.endTagScript)]) 1602 self.endTagHandler.default = self.endTagOther 1603 1604 def processCharacters(self, token): 1605 self.tree.insertText(token["data"]) 1606 1607 def processEOF(self): 1608 self.parser.parseError("expected-named-closing-tag-but-got-eof", 1609 {"name": self.tree.openElements[-1].name}) 1610 self.tree.openElements.pop() 1611 self.parser.phase = self.parser.originalPhase 1612 return True 1613 1614 def startTagOther(self, token): 1615 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name'] 1616 1617 def endTagScript(self, token): 1618 node = self.tree.openElements.pop() 1619 assert node.name == "script" 1620 self.parser.phase = self.parser.originalPhase 1621 # The rest of this method is all stuff that only happens if 1622 # document.write works 1623 1624 def endTagOther(self, token): 1625 self.tree.openElements.pop() 1626 self.parser.phase = self.parser.originalPhase 1627 1628 class InTablePhase(Phase): 1629 # http://www.whatwg.org/specs/web-apps/current-work/#in-table 1630 def __init__(self, parser, tree): 1631 Phase.__init__(self, parser, tree) 1632 self.startTagHandler = utils.MethodDispatcher([ 1633 ("html", self.startTagHtml), 1634 ("caption", self.startTagCaption), 1635 ("colgroup", self.startTagColgroup), 1636 ("col", self.startTagCol), 1637 (("tbody", "tfoot", "thead"), self.startTagRowGroup), 1638 (("td", "th", "tr"), self.startTagImplyTbody), 1639 ("table", self.startTagTable), 1640 (("style", "script"), self.startTagStyleScript), 1641 ("input", self.startTagInput), 1642 ("form", self.startTagForm) 1643 ]) 1644 self.startTagHandler.default = self.startTagOther 1645 1646 self.endTagHandler = utils.MethodDispatcher([ 1647 ("table", self.endTagTable), 1648 (("body", "caption", "col", "colgroup", "html", "tbody", "td", 1649 "tfoot", "th", "thead", "tr"), self.endTagIgnore) 1650 ]) 1651 self.endTagHandler.default = self.endTagOther 1652 1653 # helper methods 1654 def clearStackToTableContext(self): 1655 # "clear the stack back to a table context" 1656 while self.tree.openElements[-1].name not in ("table", "html"): 1657 # self.parser.parseError("unexpected-implied-end-tag-in-table", 1658 # {"name": self.tree.openElements[-1].name}) 1659 self.tree.openElements.pop() 1660 # When the current node is <html> it's an innerHTML case 1661 1662 # processing methods 1663 def processEOF(self): 1664 if self.tree.openElements[-1].name != "html": 1665 self.parser.parseError("eof-in-table") 1666 else: 1667 assert self.parser.innerHTML 1668 # Stop parsing 1669 1670 def processSpaceCharacters(self, token): 1671 originalPhase = self.parser.phase 1672 self.parser.phase = self.parser.phases["inTableText"] 1673 self.parser.phase.originalPhase = originalPhase 1674 self.parser.phase.processSpaceCharacters(token) 1675 1676 def processCharacters(self, token): 1677 originalPhase = self.parser.phase 1678 self.parser.phase = self.parser.phases["inTableText"] 1679 self.parser.phase.originalPhase = originalPhase 1680 self.parser.phase.processCharacters(token) 1681 1682 def insertText(self, token): 1683 # If we get here there must be at least one non-whitespace character 1684 # Do the table magic! 1685 self.tree.insertFromTable = True 1686 self.parser.phases["inBody"].processCharacters(token) 1687 self.tree.insertFromTable = False 1688 1689 def startTagCaption(self, token): 1690 self.clearStackToTableContext() 1691 self.tree.activeFormattingElements.append(Marker) 1692 self.tree.insertElement(token) 1693 self.parser.phase = self.parser.phases["inCaption"] 1694 1695 def startTagColgroup(self, token): 1696 self.clearStackToTableContext() 1697 self.tree.insertElement(token) 1698 self.parser.phase = self.parser.phases["inColumnGroup"] 1699 1700 def startTagCol(self, token): 1701 self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) 1702 return token 1703 1704 def startTagRowGroup(self, token): 1705 self.clearStackToTableContext() 1706 self.tree.insertElement(token) 1707 self.parser.phase = self.parser.phases["inTableBody"] 1708 1709 def startTagImplyTbody(self, token): 1710 self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) 1711 return token 1712 1713 def startTagTable(self, token): 1714 self.parser.parseError("unexpected-start-tag-implies-end-tag", 1715 {"startName": "table", "endName": "table"}) 1716 self.parser.phase.processEndTag(impliedTagToken("table")) 1717 if not self.parser.innerHTML: 1718 return token 1719 1720 def startTagStyleScript(self, token): 1721 return self.parser.phases["inHead"].processStartTag(token) 1722 1723 def startTagInput(self, token): 1724 if ("type" in token["data"] and 1725 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): 1726 self.parser.parseError("unexpected-hidden-input-in-table") 1727 self.tree.insertElement(token) 1728 # XXX associate with form 1729 self.tree.openElements.pop() 1730 else: 1731 self.startTagOther(token) 1732 1733 def startTagForm(self, token): 1734 self.parser.parseError("unexpected-form-in-table") 1735 if self.tree.formPointer is None: 1736 self.tree.insertElement(token) 1737 self.tree.formPointer = self.tree.openElements[-1] 1738 self.tree.openElements.pop() 1739 1740 def startTagOther(self, token): 1741 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) 1742 # Do the table magic! 1743 self.tree.insertFromTable = True 1744 self.parser.phases["inBody"].processStartTag(token) 1745 self.tree.insertFromTable = False 1746 1747 def endTagTable(self, token): 1748 if self.tree.elementInScope("table", variant="table"): 1749 self.tree.generateImpliedEndTags() 1750 if self.tree.openElements[-1].name != "table": 1751 self.parser.parseError("end-tag-too-early-named", 1752 {"gotName": "table", 1753 "expectedName": self.tree.openElements[-1].name}) 1754 while self.tree.openElements[-1].name != "table": 1755 self.tree.openElements.pop() 1756 self.tree.openElements.pop() 1757 self.parser.resetInsertionMode() 1758 else: 1759 # innerHTML case 1760 assert self.parser.innerHTML 1761 self.parser.parseError() 1762 1763 def endTagIgnore(self, token): 1764 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 1765 1766 def endTagOther(self, token): 1767 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) 1768 # Do the table magic! 1769 self.tree.insertFromTable = True 1770 self.parser.phases["inBody"].processEndTag(token) 1771 self.tree.insertFromTable = False 1772 1773 class InTableTextPhase(Phase): 1774 def __init__(self, parser, tree): 1775 Phase.__init__(self, parser, tree) 1776 self.originalPhase = None 1777 self.characterTokens = [] 1778 1779 def flushCharacters(self): 1780 data = "".join([item["data"] for item in self.characterTokens]) 1781 if any([item not in spaceCharacters for item in data]): 1782 token = {"type": tokenTypes["Characters"], "data": data} 1783 self.parser.phases["inTable"].insertText(token) 1784 elif data: 1785 self.tree.insertText(data) 1786 self.characterTokens = [] 1787 1788 def processComment(self, token): 1789 self.flushCharacters() 1790 self.parser.phase = self.originalPhase 1791 return token 1792 1793 def processEOF(self): 1794 self.flushCharacters() 1795 self.parser.phase = self.originalPhase 1796 return True 1797 1798 def processCharacters(self, token): 1799 if token["data"] == "\u0000": 1800 return 1801 self.characterTokens.append(token) 1802 1803 def processSpaceCharacters(self, token): 1804 # pretty sure we should never reach here 1805 self.characterTokens.append(token) 1806 # assert False 1807 1808 def processStartTag(self, token): 1809 self.flushCharacters() 1810 self.parser.phase = self.originalPhase 1811 return token 1812 1813 def processEndTag(self, token): 1814 self.flushCharacters() 1815 self.parser.phase = self.originalPhase 1816 return token 1817 1818 class InCaptionPhase(Phase): 1819 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption 1820 def __init__(self, parser, tree): 1821 Phase.__init__(self, parser, tree) 1822 1823 self.startTagHandler = utils.MethodDispatcher([ 1824 ("html", self.startTagHtml), 1825 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", 1826 "thead", "tr"), self.startTagTableElement) 1827 ]) 1828 self.startTagHandler.default = self.startTagOther 1829 1830 self.endTagHandler = utils.MethodDispatcher([ 1831 ("caption", self.endTagCaption), 1832 ("table", self.endTagTable), 1833 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", 1834 "thead", "tr"), self.endTagIgnore) 1835 ]) 1836 self.endTagHandler.default = self.endTagOther 1837 1838 def ignoreEndTagCaption(self): 1839 return not self.tree.elementInScope("caption", variant="table") 1840 1841 def processEOF(self): 1842 self.parser.phases["inBody"].processEOF() 1843 1844 def processCharacters(self, token): 1845 return self.parser.phases["inBody"].processCharacters(token) 1846 1847 def startTagTableElement(self, token): 1848 self.parser.parseError() 1849 # XXX Have to duplicate logic here to find out if the tag is ignored 1850 ignoreEndTag = self.ignoreEndTagCaption() 1851 self.parser.phase.processEndTag(impliedTagToken("caption")) 1852 if not ignoreEndTag: 1853 return token 1854 1855 def startTagOther(self, token): 1856 return self.parser.phases["inBody"].processStartTag(token) 1857 1858 def endTagCaption(self, token): 1859 if not self.ignoreEndTagCaption(): 1860 # AT this code is quite similar to endTagTable in "InTable" 1861 self.tree.generateImpliedEndTags() 1862 if self.tree.openElements[-1].name != "caption": 1863 self.parser.parseError("expected-one-end-tag-but-got-another", 1864 {"gotName": "caption", 1865 "expectedName": self.tree.openElements[-1].name}) 1866 while self.tree.openElements[-1].name != "caption": 1867 self.tree.openElements.pop() 1868 self.tree.openElements.pop() 1869 self.tree.clearActiveFormattingElements() 1870 self.parser.phase = self.parser.phases["inTable"] 1871 else: 1872 # innerHTML case 1873 assert self.parser.innerHTML 1874 self.parser.parseError() 1875 1876 def endTagTable(self, token): 1877 self.parser.parseError() 1878 ignoreEndTag = self.ignoreEndTagCaption() 1879 self.parser.phase.processEndTag(impliedTagToken("caption")) 1880 if not ignoreEndTag: 1881 return token 1882 1883 def endTagIgnore(self, token): 1884 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 1885 1886 def endTagOther(self, token): 1887 return self.parser.phases["inBody"].processEndTag(token) 1888 1889 class InColumnGroupPhase(Phase): 1890 # http://www.whatwg.org/specs/web-apps/current-work/#in-column 1891 1892 def __init__(self, parser, tree): 1893 Phase.__init__(self, parser, tree) 1894 1895 self.startTagHandler = utils.MethodDispatcher([ 1896 ("html", self.startTagHtml), 1897 ("col", self.startTagCol) 1898 ]) 1899 self.startTagHandler.default = self.startTagOther 1900 1901 self.endTagHandler = utils.MethodDispatcher([ 1902 ("colgroup", self.endTagColgroup), 1903 ("col", self.endTagCol) 1904 ]) 1905 self.endTagHandler.default = self.endTagOther 1906 1907 def ignoreEndTagColgroup(self): 1908 return self.tree.openElements[-1].name == "html" 1909 1910 def processEOF(self): 1911 if self.tree.openElements[-1].name == "html": 1912 assert self.parser.innerHTML 1913 return 1914 else: 1915 ignoreEndTag = self.ignoreEndTagColgroup() 1916 self.endTagColgroup(impliedTagToken("colgroup")) 1917 if not ignoreEndTag: 1918 return True 1919 1920 def processCharacters(self, token): 1921 ignoreEndTag = self.ignoreEndTagColgroup() 1922 self.endTagColgroup(impliedTagToken("colgroup")) 1923 if not ignoreEndTag: 1924 return token 1925 1926 def startTagCol(self, token): 1927 self.tree.insertElement(token) 1928 self.tree.openElements.pop() 1929 1930 def startTagOther(self, token): 1931 ignoreEndTag = self.ignoreEndTagColgroup() 1932 self.endTagColgroup(impliedTagToken("colgroup")) 1933 if not ignoreEndTag: 1934 return token 1935 1936 def endTagColgroup(self, token): 1937 if self.ignoreEndTagColgroup(): 1938 # innerHTML case 1939 assert self.parser.innerHTML 1940 self.parser.parseError() 1941 else: 1942 self.tree.openElements.pop() 1943 self.parser.phase = self.parser.phases["inTable"] 1944 1945 def endTagCol(self, token): 1946 self.parser.parseError("no-end-tag", {"name": "col"}) 1947 1948 def endTagOther(self, token): 1949 ignoreEndTag = self.ignoreEndTagColgroup() 1950 self.endTagColgroup(impliedTagToken("colgroup")) 1951 if not ignoreEndTag: 1952 return token 1953 1954 class InTableBodyPhase(Phase): 1955 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 1956 def __init__(self, parser, tree): 1957 Phase.__init__(self, parser, tree) 1958 self.startTagHandler = utils.MethodDispatcher([ 1959 ("html", self.startTagHtml), 1960 ("tr", self.startTagTr), 1961 (("td", "th"), self.startTagTableCell), 1962 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), 1963 self.startTagTableOther) 1964 ]) 1965 self.startTagHandler.default = self.startTagOther 1966 1967 self.endTagHandler = utils.MethodDispatcher([ 1968 (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), 1969 ("table", self.endTagTable), 1970 (("body", "caption", "col", "colgroup", "html", "td", "th", 1971 "tr"), self.endTagIgnore) 1972 ]) 1973 self.endTagHandler.default = self.endTagOther 1974 1975 # helper methods 1976 def clearStackToTableBodyContext(self): 1977 while self.tree.openElements[-1].name not in ("tbody", "tfoot", 1978 "thead", "html"): 1979 # self.parser.parseError("unexpected-implied-end-tag-in-table", 1980 # {"name": self.tree.openElements[-1].name}) 1981 self.tree.openElements.pop() 1982 if self.tree.openElements[-1].name == "html": 1983 assert self.parser.innerHTML 1984 1985 # the rest 1986 def processEOF(self): 1987 self.parser.phases["inTable"].processEOF() 1988 1989 def processSpaceCharacters(self, token): 1990 return self.parser.phases["inTable"].processSpaceCharacters(token) 1991 1992 def processCharacters(self, token): 1993 return self.parser.phases["inTable"].processCharacters(token) 1994 1995 def startTagTr(self, token): 1996 self.clearStackToTableBodyContext() 1997 self.tree.insertElement(token) 1998 self.parser.phase = self.parser.phases["inRow"] 1999 2000 def startTagTableCell(self, token): 2001 self.parser.parseError("unexpected-cell-in-table-body", 2002 {"name": token["name"]}) 2003 self.startTagTr(impliedTagToken("tr", "StartTag")) 2004 return token 2005 2006 def startTagTableOther(self, token): 2007 # XXX AT Any ideas on how to share this with endTagTable? 2008 if (self.tree.elementInScope("tbody", variant="table") or 2009 self.tree.elementInScope("thead", variant="table") or 2010 self.tree.elementInScope("tfoot", variant="table")): 2011 self.clearStackToTableBodyContext() 2012 self.endTagTableRowGroup( 2013 impliedTagToken(self.tree.openElements[-1].name)) 2014 return token 2015 else: 2016 # innerHTML case 2017 assert self.parser.innerHTML 2018 self.parser.parseError() 2019 2020 def startTagOther(self, token): 2021 return self.parser.phases["inTable"].processStartTag(token) 2022 2023 def endTagTableRowGroup(self, token): 2024 if self.tree.elementInScope(token["name"], variant="table"): 2025 self.clearStackToTableBodyContext() 2026 self.tree.openElements.pop() 2027 self.parser.phase = self.parser.phases["inTable"] 2028 else: 2029 self.parser.parseError("unexpected-end-tag-in-table-body", 2030 {"name": token["name"]}) 2031 2032 def endTagTable(self, token): 2033 if (self.tree.elementInScope("tbody", variant="table") or 2034 self.tree.elementInScope("thead", variant="table") or 2035 self.tree.elementInScope("tfoot", variant="table")): 2036 self.clearStackToTableBodyContext() 2037 self.endTagTableRowGroup( 2038 impliedTagToken(self.tree.openElements[-1].name)) 2039 return token 2040 else: 2041 # innerHTML case 2042 assert self.parser.innerHTML 2043 self.parser.parseError() 2044 2045 def endTagIgnore(self, token): 2046 self.parser.parseError("unexpected-end-tag-in-table-body", 2047 {"name": token["name"]}) 2048 2049 def endTagOther(self, token): 2050 return self.parser.phases["inTable"].processEndTag(token) 2051 2052 class InRowPhase(Phase): 2053 # http://www.whatwg.org/specs/web-apps/current-work/#in-row 2054 def __init__(self, parser, tree): 2055 Phase.__init__(self, parser, tree) 2056 self.startTagHandler = utils.MethodDispatcher([ 2057 ("html", self.startTagHtml), 2058 (("td", "th"), self.startTagTableCell), 2059 (("caption", "col", "colgroup", "tbody", "tfoot", "thead", 2060 "tr"), self.startTagTableOther) 2061 ]) 2062 self.startTagHandler.default = self.startTagOther 2063 2064 self.endTagHandler = utils.MethodDispatcher([ 2065 ("tr", self.endTagTr), 2066 ("table", self.endTagTable), 2067 (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), 2068 (("body", "caption", "col", "colgroup", "html", "td", "th"), 2069 self.endTagIgnore) 2070 ]) 2071 self.endTagHandler.default = self.endTagOther 2072 2073 # helper methods (XXX unify this with other table helper methods) 2074 def clearStackToTableRowContext(self): 2075 while self.tree.openElements[-1].name not in ("tr", "html"): 2076 self.parser.parseError("unexpected-implied-end-tag-in-table-row", 2077 {"name": self.tree.openElements[-1].name}) 2078 self.tree.openElements.pop() 2079 2080 def ignoreEndTagTr(self): 2081 return not self.tree.elementInScope("tr", variant="table") 2082 2083 # the rest 2084 def processEOF(self): 2085 self.parser.phases["inTable"].processEOF() 2086 2087 def processSpaceCharacters(self, token): 2088 return self.parser.phases["inTable"].processSpaceCharacters(token) 2089 2090 def processCharacters(self, token): 2091 return self.parser.phases["inTable"].processCharacters(token) 2092 2093 def startTagTableCell(self, token): 2094 self.clearStackToTableRowContext() 2095 self.tree.insertElement(token) 2096 self.parser.phase = self.parser.phases["inCell"] 2097 self.tree.activeFormattingElements.append(Marker) 2098 2099 def startTagTableOther(self, token): 2100 ignoreEndTag = self.ignoreEndTagTr() 2101 self.endTagTr(impliedTagToken("tr")) 2102 # XXX how are we sure it's always ignored in the innerHTML case? 2103 if not ignoreEndTag: 2104 return token 2105 2106 def startTagOther(self, token): 2107 return self.parser.phases["inTable"].processStartTag(token) 2108 2109 def endTagTr(self, token): 2110 if not self.ignoreEndTagTr(): 2111 self.clearStackToTableRowContext() 2112 self.tree.openElements.pop() 2113 self.parser.phase = self.parser.phases["inTableBody"] 2114 else: 2115 # innerHTML case 2116 assert self.parser.innerHTML 2117 self.parser.parseError() 2118 2119 def endTagTable(self, token): 2120 ignoreEndTag = self.ignoreEndTagTr() 2121 self.endTagTr(impliedTagToken("tr")) 2122 # Reprocess the current tag if the tr end tag was not ignored 2123 # XXX how are we sure it's always ignored in the innerHTML case? 2124 if not ignoreEndTag: 2125 return token 2126 2127 def endTagTableRowGroup(self, token): 2128 if self.tree.elementInScope(token["name"], variant="table"): 2129 self.endTagTr(impliedTagToken("tr")) 2130 return token 2131 else: 2132 self.parser.parseError() 2133 2134 def endTagIgnore(self, token): 2135 self.parser.parseError("unexpected-end-tag-in-table-row", 2136 {"name": token["name"]}) 2137 2138 def endTagOther(self, token): 2139 return self.parser.phases["inTable"].processEndTag(token) 2140 2141 class InCellPhase(Phase): 2142 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell 2143 def __init__(self, parser, tree): 2144 Phase.__init__(self, parser, tree) 2145 self.startTagHandler = utils.MethodDispatcher([ 2146 ("html", self.startTagHtml), 2147 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", 2148 "thead", "tr"), self.startTagTableOther) 2149 ]) 2150 self.startTagHandler.default = self.startTagOther 2151 2152 self.endTagHandler = utils.MethodDispatcher([ 2153 (("td", "th"), self.endTagTableCell), 2154 (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore), 2155 (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply) 2156 ]) 2157 self.endTagHandler.default = self.endTagOther 2158 2159 # helper 2160 def closeCell(self): 2161 if self.tree.elementInScope("td", variant="table"): 2162 self.endTagTableCell(impliedTagToken("td")) 2163 elif self.tree.elementInScope("th", variant="table"): 2164 self.endTagTableCell(impliedTagToken("th")) 2165 2166 # the rest 2167 def processEOF(self): 2168 self.parser.phases["inBody"].processEOF() 2169 2170 def processCharacters(self, token): 2171 return self.parser.phases["inBody"].processCharacters(token) 2172 2173 def startTagTableOther(self, token): 2174 if (self.tree.elementInScope("td", variant="table") or 2175 self.tree.elementInScope("th", variant="table")): 2176 self.closeCell() 2177 return token 2178 else: 2179 # innerHTML case 2180 assert self.parser.innerHTML 2181 self.parser.parseError() 2182 2183 def startTagOther(self, token): 2184 return self.parser.phases["inBody"].processStartTag(token) 2185 2186 def endTagTableCell(self, token): 2187 if self.tree.elementInScope(token["name"], variant="table"): 2188 self.tree.generateImpliedEndTags(token["name"]) 2189 if self.tree.openElements[-1].name != token["name"]: 2190 self.parser.parseError("unexpected-cell-end-tag", 2191 {"name": token["name"]}) 2192 while True: 2193 node = self.tree.openElements.pop() 2194 if node.name == token["name"]: 2195 break 2196 else: 2197 self.tree.openElements.pop() 2198 self.tree.clearActiveFormattingElements() 2199 self.parser.phase = self.parser.phases["inRow"] 2200 else: 2201 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 2202 2203 def endTagIgnore(self, token): 2204 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 2205 2206 def endTagImply(self, token): 2207 if self.tree.elementInScope(token["name"], variant="table"): 2208 self.closeCell() 2209 return token 2210 else: 2211 # sometimes innerHTML case 2212 self.parser.parseError() 2213 2214 def endTagOther(self, token): 2215 return self.parser.phases["inBody"].processEndTag(token) 2216 2217 class InSelectPhase(Phase): 2218 def __init__(self, parser, tree): 2219 Phase.__init__(self, parser, tree) 2220 2221 self.startTagHandler = utils.MethodDispatcher([ 2222 ("html", self.startTagHtml), 2223 ("option", self.startTagOption), 2224 ("optgroup", self.startTagOptgroup), 2225 ("select", self.startTagSelect), 2226 (("input", "keygen", "textarea"), self.startTagInput), 2227 ("script", self.startTagScript) 2228 ]) 2229 self.startTagHandler.default = self.startTagOther 2230 2231 self.endTagHandler = utils.MethodDispatcher([ 2232 ("option", self.endTagOption), 2233 ("optgroup", self.endTagOptgroup), 2234 ("select", self.endTagSelect) 2235 ]) 2236 self.endTagHandler.default = self.endTagOther 2237 2238 # http://www.whatwg.org/specs/web-apps/current-work/#in-select 2239 def processEOF(self): 2240 if self.tree.openElements[-1].name != "html": 2241 self.parser.parseError("eof-in-select") 2242 else: 2243 assert self.parser.innerHTML 2244 2245 def processCharacters(self, token): 2246 if token["data"] == "\u0000": 2247 return 2248 self.tree.insertText(token["data"]) 2249 2250 def startTagOption(self, token): 2251 # We need to imply </option> if <option> is the current node. 2252 if self.tree.openElements[-1].name == "option": 2253 self.tree.openElements.pop() 2254 self.tree.insertElement(token) 2255 2256 def startTagOptgroup(self, token): 2257 if self.tree.openElements[-1].name == "option": 2258 self.tree.openElements.pop() 2259 if self.tree.openElements[-1].name == "optgroup": 2260 self.tree.openElements.pop() 2261 self.tree.insertElement(token) 2262 2263 def startTagSelect(self, token): 2264 self.parser.parseError("unexpected-select-in-select") 2265 self.endTagSelect(impliedTagToken("select")) 2266 2267 def startTagInput(self, token): 2268 self.parser.parseError("unexpected-input-in-select") 2269 if self.tree.elementInScope("select", variant="select"): 2270 self.endTagSelect(impliedTagToken("select")) 2271 return token 2272 else: 2273 assert self.parser.innerHTML 2274 2275 def startTagScript(self, token): 2276 return self.parser.phases["inHead"].processStartTag(token) 2277 2278 def startTagOther(self, token): 2279 self.parser.parseError("unexpected-start-tag-in-select", 2280 {"name": token["name"]}) 2281 2282 def endTagOption(self, token): 2283 if self.tree.openElements[-1].name == "option": 2284 self.tree.openElements.pop() 2285 else: 2286 self.parser.parseError("unexpected-end-tag-in-select", 2287 {"name": "option"}) 2288 2289 def endTagOptgroup(self, token): 2290 # </optgroup> implicitly closes <option> 2291 if (self.tree.openElements[-1].name == "option" and 2292 self.tree.openElements[-2].name == "optgroup"): 2293 self.tree.openElements.pop() 2294 # It also closes </optgroup> 2295 if self.tree.openElements[-1].name == "optgroup": 2296 self.tree.openElements.pop() 2297 # But nothing else 2298 else: 2299 self.parser.parseError("unexpected-end-tag-in-select", 2300 {"name": "optgroup"}) 2301 2302 def endTagSelect(self, token): 2303 if self.tree.elementInScope("select", variant="select"): 2304 node = self.tree.openElements.pop() 2305 while node.name != "select": 2306 node = self.tree.openElements.pop() 2307 self.parser.resetInsertionMode() 2308 else: 2309 # innerHTML case 2310 assert self.parser.innerHTML 2311 self.parser.parseError() 2312 2313 def endTagOther(self, token): 2314 self.parser.parseError("unexpected-end-tag-in-select", 2315 {"name": token["name"]}) 2316 2317 class InSelectInTablePhase(Phase): 2318 def __init__(self, parser, tree): 2319 Phase.__init__(self, parser, tree) 2320 2321 self.startTagHandler = utils.MethodDispatcher([ 2322 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), 2323 self.startTagTable) 2324 ]) 2325 self.startTagHandler.default = self.startTagOther 2326 2327 self.endTagHandler = utils.MethodDispatcher([ 2328 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), 2329 self.endTagTable) 2330 ]) 2331 self.endTagHandler.default = self.endTagOther 2332 2333 def processEOF(self): 2334 self.parser.phases["inSelect"].processEOF() 2335 2336 def processCharacters(self, token): 2337 return self.parser.phases["inSelect"].processCharacters(token) 2338 2339 def startTagTable(self, token): 2340 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]}) 2341 self.endTagOther(impliedTagToken("select")) 2342 return token 2343 2344 def startTagOther(self, token): 2345 return self.parser.phases["inSelect"].processStartTag(token) 2346 2347 def endTagTable(self, token): 2348 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]}) 2349 if self.tree.elementInScope(token["name"], variant="table"): 2350 self.endTagOther(impliedTagToken("select")) 2351 return token 2352 2353 def endTagOther(self, token): 2354 return self.parser.phases["inSelect"].processEndTag(token) 2355 2356 class InForeignContentPhase(Phase): 2357 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", 2358 "center", "code", "dd", "div", "dl", "dt", 2359 "em", "embed", "h1", "h2", "h3", 2360 "h4", "h5", "h6", "head", "hr", "i", "img", 2361 "li", "listing", "menu", "meta", "nobr", 2362 "ol", "p", "pre", "ruby", "s", "small", 2363 "span", "strong", "strike", "sub", "sup", 2364 "table", "tt", "u", "ul", "var"]) 2365 2366 def __init__(self, parser, tree): 2367 Phase.__init__(self, parser, tree) 2368 2369 def adjustSVGTagNames(self, token): 2370 replacements = {"altglyph": "altGlyph", 2371 "altglyphdef": "altGlyphDef", 2372 "altglyphitem": "altGlyphItem", 2373 "animatecolor": "animateColor", 2374 "animatemotion": "animateMotion", 2375 "animatetransform": "animateTransform", 2376 "clippath": "clipPath", 2377 "feblend": "feBlend", 2378 "fecolormatrix": "feColorMatrix", 2379 "fecomponenttransfer": "feComponentTransfer", 2380 "fecomposite": "feComposite", 2381 "feconvolvematrix": "feConvolveMatrix", 2382 "fediffuselighting": "feDiffuseLighting", 2383 "fedisplacementmap": "feDisplacementMap", 2384 "fedistantlight": "feDistantLight", 2385 "feflood": "feFlood", 2386 "fefunca": "feFuncA", 2387 "fefuncb": "feFuncB", 2388 "fefuncg": "feFuncG", 2389 "fefuncr": "feFuncR", 2390 "fegaussianblur": "feGaussianBlur", 2391 "feimage": "feImage", 2392 "femerge": "feMerge", 2393 "femergenode": "feMergeNode", 2394 "femorphology": "feMorphology", 2395 "feoffset": "feOffset", 2396 "fepointlight": "fePointLight", 2397 "fespecularlighting": "feSpecularLighting", 2398 "fespotlight": "feSpotLight", 2399 "fetile": "feTile", 2400 "feturbulence": "feTurbulence", 2401 "foreignobject": "foreignObject", 2402 "glyphref": "glyphRef", 2403 "lineargradient": "linearGradient", 2404 "radialgradient": "radialGradient", 2405 "textpath": "textPath"} 2406 2407 if token["name"] in replacements: 2408 token["name"] = replacements[token["name"]] 2409 2410 def processCharacters(self, token): 2411 if token["data"] == "\u0000": 2412 token["data"] = "\uFFFD" 2413 elif (self.parser.framesetOK and 2414 any(char not in spaceCharacters for char in token["data"])): 2415 self.parser.framesetOK = False 2416 Phase.processCharacters(self, token) 2417 2418 def processStartTag(self, token): 2419 currentNode = self.tree.openElements[-1] 2420 if (token["name"] in self.breakoutElements or 2421 (token["name"] == "font" and 2422 set(token["data"].keys()) & set(["color", "face", "size"]))): 2423 self.parser.parseError("unexpected-html-element-in-foreign-content", 2424 {"name": token["name"]}) 2425 while (self.tree.openElements[-1].namespace != 2426 self.tree.defaultNamespace and 2427 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and 2428 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])): 2429 self.tree.openElements.pop() 2430 return token 2431 2432 else: 2433 if currentNode.namespace == namespaces["mathml"]: 2434 self.parser.adjustMathMLAttributes(token) 2435 elif currentNode.namespace == namespaces["svg"]: 2436 self.adjustSVGTagNames(token) 2437 self.parser.adjustSVGAttributes(token) 2438 self.parser.adjustForeignAttributes(token) 2439 token["namespace"] = currentNode.namespace 2440 self.tree.insertElement(token) 2441 if token["selfClosing"]: 2442 self.tree.openElements.pop() 2443 token["selfClosingAcknowledged"] = True 2444 2445 def processEndTag(self, token): 2446 nodeIndex = len(self.tree.openElements) - 1 2447 node = self.tree.openElements[-1] 2448 if node.name != token["name"]: 2449 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 2450 2451 while True: 2452 if node.name.translate(asciiUpper2Lower) == token["name"]: 2453 # XXX this isn't in the spec but it seems necessary 2454 if self.parser.phase == self.parser.phases["inTableText"]: 2455 self.parser.phase.flushCharacters() 2456 self.parser.phase = self.parser.phase.originalPhase 2457 while self.tree.openElements.pop() != node: 2458 assert self.tree.openElements 2459 new_token = None 2460 break 2461 nodeIndex -= 1 2462 2463 node = self.tree.openElements[nodeIndex] 2464 if node.namespace != self.tree.defaultNamespace: 2465 continue 2466 else: 2467 new_token = self.parser.phase.processEndTag(token) 2468 break 2469 return new_token 2470 2471 class AfterBodyPhase(Phase): 2472 def __init__(self, parser, tree): 2473 Phase.__init__(self, parser, tree) 2474 2475 self.startTagHandler = utils.MethodDispatcher([ 2476 ("html", self.startTagHtml) 2477 ]) 2478 self.startTagHandler.default = self.startTagOther 2479 2480 self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)]) 2481 self.endTagHandler.default = self.endTagOther 2482 2483 def processEOF(self): 2484 # Stop parsing 2485 pass 2486 2487 def processComment(self, token): 2488 # This is needed because data is to be appended to the <html> element 2489 # here and not to whatever is currently open. 2490 self.tree.insertComment(token, self.tree.openElements[0]) 2491 2492 def processCharacters(self, token): 2493 self.parser.parseError("unexpected-char-after-body") 2494 self.parser.phase = self.parser.phases["inBody"] 2495 return token 2496 2497 def startTagHtml(self, token): 2498 return self.parser.phases["inBody"].processStartTag(token) 2499 2500 def startTagOther(self, token): 2501 self.parser.parseError("unexpected-start-tag-after-body", 2502 {"name": token["name"]}) 2503 self.parser.phase = self.parser.phases["inBody"] 2504 return token 2505 2506 def endTagHtml(self, name): 2507 if self.parser.innerHTML: 2508 self.parser.parseError("unexpected-end-tag-after-body-innerhtml") 2509 else: 2510 self.parser.phase = self.parser.phases["afterAfterBody"] 2511 2512 def endTagOther(self, token): 2513 self.parser.parseError("unexpected-end-tag-after-body", 2514 {"name": token["name"]}) 2515 self.parser.phase = self.parser.phases["inBody"] 2516 return token 2517 2518 class InFramesetPhase(Phase): 2519 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset 2520 def __init__(self, parser, tree): 2521 Phase.__init__(self, parser, tree) 2522 2523 self.startTagHandler = utils.MethodDispatcher([ 2524 ("html", self.startTagHtml), 2525 ("frameset", self.startTagFrameset), 2526 ("frame", self.startTagFrame), 2527 ("noframes", self.startTagNoframes) 2528 ]) 2529 self.startTagHandler.default = self.startTagOther 2530 2531 self.endTagHandler = utils.MethodDispatcher([ 2532 ("frameset", self.endTagFrameset) 2533 ]) 2534 self.endTagHandler.default = self.endTagOther 2535 2536 def processEOF(self): 2537 if self.tree.openElements[-1].name != "html": 2538 self.parser.parseError("eof-in-frameset") 2539 else: 2540 assert self.parser.innerHTML 2541 2542 def processCharacters(self, token): 2543 self.parser.parseError("unexpected-char-in-frameset") 2544 2545 def startTagFrameset(self, token): 2546 self.tree.insertElement(token) 2547 2548 def startTagFrame(self, token): 2549 self.tree.insertElement(token) 2550 self.tree.openElements.pop() 2551 2552 def startTagNoframes(self, token): 2553 return self.parser.phases["inBody"].processStartTag(token) 2554 2555 def startTagOther(self, token): 2556 self.parser.parseError("unexpected-start-tag-in-frameset", 2557 {"name": token["name"]}) 2558 2559 def endTagFrameset(self, token): 2560 if self.tree.openElements[-1].name == "html": 2561 # innerHTML case 2562 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml") 2563 else: 2564 self.tree.openElements.pop() 2565 if (not self.parser.innerHTML and 2566 self.tree.openElements[-1].name != "frameset"): 2567 # If we're not in innerHTML mode and the the current node is not a 2568 # "frameset" element (anymore) then switch. 2569 self.parser.phase = self.parser.phases["afterFrameset"] 2570 2571 def endTagOther(self, token): 2572 self.parser.parseError("unexpected-end-tag-in-frameset", 2573 {"name": token["name"]}) 2574 2575 class AfterFramesetPhase(Phase): 2576 # http://www.whatwg.org/specs/web-apps/current-work/#after3 2577 def __init__(self, parser, tree): 2578 Phase.__init__(self, parser, tree) 2579 2580 self.startTagHandler = utils.MethodDispatcher([ 2581 ("html", self.startTagHtml), 2582 ("noframes", self.startTagNoframes) 2583 ]) 2584 self.startTagHandler.default = self.startTagOther 2585 2586 self.endTagHandler = utils.MethodDispatcher([ 2587 ("html", self.endTagHtml) 2588 ]) 2589 self.endTagHandler.default = self.endTagOther 2590 2591 def processEOF(self): 2592 # Stop parsing 2593 pass 2594 2595 def processCharacters(self, token): 2596 self.parser.parseError("unexpected-char-after-frameset") 2597 2598 def startTagNoframes(self, token): 2599 return self.parser.phases["inHead"].processStartTag(token) 2600 2601 def startTagOther(self, token): 2602 self.parser.parseError("unexpected-start-tag-after-frameset", 2603 {"name": token["name"]}) 2604 2605 def endTagHtml(self, token): 2606 self.parser.phase = self.parser.phases["afterAfterFrameset"] 2607 2608 def endTagOther(self, token): 2609 self.parser.parseError("unexpected-end-tag-after-frameset", 2610 {"name": token["name"]}) 2611 2612 class AfterAfterBodyPhase(Phase): 2613 def __init__(self, parser, tree): 2614 Phase.__init__(self, parser, tree) 2615 2616 self.startTagHandler = utils.MethodDispatcher([ 2617 ("html", self.startTagHtml) 2618 ]) 2619 self.startTagHandler.default = self.startTagOther 2620 2621 def processEOF(self): 2622 pass 2623 2624 def processComment(self, token): 2625 self.tree.insertComment(token, self.tree.document) 2626 2627 def processSpaceCharacters(self, token): 2628 return self.parser.phases["inBody"].processSpaceCharacters(token) 2629 2630 def processCharacters(self, token): 2631 self.parser.parseError("expected-eof-but-got-char") 2632 self.parser.phase = self.parser.phases["inBody"] 2633 return token 2634 2635 def startTagHtml(self, token): 2636 return self.parser.phases["inBody"].processStartTag(token) 2637 2638 def startTagOther(self, token): 2639 self.parser.parseError("expected-eof-but-got-start-tag", 2640 {"name": token["name"]}) 2641 self.parser.phase = self.parser.phases["inBody"] 2642 return token 2643 2644 def processEndTag(self, token): 2645 self.parser.parseError("expected-eof-but-got-end-tag", 2646 {"name": token["name"]}) 2647 self.parser.phase = self.parser.phases["inBody"] 2648 return token 2649 2650 class AfterAfterFramesetPhase(Phase): 2651 def __init__(self, parser, tree): 2652 Phase.__init__(self, parser, tree) 2653 2654 self.startTagHandler = utils.MethodDispatcher([ 2655 ("html", self.startTagHtml), 2656 ("noframes", self.startTagNoFrames) 2657 ]) 2658 self.startTagHandler.default = self.startTagOther 2659 2660 def processEOF(self): 2661 pass 2662 2663 def processComment(self, token): 2664 self.tree.insertComment(token, self.tree.document) 2665 2666 def processSpaceCharacters(self, token): 2667 return self.parser.phases["inBody"].processSpaceCharacters(token) 2668 2669 def processCharacters(self, token): 2670 self.parser.parseError("expected-eof-but-got-char") 2671 2672 def startTagHtml(self, token): 2673 return self.parser.phases["inBody"].processStartTag(token) 2674 2675 def startTagNoFrames(self, token): 2676 return self.parser.phases["inHead"].processStartTag(token) 2677 2678 def startTagOther(self, token): 2679 self.parser.parseError("expected-eof-but-got-start-tag", 2680 {"name": token["name"]}) 2681 2682 def processEndTag(self, token): 2683 self.parser.parseError("expected-eof-but-got-end-tag", 2684 {"name": token["name"]}) 2685 2686 return { 2687 "initial": InitialPhase, 2688 "beforeHtml": BeforeHtmlPhase, 2689 "beforeHead": BeforeHeadPhase, 2690 "inHead": InHeadPhase, 2691 # XXX "inHeadNoscript": InHeadNoScriptPhase, 2692 "afterHead": AfterHeadPhase, 2693 "inBody": InBodyPhase, 2694 "text": TextPhase, 2695 "inTable": InTablePhase, 2696 "inTableText": InTableTextPhase, 2697 "inCaption": InCaptionPhase, 2698 "inColumnGroup": InColumnGroupPhase, 2699 "inTableBody": InTableBodyPhase, 2700 "inRow": InRowPhase, 2701 "inCell": InCellPhase, 2702 "inSelect": InSelectPhase, 2703 "inSelectInTable": InSelectInTablePhase, 2704 "inForeignContent": InForeignContentPhase, 2705 "afterBody": AfterBodyPhase, 2706 "inFrameset": InFramesetPhase, 2707 "afterFrameset": AfterFramesetPhase, 2708 "afterAfterBody": AfterAfterBodyPhase, 2709 "afterAfterFrameset": AfterAfterFramesetPhase, 2710 # XXX after after frameset 2711 } 2712 2713 2714def impliedTagToken(name, type="EndTag", attributes=None, 2715 selfClosing=False): 2716 if attributes is None: 2717 attributes = {} 2718 return {"type": tokenTypes[type], "name": name, "data": attributes, 2719 "selfClosing": selfClosing} 2720 2721 2722class ParseError(Exception): 2723 """Error in parsed document""" 2724 pass 2725