1from __future__ import absolute_import, division, unicode_literals
2
3try:
4    chr = unichr # flake8: noqa
5except NameError:
6    pass
7
8from collections import deque
9
10from .constants import spaceCharacters
11from .constants import entities
12from .constants import asciiLetters, asciiUpper2Lower
13from .constants import digits, hexDigits, EOF
14from .constants import tokenTypes, tagTokenTypes
15from .constants import replacementCharacters
16
17from .inputstream import HTMLInputStream
18
19from .trie import Trie
20
21entitiesTrie = Trie(entities)
22
23
24class HTMLTokenizer(object):
25    """ This class takes care of tokenizing HTML.
26
27    * self.currentToken
28      Holds the token that is currently being processed.
29
30    * self.state
31      Holds a reference to the method to be invoked... XXX
32
33    * self.stream
34      Points to HTMLInputStream object.
35    """
36
37    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
38                 lowercaseElementName=True, lowercaseAttrName=True, parser=None):
39
40        self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
41        self.parser = parser
42
43        # Perform case conversions?
44        self.lowercaseElementName = lowercaseElementName
45        self.lowercaseAttrName = lowercaseAttrName
46
47        # Setup the initial tokenizer state
48        self.escapeFlag = False
49        self.lastFourChars = []
50        self.state = self.dataState
51        self.escape = False
52
53        # The current token being created
54        self.currentToken = None
55        super(HTMLTokenizer, self).__init__()
56
57    def __iter__(self):
58        """ This is where the magic happens.
59
60        We do our usually processing through the states and when we have a token
61        to return we yield the token which pauses processing until the next token
62        is requested.
63        """
64        self.tokenQueue = deque([])
65        # Start processing. When EOF is reached self.state will return False
66        # instead of True and the loop will terminate.
67        while self.state():
68            while self.stream.errors:
69                yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
70            while self.tokenQueue:
71                yield self.tokenQueue.popleft()
72
73    def consumeNumberEntity(self, isHex):
74        """This function returns either U+FFFD or the character based on the
75        decimal or hexadecimal representation. It also discards ";" if present.
76        If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
77        """
78
79        allowed = digits
80        radix = 10
81        if isHex:
82            allowed = hexDigits
83            radix = 16
84
85        charStack = []
86
87        # Consume all the characters that are in range while making sure we
88        # don't hit an EOF.
89        c = self.stream.char()
90        while c in allowed and c is not EOF:
91            charStack.append(c)
92            c = self.stream.char()
93
94        # Convert the set of characters consumed to an int.
95        charAsInt = int("".join(charStack), radix)
96
97        # Certain characters get replaced with others
98        if charAsInt in replacementCharacters:
99            char = replacementCharacters[charAsInt]
100            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
101                                    "illegal-codepoint-for-numeric-entity",
102                                    "datavars": {"charAsInt": charAsInt}})
103        elif ((0xD800 <= charAsInt <= 0xDFFF) or
104              (charAsInt > 0x10FFFF)):
105            char = "\uFFFD"
106            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
107                                    "illegal-codepoint-for-numeric-entity",
108                                    "datavars": {"charAsInt": charAsInt}})
109        else:
110            # Should speed up this check somehow (e.g. move the set to a constant)
111            if ((0x0001 <= charAsInt <= 0x0008) or
112                (0x000E <= charAsInt <= 0x001F) or
113                (0x007F <= charAsInt <= 0x009F) or
114                (0xFDD0 <= charAsInt <= 0xFDEF) or
115                charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
116                                        0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
117                                        0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
118                                        0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
119                                        0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
120                                        0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
121                                        0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
122                                        0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
123                                        0xFFFFF, 0x10FFFE, 0x10FFFF])):
124                self.tokenQueue.append({"type": tokenTypes["ParseError"],
125                                        "data":
126                                        "illegal-codepoint-for-numeric-entity",
127                                        "datavars": {"charAsInt": charAsInt}})
128            try:
129                # Try/except needed as UCS-2 Python builds' unichar only works
130                # within the BMP.
131                char = chr(charAsInt)
132            except ValueError:
133                v = charAsInt - 0x10000
134                char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
135
136        # Discard the ; if present. Otherwise, put it back on the queue and
137        # invoke parseError on parser.
138        if c != ";":
139            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
140                                    "numeric-entity-without-semicolon"})
141            self.stream.unget(c)
142
143        return char
144
145    def consumeEntity(self, allowedChar=None, fromAttribute=False):
146        # Initialise to the default output for when no entity is matched
147        output = "&"
148
149        charStack = [self.stream.char()]
150        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
151                or (allowedChar is not None and allowedChar == charStack[0])):
152            self.stream.unget(charStack[0])
153
154        elif charStack[0] == "#":
155            # Read the next character to see if it's hex or decimal
156            hex = False
157            charStack.append(self.stream.char())
158            if charStack[-1] in ("x", "X"):
159                hex = True
160                charStack.append(self.stream.char())
161
162            # charStack[-1] should be the first digit
163            if (hex and charStack[-1] in hexDigits) \
164                    or (not hex and charStack[-1] in digits):
165                # At least one digit found, so consume the whole number
166                self.stream.unget(charStack[-1])
167                output = self.consumeNumberEntity(hex)
168            else:
169                # No digits found
170                self.tokenQueue.append({"type": tokenTypes["ParseError"],
171                                        "data": "expected-numeric-entity"})
172                self.stream.unget(charStack.pop())
173                output = "&" + "".join(charStack)
174
175        else:
176            # At this point in the process might have named entity. Entities
177            # are stored in the global variable "entities".
178            #
179            # Consume characters and compare to these to a substring of the
180            # entity names in the list until the substring no longer matches.
181            while (charStack[-1] is not EOF):
182                if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
183                    break
184                charStack.append(self.stream.char())
185
186            # At this point we have a string that starts with some characters
187            # that may match an entity
188            # Try to find the longest entity the string will match to take care
189            # of &noti for instance.
190            try:
191                entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
192                entityLength = len(entityName)
193            except KeyError:
194                entityName = None
195
196            if entityName is not None:
197                if entityName[-1] != ";":
198                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
199                                            "named-entity-without-semicolon"})
200                if (entityName[-1] != ";" and fromAttribute and
201                    (charStack[entityLength] in asciiLetters or
202                     charStack[entityLength] in digits or
203                     charStack[entityLength] == "=")):
204                    self.stream.unget(charStack.pop())
205                    output = "&" + "".join(charStack)
206                else:
207                    output = entities[entityName]
208                    self.stream.unget(charStack.pop())
209                    output += "".join(charStack[entityLength:])
210            else:
211                self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
212                                        "expected-named-entity"})
213                self.stream.unget(charStack.pop())
214                output = "&" + "".join(charStack)
215
216        if fromAttribute:
217            self.currentToken["data"][-1][1] += output
218        else:
219            if output in spaceCharacters:
220                tokenType = "SpaceCharacters"
221            else:
222                tokenType = "Characters"
223            self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
224
225    def processEntityInAttribute(self, allowedChar):
226        """This method replaces the need for "entityInAttributeValueState".
227        """
228        self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
229
230    def emitCurrentToken(self):
231        """This method is a generic handler for emitting the tags. It also sets
232        the state to "data" because that's what's needed after a token has been
233        emitted.
234        """
235        token = self.currentToken
236        # Add token to the queue to be yielded
237        if (token["type"] in tagTokenTypes):
238            if self.lowercaseElementName:
239                token["name"] = token["name"].translate(asciiUpper2Lower)
240            if token["type"] == tokenTypes["EndTag"]:
241                if token["data"]:
242                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
243                                            "data": "attributes-in-end-tag"})
244                if token["selfClosing"]:
245                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
246                                            "data": "self-closing-flag-on-end-tag"})
247        self.tokenQueue.append(token)
248        self.state = self.dataState
249
250    # Below are the various tokenizer states worked out.
251    def dataState(self):
252        data = self.stream.char()
253        if data == "&":
254            self.state = self.entityDataState
255        elif data == "<":
256            self.state = self.tagOpenState
257        elif data == "\u0000":
258            self.tokenQueue.append({"type": tokenTypes["ParseError"],
259                                    "data": "invalid-codepoint"})
260            self.tokenQueue.append({"type": tokenTypes["Characters"],
261                                    "data": "\u0000"})
262        elif data is EOF:
263            # Tokenization ends.
264            return False
265        elif data in spaceCharacters:
266            # Directly after emitting a token you switch back to the "data
267            # state". At that point spaceCharacters are important so they are
268            # emitted separately.
269            self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
270                                    data + self.stream.charsUntil(spaceCharacters, True)})
271            # No need to update lastFourChars here, since the first space will
272            # have already been appended to lastFourChars and will have broken
273            # any <!-- or --> sequences
274        else:
275            chars = self.stream.charsUntil(("&", "<", "\u0000"))
276            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
277                                    data + chars})
278        return True
279
280    def entityDataState(self):
281        self.consumeEntity()
282        self.state = self.dataState
283        return True
284
285    def rcdataState(self):
286        data = self.stream.char()
287        if data == "&":
288            self.state = self.characterReferenceInRcdata
289        elif data == "<":
290            self.state = self.rcdataLessThanSignState
291        elif data == EOF:
292            # Tokenization ends.
293            return False
294        elif data == "\u0000":
295            self.tokenQueue.append({"type": tokenTypes["ParseError"],
296                                    "data": "invalid-codepoint"})
297            self.tokenQueue.append({"type": tokenTypes["Characters"],
298                                    "data": "\uFFFD"})
299        elif data in spaceCharacters:
300            # Directly after emitting a token you switch back to the "data
301            # state". At that point spaceCharacters are important so they are
302            # emitted separately.
303            self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
304                                    data + self.stream.charsUntil(spaceCharacters, True)})
305            # No need to update lastFourChars here, since the first space will
306            # have already been appended to lastFourChars and will have broken
307            # any <!-- or --> sequences
308        else:
309            chars = self.stream.charsUntil(("&", "<", "\u0000"))
310            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
311                                    data + chars})
312        return True
313
314    def characterReferenceInRcdata(self):
315        self.consumeEntity()
316        self.state = self.rcdataState
317        return True
318
319    def rawtextState(self):
320        data = self.stream.char()
321        if data == "<":
322            self.state = self.rawtextLessThanSignState
323        elif data == "\u0000":
324            self.tokenQueue.append({"type": tokenTypes["ParseError"],
325                                    "data": "invalid-codepoint"})
326            self.tokenQueue.append({"type": tokenTypes["Characters"],
327                                    "data": "\uFFFD"})
328        elif data == EOF:
329            # Tokenization ends.
330            return False
331        else:
332            chars = self.stream.charsUntil(("<", "\u0000"))
333            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
334                                    data + chars})
335        return True
336
337    def scriptDataState(self):
338        data = self.stream.char()
339        if data == "<":
340            self.state = self.scriptDataLessThanSignState
341        elif data == "\u0000":
342            self.tokenQueue.append({"type": tokenTypes["ParseError"],
343                                    "data": "invalid-codepoint"})
344            self.tokenQueue.append({"type": tokenTypes["Characters"],
345                                    "data": "\uFFFD"})
346        elif data == EOF:
347            # Tokenization ends.
348            return False
349        else:
350            chars = self.stream.charsUntil(("<", "\u0000"))
351            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
352                                    data + chars})
353        return True
354
355    def plaintextState(self):
356        data = self.stream.char()
357        if data == EOF:
358            # Tokenization ends.
359            return False
360        elif data == "\u0000":
361            self.tokenQueue.append({"type": tokenTypes["ParseError"],
362                                    "data": "invalid-codepoint"})
363            self.tokenQueue.append({"type": tokenTypes["Characters"],
364                                    "data": "\uFFFD"})
365        else:
366            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
367                                    data + self.stream.charsUntil("\u0000")})
368        return True
369
370    def tagOpenState(self):
371        data = self.stream.char()
372        if data == "!":
373            self.state = self.markupDeclarationOpenState
374        elif data == "/":
375            self.state = self.closeTagOpenState
376        elif data in asciiLetters:
377            self.currentToken = {"type": tokenTypes["StartTag"],
378                                 "name": data, "data": [],
379                                 "selfClosing": False,
380                                 "selfClosingAcknowledged": False}
381            self.state = self.tagNameState
382        elif data == ">":
383            # XXX In theory it could be something besides a tag name. But
384            # do we really care?
385            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
386                                    "expected-tag-name-but-got-right-bracket"})
387            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
388            self.state = self.dataState
389        elif data == "?":
390            # XXX In theory it could be something besides a tag name. But
391            # do we really care?
392            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
393                                    "expected-tag-name-but-got-question-mark"})
394            self.stream.unget(data)
395            self.state = self.bogusCommentState
396        else:
397            # XXX
398            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
399                                    "expected-tag-name"})
400            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
401            self.stream.unget(data)
402            self.state = self.dataState
403        return True
404
405    def closeTagOpenState(self):
406        data = self.stream.char()
407        if data in asciiLetters:
408            self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
409                                 "data": [], "selfClosing": False}
410            self.state = self.tagNameState
411        elif data == ">":
412            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
413                                    "expected-closing-tag-but-got-right-bracket"})
414            self.state = self.dataState
415        elif data is EOF:
416            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
417                                    "expected-closing-tag-but-got-eof"})
418            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
419            self.state = self.dataState
420        else:
421            # XXX data can be _'_...
422            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
423                                    "expected-closing-tag-but-got-char",
424                                    "datavars": {"data": data}})
425            self.stream.unget(data)
426            self.state = self.bogusCommentState
427        return True
428
429    def tagNameState(self):
430        data = self.stream.char()
431        if data in spaceCharacters:
432            self.state = self.beforeAttributeNameState
433        elif data == ">":
434            self.emitCurrentToken()
435        elif data is EOF:
436            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
437                                    "eof-in-tag-name"})
438            self.state = self.dataState
439        elif data == "/":
440            self.state = self.selfClosingStartTagState
441        elif data == "\u0000":
442            self.tokenQueue.append({"type": tokenTypes["ParseError"],
443                                    "data": "invalid-codepoint"})
444            self.currentToken["name"] += "\uFFFD"
445        else:
446            self.currentToken["name"] += data
447            # (Don't use charsUntil here, because tag names are
448            # very short and it's faster to not do anything fancy)
449        return True
450
451    def rcdataLessThanSignState(self):
452        data = self.stream.char()
453        if data == "/":
454            self.temporaryBuffer = ""
455            self.state = self.rcdataEndTagOpenState
456        else:
457            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
458            self.stream.unget(data)
459            self.state = self.rcdataState
460        return True
461
462    def rcdataEndTagOpenState(self):
463        data = self.stream.char()
464        if data in asciiLetters:
465            self.temporaryBuffer += data
466            self.state = self.rcdataEndTagNameState
467        else:
468            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
469            self.stream.unget(data)
470            self.state = self.rcdataState
471        return True
472
473    def rcdataEndTagNameState(self):
474        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
475        data = self.stream.char()
476        if data in spaceCharacters and appropriate:
477            self.currentToken = {"type": tokenTypes["EndTag"],
478                                 "name": self.temporaryBuffer,
479                                 "data": [], "selfClosing": False}
480            self.state = self.beforeAttributeNameState
481        elif data == "/" and appropriate:
482            self.currentToken = {"type": tokenTypes["EndTag"],
483                                 "name": self.temporaryBuffer,
484                                 "data": [], "selfClosing": False}
485            self.state = self.selfClosingStartTagState
486        elif data == ">" and appropriate:
487            self.currentToken = {"type": tokenTypes["EndTag"],
488                                 "name": self.temporaryBuffer,
489                                 "data": [], "selfClosing": False}
490            self.emitCurrentToken()
491            self.state = self.dataState
492        elif data in asciiLetters:
493            self.temporaryBuffer += data
494        else:
495            self.tokenQueue.append({"type": tokenTypes["Characters"],
496                                    "data": "</" + self.temporaryBuffer})
497            self.stream.unget(data)
498            self.state = self.rcdataState
499        return True
500
501    def rawtextLessThanSignState(self):
502        data = self.stream.char()
503        if data == "/":
504            self.temporaryBuffer = ""
505            self.state = self.rawtextEndTagOpenState
506        else:
507            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
508            self.stream.unget(data)
509            self.state = self.rawtextState
510        return True
511
512    def rawtextEndTagOpenState(self):
513        data = self.stream.char()
514        if data in asciiLetters:
515            self.temporaryBuffer += data
516            self.state = self.rawtextEndTagNameState
517        else:
518            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
519            self.stream.unget(data)
520            self.state = self.rawtextState
521        return True
522
523    def rawtextEndTagNameState(self):
524        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
525        data = self.stream.char()
526        if data in spaceCharacters and appropriate:
527            self.currentToken = {"type": tokenTypes["EndTag"],
528                                 "name": self.temporaryBuffer,
529                                 "data": [], "selfClosing": False}
530            self.state = self.beforeAttributeNameState
531        elif data == "/" and appropriate:
532            self.currentToken = {"type": tokenTypes["EndTag"],
533                                 "name": self.temporaryBuffer,
534                                 "data": [], "selfClosing": False}
535            self.state = self.selfClosingStartTagState
536        elif data == ">" and appropriate:
537            self.currentToken = {"type": tokenTypes["EndTag"],
538                                 "name": self.temporaryBuffer,
539                                 "data": [], "selfClosing": False}
540            self.emitCurrentToken()
541            self.state = self.dataState
542        elif data in asciiLetters:
543            self.temporaryBuffer += data
544        else:
545            self.tokenQueue.append({"type": tokenTypes["Characters"],
546                                    "data": "</" + self.temporaryBuffer})
547            self.stream.unget(data)
548            self.state = self.rawtextState
549        return True
550
551    def scriptDataLessThanSignState(self):
552        data = self.stream.char()
553        if data == "/":
554            self.temporaryBuffer = ""
555            self.state = self.scriptDataEndTagOpenState
556        elif data == "!":
557            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
558            self.state = self.scriptDataEscapeStartState
559        else:
560            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
561            self.stream.unget(data)
562            self.state = self.scriptDataState
563        return True
564
565    def scriptDataEndTagOpenState(self):
566        data = self.stream.char()
567        if data in asciiLetters:
568            self.temporaryBuffer += data
569            self.state = self.scriptDataEndTagNameState
570        else:
571            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
572            self.stream.unget(data)
573            self.state = self.scriptDataState
574        return True
575
576    def scriptDataEndTagNameState(self):
577        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
578        data = self.stream.char()
579        if data in spaceCharacters and appropriate:
580            self.currentToken = {"type": tokenTypes["EndTag"],
581                                 "name": self.temporaryBuffer,
582                                 "data": [], "selfClosing": False}
583            self.state = self.beforeAttributeNameState
584        elif data == "/" and appropriate:
585            self.currentToken = {"type": tokenTypes["EndTag"],
586                                 "name": self.temporaryBuffer,
587                                 "data": [], "selfClosing": False}
588            self.state = self.selfClosingStartTagState
589        elif data == ">" and appropriate:
590            self.currentToken = {"type": tokenTypes["EndTag"],
591                                 "name": self.temporaryBuffer,
592                                 "data": [], "selfClosing": False}
593            self.emitCurrentToken()
594            self.state = self.dataState
595        elif data in asciiLetters:
596            self.temporaryBuffer += data
597        else:
598            self.tokenQueue.append({"type": tokenTypes["Characters"],
599                                    "data": "</" + self.temporaryBuffer})
600            self.stream.unget(data)
601            self.state = self.scriptDataState
602        return True
603
604    def scriptDataEscapeStartState(self):
605        data = self.stream.char()
606        if data == "-":
607            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
608            self.state = self.scriptDataEscapeStartDashState
609        else:
610            self.stream.unget(data)
611            self.state = self.scriptDataState
612        return True
613
614    def scriptDataEscapeStartDashState(self):
615        data = self.stream.char()
616        if data == "-":
617            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
618            self.state = self.scriptDataEscapedDashDashState
619        else:
620            self.stream.unget(data)
621            self.state = self.scriptDataState
622        return True
623
624    def scriptDataEscapedState(self):
625        data = self.stream.char()
626        if data == "-":
627            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
628            self.state = self.scriptDataEscapedDashState
629        elif data == "<":
630            self.state = self.scriptDataEscapedLessThanSignState
631        elif data == "\u0000":
632            self.tokenQueue.append({"type": tokenTypes["ParseError"],
633                                    "data": "invalid-codepoint"})
634            self.tokenQueue.append({"type": tokenTypes["Characters"],
635                                    "data": "\uFFFD"})
636        elif data == EOF:
637            self.state = self.dataState
638        else:
639            chars = self.stream.charsUntil(("<", "-", "\u0000"))
640            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
641                                    data + chars})
642        return True
643
644    def scriptDataEscapedDashState(self):
645        data = self.stream.char()
646        if data == "-":
647            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
648            self.state = self.scriptDataEscapedDashDashState
649        elif data == "<":
650            self.state = self.scriptDataEscapedLessThanSignState
651        elif data == "\u0000":
652            self.tokenQueue.append({"type": tokenTypes["ParseError"],
653                                    "data": "invalid-codepoint"})
654            self.tokenQueue.append({"type": tokenTypes["Characters"],
655                                    "data": "\uFFFD"})
656            self.state = self.scriptDataEscapedState
657        elif data == EOF:
658            self.state = self.dataState
659        else:
660            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
661            self.state = self.scriptDataEscapedState
662        return True
663
664    def scriptDataEscapedDashDashState(self):
665        data = self.stream.char()
666        if data == "-":
667            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
668        elif data == "<":
669            self.state = self.scriptDataEscapedLessThanSignState
670        elif data == ">":
671            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
672            self.state = self.scriptDataState
673        elif data == "\u0000":
674            self.tokenQueue.append({"type": tokenTypes["ParseError"],
675                                    "data": "invalid-codepoint"})
676            self.tokenQueue.append({"type": tokenTypes["Characters"],
677                                    "data": "\uFFFD"})
678            self.state = self.scriptDataEscapedState
679        elif data == EOF:
680            self.state = self.dataState
681        else:
682            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
683            self.state = self.scriptDataEscapedState
684        return True
685
686    def scriptDataEscapedLessThanSignState(self):
687        data = self.stream.char()
688        if data == "/":
689            self.temporaryBuffer = ""
690            self.state = self.scriptDataEscapedEndTagOpenState
691        elif data in asciiLetters:
692            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
693            self.temporaryBuffer = data
694            self.state = self.scriptDataDoubleEscapeStartState
695        else:
696            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
697            self.stream.unget(data)
698            self.state = self.scriptDataEscapedState
699        return True
700
701    def scriptDataEscapedEndTagOpenState(self):
702        data = self.stream.char()
703        if data in asciiLetters:
704            self.temporaryBuffer = data
705            self.state = self.scriptDataEscapedEndTagNameState
706        else:
707            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
708            self.stream.unget(data)
709            self.state = self.scriptDataEscapedState
710        return True
711
712    def scriptDataEscapedEndTagNameState(self):
713        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
714        data = self.stream.char()
715        if data in spaceCharacters and appropriate:
716            self.currentToken = {"type": tokenTypes["EndTag"],
717                                 "name": self.temporaryBuffer,
718                                 "data": [], "selfClosing": False}
719            self.state = self.beforeAttributeNameState
720        elif data == "/" and appropriate:
721            self.currentToken = {"type": tokenTypes["EndTag"],
722                                 "name": self.temporaryBuffer,
723                                 "data": [], "selfClosing": False}
724            self.state = self.selfClosingStartTagState
725        elif data == ">" and appropriate:
726            self.currentToken = {"type": tokenTypes["EndTag"],
727                                 "name": self.temporaryBuffer,
728                                 "data": [], "selfClosing": False}
729            self.emitCurrentToken()
730            self.state = self.dataState
731        elif data in asciiLetters:
732            self.temporaryBuffer += data
733        else:
734            self.tokenQueue.append({"type": tokenTypes["Characters"],
735                                    "data": "</" + self.temporaryBuffer})
736            self.stream.unget(data)
737            self.state = self.scriptDataEscapedState
738        return True
739
740    def scriptDataDoubleEscapeStartState(self):
741        data = self.stream.char()
742        if data in (spaceCharacters | frozenset(("/", ">"))):
743            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
744            if self.temporaryBuffer.lower() == "script":
745                self.state = self.scriptDataDoubleEscapedState
746            else:
747                self.state = self.scriptDataEscapedState
748        elif data in asciiLetters:
749            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
750            self.temporaryBuffer += data
751        else:
752            self.stream.unget(data)
753            self.state = self.scriptDataEscapedState
754        return True
755
756    def scriptDataDoubleEscapedState(self):
757        data = self.stream.char()
758        if data == "-":
759            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
760            self.state = self.scriptDataDoubleEscapedDashState
761        elif data == "<":
762            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
763            self.state = self.scriptDataDoubleEscapedLessThanSignState
764        elif data == "\u0000":
765            self.tokenQueue.append({"type": tokenTypes["ParseError"],
766                                    "data": "invalid-codepoint"})
767            self.tokenQueue.append({"type": tokenTypes["Characters"],
768                                    "data": "\uFFFD"})
769        elif data == EOF:
770            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
771                                    "eof-in-script-in-script"})
772            self.state = self.dataState
773        else:
774            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
775        return True
776
777    def scriptDataDoubleEscapedDashState(self):
778        data = self.stream.char()
779        if data == "-":
780            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
781            self.state = self.scriptDataDoubleEscapedDashDashState
782        elif data == "<":
783            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
784            self.state = self.scriptDataDoubleEscapedLessThanSignState
785        elif data == "\u0000":
786            self.tokenQueue.append({"type": tokenTypes["ParseError"],
787                                    "data": "invalid-codepoint"})
788            self.tokenQueue.append({"type": tokenTypes["Characters"],
789                                    "data": "\uFFFD"})
790            self.state = self.scriptDataDoubleEscapedState
791        elif data == EOF:
792            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
793                                    "eof-in-script-in-script"})
794            self.state = self.dataState
795        else:
796            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
797            self.state = self.scriptDataDoubleEscapedState
798        return True
799
800    def scriptDataDoubleEscapedDashDashState(self):
801        data = self.stream.char()
802        if data == "-":
803            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
804        elif data == "<":
805            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
806            self.state = self.scriptDataDoubleEscapedLessThanSignState
807        elif data == ">":
808            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
809            self.state = self.scriptDataState
810        elif data == "\u0000":
811            self.tokenQueue.append({"type": tokenTypes["ParseError"],
812                                    "data": "invalid-codepoint"})
813            self.tokenQueue.append({"type": tokenTypes["Characters"],
814                                    "data": "\uFFFD"})
815            self.state = self.scriptDataDoubleEscapedState
816        elif data == EOF:
817            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
818                                    "eof-in-script-in-script"})
819            self.state = self.dataState
820        else:
821            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
822            self.state = self.scriptDataDoubleEscapedState
823        return True
824
825    def scriptDataDoubleEscapedLessThanSignState(self):
826        data = self.stream.char()
827        if data == "/":
828            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
829            self.temporaryBuffer = ""
830            self.state = self.scriptDataDoubleEscapeEndState
831        else:
832            self.stream.unget(data)
833            self.state = self.scriptDataDoubleEscapedState
834        return True
835
836    def scriptDataDoubleEscapeEndState(self):
837        data = self.stream.char()
838        if data in (spaceCharacters | frozenset(("/", ">"))):
839            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
840            if self.temporaryBuffer.lower() == "script":
841                self.state = self.scriptDataEscapedState
842            else:
843                self.state = self.scriptDataDoubleEscapedState
844        elif data in asciiLetters:
845            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
846            self.temporaryBuffer += data
847        else:
848            self.stream.unget(data)
849            self.state = self.scriptDataDoubleEscapedState
850        return True
851
852    def beforeAttributeNameState(self):
853        data = self.stream.char()
854        if data in spaceCharacters:
855            self.stream.charsUntil(spaceCharacters, True)
856        elif data in asciiLetters:
857            self.currentToken["data"].append([data, ""])
858            self.state = self.attributeNameState
859        elif data == ">":
860            self.emitCurrentToken()
861        elif data == "/":
862            self.state = self.selfClosingStartTagState
863        elif data in ("'", '"', "=", "<"):
864            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
865                                    "invalid-character-in-attribute-name"})
866            self.currentToken["data"].append([data, ""])
867            self.state = self.attributeNameState
868        elif data == "\u0000":
869            self.tokenQueue.append({"type": tokenTypes["ParseError"],
870                                    "data": "invalid-codepoint"})
871            self.currentToken["data"].append(["\uFFFD", ""])
872            self.state = self.attributeNameState
873        elif data is EOF:
874            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
875                                    "expected-attribute-name-but-got-eof"})
876            self.state = self.dataState
877        else:
878            self.currentToken["data"].append([data, ""])
879            self.state = self.attributeNameState
880        return True
881
882    def attributeNameState(self):
883        data = self.stream.char()
884        leavingThisState = True
885        emitToken = False
886        if data == "=":
887            self.state = self.beforeAttributeValueState
888        elif data in asciiLetters:
889            self.currentToken["data"][-1][0] += data +\
890                self.stream.charsUntil(asciiLetters, True)
891            leavingThisState = False
892        elif data == ">":
893            # XXX If we emit here the attributes are converted to a dict
894            # without being checked and when the code below runs we error
895            # because data is a dict not a list
896            emitToken = True
897        elif data in spaceCharacters:
898            self.state = self.afterAttributeNameState
899        elif data == "/":
900            self.state = self.selfClosingStartTagState
901        elif data == "\u0000":
902            self.tokenQueue.append({"type": tokenTypes["ParseError"],
903                                    "data": "invalid-codepoint"})
904            self.currentToken["data"][-1][0] += "\uFFFD"
905            leavingThisState = False
906        elif data in ("'", '"', "<"):
907            self.tokenQueue.append({"type": tokenTypes["ParseError"],
908                                    "data":
909                                    "invalid-character-in-attribute-name"})
910            self.currentToken["data"][-1][0] += data
911            leavingThisState = False
912        elif data is EOF:
913            self.tokenQueue.append({"type": tokenTypes["ParseError"],
914                                    "data": "eof-in-attribute-name"})
915            self.state = self.dataState
916        else:
917            self.currentToken["data"][-1][0] += data
918            leavingThisState = False
919
920        if leavingThisState:
921            # Attributes are not dropped at this stage. That happens when the
922            # start tag token is emitted so values can still be safely appended
923            # to attributes, but we do want to report the parse error in time.
924            if self.lowercaseAttrName:
925                self.currentToken["data"][-1][0] = (
926                    self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
927            for name, value in self.currentToken["data"][:-1]:
928                if self.currentToken["data"][-1][0] == name:
929                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
930                                            "duplicate-attribute"})
931                    break
932            # XXX Fix for above XXX
933            if emitToken:
934                self.emitCurrentToken()
935        return True
936
937    def afterAttributeNameState(self):
938        data = self.stream.char()
939        if data in spaceCharacters:
940            self.stream.charsUntil(spaceCharacters, True)
941        elif data == "=":
942            self.state = self.beforeAttributeValueState
943        elif data == ">":
944            self.emitCurrentToken()
945        elif data in asciiLetters:
946            self.currentToken["data"].append([data, ""])
947            self.state = self.attributeNameState
948        elif data == "/":
949            self.state = self.selfClosingStartTagState
950        elif data == "\u0000":
951            self.tokenQueue.append({"type": tokenTypes["ParseError"],
952                                    "data": "invalid-codepoint"})
953            self.currentToken["data"].append(["\uFFFD", ""])
954            self.state = self.attributeNameState
955        elif data in ("'", '"', "<"):
956            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
957                                    "invalid-character-after-attribute-name"})
958            self.currentToken["data"].append([data, ""])
959            self.state = self.attributeNameState
960        elif data is EOF:
961            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
962                                    "expected-end-of-tag-but-got-eof"})
963            self.state = self.dataState
964        else:
965            self.currentToken["data"].append([data, ""])
966            self.state = self.attributeNameState
967        return True
968
969    def beforeAttributeValueState(self):
970        data = self.stream.char()
971        if data in spaceCharacters:
972            self.stream.charsUntil(spaceCharacters, True)
973        elif data == "\"":
974            self.state = self.attributeValueDoubleQuotedState
975        elif data == "&":
976            self.state = self.attributeValueUnQuotedState
977            self.stream.unget(data)
978        elif data == "'":
979            self.state = self.attributeValueSingleQuotedState
980        elif data == ">":
981            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
982                                    "expected-attribute-value-but-got-right-bracket"})
983            self.emitCurrentToken()
984        elif data == "\u0000":
985            self.tokenQueue.append({"type": tokenTypes["ParseError"],
986                                    "data": "invalid-codepoint"})
987            self.currentToken["data"][-1][1] += "\uFFFD"
988            self.state = self.attributeValueUnQuotedState
989        elif data in ("=", "<", "`"):
990            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
991                                    "equals-in-unquoted-attribute-value"})
992            self.currentToken["data"][-1][1] += data
993            self.state = self.attributeValueUnQuotedState
994        elif data is EOF:
995            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
996                                    "expected-attribute-value-but-got-eof"})
997            self.state = self.dataState
998        else:
999            self.currentToken["data"][-1][1] += data
1000            self.state = self.attributeValueUnQuotedState
1001        return True
1002
1003    def attributeValueDoubleQuotedState(self):
1004        data = self.stream.char()
1005        if data == "\"":
1006            self.state = self.afterAttributeValueState
1007        elif data == "&":
1008            self.processEntityInAttribute('"')
1009        elif data == "\u0000":
1010            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1011                                    "data": "invalid-codepoint"})
1012            self.currentToken["data"][-1][1] += "\uFFFD"
1013        elif data is EOF:
1014            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1015                                    "eof-in-attribute-value-double-quote"})
1016            self.state = self.dataState
1017        else:
1018            self.currentToken["data"][-1][1] += data +\
1019                self.stream.charsUntil(("\"", "&", "\u0000"))
1020        return True
1021
1022    def attributeValueSingleQuotedState(self):
1023        data = self.stream.char()
1024        if data == "'":
1025            self.state = self.afterAttributeValueState
1026        elif data == "&":
1027            self.processEntityInAttribute("'")
1028        elif data == "\u0000":
1029            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1030                                    "data": "invalid-codepoint"})
1031            self.currentToken["data"][-1][1] += "\uFFFD"
1032        elif data is EOF:
1033            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1034                                    "eof-in-attribute-value-single-quote"})
1035            self.state = self.dataState
1036        else:
1037            self.currentToken["data"][-1][1] += data +\
1038                self.stream.charsUntil(("'", "&", "\u0000"))
1039        return True
1040
1041    def attributeValueUnQuotedState(self):
1042        data = self.stream.char()
1043        if data in spaceCharacters:
1044            self.state = self.beforeAttributeNameState
1045        elif data == "&":
1046            self.processEntityInAttribute(">")
1047        elif data == ">":
1048            self.emitCurrentToken()
1049        elif data in ('"', "'", "=", "<", "`"):
1050            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1051                                    "unexpected-character-in-unquoted-attribute-value"})
1052            self.currentToken["data"][-1][1] += data
1053        elif data == "\u0000":
1054            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1055                                    "data": "invalid-codepoint"})
1056            self.currentToken["data"][-1][1] += "\uFFFD"
1057        elif data is EOF:
1058            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1059                                    "eof-in-attribute-value-no-quotes"})
1060            self.state = self.dataState
1061        else:
1062            self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
1063                frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
1064        return True
1065
1066    def afterAttributeValueState(self):
1067        data = self.stream.char()
1068        if data in spaceCharacters:
1069            self.state = self.beforeAttributeNameState
1070        elif data == ">":
1071            self.emitCurrentToken()
1072        elif data == "/":
1073            self.state = self.selfClosingStartTagState
1074        elif data is EOF:
1075            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1076                                    "unexpected-EOF-after-attribute-value"})
1077            self.stream.unget(data)
1078            self.state = self.dataState
1079        else:
1080            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1081                                    "unexpected-character-after-attribute-value"})
1082            self.stream.unget(data)
1083            self.state = self.beforeAttributeNameState
1084        return True
1085
1086    def selfClosingStartTagState(self):
1087        data = self.stream.char()
1088        if data == ">":
1089            self.currentToken["selfClosing"] = True
1090            self.emitCurrentToken()
1091        elif data is EOF:
1092            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1093                                    "data":
1094                                    "unexpected-EOF-after-solidus-in-tag"})
1095            self.stream.unget(data)
1096            self.state = self.dataState
1097        else:
1098            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1099                                    "unexpected-character-after-solidus-in-tag"})
1100            self.stream.unget(data)
1101            self.state = self.beforeAttributeNameState
1102        return True
1103
1104    def bogusCommentState(self):
1105        # Make a new comment token and give it as value all the characters
1106        # until the first > or EOF (charsUntil checks for EOF automatically)
1107        # and emit it.
1108        data = self.stream.charsUntil(">")
1109        data = data.replace("\u0000", "\uFFFD")
1110        self.tokenQueue.append(
1111            {"type": tokenTypes["Comment"], "data": data})
1112
1113        # Eat the character directly after the bogus comment which is either a
1114        # ">" or an EOF.
1115        self.stream.char()
1116        self.state = self.dataState
1117        return True
1118
1119    def markupDeclarationOpenState(self):
1120        charStack = [self.stream.char()]
1121        if charStack[-1] == "-":
1122            charStack.append(self.stream.char())
1123            if charStack[-1] == "-":
1124                self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
1125                self.state = self.commentStartState
1126                return True
1127        elif charStack[-1] in ('d', 'D'):
1128            matched = True
1129            for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
1130                             ('y', 'Y'), ('p', 'P'), ('e', 'E')):
1131                charStack.append(self.stream.char())
1132                if charStack[-1] not in expected:
1133                    matched = False
1134                    break
1135            if matched:
1136                self.currentToken = {"type": tokenTypes["Doctype"],
1137                                     "name": "",
1138                                     "publicId": None, "systemId": None,
1139                                     "correct": True}
1140                self.state = self.doctypeState
1141                return True
1142        elif (charStack[-1] == "[" and
1143              self.parser is not None and
1144              self.parser.tree.openElements and
1145              self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
1146            matched = True
1147            for expected in ["C", "D", "A", "T", "A", "["]:
1148                charStack.append(self.stream.char())
1149                if charStack[-1] != expected:
1150                    matched = False
1151                    break
1152            if matched:
1153                self.state = self.cdataSectionState
1154                return True
1155
1156        self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1157                                "expected-dashes-or-doctype"})
1158
1159        while charStack:
1160            self.stream.unget(charStack.pop())
1161        self.state = self.bogusCommentState
1162        return True
1163
1164    def commentStartState(self):
1165        data = self.stream.char()
1166        if data == "-":
1167            self.state = self.commentStartDashState
1168        elif data == "\u0000":
1169            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1170                                    "data": "invalid-codepoint"})
1171            self.currentToken["data"] += "\uFFFD"
1172        elif data == ">":
1173            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1174                                    "incorrect-comment"})
1175            self.tokenQueue.append(self.currentToken)
1176            self.state = self.dataState
1177        elif data is EOF:
1178            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1179                                    "eof-in-comment"})
1180            self.tokenQueue.append(self.currentToken)
1181            self.state = self.dataState
1182        else:
1183            self.currentToken["data"] += data
1184            self.state = self.commentState
1185        return True
1186
1187    def commentStartDashState(self):
1188        data = self.stream.char()
1189        if data == "-":
1190            self.state = self.commentEndState
1191        elif data == "\u0000":
1192            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1193                                    "data": "invalid-codepoint"})
1194            self.currentToken["data"] += "-\uFFFD"
1195        elif data == ">":
1196            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1197                                    "incorrect-comment"})
1198            self.tokenQueue.append(self.currentToken)
1199            self.state = self.dataState
1200        elif data is EOF:
1201            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1202                                    "eof-in-comment"})
1203            self.tokenQueue.append(self.currentToken)
1204            self.state = self.dataState
1205        else:
1206            self.currentToken["data"] += "-" + data
1207            self.state = self.commentState
1208        return True
1209
1210    def commentState(self):
1211        data = self.stream.char()
1212        if data == "-":
1213            self.state = self.commentEndDashState
1214        elif data == "\u0000":
1215            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1216                                    "data": "invalid-codepoint"})
1217            self.currentToken["data"] += "\uFFFD"
1218        elif data is EOF:
1219            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1220                                    "data": "eof-in-comment"})
1221            self.tokenQueue.append(self.currentToken)
1222            self.state = self.dataState
1223        else:
1224            self.currentToken["data"] += data + \
1225                self.stream.charsUntil(("-", "\u0000"))
1226        return True
1227
1228    def commentEndDashState(self):
1229        data = self.stream.char()
1230        if data == "-":
1231            self.state = self.commentEndState
1232        elif data == "\u0000":
1233            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1234                                    "data": "invalid-codepoint"})
1235            self.currentToken["data"] += "-\uFFFD"
1236            self.state = self.commentState
1237        elif data is EOF:
1238            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1239                                    "eof-in-comment-end-dash"})
1240            self.tokenQueue.append(self.currentToken)
1241            self.state = self.dataState
1242        else:
1243            self.currentToken["data"] += "-" + data
1244            self.state = self.commentState
1245        return True
1246
1247    def commentEndState(self):
1248        data = self.stream.char()
1249        if data == ">":
1250            self.tokenQueue.append(self.currentToken)
1251            self.state = self.dataState
1252        elif data == "\u0000":
1253            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1254                                    "data": "invalid-codepoint"})
1255            self.currentToken["data"] += "--\uFFFD"
1256            self.state = self.commentState
1257        elif data == "!":
1258            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1259                                    "unexpected-bang-after-double-dash-in-comment"})
1260            self.state = self.commentEndBangState
1261        elif data == "-":
1262            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1263                                    "unexpected-dash-after-double-dash-in-comment"})
1264            self.currentToken["data"] += data
1265        elif data is EOF:
1266            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1267                                    "eof-in-comment-double-dash"})
1268            self.tokenQueue.append(self.currentToken)
1269            self.state = self.dataState
1270        else:
1271            # XXX
1272            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1273                                    "unexpected-char-in-comment"})
1274            self.currentToken["data"] += "--" + data
1275            self.state = self.commentState
1276        return True
1277
1278    def commentEndBangState(self):
1279        data = self.stream.char()
1280        if data == ">":
1281            self.tokenQueue.append(self.currentToken)
1282            self.state = self.dataState
1283        elif data == "-":
1284            self.currentToken["data"] += "--!"
1285            self.state = self.commentEndDashState
1286        elif data == "\u0000":
1287            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1288                                    "data": "invalid-codepoint"})
1289            self.currentToken["data"] += "--!\uFFFD"
1290            self.state = self.commentState
1291        elif data is EOF:
1292            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1293                                    "eof-in-comment-end-bang-state"})
1294            self.tokenQueue.append(self.currentToken)
1295            self.state = self.dataState
1296        else:
1297            self.currentToken["data"] += "--!" + data
1298            self.state = self.commentState
1299        return True
1300
1301    def doctypeState(self):
1302        data = self.stream.char()
1303        if data in spaceCharacters:
1304            self.state = self.beforeDoctypeNameState
1305        elif data is EOF:
1306            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1307                                    "expected-doctype-name-but-got-eof"})
1308            self.currentToken["correct"] = False
1309            self.tokenQueue.append(self.currentToken)
1310            self.state = self.dataState
1311        else:
1312            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1313                                    "need-space-after-doctype"})
1314            self.stream.unget(data)
1315            self.state = self.beforeDoctypeNameState
1316        return True
1317
1318    def beforeDoctypeNameState(self):
1319        data = self.stream.char()
1320        if data in spaceCharacters:
1321            pass
1322        elif data == ">":
1323            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1324                                    "expected-doctype-name-but-got-right-bracket"})
1325            self.currentToken["correct"] = False
1326            self.tokenQueue.append(self.currentToken)
1327            self.state = self.dataState
1328        elif data == "\u0000":
1329            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1330                                    "data": "invalid-codepoint"})
1331            self.currentToken["name"] = "\uFFFD"
1332            self.state = self.doctypeNameState
1333        elif data is EOF:
1334            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1335                                    "expected-doctype-name-but-got-eof"})
1336            self.currentToken["correct"] = False
1337            self.tokenQueue.append(self.currentToken)
1338            self.state = self.dataState
1339        else:
1340            self.currentToken["name"] = data
1341            self.state = self.doctypeNameState
1342        return True
1343
1344    def doctypeNameState(self):
1345        data = self.stream.char()
1346        if data in spaceCharacters:
1347            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1348            self.state = self.afterDoctypeNameState
1349        elif data == ">":
1350            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1351            self.tokenQueue.append(self.currentToken)
1352            self.state = self.dataState
1353        elif data == "\u0000":
1354            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1355                                    "data": "invalid-codepoint"})
1356            self.currentToken["name"] += "\uFFFD"
1357            self.state = self.doctypeNameState
1358        elif data is EOF:
1359            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1360                                    "eof-in-doctype-name"})
1361            self.currentToken["correct"] = False
1362            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1363            self.tokenQueue.append(self.currentToken)
1364            self.state = self.dataState
1365        else:
1366            self.currentToken["name"] += data
1367        return True
1368
1369    def afterDoctypeNameState(self):
1370        data = self.stream.char()
1371        if data in spaceCharacters:
1372            pass
1373        elif data == ">":
1374            self.tokenQueue.append(self.currentToken)
1375            self.state = self.dataState
1376        elif data is EOF:
1377            self.currentToken["correct"] = False
1378            self.stream.unget(data)
1379            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1380                                    "eof-in-doctype"})
1381            self.tokenQueue.append(self.currentToken)
1382            self.state = self.dataState
1383        else:
1384            if data in ("p", "P"):
1385                matched = True
1386                for expected in (("u", "U"), ("b", "B"), ("l", "L"),
1387                                 ("i", "I"), ("c", "C")):
1388                    data = self.stream.char()
1389                    if data not in expected:
1390                        matched = False
1391                        break
1392                if matched:
1393                    self.state = self.afterDoctypePublicKeywordState
1394                    return True
1395            elif data in ("s", "S"):
1396                matched = True
1397                for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
1398                                 ("e", "E"), ("m", "M")):
1399                    data = self.stream.char()
1400                    if data not in expected:
1401                        matched = False
1402                        break
1403                if matched:
1404                    self.state = self.afterDoctypeSystemKeywordState
1405                    return True
1406
1407            # All the characters read before the current 'data' will be
1408            # [a-zA-Z], so they're garbage in the bogus doctype and can be
1409            # discarded; only the latest character might be '>' or EOF
1410            # and needs to be ungetted
1411            self.stream.unget(data)
1412            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1413                                    "expected-space-or-right-bracket-in-doctype", "datavars":
1414                                    {"data": data}})
1415            self.currentToken["correct"] = False
1416            self.state = self.bogusDoctypeState
1417
1418        return True
1419
1420    def afterDoctypePublicKeywordState(self):
1421        data = self.stream.char()
1422        if data in spaceCharacters:
1423            self.state = self.beforeDoctypePublicIdentifierState
1424        elif data in ("'", '"'):
1425            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1426                                    "unexpected-char-in-doctype"})
1427            self.stream.unget(data)
1428            self.state = self.beforeDoctypePublicIdentifierState
1429        elif data is EOF:
1430            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1431                                    "eof-in-doctype"})
1432            self.currentToken["correct"] = False
1433            self.tokenQueue.append(self.currentToken)
1434            self.state = self.dataState
1435        else:
1436            self.stream.unget(data)
1437            self.state = self.beforeDoctypePublicIdentifierState
1438        return True
1439
1440    def beforeDoctypePublicIdentifierState(self):
1441        data = self.stream.char()
1442        if data in spaceCharacters:
1443            pass
1444        elif data == "\"":
1445            self.currentToken["publicId"] = ""
1446            self.state = self.doctypePublicIdentifierDoubleQuotedState
1447        elif data == "'":
1448            self.currentToken["publicId"] = ""
1449            self.state = self.doctypePublicIdentifierSingleQuotedState
1450        elif data == ">":
1451            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1452                                    "unexpected-end-of-doctype"})
1453            self.currentToken["correct"] = False
1454            self.tokenQueue.append(self.currentToken)
1455            self.state = self.dataState
1456        elif data is EOF:
1457            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1458                                    "eof-in-doctype"})
1459            self.currentToken["correct"] = False
1460            self.tokenQueue.append(self.currentToken)
1461            self.state = self.dataState
1462        else:
1463            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1464                                    "unexpected-char-in-doctype"})
1465            self.currentToken["correct"] = False
1466            self.state = self.bogusDoctypeState
1467        return True
1468
1469    def doctypePublicIdentifierDoubleQuotedState(self):
1470        data = self.stream.char()
1471        if data == "\"":
1472            self.state = self.afterDoctypePublicIdentifierState
1473        elif data == "\u0000":
1474            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1475                                    "data": "invalid-codepoint"})
1476            self.currentToken["publicId"] += "\uFFFD"
1477        elif data == ">":
1478            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1479                                    "unexpected-end-of-doctype"})
1480            self.currentToken["correct"] = False
1481            self.tokenQueue.append(self.currentToken)
1482            self.state = self.dataState
1483        elif data is EOF:
1484            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1485                                    "eof-in-doctype"})
1486            self.currentToken["correct"] = False
1487            self.tokenQueue.append(self.currentToken)
1488            self.state = self.dataState
1489        else:
1490            self.currentToken["publicId"] += data
1491        return True
1492
1493    def doctypePublicIdentifierSingleQuotedState(self):
1494        data = self.stream.char()
1495        if data == "'":
1496            self.state = self.afterDoctypePublicIdentifierState
1497        elif data == "\u0000":
1498            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1499                                    "data": "invalid-codepoint"})
1500            self.currentToken["publicId"] += "\uFFFD"
1501        elif data == ">":
1502            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1503                                    "unexpected-end-of-doctype"})
1504            self.currentToken["correct"] = False
1505            self.tokenQueue.append(self.currentToken)
1506            self.state = self.dataState
1507        elif data is EOF:
1508            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1509                                    "eof-in-doctype"})
1510            self.currentToken["correct"] = False
1511            self.tokenQueue.append(self.currentToken)
1512            self.state = self.dataState
1513        else:
1514            self.currentToken["publicId"] += data
1515        return True
1516
1517    def afterDoctypePublicIdentifierState(self):
1518        data = self.stream.char()
1519        if data in spaceCharacters:
1520            self.state = self.betweenDoctypePublicAndSystemIdentifiersState
1521        elif data == ">":
1522            self.tokenQueue.append(self.currentToken)
1523            self.state = self.dataState
1524        elif data == '"':
1525            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1526                                    "unexpected-char-in-doctype"})
1527            self.currentToken["systemId"] = ""
1528            self.state = self.doctypeSystemIdentifierDoubleQuotedState
1529        elif data == "'":
1530            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1531                                    "unexpected-char-in-doctype"})
1532            self.currentToken["systemId"] = ""
1533            self.state = self.doctypeSystemIdentifierSingleQuotedState
1534        elif data is EOF:
1535            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1536                                    "eof-in-doctype"})
1537            self.currentToken["correct"] = False
1538            self.tokenQueue.append(self.currentToken)
1539            self.state = self.dataState
1540        else:
1541            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1542                                    "unexpected-char-in-doctype"})
1543            self.currentToken["correct"] = False
1544            self.state = self.bogusDoctypeState
1545        return True
1546
1547    def betweenDoctypePublicAndSystemIdentifiersState(self):
1548        data = self.stream.char()
1549        if data in spaceCharacters:
1550            pass
1551        elif data == ">":
1552            self.tokenQueue.append(self.currentToken)
1553            self.state = self.dataState
1554        elif data == '"':
1555            self.currentToken["systemId"] = ""
1556            self.state = self.doctypeSystemIdentifierDoubleQuotedState
1557        elif data == "'":
1558            self.currentToken["systemId"] = ""
1559            self.state = self.doctypeSystemIdentifierSingleQuotedState
1560        elif data == EOF:
1561            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1562                                    "eof-in-doctype"})
1563            self.currentToken["correct"] = False
1564            self.tokenQueue.append(self.currentToken)
1565            self.state = self.dataState
1566        else:
1567            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1568                                    "unexpected-char-in-doctype"})
1569            self.currentToken["correct"] = False
1570            self.state = self.bogusDoctypeState
1571        return True
1572
1573    def afterDoctypeSystemKeywordState(self):
1574        data = self.stream.char()
1575        if data in spaceCharacters:
1576            self.state = self.beforeDoctypeSystemIdentifierState
1577        elif data in ("'", '"'):
1578            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1579                                    "unexpected-char-in-doctype"})
1580            self.stream.unget(data)
1581            self.state = self.beforeDoctypeSystemIdentifierState
1582        elif data is EOF:
1583            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1584                                    "eof-in-doctype"})
1585            self.currentToken["correct"] = False
1586            self.tokenQueue.append(self.currentToken)
1587            self.state = self.dataState
1588        else:
1589            self.stream.unget(data)
1590            self.state = self.beforeDoctypeSystemIdentifierState
1591        return True
1592
1593    def beforeDoctypeSystemIdentifierState(self):
1594        data = self.stream.char()
1595        if data in spaceCharacters:
1596            pass
1597        elif data == "\"":
1598            self.currentToken["systemId"] = ""
1599            self.state = self.doctypeSystemIdentifierDoubleQuotedState
1600        elif data == "'":
1601            self.currentToken["systemId"] = ""
1602            self.state = self.doctypeSystemIdentifierSingleQuotedState
1603        elif data == ">":
1604            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1605                                    "unexpected-char-in-doctype"})
1606            self.currentToken["correct"] = False
1607            self.tokenQueue.append(self.currentToken)
1608            self.state = self.dataState
1609        elif data is EOF:
1610            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1611                                    "eof-in-doctype"})
1612            self.currentToken["correct"] = False
1613            self.tokenQueue.append(self.currentToken)
1614            self.state = self.dataState
1615        else:
1616            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1617                                    "unexpected-char-in-doctype"})
1618            self.currentToken["correct"] = False
1619            self.state = self.bogusDoctypeState
1620        return True
1621
1622    def doctypeSystemIdentifierDoubleQuotedState(self):
1623        data = self.stream.char()
1624        if data == "\"":
1625            self.state = self.afterDoctypeSystemIdentifierState
1626        elif data == "\u0000":
1627            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1628                                    "data": "invalid-codepoint"})
1629            self.currentToken["systemId"] += "\uFFFD"
1630        elif data == ">":
1631            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1632                                    "unexpected-end-of-doctype"})
1633            self.currentToken["correct"] = False
1634            self.tokenQueue.append(self.currentToken)
1635            self.state = self.dataState
1636        elif data is EOF:
1637            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1638                                    "eof-in-doctype"})
1639            self.currentToken["correct"] = False
1640            self.tokenQueue.append(self.currentToken)
1641            self.state = self.dataState
1642        else:
1643            self.currentToken["systemId"] += data
1644        return True
1645
1646    def doctypeSystemIdentifierSingleQuotedState(self):
1647        data = self.stream.char()
1648        if data == "'":
1649            self.state = self.afterDoctypeSystemIdentifierState
1650        elif data == "\u0000":
1651            self.tokenQueue.append({"type": tokenTypes["ParseError"],
1652                                    "data": "invalid-codepoint"})
1653            self.currentToken["systemId"] += "\uFFFD"
1654        elif data == ">":
1655            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1656                                    "unexpected-end-of-doctype"})
1657            self.currentToken["correct"] = False
1658            self.tokenQueue.append(self.currentToken)
1659            self.state = self.dataState
1660        elif data is EOF:
1661            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1662                                    "eof-in-doctype"})
1663            self.currentToken["correct"] = False
1664            self.tokenQueue.append(self.currentToken)
1665            self.state = self.dataState
1666        else:
1667            self.currentToken["systemId"] += data
1668        return True
1669
1670    def afterDoctypeSystemIdentifierState(self):
1671        data = self.stream.char()
1672        if data in spaceCharacters:
1673            pass
1674        elif data == ">":
1675            self.tokenQueue.append(self.currentToken)
1676            self.state = self.dataState
1677        elif data is EOF:
1678            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1679                                    "eof-in-doctype"})
1680            self.currentToken["correct"] = False
1681            self.tokenQueue.append(self.currentToken)
1682            self.state = self.dataState
1683        else:
1684            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1685                                    "unexpected-char-in-doctype"})
1686            self.state = self.bogusDoctypeState
1687        return True
1688
1689    def bogusDoctypeState(self):
1690        data = self.stream.char()
1691        if data == ">":
1692            self.tokenQueue.append(self.currentToken)
1693            self.state = self.dataState
1694        elif data is EOF:
1695            # XXX EMIT
1696            self.stream.unget(data)
1697            self.tokenQueue.append(self.currentToken)
1698            self.state = self.dataState
1699        else:
1700            pass
1701        return True
1702
1703    def cdataSectionState(self):
1704        data = []
1705        while True:
1706            data.append(self.stream.charsUntil("]"))
1707            data.append(self.stream.charsUntil(">"))
1708            char = self.stream.char()
1709            if char == EOF:
1710                break
1711            else:
1712                assert char == ">"
1713                if data[-1][-2:] == "]]":
1714                    data[-1] = data[-1][:-2]
1715                    break
1716                else:
1717                    data.append(char)
1718
1719        data = "".join(data)
1720        # Deal with null here rather than in the parser
1721        nullCount = data.count("\u0000")
1722        if nullCount > 0:
1723            for i in range(nullCount):
1724                self.tokenQueue.append({"type": tokenTypes["ParseError"],
1725                                        "data": "invalid-codepoint"})
1726            data = data.replace("\u0000", "\uFFFD")
1727        if data:
1728            self.tokenQueue.append({"type": tokenTypes["Characters"],
1729                                    "data": data})
1730        self.state = self.dataState
1731        return True
1732