1from __future__ import absolute_import, division, unicode_literals
2
3import json
4import warnings
5import re
6
7from six import unichr
8
9from .support import get_data_files
10
11from html5lib.tokenizer import HTMLTokenizer
12from html5lib import constants, utils
13
14
15class TokenizerTestParser(object):
16    def __init__(self, initialState, lastStartTag=None):
17        self.tokenizer = HTMLTokenizer
18        self._state = initialState
19        self._lastStartTag = lastStartTag
20
21    def parse(self, stream, encoding=None, innerHTML=False):
22        tokenizer = self.tokenizer(stream, encoding)
23        self.outputTokens = []
24
25        tokenizer.state = getattr(tokenizer, self._state)
26        if self._lastStartTag is not None:
27            tokenizer.currentToken = {"type": "startTag",
28                                      "name": self._lastStartTag}
29
30        types = dict((v, k) for k, v in constants.tokenTypes.items())
31        for token in tokenizer:
32            getattr(self, 'process%s' % types[token["type"]])(token)
33
34        return self.outputTokens
35
36    def processDoctype(self, token):
37        self.outputTokens.append(["DOCTYPE", token["name"], token["publicId"],
38                                  token["systemId"], token["correct"]])
39
40    def processStartTag(self, token):
41        self.outputTokens.append(["StartTag", token["name"],
42                                  dict(token["data"][::-1]), token["selfClosing"]])
43
44    def processEmptyTag(self, token):
45        if token["name"] not in constants.voidElements:
46            self.outputTokens.append("ParseError")
47        self.outputTokens.append(["StartTag", token["name"], dict(token["data"][::-1])])
48
49    def processEndTag(self, token):
50        self.outputTokens.append(["EndTag", token["name"],
51                                  token["selfClosing"]])
52
53    def processComment(self, token):
54        self.outputTokens.append(["Comment", token["data"]])
55
56    def processSpaceCharacters(self, token):
57        self.outputTokens.append(["Character", token["data"]])
58        self.processSpaceCharacters = self.processCharacters
59
60    def processCharacters(self, token):
61        self.outputTokens.append(["Character", token["data"]])
62
63    def processEOF(self, token):
64        pass
65
66    def processParseError(self, token):
67        self.outputTokens.append(["ParseError", token["data"]])
68
69
70def concatenateCharacterTokens(tokens):
71    outputTokens = []
72    for token in tokens:
73        if "ParseError" not in token and token[0] == "Character":
74            if (outputTokens and "ParseError" not in outputTokens[-1] and
75                    outputTokens[-1][0] == "Character"):
76                outputTokens[-1][1] += token[1]
77            else:
78                outputTokens.append(token)
79        else:
80            outputTokens.append(token)
81    return outputTokens
82
83
84def normalizeTokens(tokens):
85    # TODO: convert tests to reflect arrays
86    for i, token in enumerate(tokens):
87        if token[0] == 'ParseError':
88            tokens[i] = token[0]
89    return tokens
90
91
92def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
93                ignoreErrors=False):
94    """Test whether the test has passed or failed
95
96    If the ignoreErrorOrder flag is set to true we don't test the relative
97    positions of parse errors and non parse errors
98    """
99    checkSelfClosing = False
100    for token in expectedTokens:
101        if (token[0] == "StartTag" and len(token) == 4
102                or token[0] == "EndTag" and len(token) == 3):
103            checkSelfClosing = True
104            break
105
106    if not checkSelfClosing:
107        for token in receivedTokens:
108            if token[0] == "StartTag" or token[0] == "EndTag":
109                token.pop()
110
111    if not ignoreErrorOrder and not ignoreErrors:
112        return expectedTokens == receivedTokens
113    else:
114        # Sort the tokens into two groups; non-parse errors and parse errors
115        tokens = {"expected": [[], []], "received": [[], []]}
116        for tokenType, tokenList in zip(list(tokens.keys()),
117                                        (expectedTokens, receivedTokens)):
118            for token in tokenList:
119                if token != "ParseError":
120                    tokens[tokenType][0].append(token)
121                else:
122                    if not ignoreErrors:
123                        tokens[tokenType][1].append(token)
124        return tokens["expected"] == tokens["received"]
125
126
127_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")
128
129
130def unescape(test):
131    def decode(inp):
132        """Decode \\uXXXX escapes
133
134        This decodes \\uXXXX escapes, possibly into non-BMP characters when
135        two surrogate character escapes are adjacent to each other.
136        """
137        # This cannot be implemented using the unicode_escape codec
138        # because that requires its input be ISO-8859-1, and we need
139        # arbitrary unicode as input.
140        def repl(m):
141            if m.group(2) is not None:
142                high = int(m.group(1), 16)
143                low = int(m.group(2), 16)
144                if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF:
145                    cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
146                    return unichr(cp)
147                else:
148                    return unichr(high) + unichr(low)
149            else:
150                return unichr(int(m.group(1), 16))
151        try:
152            return _surrogateRe.sub(repl, inp)
153        except ValueError:
154            # This occurs when unichr throws ValueError, which should
155            # only be for a lone-surrogate.
156            if utils.supports_lone_surrogates:
157                raise
158            return None
159
160    test["input"] = decode(test["input"])
161    for token in test["output"]:
162        if token == "ParseError":
163            continue
164        else:
165            token[1] = decode(token[1])
166            if len(token) > 2:
167                for key, value in token[2]:
168                    del token[2][key]
169                    token[2][decode(key)] = decode(value)
170    return test
171
172
173def runTokenizerTest(test):
174    warnings.resetwarnings()
175    warnings.simplefilter("error")
176
177    expected = concatenateCharacterTokens(test['output'])
178    if 'lastStartTag' not in test:
179        test['lastStartTag'] = None
180    parser = TokenizerTestParser(test['initialState'],
181                                 test['lastStartTag'])
182    tokens = parser.parse(test['input'])
183    tokens = concatenateCharacterTokens(tokens)
184    received = normalizeTokens(tokens)
185    errorMsg = "\n".join(["\n\nInitial state:",
186                          test['initialState'],
187                          "\nInput:", test['input'],
188                          "\nExpected:", repr(expected),
189                          "\nreceived:", repr(tokens)])
190    errorMsg = errorMsg
191    ignoreErrorOrder = test.get('ignoreErrorOrder', False)
192    assert tokensMatch(expected, received, ignoreErrorOrder, True), errorMsg
193
194
195def _doCapitalize(match):
196    return match.group(1).upper()
197
198_capitalizeRe = re.compile(r"\W+(\w)").sub
199
200
201def capitalize(s):
202    s = s.lower()
203    s = _capitalizeRe(_doCapitalize, s)
204    return s
205
206
207def testTokenizer():
208    for filename in get_data_files('tokenizer', '*.test'):
209        with open(filename) as fp:
210            tests = json.load(fp)
211            if 'tests' in tests:
212                for index, test in enumerate(tests['tests']):
213                    if 'initialStates' not in test:
214                        test["initialStates"] = ["Data state"]
215                    if 'doubleEscaped' in test:
216                        test = unescape(test)
217                        if test["input"] is None:
218                            continue  # Not valid input for this platform
219                    for initialState in test["initialStates"]:
220                        test["initialState"] = capitalize(initialState)
221                        yield runTokenizerTest, test
222