1"""A parser for HTML and XHTML."""
2
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import markupbase
12import re
13
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17incomplete = re.compile('&[a-zA-Z#]')
18
19entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
20charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
21
22starttagopen = re.compile('<[a-zA-Z]')
23piclose = re.compile('>')
24commentclose = re.compile(r'--\s*>')
25
26# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
27# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
28# note: if you change tagfind/attrfind remember to update locatestarttagend too
29tagfind = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
30# this regex is currently unused, but left for backward compatibility
31tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
32
33attrfind = re.compile(
34    r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
35    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
36
37locatestarttagend = re.compile(r"""
38  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
39  (?:[\s/]*                          # optional whitespace before attribute name
40    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
41      (?:\s*=+\s*                    # value indicator
42        (?:'[^']*'                   # LITA-enclosed value
43          |"[^"]*"                   # LIT-enclosed value
44          |(?!['"])[^>\s]*           # bare value
45         )
46       )?(?:\s|/(?!>))*
47     )*
48   )?
49  \s*                                # trailing whitespace
50""", re.VERBOSE)
51endendtag = re.compile('>')
52# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
53# </ and the tag name, so maybe this should be fixed
54endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
55
56
57class HTMLParseError(Exception):
58    """Exception raised for all parse errors."""
59
60    def __init__(self, msg, position=(None, None)):
61        assert msg
62        self.msg = msg
63        self.lineno = position[0]
64        self.offset = position[1]
65
66    def __str__(self):
67        result = self.msg
68        if self.lineno is not None:
69            result = result + ", at line %d" % self.lineno
70        if self.offset is not None:
71            result = result + ", column %d" % (self.offset + 1)
72        return result
73
74
75class HTMLParser(markupbase.ParserBase):
76    """Find tags and other markup and call handler functions.
77
78    Usage:
79        p = HTMLParser()
80        p.feed(data)
81        ...
82        p.close()
83
84    Start tags are handled by calling self.handle_starttag() or
85    self.handle_startendtag(); end tags by self.handle_endtag().  The
86    data between tags is passed from the parser to the derived class
87    by calling self.handle_data() with the data as argument (the data
88    may be split up in arbitrary chunks).  Entity references are
89    passed by calling self.handle_entityref() with the entity
90    reference as the argument.  Numeric character references are
91    passed to self.handle_charref() with the string containing the
92    reference as the argument.
93    """
94
95    CDATA_CONTENT_ELEMENTS = ("script", "style")
96
97
98    def __init__(self):
99        """Initialize and reset this instance."""
100        self.reset()
101
102    def reset(self):
103        """Reset this instance.  Loses all unprocessed data."""
104        self.rawdata = ''
105        self.lasttag = '???'
106        self.interesting = interesting_normal
107        self.cdata_elem = None
108        markupbase.ParserBase.reset(self)
109
110    def feed(self, data):
111        r"""Feed data to the parser.
112
113        Call this as often as you want, with as little or as much text
114        as you want (may include '\n').
115        """
116        self.rawdata = self.rawdata + data
117        self.goahead(0)
118
119    def close(self):
120        """Handle any buffered data."""
121        self.goahead(1)
122
123    def error(self, message):
124        raise HTMLParseError(message, self.getpos())
125
126    __starttag_text = None
127
128    def get_starttag_text(self):
129        """Return full source of start tag: '<...>'."""
130        return self.__starttag_text
131
132    def set_cdata_mode(self, elem):
133        self.cdata_elem = elem.lower()
134        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
135
136    def clear_cdata_mode(self):
137        self.interesting = interesting_normal
138        self.cdata_elem = None
139
140    # Internal -- handle data as far as reasonable.  May leave state
141    # and data to be processed by a subsequent call.  If 'end' is
142    # true, force handling all data as if followed by EOF marker.
143    def goahead(self, end):
144        rawdata = self.rawdata
145        i = 0
146        n = len(rawdata)
147        while i < n:
148            match = self.interesting.search(rawdata, i) # < or &
149            if match:
150                j = match.start()
151            else:
152                if self.cdata_elem:
153                    break
154                j = n
155            if i < j: self.handle_data(rawdata[i:j])
156            i = self.updatepos(i, j)
157            if i == n: break
158            startswith = rawdata.startswith
159            if startswith('<', i):
160                if starttagopen.match(rawdata, i): # < + letter
161                    k = self.parse_starttag(i)
162                elif startswith("</", i):
163                    k = self.parse_endtag(i)
164                elif startswith("<!--", i):
165                    k = self.parse_comment(i)
166                elif startswith("<?", i):
167                    k = self.parse_pi(i)
168                elif startswith("<!", i):
169                    k = self.parse_html_declaration(i)
170                elif (i + 1) < n:
171                    self.handle_data("<")
172                    k = i + 1
173                else:
174                    break
175                if k < 0:
176                    if not end:
177                        break
178                    k = rawdata.find('>', i + 1)
179                    if k < 0:
180                        k = rawdata.find('<', i + 1)
181                        if k < 0:
182                            k = i + 1
183                    else:
184                        k += 1
185                    self.handle_data(rawdata[i:k])
186                i = self.updatepos(i, k)
187            elif startswith("&#", i):
188                match = charref.match(rawdata, i)
189                if match:
190                    name = match.group()[2:-1]
191                    self.handle_charref(name)
192                    k = match.end()
193                    if not startswith(';', k-1):
194                        k = k - 1
195                    i = self.updatepos(i, k)
196                    continue
197                else:
198                    if ";" in rawdata[i:]:  # bail by consuming '&#'
199                        self.handle_data(rawdata[i:i+2])
200                        i = self.updatepos(i, i+2)
201                    break
202            elif startswith('&', i):
203                match = entityref.match(rawdata, i)
204                if match:
205                    name = match.group(1)
206                    self.handle_entityref(name)
207                    k = match.end()
208                    if not startswith(';', k-1):
209                        k = k - 1
210                    i = self.updatepos(i, k)
211                    continue
212                match = incomplete.match(rawdata, i)
213                if match:
214                    # match.group() will contain at least 2 chars
215                    if end and match.group() == rawdata[i:]:
216                        self.error("EOF in middle of entity or char ref")
217                    # incomplete
218                    break
219                elif (i + 1) < n:
220                    # not the end of the buffer, and can't be confused
221                    # with some other construct
222                    self.handle_data("&")
223                    i = self.updatepos(i, i + 1)
224                else:
225                    break
226            else:
227                assert 0, "interesting.search() lied"
228        # end while
229        if end and i < n and not self.cdata_elem:
230            self.handle_data(rawdata[i:n])
231            i = self.updatepos(i, n)
232        self.rawdata = rawdata[i:]
233
234    # Internal -- parse html declarations, return length or -1 if not terminated
235    # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
236    # See also parse_declaration in _markupbase
237    def parse_html_declaration(self, i):
238        rawdata = self.rawdata
239        if rawdata[i:i+2] != '<!':
240            self.error('unexpected call to parse_html_declaration()')
241        if rawdata[i:i+4] == '<!--':
242            # this case is actually already handled in goahead()
243            return self.parse_comment(i)
244        elif rawdata[i:i+3] == '<![':
245            return self.parse_marked_section(i)
246        elif rawdata[i:i+9].lower() == '<!doctype':
247            # find the closing >
248            gtpos = rawdata.find('>', i+9)
249            if gtpos == -1:
250                return -1
251            self.handle_decl(rawdata[i+2:gtpos])
252            return gtpos+1
253        else:
254            return self.parse_bogus_comment(i)
255
256    # Internal -- parse bogus comment, return length or -1 if not terminated
257    # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
258    def parse_bogus_comment(self, i, report=1):
259        rawdata = self.rawdata
260        if rawdata[i:i+2] not in ('<!', '</'):
261            self.error('unexpected call to parse_comment()')
262        pos = rawdata.find('>', i+2)
263        if pos == -1:
264            return -1
265        if report:
266            self.handle_comment(rawdata[i+2:pos])
267        return pos + 1
268
269    # Internal -- parse processing instr, return end or -1 if not terminated
270    def parse_pi(self, i):
271        rawdata = self.rawdata
272        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
273        match = piclose.search(rawdata, i+2) # >
274        if not match:
275            return -1
276        j = match.start()
277        self.handle_pi(rawdata[i+2: j])
278        j = match.end()
279        return j
280
281    # Internal -- handle starttag, return end or -1 if not terminated
282    def parse_starttag(self, i):
283        self.__starttag_text = None
284        endpos = self.check_for_whole_start_tag(i)
285        if endpos < 0:
286            return endpos
287        rawdata = self.rawdata
288        self.__starttag_text = rawdata[i:endpos]
289
290        # Now parse the data between i+1 and j into a tag and attrs
291        attrs = []
292        match = tagfind.match(rawdata, i+1)
293        assert match, 'unexpected call to parse_starttag()'
294        k = match.end()
295        self.lasttag = tag = match.group(1).lower()
296
297        while k < endpos:
298            m = attrfind.match(rawdata, k)
299            if not m:
300                break
301            attrname, rest, attrvalue = m.group(1, 2, 3)
302            if not rest:
303                attrvalue = None
304            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
305                 attrvalue[:1] == '"' == attrvalue[-1:]:
306                attrvalue = attrvalue[1:-1]
307            if attrvalue:
308                attrvalue = self.unescape(attrvalue)
309            attrs.append((attrname.lower(), attrvalue))
310            k = m.end()
311
312        end = rawdata[k:endpos].strip()
313        if end not in (">", "/>"):
314            lineno, offset = self.getpos()
315            if "\n" in self.__starttag_text:
316                lineno = lineno + self.__starttag_text.count("\n")
317                offset = len(self.__starttag_text) \
318                         - self.__starttag_text.rfind("\n")
319            else:
320                offset = offset + len(self.__starttag_text)
321            self.handle_data(rawdata[i:endpos])
322            return endpos
323        if end.endswith('/>'):
324            # XHTML-style empty tag: <span attr="value" />
325            self.handle_startendtag(tag, attrs)
326        else:
327            self.handle_starttag(tag, attrs)
328            if tag in self.CDATA_CONTENT_ELEMENTS:
329                self.set_cdata_mode(tag)
330        return endpos
331
332    # Internal -- check to see if we have a complete starttag; return end
333    # or -1 if incomplete.
334    def check_for_whole_start_tag(self, i):
335        rawdata = self.rawdata
336        m = locatestarttagend.match(rawdata, i)
337        if m:
338            j = m.end()
339            next = rawdata[j:j+1]
340            if next == ">":
341                return j + 1
342            if next == "/":
343                if rawdata.startswith("/>", j):
344                    return j + 2
345                if rawdata.startswith("/", j):
346                    # buffer boundary
347                    return -1
348                # else bogus input
349                self.updatepos(i, j + 1)
350                self.error("malformed empty start tag")
351            if next == "":
352                # end of input
353                return -1
354            if next in ("abcdefghijklmnopqrstuvwxyz=/"
355                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
356                # end of input in or before attribute value, or we have the
357                # '/' from a '/>' ending
358                return -1
359            if j > i:
360                return j
361            else:
362                return i + 1
363        raise AssertionError("we should not get here!")
364
365    # Internal -- parse endtag, return end or -1 if incomplete
366    def parse_endtag(self, i):
367        rawdata = self.rawdata
368        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
369        match = endendtag.search(rawdata, i+1) # >
370        if not match:
371            return -1
372        gtpos = match.end()
373        match = endtagfind.match(rawdata, i) # </ + tag + >
374        if not match:
375            if self.cdata_elem is not None:
376                self.handle_data(rawdata[i:gtpos])
377                return gtpos
378            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
379            namematch = tagfind.match(rawdata, i+2)
380            if not namematch:
381                # w3.org/TR/html5/tokenization.html#end-tag-open-state
382                if rawdata[i:i+3] == '</>':
383                    return i+3
384                else:
385                    return self.parse_bogus_comment(i)
386            tagname = namematch.group(1).lower()
387            # consume and ignore other stuff between the name and the >
388            # Note: this is not 100% correct, since we might have things like
389            # </tag attr=">">, but looking for > after tha name should cover
390            # most of the cases and is much simpler
391            gtpos = rawdata.find('>', namematch.end())
392            self.handle_endtag(tagname)
393            return gtpos+1
394
395        elem = match.group(1).lower() # script or style
396        if self.cdata_elem is not None:
397            if elem != self.cdata_elem:
398                self.handle_data(rawdata[i:gtpos])
399                return gtpos
400
401        self.handle_endtag(elem)
402        self.clear_cdata_mode()
403        return gtpos
404
405    # Overridable -- finish processing of start+end tag: <tag.../>
406    def handle_startendtag(self, tag, attrs):
407        self.handle_starttag(tag, attrs)
408        self.handle_endtag(tag)
409
410    # Overridable -- handle start tag
411    def handle_starttag(self, tag, attrs):
412        pass
413
414    # Overridable -- handle end tag
415    def handle_endtag(self, tag):
416        pass
417
418    # Overridable -- handle character reference
419    def handle_charref(self, name):
420        pass
421
422    # Overridable -- handle entity reference
423    def handle_entityref(self, name):
424        pass
425
426    # Overridable -- handle data
427    def handle_data(self, data):
428        pass
429
430    # Overridable -- handle comment
431    def handle_comment(self, data):
432        pass
433
434    # Overridable -- handle declaration
435    def handle_decl(self, decl):
436        pass
437
438    # Overridable -- handle processing instruction
439    def handle_pi(self, data):
440        pass
441
442    def unknown_decl(self, data):
443        pass
444
445    # Internal -- helper to remove special character quoting
446    entitydefs = None
447    def unescape(self, s):
448        if '&' not in s:
449            return s
450        def replaceEntities(s):
451            s = s.groups()[0]
452            try:
453                if s[0] == "#":
454                    s = s[1:]
455                    if s[0] in ['x','X']:
456                        c = int(s[1:], 16)
457                    else:
458                        c = int(s)
459                    return unichr(c)
460            except ValueError:
461                return '&#'+s+';'
462            else:
463                # Cannot use name2codepoint directly, because HTMLParser supports apos,
464                # which is not part of HTML 4
465                if HTMLParser.entitydefs is None:
466                    import htmlentitydefs
467                    entitydefs = {'apos':u"'"}
468                    for k, v in htmlentitydefs.name2codepoint.iteritems():
469                        entitydefs[k] = unichr(v)
470                    HTMLParser.entitydefs = entitydefs
471                try:
472                    return self.entitydefs[s]
473                except KeyError:
474                    return '&'+s+';'
475
476        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
477