1"""A parser for HTML and XHTML."""
3# This file is based on sgmllib.py, but the API is slightly different.
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
11import re
12import warnings
13import _markupbase
15from html import unescape
18__all__ = ['HTMLParser']
20# Regular expressions used for parsing
22interesting_normal = re.compile('[&<]')
23incomplete = re.compile('&[a-zA-Z#]')
25entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
28starttagopen = re.compile('<[a-zA-Z]')
29piclose = re.compile('>')
30commentclose = re.compile(r'--\s*>')
31# Note:
32#  1) if you change tagfind/attrfind remember to update locatestarttagend too;
33#  2) if you change tagfind/attrfind and/or locatestarttagend the parser will
34#     explode, so don't do it.
35# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
36# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
37tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
38attrfind_tolerant = re.compile(
39    r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
40    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
41locatestarttagend_tolerant = re.compile(r"""
42  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
43  (?:[\s/]*                          # optional whitespace before attribute name
44    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
45      (?:\s*=+\s*                    # value indicator
46        (?:'[^']*'                   # LITA-enclosed value
47          |"[^"]*"                   # LIT-enclosed value
48          |(?!['"])[^>\s]*           # bare value
49         )
50         (?:\s*,)*                   # possibly followed by a comma
51       )?(?:\s|/(?!>))*
52     )*
53   )?
54  \s*                                # trailing whitespace
55""", re.VERBOSE)
56endendtag = re.compile('>')
57# the HTML 5 spec, section, doesn't allow spaces between
58# </ and the tag name, so maybe this should be fixed
59endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
63class HTMLParser(_markupbase.ParserBase):
64    """Find tags and other markup and call handler functions.
66    Usage:
67        p = HTMLParser()
68        p.feed(data)
69        ...
70        p.close()
72    Start tags are handled by calling self.handle_starttag() or
73    self.handle_startendtag(); end tags by self.handle_endtag().  The
74    data between tags is passed from the parser to the derived class
75    by calling self.handle_data() with the data as argument (the data
76    may be split up in arbitrary chunks).  If convert_charrefs is
77    True the character references are converted automatically to the
78    corresponding Unicode character (and self.handle_data() is no
79    longer split in chunks), otherwise they are passed by calling
80    self.handle_entityref() or self.handle_charref() with the string
81    containing respectively the named or numeric reference as the
82    argument.
83    """
85    CDATA_CONTENT_ELEMENTS = ("script", "style")
87    def __init__(self, *, convert_charrefs=True):
88        """Initialize and reset this instance.
90        If convert_charrefs is True (the default), all character references
91        are automatically converted to the corresponding Unicode characters.
92        """
93        self.convert_charrefs = convert_charrefs
94        self.reset()
96    def reset(self):
97        """Reset this instance.  Loses all unprocessed data."""
98        self.rawdata = ''
99        self.lasttag = '???'
100        self.interesting = interesting_normal
101        self.cdata_elem = None
102        _markupbase.ParserBase.reset(self)
104    def feed(self, data):
105        r"""Feed data to the parser.
107        Call this as often as you want, with as little or as much text
108        as you want (may include '\n').
109        """
110        self.rawdata = self.rawdata + data
111        self.goahead(0)
113    def close(self):
114        """Handle any buffered data."""
115        self.goahead(1)
117    __starttag_text = None
119    def get_starttag_text(self):
120        """Return full source of start tag: '<...>'."""
121        return self.__starttag_text
123    def set_cdata_mode(self, elem):
124        self.cdata_elem = elem.lower()
125        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
127    def clear_cdata_mode(self):
128        self.interesting = interesting_normal
129        self.cdata_elem = None
131    # Internal -- handle data as far as reasonable.  May leave state
132    # and data to be processed by a subsequent call.  If 'end' is
133    # true, force handling all data as if followed by EOF marker.
134    def goahead(self, end):
135        rawdata = self.rawdata
136        i = 0
137        n = len(rawdata)
138        while i < n:
139            if self.convert_charrefs and not self.cdata_elem:
140                j = rawdata.find('<', i)
141                if j < 0:
142                    # if we can't find the next <, either we are at the end
143                    # or there's more text incoming.  If the latter is True,
144                    # we can't pass the text to handle_data in case we have
145                    # a charref cut in half at end.  Try to determine if
146                    # this is the case before proceeding by looking for an
147                    # & near the end and see if it's followed by a space or ;.
148                    amppos = rawdata.rfind('&', max(i, n-34))
149                    if (amppos >= 0 and
150                        not re.compile(r'[\s;]').search(rawdata, amppos)):
151                        break  # wait till we get all the text
152                    j = n
153            else:
154                match = self.interesting.search(rawdata, i)  # < or &
155                if match:
156                    j = match.start()
157                else:
158                    if self.cdata_elem:
159                        break
160                    j = n
161            if i < j:
162                if self.convert_charrefs and not self.cdata_elem:
163                    self.handle_data(unescape(rawdata[i:j]))
164                else:
165                    self.handle_data(rawdata[i:j])
166            i = self.updatepos(i, j)
167            if i == n: break
168            startswith = rawdata.startswith
169            if startswith('<', i):
170                if starttagopen.match(rawdata, i): # < + letter
171                    k = self.parse_starttag(i)
172                elif startswith("</", i):
173                    k = self.parse_endtag(i)
174                elif startswith("<!--", i):
175                    k = self.parse_comment(i)
176                elif startswith("<?", i):
177                    k = self.parse_pi(i)
178                elif startswith("<!", i):
179                    k = self.parse_html_declaration(i)
180                elif (i + 1) < n:
181                    self.handle_data("<")
182                    k = i + 1
183                else:
184                    break
185                if k < 0:
186                    if not end:
187                        break
188                    k = rawdata.find('>', i + 1)
189                    if k < 0:
190                        k = rawdata.find('<', i + 1)
191                        if k < 0:
192                            k = i + 1
193                    else:
194                        k += 1
195                    if self.convert_charrefs and not self.cdata_elem:
196                        self.handle_data(unescape(rawdata[i:k]))
197                    else:
198                        self.handle_data(rawdata[i:k])
199                i = self.updatepos(i, k)
200            elif startswith("&#", i):
201                match = charref.match(rawdata, i)
202                if match:
203                    name = match.group()[2:-1]
204                    self.handle_charref(name)
205                    k = match.end()
206                    if not startswith(';', k-1):
207                        k = k - 1
208                    i = self.updatepos(i, k)
209                    continue
210                else:
211                    if ";" in rawdata[i:]:  # bail by consuming &#
212                        self.handle_data(rawdata[i:i+2])
213                        i = self.updatepos(i, i+2)
214                    break
215            elif startswith('&', i):
216                match = entityref.match(rawdata, i)
217                if match:
218                    name = match.group(1)
219                    self.handle_entityref(name)
220                    k = match.end()
221                    if not startswith(';', k-1):
222                        k = k - 1
223                    i = self.updatepos(i, k)
224                    continue
225                match = incomplete.match(rawdata, i)
226                if match:
227                    # match.group() will contain at least 2 chars
228                    if end and match.group() == rawdata[i:]:
229                        k = match.end()
230                        if k <= i:
231                            k = n
232                        i = self.updatepos(i, i + 1)
233                    # incomplete
234                    break
235                elif (i + 1) < n:
236                    # not the end of the buffer, and can't be confused
237                    # with some other construct
238                    self.handle_data("&")
239                    i = self.updatepos(i, i + 1)
240                else:
241                    break
242            else:
243                assert 0, "interesting.search() lied"
244        # end while
245        if end and i < n and not self.cdata_elem:
246            if self.convert_charrefs and not self.cdata_elem:
247                self.handle_data(unescape(rawdata[i:n]))
248            else:
249                self.handle_data(rawdata[i:n])
250            i = self.updatepos(i, n)
251        self.rawdata = rawdata[i:]
253    # Internal -- parse html declarations, return length or -1 if not terminated
254    # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
255    # See also parse_declaration in _markupbase
256    def parse_html_declaration(self, i):
257        rawdata = self.rawdata
258        assert rawdata[i:i+2] == '<!', ('unexpected call to '
259                                        'parse_html_declaration()')
260        if rawdata[i:i+4] == '<!--':
261            # this case is actually already handled in goahead()
262            return self.parse_comment(i)
263        elif rawdata[i:i+3] == '<![':
264            return self.parse_marked_section(i)
265        elif rawdata[i:i+9].lower() == '<!doctype':
266            # find the closing >
267            gtpos = rawdata.find('>', i+9)
268            if gtpos == -1:
269                return -1
270            self.handle_decl(rawdata[i+2:gtpos])
271            return gtpos+1
272        else:
273            return self.parse_bogus_comment(i)
275    # Internal -- parse bogus comment, return length or -1 if not terminated
276    # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
277    def parse_bogus_comment(self, i, report=1):
278        rawdata = self.rawdata
279        assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
280                                                'parse_comment()')
281        pos = rawdata.find('>', i+2)
282        if pos == -1:
283            return -1
284        if report:
285            self.handle_comment(rawdata[i+2:pos])
286        return pos + 1
288    # Internal -- parse processing instr, return end or -1 if not terminated
289    def parse_pi(self, i):
290        rawdata = self.rawdata
291        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
292        match = piclose.search(rawdata, i+2) # >
293        if not match:
294            return -1
295        j = match.start()
296        self.handle_pi(rawdata[i+2: j])
297        j = match.end()
298        return j
300    # Internal -- handle starttag, return end or -1 if not terminated
301    def parse_starttag(self, i):
302        self.__starttag_text = None
303        endpos = self.check_for_whole_start_tag(i)
304        if endpos < 0:
305            return endpos
306        rawdata = self.rawdata
307        self.__starttag_text = rawdata[i:endpos]
309        # Now parse the data between i+1 and j into a tag and attrs
310        attrs = []
311        match = tagfind_tolerant.match(rawdata, i+1)
312        assert match, 'unexpected call to parse_starttag()'
313        k = match.end()
314        self.lasttag = tag = match.group(1).lower()
315        while k < endpos:
316            m = attrfind_tolerant.match(rawdata, k)
317            if not m:
318                break
319            attrname, rest, attrvalue = m.group(1, 2, 3)
320            if not rest:
321                attrvalue = None
322            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
323                 attrvalue[:1] == '"' == attrvalue[-1:]:
324                attrvalue = attrvalue[1:-1]
325            if attrvalue:
326                attrvalue = unescape(attrvalue)
327            attrs.append((attrname.lower(), attrvalue))
328            k = m.end()
330        end = rawdata[k:endpos].strip()
331        if end not in (">", "/>"):
332            lineno, offset = self.getpos()
333            if "\n" in self.__starttag_text:
334                lineno = lineno + self.__starttag_text.count("\n")
335                offset = len(self.__starttag_text) \
336                         - self.__starttag_text.rfind("\n")
337            else:
338                offset = offset + len(self.__starttag_text)
339            self.handle_data(rawdata[i:endpos])
340            return endpos
341        if end.endswith('/>'):
342            # XHTML-style empty tag: <span attr="value" />
343            self.handle_startendtag(tag, attrs)
344        else:
345            self.handle_starttag(tag, attrs)
346            if tag in self.CDATA_CONTENT_ELEMENTS:
347                self.set_cdata_mode(tag)
348        return endpos
350    # Internal -- check to see if we have a complete starttag; return end
351    # or -1 if incomplete.
352    def check_for_whole_start_tag(self, i):
353        rawdata = self.rawdata
354        m = locatestarttagend_tolerant.match(rawdata, i)
355        if m:
356            j = m.end()
357            next = rawdata[j:j+1]
358            if next == ">":
359                return j + 1
360            if next == "/":
361                if rawdata.startswith("/>", j):
362                    return j + 2
363                if rawdata.startswith("/", j):
364                    # buffer boundary
365                    return -1
366                # else bogus input
367                if j > i:
368                    return j
369                else:
370                    return i + 1
371            if next == "":
372                # end of input
373                return -1
374            if next in ("abcdefghijklmnopqrstuvwxyz=/"
375                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
376                # end of input in or before attribute value, or we have the
377                # '/' from a '/>' ending
378                return -1
379            if j > i:
380                return j
381            else:
382                return i + 1
383        raise AssertionError("we should not get here!")
385    # Internal -- parse endtag, return end or -1 if incomplete
386    def parse_endtag(self, i):
387        rawdata = self.rawdata
388        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
389        match = endendtag.search(rawdata, i+1) # >
390        if not match:
391            return -1
392        gtpos = match.end()
393        match = endtagfind.match(rawdata, i) # </ + tag + >
394        if not match:
395            if self.cdata_elem is not None:
396                self.handle_data(rawdata[i:gtpos])
397                return gtpos
398            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
399            namematch = tagfind_tolerant.match(rawdata, i+2)
400            if not namematch:
401                # w3.org/TR/html5/tokenization.html#end-tag-open-state
402                if rawdata[i:i+3] == '</>':
403                    return i+3
404                else:
405                    return self.parse_bogus_comment(i)
406            tagname = namematch.group(1).lower()
407            # consume and ignore other stuff between the name and the >
408            # Note: this is not 100% correct, since we might have things like
409            # </tag attr=">">, but looking for > after tha name should cover
410            # most of the cases and is much simpler
411            gtpos = rawdata.find('>', namematch.end())
412            self.handle_endtag(tagname)
413            return gtpos+1
415        elem = match.group(1).lower() # script or style
416        if self.cdata_elem is not None:
417            if elem != self.cdata_elem:
418                self.handle_data(rawdata[i:gtpos])
419                return gtpos
421        self.handle_endtag(elem)
422        self.clear_cdata_mode()
423        return gtpos
425    # Overridable -- finish processing of start+end tag: <tag.../>
426    def handle_startendtag(self, tag, attrs):
427        self.handle_starttag(tag, attrs)
428        self.handle_endtag(tag)
430    # Overridable -- handle start tag
431    def handle_starttag(self, tag, attrs):
432        pass
434    # Overridable -- handle end tag
435    def handle_endtag(self, tag):
436        pass
438    # Overridable -- handle character reference
439    def handle_charref(self, name):
440        pass
442    # Overridable -- handle entity reference
443    def handle_entityref(self, name):
444        pass
446    # Overridable -- handle data
447    def handle_data(self, data):
448        pass
450    # Overridable -- handle comment
451    def handle_comment(self, data):
452        pass
454    # Overridable -- handle declaration
455    def handle_decl(self, decl):
456        pass
458    # Overridable -- handle processing instruction
459    def handle_pi(self, data):
460        pass
462    def unknown_decl(self, data):
463        pass
465    # Internal -- helper to remove special character quoting
466    def unescape(self, s):
467        warnings.warn('The unescape method is deprecated and will be removed '
468                      'in 3.5, use html.unescape() instead.',
469                      DeprecationWarning, stacklevel=2)
470        return unescape(s)