1"""A parser for HTML and XHTML.""" 2 3# This file is based on sgmllib.py, but the API is slightly different. 4 5# XXX There should be a way to distinguish between PCDATA (parsed 6# character data -- the normal case), RCDATA (replaceable character 7# data -- only char and entity references and end tags are special) 8# and CDATA (character data -- only end tags are special). 9 10 11import markupbase 12import re 13 14# Regular expressions used for parsing 15 16interesting_normal = re.compile('[&<]') 17incomplete = re.compile('&[a-zA-Z#]') 18 19entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 20charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') 21 22starttagopen = re.compile('<[a-zA-Z]') 23piclose = re.compile('>') 24commentclose = re.compile(r'--\s*>') 25 26# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state 27# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state 28# note: if you change tagfind/attrfind remember to update locatestarttagend too 29tagfind = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') 30# this regex is currently unused, but left for backward compatibility 31tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') 32 33attrfind = re.compile( 34 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' 35 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') 36 37locatestarttagend = re.compile(r""" 38 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name 39 (?:[\s/]* # optional whitespace before attribute name 40 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name 41 (?:\s*=+\s* # value indicator 42 (?:'[^']*' # LITA-enclosed value 43 |"[^"]*" # LIT-enclosed value 44 |(?!['"])[^>\s]* # bare value 45 ) 46 )?(?:\s|/(?!>))* 47 )* 48 )? 49 \s* # trailing whitespace 50""", re.VERBOSE) 51endendtag = re.compile('>') 52# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between 53# </ and the tag name, so maybe this should be fixed 54endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') 55 56 57class HTMLParseError(Exception): 58 """Exception raised for all parse errors.""" 59 60 def __init__(self, msg, position=(None, None)): 61 assert msg 62 self.msg = msg 63 self.lineno = position[0] 64 self.offset = position[1] 65 66 def __str__(self): 67 result = self.msg 68 if self.lineno is not None: 69 result = result + ", at line %d" % self.lineno 70 if self.offset is not None: 71 result = result + ", column %d" % (self.offset + 1) 72 return result 73 74 75class HTMLParser(markupbase.ParserBase): 76 """Find tags and other markup and call handler functions. 77 78 Usage: 79 p = HTMLParser() 80 p.feed(data) 81 ... 82 p.close() 83 84 Start tags are handled by calling self.handle_starttag() or 85 self.handle_startendtag(); end tags by self.handle_endtag(). The 86 data between tags is passed from the parser to the derived class 87 by calling self.handle_data() with the data as argument (the data 88 may be split up in arbitrary chunks). Entity references are 89 passed by calling self.handle_entityref() with the entity 90 reference as the argument. Numeric character references are 91 passed to self.handle_charref() with the string containing the 92 reference as the argument. 93 """ 94 95 CDATA_CONTENT_ELEMENTS = ("script", "style") 96 97 98 def __init__(self): 99 """Initialize and reset this instance.""" 100 self.reset() 101 102 def reset(self): 103 """Reset this instance. Loses all unprocessed data.""" 104 self.rawdata = '' 105 self.lasttag = '???' 106 self.interesting = interesting_normal 107 self.cdata_elem = None 108 markupbase.ParserBase.reset(self) 109 110 def feed(self, data): 111 r"""Feed data to the parser. 112 113 Call this as often as you want, with as little or as much text 114 as you want (may include '\n'). 115 """ 116 self.rawdata = self.rawdata + data 117 self.goahead(0) 118 119 def close(self): 120 """Handle any buffered data.""" 121 self.goahead(1) 122 123 def error(self, message): 124 raise HTMLParseError(message, self.getpos()) 125 126 __starttag_text = None 127 128 def get_starttag_text(self): 129 """Return full source of start tag: '<...>'.""" 130 return self.__starttag_text 131 132 def set_cdata_mode(self, elem): 133 self.cdata_elem = elem.lower() 134 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 135 136 def clear_cdata_mode(self): 137 self.interesting = interesting_normal 138 self.cdata_elem = None 139 140 # Internal -- handle data as far as reasonable. May leave state 141 # and data to be processed by a subsequent call. If 'end' is 142 # true, force handling all data as if followed by EOF marker. 143 def goahead(self, end): 144 rawdata = self.rawdata 145 i = 0 146 n = len(rawdata) 147 while i < n: 148 match = self.interesting.search(rawdata, i) # < or & 149 if match: 150 j = match.start() 151 else: 152 if self.cdata_elem: 153 break 154 j = n 155 if i < j: self.handle_data(rawdata[i:j]) 156 i = self.updatepos(i, j) 157 if i == n: break 158 startswith = rawdata.startswith 159 if startswith('<', i): 160 if starttagopen.match(rawdata, i): # < + letter 161 k = self.parse_starttag(i) 162 elif startswith("</", i): 163 k = self.parse_endtag(i) 164 elif startswith("<!--", i): 165 k = self.parse_comment(i) 166 elif startswith("<?", i): 167 k = self.parse_pi(i) 168 elif startswith("<!", i): 169 k = self.parse_html_declaration(i) 170 elif (i + 1) < n: 171 self.handle_data("<") 172 k = i + 1 173 else: 174 break 175 if k < 0: 176 if not end: 177 break 178 k = rawdata.find('>', i + 1) 179 if k < 0: 180 k = rawdata.find('<', i + 1) 181 if k < 0: 182 k = i + 1 183 else: 184 k += 1 185 self.handle_data(rawdata[i:k]) 186 i = self.updatepos(i, k) 187 elif startswith("&#", i): 188 match = charref.match(rawdata, i) 189 if match: 190 name = match.group()[2:-1] 191 self.handle_charref(name) 192 k = match.end() 193 if not startswith(';', k-1): 194 k = k - 1 195 i = self.updatepos(i, k) 196 continue 197 else: 198 if ";" in rawdata[i:]: # bail by consuming '&#' 199 self.handle_data(rawdata[i:i+2]) 200 i = self.updatepos(i, i+2) 201 break 202 elif startswith('&', i): 203 match = entityref.match(rawdata, i) 204 if match: 205 name = match.group(1) 206 self.handle_entityref(name) 207 k = match.end() 208 if not startswith(';', k-1): 209 k = k - 1 210 i = self.updatepos(i, k) 211 continue 212 match = incomplete.match(rawdata, i) 213 if match: 214 # match.group() will contain at least 2 chars 215 if end and match.group() == rawdata[i:]: 216 self.error("EOF in middle of entity or char ref") 217 # incomplete 218 break 219 elif (i + 1) < n: 220 # not the end of the buffer, and can't be confused 221 # with some other construct 222 self.handle_data("&") 223 i = self.updatepos(i, i + 1) 224 else: 225 break 226 else: 227 assert 0, "interesting.search() lied" 228 # end while 229 if end and i < n and not self.cdata_elem: 230 self.handle_data(rawdata[i:n]) 231 i = self.updatepos(i, n) 232 self.rawdata = rawdata[i:] 233 234 # Internal -- parse html declarations, return length or -1 if not terminated 235 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state 236 # See also parse_declaration in _markupbase 237 def parse_html_declaration(self, i): 238 rawdata = self.rawdata 239 if rawdata[i:i+2] != '<!': 240 self.error('unexpected call to parse_html_declaration()') 241 if rawdata[i:i+4] == '<!--': 242 # this case is actually already handled in goahead() 243 return self.parse_comment(i) 244 elif rawdata[i:i+3] == '<![': 245 return self.parse_marked_section(i) 246 elif rawdata[i:i+9].lower() == '<!doctype': 247 # find the closing > 248 gtpos = rawdata.find('>', i+9) 249 if gtpos == -1: 250 return -1 251 self.handle_decl(rawdata[i+2:gtpos]) 252 return gtpos+1 253 else: 254 return self.parse_bogus_comment(i) 255 256 # Internal -- parse bogus comment, return length or -1 if not terminated 257 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state 258 def parse_bogus_comment(self, i, report=1): 259 rawdata = self.rawdata 260 if rawdata[i:i+2] not in ('<!', '</'): 261 self.error('unexpected call to parse_comment()') 262 pos = rawdata.find('>', i+2) 263 if pos == -1: 264 return -1 265 if report: 266 self.handle_comment(rawdata[i+2:pos]) 267 return pos + 1 268 269 # Internal -- parse processing instr, return end or -1 if not terminated 270 def parse_pi(self, i): 271 rawdata = self.rawdata 272 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' 273 match = piclose.search(rawdata, i+2) # > 274 if not match: 275 return -1 276 j = match.start() 277 self.handle_pi(rawdata[i+2: j]) 278 j = match.end() 279 return j 280 281 # Internal -- handle starttag, return end or -1 if not terminated 282 def parse_starttag(self, i): 283 self.__starttag_text = None 284 endpos = self.check_for_whole_start_tag(i) 285 if endpos < 0: 286 return endpos 287 rawdata = self.rawdata 288 self.__starttag_text = rawdata[i:endpos] 289 290 # Now parse the data between i+1 and j into a tag and attrs 291 attrs = [] 292 match = tagfind.match(rawdata, i+1) 293 assert match, 'unexpected call to parse_starttag()' 294 k = match.end() 295 self.lasttag = tag = match.group(1).lower() 296 297 while k < endpos: 298 m = attrfind.match(rawdata, k) 299 if not m: 300 break 301 attrname, rest, attrvalue = m.group(1, 2, 3) 302 if not rest: 303 attrvalue = None 304 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 305 attrvalue[:1] == '"' == attrvalue[-1:]: 306 attrvalue = attrvalue[1:-1] 307 if attrvalue: 308 attrvalue = self.unescape(attrvalue) 309 attrs.append((attrname.lower(), attrvalue)) 310 k = m.end() 311 312 end = rawdata[k:endpos].strip() 313 if end not in (">", "/>"): 314 lineno, offset = self.getpos() 315 if "\n" in self.__starttag_text: 316 lineno = lineno + self.__starttag_text.count("\n") 317 offset = len(self.__starttag_text) \ 318 - self.__starttag_text.rfind("\n") 319 else: 320 offset = offset + len(self.__starttag_text) 321 self.handle_data(rawdata[i:endpos]) 322 return endpos 323 if end.endswith('/>'): 324 # XHTML-style empty tag: <span attr="value" /> 325 self.handle_startendtag(tag, attrs) 326 else: 327 self.handle_starttag(tag, attrs) 328 if tag in self.CDATA_CONTENT_ELEMENTS: 329 self.set_cdata_mode(tag) 330 return endpos 331 332 # Internal -- check to see if we have a complete starttag; return end 333 # or -1 if incomplete. 334 def check_for_whole_start_tag(self, i): 335 rawdata = self.rawdata 336 m = locatestarttagend.match(rawdata, i) 337 if m: 338 j = m.end() 339 next = rawdata[j:j+1] 340 if next == ">": 341 return j + 1 342 if next == "/": 343 if rawdata.startswith("/>", j): 344 return j + 2 345 if rawdata.startswith("/", j): 346 # buffer boundary 347 return -1 348 # else bogus input 349 self.updatepos(i, j + 1) 350 self.error("malformed empty start tag") 351 if next == "": 352 # end of input 353 return -1 354 if next in ("abcdefghijklmnopqrstuvwxyz=/" 355 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): 356 # end of input in or before attribute value, or we have the 357 # '/' from a '/>' ending 358 return -1 359 if j > i: 360 return j 361 else: 362 return i + 1 363 raise AssertionError("we should not get here!") 364 365 # Internal -- parse endtag, return end or -1 if incomplete 366 def parse_endtag(self, i): 367 rawdata = self.rawdata 368 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" 369 match = endendtag.search(rawdata, i+1) # > 370 if not match: 371 return -1 372 gtpos = match.end() 373 match = endtagfind.match(rawdata, i) # </ + tag + > 374 if not match: 375 if self.cdata_elem is not None: 376 self.handle_data(rawdata[i:gtpos]) 377 return gtpos 378 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state 379 namematch = tagfind.match(rawdata, i+2) 380 if not namematch: 381 # w3.org/TR/html5/tokenization.html#end-tag-open-state 382 if rawdata[i:i+3] == '</>': 383 return i+3 384 else: 385 return self.parse_bogus_comment(i) 386 tagname = namematch.group(1).lower() 387 # consume and ignore other stuff between the name and the > 388 # Note: this is not 100% correct, since we might have things like 389 # </tag attr=">">, but looking for > after tha name should cover 390 # most of the cases and is much simpler 391 gtpos = rawdata.find('>', namematch.end()) 392 self.handle_endtag(tagname) 393 return gtpos+1 394 395 elem = match.group(1).lower() # script or style 396 if self.cdata_elem is not None: 397 if elem != self.cdata_elem: 398 self.handle_data(rawdata[i:gtpos]) 399 return gtpos 400 401 self.handle_endtag(elem) 402 self.clear_cdata_mode() 403 return gtpos 404 405 # Overridable -- finish processing of start+end tag: <tag.../> 406 def handle_startendtag(self, tag, attrs): 407 self.handle_starttag(tag, attrs) 408 self.handle_endtag(tag) 409 410 # Overridable -- handle start tag 411 def handle_starttag(self, tag, attrs): 412 pass 413 414 # Overridable -- handle end tag 415 def handle_endtag(self, tag): 416 pass 417 418 # Overridable -- handle character reference 419 def handle_charref(self, name): 420 pass 421 422 # Overridable -- handle entity reference 423 def handle_entityref(self, name): 424 pass 425 426 # Overridable -- handle data 427 def handle_data(self, data): 428 pass 429 430 # Overridable -- handle comment 431 def handle_comment(self, data): 432 pass 433 434 # Overridable -- handle declaration 435 def handle_decl(self, decl): 436 pass 437 438 # Overridable -- handle processing instruction 439 def handle_pi(self, data): 440 pass 441 442 def unknown_decl(self, data): 443 pass 444 445 # Internal -- helper to remove special character quoting 446 entitydefs = None 447 def unescape(self, s): 448 if '&' not in s: 449 return s 450 def replaceEntities(s): 451 s = s.groups()[0] 452 try: 453 if s[0] == "#": 454 s = s[1:] 455 if s[0] in ['x','X']: 456 c = int(s[1:], 16) 457 else: 458 c = int(s) 459 return unichr(c) 460 except ValueError: 461 return '&#'+s+';' 462 else: 463 # Cannot use name2codepoint directly, because HTMLParser supports apos, 464 # which is not part of HTML 4 465 if HTMLParser.entitydefs is None: 466 import htmlentitydefs 467 entitydefs = {'apos':u"'"} 468 for k, v in htmlentitydefs.name2codepoint.iteritems(): 469 entitydefs[k] = unichr(v) 470 HTMLParser.entitydefs = entitydefs 471 try: 472 return self.entitydefs[s] 473 except KeyError: 474 return '&'+s+';' 475 476 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s) 477