1"""Beautiful Soup 2Elixir and Tonic 3"The Screen-Scraper's Friend" 4http://www.crummy.com/software/BeautifulSoup/ 5 6Beautiful Soup uses a pluggable XML or HTML parser to parse a 7(possibly invalid) document into a tree representation. Beautiful Soup 8provides provides methods and Pythonic idioms that make it easy to 9navigate, search, and modify the parse tree. 10 11Beautiful Soup works with Python 2.6 and up. It works better if lxml 12and/or html5lib is installed. 13 14For more than you ever wanted to know about Beautiful Soup, see the 15documentation: 16http://www.crummy.com/software/BeautifulSoup/bs4/doc/ 17""" 18 19__author__ = "Leonard Richardson (leonardr@segfault.org)" 20__version__ = "4.3.2" 21__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" 22__license__ = "MIT" 23 24__all__ = ['BeautifulSoup'] 25 26import os 27import re 28import warnings 29 30from .builder import builder_registry, ParserRejectedMarkup 31from .dammit import UnicodeDammit 32from .element import ( 33 CData, 34 Comment, 35 DEFAULT_OUTPUT_ENCODING, 36 Declaration, 37 Doctype, 38 NavigableString, 39 PageElement, 40 ProcessingInstruction, 41 ResultSet, 42 SoupStrainer, 43 Tag, 44 ) 45 46# The very first thing we do is give a useful error if someone is 47# running this code under Python 3 without converting it. 48syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' 49 50class BeautifulSoup(Tag): 51 """ 52 This class defines the basic interface called by the tree builders. 53 54 These methods will be called by the parser: 55 reset() 56 feed(markup) 57 58 The tree builder may call these methods from its feed() implementation: 59 handle_starttag(name, attrs) # See note about return value 60 handle_endtag(name) 61 handle_data(data) # Appends to the current data node 62 endData(containerClass=NavigableString) # Ends the current data node 63 64 No matter how complicated the underlying parser is, you should be 65 able to build a tree using 'start tag' events, 'end tag' events, 66 'data' events, and "done with data" events. 67 68 If you encounter an empty-element tag (aka a self-closing tag, 69 like HTML's <br> tag), call handle_starttag and then 70 handle_endtag. 71 """ 72 ROOT_TAG_NAME = u'[document]' 73 74 # If the end-user gives no indication which tree builder they 75 # want, look for one with these features. 76 DEFAULT_BUILDER_FEATURES = ['html', 'fast'] 77 78 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' 79 80 def __init__(self, markup="", features=None, builder=None, 81 parse_only=None, from_encoding=None, **kwargs): 82 """The Soup object is initialized as the 'root tag', and the 83 provided markup (which can be a string or a file-like object) 84 is fed into the underlying parser.""" 85 86 if 'convertEntities' in kwargs: 87 warnings.warn( 88 "BS4 does not respect the convertEntities argument to the " 89 "BeautifulSoup constructor. Entities are always converted " 90 "to Unicode characters.") 91 92 if 'markupMassage' in kwargs: 93 del kwargs['markupMassage'] 94 warnings.warn( 95 "BS4 does not respect the markupMassage argument to the " 96 "BeautifulSoup constructor. The tree builder is responsible " 97 "for any necessary markup massage.") 98 99 if 'smartQuotesTo' in kwargs: 100 del kwargs['smartQuotesTo'] 101 warnings.warn( 102 "BS4 does not respect the smartQuotesTo argument to the " 103 "BeautifulSoup constructor. Smart quotes are always converted " 104 "to Unicode characters.") 105 106 if 'selfClosingTags' in kwargs: 107 del kwargs['selfClosingTags'] 108 warnings.warn( 109 "BS4 does not respect the selfClosingTags argument to the " 110 "BeautifulSoup constructor. The tree builder is responsible " 111 "for understanding self-closing tags.") 112 113 if 'isHTML' in kwargs: 114 del kwargs['isHTML'] 115 warnings.warn( 116 "BS4 does not respect the isHTML argument to the " 117 "BeautifulSoup constructor. You can pass in features='html' " 118 "or features='xml' to get a builder capable of handling " 119 "one or the other.") 120 121 def deprecated_argument(old_name, new_name): 122 if old_name in kwargs: 123 warnings.warn( 124 'The "%s" argument to the BeautifulSoup constructor ' 125 'has been renamed to "%s."' % (old_name, new_name)) 126 value = kwargs[old_name] 127 del kwargs[old_name] 128 return value 129 return None 130 131 parse_only = parse_only or deprecated_argument( 132 "parseOnlyThese", "parse_only") 133 134 from_encoding = from_encoding or deprecated_argument( 135 "fromEncoding", "from_encoding") 136 137 if len(kwargs) > 0: 138 arg = kwargs.keys().pop() 139 raise TypeError( 140 "__init__() got an unexpected keyword argument '%s'" % arg) 141 142 if builder is None: 143 if isinstance(features, basestring): 144 features = [features] 145 if features is None or len(features) == 0: 146 features = self.DEFAULT_BUILDER_FEATURES 147 builder_class = builder_registry.lookup(*features) 148 if builder_class is None: 149 raise FeatureNotFound( 150 "Couldn't find a tree builder with the features you " 151 "requested: %s. Do you need to install a parser library?" 152 % ",".join(features)) 153 builder = builder_class() 154 self.builder = builder 155 self.is_xml = builder.is_xml 156 self.builder.soup = self 157 158 self.parse_only = parse_only 159 160 if hasattr(markup, 'read'): # It's a file-type object. 161 markup = markup.read() 162 elif len(markup) <= 256: 163 # Print out warnings for a couple beginner problems 164 # involving passing non-markup to Beautiful Soup. 165 # Beautiful Soup will still parse the input as markup, 166 # just in case that's what the user really wants. 167 if (isinstance(markup, unicode) 168 and not os.path.supports_unicode_filenames): 169 possible_filename = markup.encode("utf8") 170 else: 171 possible_filename = markup 172 is_file = False 173 try: 174 is_file = os.path.exists(possible_filename) 175 except Exception, e: 176 # This is almost certainly a problem involving 177 # characters not valid in filenames on this 178 # system. Just let it go. 179 pass 180 if is_file: 181 warnings.warn( 182 '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) 183 if markup[:5] == "http:" or markup[:6] == "https:": 184 # TODO: This is ugly but I couldn't get it to work in 185 # Python 3 otherwise. 186 if ((isinstance(markup, bytes) and not b' ' in markup) 187 or (isinstance(markup, unicode) and not u' ' in markup)): 188 warnings.warn( 189 '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) 190 191 for (self.markup, self.original_encoding, self.declared_html_encoding, 192 self.contains_replacement_characters) in ( 193 self.builder.prepare_markup(markup, from_encoding)): 194 self.reset() 195 try: 196 self._feed() 197 break 198 except ParserRejectedMarkup: 199 pass 200 201 # Clear out the markup and remove the builder's circular 202 # reference to this object. 203 self.markup = None 204 self.builder.soup = None 205 206 def _feed(self): 207 # Convert the document to Unicode. 208 self.builder.reset() 209 210 self.builder.feed(self.markup) 211 # Close out any unfinished strings and close all the open tags. 212 self.endData() 213 while self.currentTag.name != self.ROOT_TAG_NAME: 214 self.popTag() 215 216 def reset(self): 217 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) 218 self.hidden = 1 219 self.builder.reset() 220 self.current_data = [] 221 self.currentTag = None 222 self.tagStack = [] 223 self.preserve_whitespace_tag_stack = [] 224 self.pushTag(self) 225 226 def new_tag(self, name, namespace=None, nsprefix=None, **attrs): 227 """Create a new tag associated with this soup.""" 228 return Tag(None, self.builder, name, namespace, nsprefix, attrs) 229 230 def new_string(self, s, subclass=NavigableString): 231 """Create a new NavigableString associated with this soup.""" 232 navigable = subclass(s) 233 navigable.setup() 234 return navigable 235 236 def insert_before(self, successor): 237 raise NotImplementedError("BeautifulSoup objects don't support insert_before().") 238 239 def insert_after(self, successor): 240 raise NotImplementedError("BeautifulSoup objects don't support insert_after().") 241 242 def popTag(self): 243 tag = self.tagStack.pop() 244 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: 245 self.preserve_whitespace_tag_stack.pop() 246 #print "Pop", tag.name 247 if self.tagStack: 248 self.currentTag = self.tagStack[-1] 249 return self.currentTag 250 251 def pushTag(self, tag): 252 #print "Push", tag.name 253 if self.currentTag: 254 self.currentTag.contents.append(tag) 255 self.tagStack.append(tag) 256 self.currentTag = self.tagStack[-1] 257 if tag.name in self.builder.preserve_whitespace_tags: 258 self.preserve_whitespace_tag_stack.append(tag) 259 260 def endData(self, containerClass=NavigableString): 261 if self.current_data: 262 current_data = u''.join(self.current_data) 263 # If whitespace is not preserved, and this string contains 264 # nothing but ASCII spaces, replace it with a single space 265 # or newline. 266 if not self.preserve_whitespace_tag_stack: 267 strippable = True 268 for i in current_data: 269 if i not in self.ASCII_SPACES: 270 strippable = False 271 break 272 if strippable: 273 if '\n' in current_data: 274 current_data = '\n' 275 else: 276 current_data = ' ' 277 278 # Reset the data collector. 279 self.current_data = [] 280 281 # Should we add this string to the tree at all? 282 if self.parse_only and len(self.tagStack) <= 1 and \ 283 (not self.parse_only.text or \ 284 not self.parse_only.search(current_data)): 285 return 286 287 o = containerClass(current_data) 288 self.object_was_parsed(o) 289 290 def object_was_parsed(self, o, parent=None, most_recent_element=None): 291 """Add an object to the parse tree.""" 292 parent = parent or self.currentTag 293 most_recent_element = most_recent_element or self._most_recent_element 294 o.setup(parent, most_recent_element) 295 296 if most_recent_element is not None: 297 most_recent_element.next_element = o 298 self._most_recent_element = o 299 parent.contents.append(o) 300 301 def _popToTag(self, name, nsprefix=None, inclusivePop=True): 302 """Pops the tag stack up to and including the most recent 303 instance of the given tag. If inclusivePop is false, pops the tag 304 stack up to but *not* including the most recent instqance of 305 the given tag.""" 306 #print "Popping to %s" % name 307 if name == self.ROOT_TAG_NAME: 308 # The BeautifulSoup object itself can never be popped. 309 return 310 311 most_recently_popped = None 312 313 stack_size = len(self.tagStack) 314 for i in range(stack_size - 1, 0, -1): 315 t = self.tagStack[i] 316 if (name == t.name and nsprefix == t.prefix): 317 if inclusivePop: 318 most_recently_popped = self.popTag() 319 break 320 most_recently_popped = self.popTag() 321 322 return most_recently_popped 323 324 def handle_starttag(self, name, namespace, nsprefix, attrs): 325 """Push a start tag on to the stack. 326 327 If this method returns None, the tag was rejected by the 328 SoupStrainer. You should proceed as if the tag had not occured 329 in the document. For instance, if this was a self-closing tag, 330 don't call handle_endtag. 331 """ 332 333 # print "Start tag %s: %s" % (name, attrs) 334 self.endData() 335 336 if (self.parse_only and len(self.tagStack) <= 1 337 and (self.parse_only.text 338 or not self.parse_only.search_tag(name, attrs))): 339 return None 340 341 tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, 342 self.currentTag, self._most_recent_element) 343 if tag is None: 344 return tag 345 if self._most_recent_element: 346 self._most_recent_element.next_element = tag 347 self._most_recent_element = tag 348 self.pushTag(tag) 349 return tag 350 351 def handle_endtag(self, name, nsprefix=None): 352 #print "End tag: " + name 353 self.endData() 354 self._popToTag(name, nsprefix) 355 356 def handle_data(self, data): 357 self.current_data.append(data) 358 359 def decode(self, pretty_print=False, 360 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 361 formatter="minimal"): 362 """Returns a string or Unicode representation of this document. 363 To get Unicode, pass None for encoding.""" 364 365 if self.is_xml: 366 # Print the XML declaration 367 encoding_part = '' 368 if eventual_encoding != None: 369 encoding_part = ' encoding="%s"' % eventual_encoding 370 prefix = u'<?xml version="1.0"%s?>\n' % encoding_part 371 else: 372 prefix = u'' 373 if not pretty_print: 374 indent_level = None 375 else: 376 indent_level = 0 377 return prefix + super(BeautifulSoup, self).decode( 378 indent_level, eventual_encoding, formatter) 379 380# Alias to make it easier to type import: 'from bs4 import _soup' 381_s = BeautifulSoup 382_soup = BeautifulSoup 383 384class BeautifulStoneSoup(BeautifulSoup): 385 """Deprecated interface to an XML parser.""" 386 387 def __init__(self, *args, **kwargs): 388 kwargs['features'] = 'xml' 389 warnings.warn( 390 'The BeautifulStoneSoup class is deprecated. Instead of using ' 391 'it, pass features="xml" into the BeautifulSoup constructor.') 392 super(BeautifulStoneSoup, self).__init__(*args, **kwargs) 393 394 395class StopParsing(Exception): 396 pass 397 398class FeatureNotFound(ValueError): 399 pass 400 401 402#By default, act as an HTML pretty-printer. 403if __name__ == '__main__': 404 import sys 405 soup = BeautifulSoup(sys.stdin) 406 print soup.prettify() 407