1from collections import defaultdict 2import itertools 3import sys 4from bs4.element import ( 5 CharsetMetaAttributeValue, 6 ContentMetaAttributeValue, 7 whitespace_re 8 ) 9 10__all__ = [ 11 'HTMLTreeBuilder', 12 'SAXTreeBuilder', 13 'TreeBuilder', 14 'TreeBuilderRegistry', 15 ] 16 17# Some useful features for a TreeBuilder to have. 18FAST = 'fast' 19PERMISSIVE = 'permissive' 20STRICT = 'strict' 21XML = 'xml' 22HTML = 'html' 23HTML_5 = 'html5' 24 25 26class TreeBuilderRegistry(object): 27 28 def __init__(self): 29 self.builders_for_feature = defaultdict(list) 30 self.builders = [] 31 32 def register(self, treebuilder_class): 33 """Register a treebuilder based on its advertised features.""" 34 for feature in treebuilder_class.features: 35 self.builders_for_feature[feature].insert(0, treebuilder_class) 36 self.builders.insert(0, treebuilder_class) 37 38 def lookup(self, *features): 39 if len(self.builders) == 0: 40 # There are no builders at all. 41 return None 42 43 if len(features) == 0: 44 # They didn't ask for any features. Give them the most 45 # recently registered builder. 46 return self.builders[0] 47 48 # Go down the list of features in order, and eliminate any builders 49 # that don't match every feature. 50 features = list(features) 51 features.reverse() 52 candidates = None 53 candidate_set = None 54 while len(features) > 0: 55 feature = features.pop() 56 we_have_the_feature = self.builders_for_feature.get(feature, []) 57 if len(we_have_the_feature) > 0: 58 if candidates is None: 59 candidates = we_have_the_feature 60 candidate_set = set(candidates) 61 else: 62 # Eliminate any candidates that don't have this feature. 63 candidate_set = candidate_set.intersection( 64 set(we_have_the_feature)) 65 66 # The only valid candidates are the ones in candidate_set. 67 # Go through the original list of candidates and pick the first one 68 # that's in candidate_set. 69 if candidate_set is None: 70 return None 71 for candidate in candidates: 72 if candidate in candidate_set: 73 return candidate 74 return None 75 76# The BeautifulSoup class will take feature lists from developers and use them 77# to look up builders in this registry. 78builder_registry = TreeBuilderRegistry() 79 80class TreeBuilder(object): 81 """Turn a document into a Beautiful Soup object tree.""" 82 83 features = [] 84 85 is_xml = False 86 preserve_whitespace_tags = set() 87 empty_element_tags = None # A tag will be considered an empty-element 88 # tag when and only when it has no contents. 89 90 # A value for these tag/attribute combinations is a space- or 91 # comma-separated list of CDATA, rather than a single CDATA. 92 cdata_list_attributes = {} 93 94 95 def __init__(self): 96 self.soup = None 97 98 def reset(self): 99 pass 100 101 def can_be_empty_element(self, tag_name): 102 """Might a tag with this name be an empty-element tag? 103 104 The final markup may or may not actually present this tag as 105 self-closing. 106 107 For instance: an HTMLBuilder does not consider a <p> tag to be 108 an empty-element tag (it's not in 109 HTMLBuilder.empty_element_tags). This means an empty <p> tag 110 will be presented as "<p></p>", not "<p />". 111 112 The default implementation has no opinion about which tags are 113 empty-element tags, so a tag will be presented as an 114 empty-element tag if and only if it has no contents. 115 "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will 116 be left alone. 117 """ 118 if self.empty_element_tags is None: 119 return True 120 return tag_name in self.empty_element_tags 121 122 def feed(self, markup): 123 raise NotImplementedError() 124 125 def prepare_markup(self, markup, user_specified_encoding=None, 126 document_declared_encoding=None): 127 return markup, None, None, False 128 129 def test_fragment_to_document(self, fragment): 130 """Wrap an HTML fragment to make it look like a document. 131 132 Different parsers do this differently. For instance, lxml 133 introduces an empty <head> tag, and html5lib 134 doesn't. Abstracting this away lets us write simple tests 135 which run HTML fragments through the parser and compare the 136 results against other HTML fragments. 137 138 This method should not be used outside of tests. 139 """ 140 return fragment 141 142 def set_up_substitutions(self, tag): 143 return False 144 145 def _replace_cdata_list_attribute_values(self, tag_name, attrs): 146 """Replaces class="foo bar" with class=["foo", "bar"] 147 148 Modifies its input in place. 149 """ 150 if not attrs: 151 return attrs 152 if self.cdata_list_attributes: 153 universal = self.cdata_list_attributes.get('*', []) 154 tag_specific = self.cdata_list_attributes.get( 155 tag_name.lower(), None) 156 for attr in attrs.keys(): 157 if attr in universal or (tag_specific and attr in tag_specific): 158 # We have a "class"-type attribute whose string 159 # value is a whitespace-separated list of 160 # values. Split it into a list. 161 value = attrs[attr] 162 if isinstance(value, basestring): 163 values = whitespace_re.split(value) 164 else: 165 # html5lib sometimes calls setAttributes twice 166 # for the same tag when rearranging the parse 167 # tree. On the second call the attribute value 168 # here is already a list. If this happens, 169 # leave the value alone rather than trying to 170 # split it again. 171 values = value 172 attrs[attr] = values 173 return attrs 174 175class SAXTreeBuilder(TreeBuilder): 176 """A Beautiful Soup treebuilder that listens for SAX events.""" 177 178 def feed(self, markup): 179 raise NotImplementedError() 180 181 def close(self): 182 pass 183 184 def startElement(self, name, attrs): 185 attrs = dict((key[1], value) for key, value in list(attrs.items())) 186 #print "Start %s, %r" % (name, attrs) 187 self.soup.handle_starttag(name, attrs) 188 189 def endElement(self, name): 190 #print "End %s" % name 191 self.soup.handle_endtag(name) 192 193 def startElementNS(self, nsTuple, nodeName, attrs): 194 # Throw away (ns, nodeName) for now. 195 self.startElement(nodeName, attrs) 196 197 def endElementNS(self, nsTuple, nodeName): 198 # Throw away (ns, nodeName) for now. 199 self.endElement(nodeName) 200 #handler.endElementNS((ns, node.nodeName), node.nodeName) 201 202 def startPrefixMapping(self, prefix, nodeValue): 203 # Ignore the prefix for now. 204 pass 205 206 def endPrefixMapping(self, prefix): 207 # Ignore the prefix for now. 208 # handler.endPrefixMapping(prefix) 209 pass 210 211 def characters(self, content): 212 self.soup.handle_data(content) 213 214 def startDocument(self): 215 pass 216 217 def endDocument(self): 218 pass 219 220 221class HTMLTreeBuilder(TreeBuilder): 222 """This TreeBuilder knows facts about HTML. 223 224 Such as which tags are empty-element tags. 225 """ 226 227 preserve_whitespace_tags = set(['pre', 'textarea']) 228 empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 229 'spacer', 'link', 'frame', 'base']) 230 231 # The HTML standard defines these attributes as containing a 232 # space-separated list of values, not a single value. That is, 233 # class="foo bar" means that the 'class' attribute has two values, 234 # 'foo' and 'bar', not the single value 'foo bar'. When we 235 # encounter one of these attributes, we will parse its value into 236 # a list of values if possible. Upon output, the list will be 237 # converted back into a string. 238 cdata_list_attributes = { 239 "*" : ['class', 'accesskey', 'dropzone'], 240 "a" : ['rel', 'rev'], 241 "link" : ['rel', 'rev'], 242 "td" : ["headers"], 243 "th" : ["headers"], 244 "td" : ["headers"], 245 "form" : ["accept-charset"], 246 "object" : ["archive"], 247 248 # These are HTML5 specific, as are *.accesskey and *.dropzone above. 249 "area" : ["rel"], 250 "icon" : ["sizes"], 251 "iframe" : ["sandbox"], 252 "output" : ["for"], 253 } 254 255 def set_up_substitutions(self, tag): 256 # We are only interested in <meta> tags 257 if tag.name != 'meta': 258 return False 259 260 http_equiv = tag.get('http-equiv') 261 content = tag.get('content') 262 charset = tag.get('charset') 263 264 # We are interested in <meta> tags that say what encoding the 265 # document was originally in. This means HTML 5-style <meta> 266 # tags that provide the "charset" attribute. It also means 267 # HTML 4-style <meta> tags that provide the "content" 268 # attribute and have "http-equiv" set to "content-type". 269 # 270 # In both cases we will replace the value of the appropriate 271 # attribute with a standin object that can take on any 272 # encoding. 273 meta_encoding = None 274 if charset is not None: 275 # HTML 5 style: 276 # <meta charset="utf8"> 277 meta_encoding = charset 278 tag['charset'] = CharsetMetaAttributeValue(charset) 279 280 elif (content is not None and http_equiv is not None 281 and http_equiv.lower() == 'content-type'): 282 # HTML 4 style: 283 # <meta http-equiv="content-type" content="text/html; charset=utf8"> 284 tag['content'] = ContentMetaAttributeValue(content) 285 286 return (meta_encoding is not None) 287 288def register_treebuilders_from(module): 289 """Copy TreeBuilders from the given module into this module.""" 290 # I'm fairly sure this is not the best way to do this. 291 this_module = sys.modules['bs4.builder'] 292 for name in module.__all__: 293 obj = getattr(module, name) 294 295 if issubclass(obj, TreeBuilder): 296 setattr(this_module, name, obj) 297 this_module.__all__.append(name) 298 # Register the builder while we're at it. 299 this_module.builder_registry.register(obj) 300 301class ParserRejectedMarkup(Exception): 302 pass 303 304# Builders are registered in reverse order of priority, so that custom 305# builder registrations will take precedence. In general, we want lxml 306# to take precedence over html5lib, because it's faster. And we only 307# want to use HTMLParser as a last result. 308from . import _htmlparser 309register_treebuilders_from(_htmlparser) 310try: 311 from . import _html5lib 312 register_treebuilders_from(_html5lib) 313except ImportError: 314 # They don't have html5lib installed. 315 pass 316try: 317 from . import _lxml 318 register_treebuilders_from(_lxml) 319except ImportError: 320 # They don't have lxml installed. 321 pass 322