1# -*- coding: utf-8 -*- 2"""Tests of Beautiful Soup as a whole.""" 3 4import logging 5import unittest 6import sys 7import tempfile 8 9from bs4 import ( 10 BeautifulSoup, 11 BeautifulStoneSoup, 12) 13from bs4.element import ( 14 CharsetMetaAttributeValue, 15 ContentMetaAttributeValue, 16 SoupStrainer, 17 NamespacedAttribute, 18 ) 19import bs4.dammit 20from bs4.dammit import ( 21 EntitySubstitution, 22 UnicodeDammit, 23) 24from bs4.testing import ( 25 SoupTest, 26 skipIf, 27) 28import warnings 29 30try: 31 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 32 LXML_PRESENT = True 33except ImportError, e: 34 LXML_PRESENT = False 35 36PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) 37PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) 38 39class TestConstructor(SoupTest): 40 41 def test_short_unicode_input(self): 42 data = u"<h1>éé</h1>" 43 soup = self.soup(data) 44 self.assertEqual(u"éé", soup.h1.string) 45 46 def test_embedded_null(self): 47 data = u"<h1>foo\0bar</h1>" 48 soup = self.soup(data) 49 self.assertEqual(u"foo\0bar", soup.h1.string) 50 51 52class TestDeprecatedConstructorArguments(SoupTest): 53 54 def test_parseOnlyThese_renamed_to_parse_only(self): 55 with warnings.catch_warnings(record=True) as w: 56 soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) 57 msg = str(w[0].message) 58 self.assertTrue("parseOnlyThese" in msg) 59 self.assertTrue("parse_only" in msg) 60 self.assertEqual(b"<b></b>", soup.encode()) 61 62 def test_fromEncoding_renamed_to_from_encoding(self): 63 with warnings.catch_warnings(record=True) as w: 64 utf8 = b"\xc3\xa9" 65 soup = self.soup(utf8, fromEncoding="utf8") 66 msg = str(w[0].message) 67 self.assertTrue("fromEncoding" in msg) 68 self.assertTrue("from_encoding" in msg) 69 self.assertEqual("utf8", soup.original_encoding) 70 71 def test_unrecognized_keyword_argument(self): 72 self.assertRaises( 73 TypeError, self.soup, "<a>", no_such_argument=True) 74 75class TestWarnings(SoupTest): 76 77 def test_disk_file_warning(self): 78 filehandle = tempfile.NamedTemporaryFile() 79 filename = filehandle.name 80 try: 81 with warnings.catch_warnings(record=True) as w: 82 soup = self.soup(filename) 83 msg = str(w[0].message) 84 self.assertTrue("looks like a filename" in msg) 85 finally: 86 filehandle.close() 87 88 # The file no longer exists, so Beautiful Soup will no longer issue the warning. 89 with warnings.catch_warnings(record=True) as w: 90 soup = self.soup(filename) 91 self.assertEqual(0, len(w)) 92 93 def test_url_warning(self): 94 with warnings.catch_warnings(record=True) as w: 95 soup = self.soup("http://www.crummy.com/") 96 msg = str(w[0].message) 97 self.assertTrue("looks like a URL" in msg) 98 99 with warnings.catch_warnings(record=True) as w: 100 soup = self.soup("http://www.crummy.com/ is great") 101 self.assertEqual(0, len(w)) 102 103class TestSelectiveParsing(SoupTest): 104 105 def test_parse_with_soupstrainer(self): 106 markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" 107 strainer = SoupStrainer("b") 108 soup = self.soup(markup, parse_only=strainer) 109 self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>") 110 111 112class TestEntitySubstitution(unittest.TestCase): 113 """Standalone tests of the EntitySubstitution class.""" 114 def setUp(self): 115 self.sub = EntitySubstitution 116 117 def test_simple_html_substitution(self): 118 # Unicode characters corresponding to named HTML entites 119 # are substituted, and no others. 120 s = u"foo\u2200\N{SNOWMAN}\u00f5bar" 121 self.assertEqual(self.sub.substitute_html(s), 122 u"foo∀\N{SNOWMAN}õbar") 123 124 def test_smart_quote_substitution(self): 125 # MS smart quotes are a common source of frustration, so we 126 # give them a special test. 127 quotes = b"\x91\x92foo\x93\x94" 128 dammit = UnicodeDammit(quotes) 129 self.assertEqual(self.sub.substitute_html(dammit.markup), 130 "‘’foo“”") 131 132 def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): 133 s = 'Welcome to "my bar"' 134 self.assertEqual(self.sub.substitute_xml(s, False), s) 135 136 def test_xml_attribute_quoting_normally_uses_double_quotes(self): 137 self.assertEqual(self.sub.substitute_xml("Welcome", True), 138 '"Welcome"') 139 self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), 140 '"Bob\'s Bar"') 141 142 def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): 143 s = 'Welcome to "my bar"' 144 self.assertEqual(self.sub.substitute_xml(s, True), 145 "'Welcome to \"my bar\"'") 146 147 def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): 148 s = 'Welcome to "Bob\'s Bar"' 149 self.assertEqual( 150 self.sub.substitute_xml(s, True), 151 '"Welcome to "Bob\'s Bar""') 152 153 def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): 154 quoted = 'Welcome to "Bob\'s Bar"' 155 self.assertEqual(self.sub.substitute_xml(quoted), quoted) 156 157 def test_xml_quoting_handles_angle_brackets(self): 158 self.assertEqual( 159 self.sub.substitute_xml("foo<bar>"), 160 "foo<bar>") 161 162 def test_xml_quoting_handles_ampersands(self): 163 self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") 164 165 def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): 166 self.assertEqual( 167 self.sub.substitute_xml("ÁT&T"), 168 "&Aacute;T&T") 169 170 def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): 171 self.assertEqual( 172 self.sub.substitute_xml_containing_entities("ÁT&T"), 173 "ÁT&T") 174 175 def test_quotes_not_html_substituted(self): 176 """There's no need to do this except inside attribute values.""" 177 text = 'Bob\'s "bar"' 178 self.assertEqual(self.sub.substitute_html(text), text) 179 180 181class TestEncodingConversion(SoupTest): 182 # Test Beautiful Soup's ability to decode and encode from various 183 # encodings. 184 185 def setUp(self): 186 super(TestEncodingConversion, self).setUp() 187 self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' 188 self.utf8_data = self.unicode_data.encode("utf-8") 189 # Just so you know what it looks like. 190 self.assertEqual( 191 self.utf8_data, 192 b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>') 193 194 def test_ascii_in_unicode_out(self): 195 # ASCII input is converted to Unicode. The original_encoding 196 # attribute is set to 'utf-8', a superset of ASCII. 197 chardet = bs4.dammit.chardet_dammit 198 logging.disable(logging.WARNING) 199 try: 200 def noop(str): 201 return None 202 # Disable chardet, which will realize that the ASCII is ASCII. 203 bs4.dammit.chardet_dammit = noop 204 ascii = b"<foo>a</foo>" 205 soup_from_ascii = self.soup(ascii) 206 unicode_output = soup_from_ascii.decode() 207 self.assertTrue(isinstance(unicode_output, unicode)) 208 self.assertEqual(unicode_output, self.document_for(ascii.decode())) 209 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") 210 finally: 211 logging.disable(logging.NOTSET) 212 bs4.dammit.chardet_dammit = chardet 213 214 def test_unicode_in_unicode_out(self): 215 # Unicode input is left alone. The original_encoding attribute 216 # is not set. 217 soup_from_unicode = self.soup(self.unicode_data) 218 self.assertEqual(soup_from_unicode.decode(), self.unicode_data) 219 self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') 220 self.assertEqual(soup_from_unicode.original_encoding, None) 221 222 def test_utf8_in_unicode_out(self): 223 # UTF-8 input is converted to Unicode. The original_encoding 224 # attribute is set. 225 soup_from_utf8 = self.soup(self.utf8_data) 226 self.assertEqual(soup_from_utf8.decode(), self.unicode_data) 227 self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') 228 229 def test_utf8_out(self): 230 # The internal data structures can be encoded as UTF-8. 231 soup_from_unicode = self.soup(self.unicode_data) 232 self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) 233 234 @skipIf( 235 PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, 236 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") 237 def test_attribute_name_containing_unicode_characters(self): 238 markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' 239 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) 240 241class TestUnicodeDammit(unittest.TestCase): 242 """Standalone tests of UnicodeDammit.""" 243 244 def test_unicode_input(self): 245 markup = u"I'm already Unicode! \N{SNOWMAN}" 246 dammit = UnicodeDammit(markup) 247 self.assertEqual(dammit.unicode_markup, markup) 248 249 def test_smart_quotes_to_unicode(self): 250 markup = b"<foo>\x91\x92\x93\x94</foo>" 251 dammit = UnicodeDammit(markup) 252 self.assertEqual( 253 dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") 254 255 def test_smart_quotes_to_xml_entities(self): 256 markup = b"<foo>\x91\x92\x93\x94</foo>" 257 dammit = UnicodeDammit(markup, smart_quotes_to="xml") 258 self.assertEqual( 259 dammit.unicode_markup, "<foo>‘’“”</foo>") 260 261 def test_smart_quotes_to_html_entities(self): 262 markup = b"<foo>\x91\x92\x93\x94</foo>" 263 dammit = UnicodeDammit(markup, smart_quotes_to="html") 264 self.assertEqual( 265 dammit.unicode_markup, "<foo>‘’“”</foo>") 266 267 def test_smart_quotes_to_ascii(self): 268 markup = b"<foo>\x91\x92\x93\x94</foo>" 269 dammit = UnicodeDammit(markup, smart_quotes_to="ascii") 270 self.assertEqual( 271 dammit.unicode_markup, """<foo>''""</foo>""") 272 273 def test_detect_utf8(self): 274 utf8 = b"\xc3\xa9" 275 dammit = UnicodeDammit(utf8) 276 self.assertEqual(dammit.unicode_markup, u'\xe9') 277 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 278 279 def test_convert_hebrew(self): 280 hebrew = b"\xed\xe5\xec\xf9" 281 dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) 282 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') 283 self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') 284 285 def test_dont_see_smart_quotes_where_there_are_none(self): 286 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" 287 dammit = UnicodeDammit(utf_8) 288 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 289 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) 290 291 def test_ignore_inappropriate_codecs(self): 292 utf8_data = u"Räksmörgås".encode("utf-8") 293 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) 294 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 295 296 def test_ignore_invalid_codecs(self): 297 utf8_data = u"Räksmörgås".encode("utf-8") 298 for bad_encoding in ['.utf8', '...', 'utF---16.!']: 299 dammit = UnicodeDammit(utf8_data, [bad_encoding]) 300 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 301 302 def test_detect_html5_style_meta_tag(self): 303 304 for data in ( 305 b'<html><meta charset="euc-jp" /></html>', 306 b"<html><meta charset='euc-jp' /></html>", 307 b"<html><meta charset=euc-jp /></html>", 308 b"<html><meta charset=euc-jp/></html>"): 309 dammit = UnicodeDammit(data, is_html=True) 310 self.assertEqual( 311 "euc-jp", dammit.original_encoding) 312 313 def test_last_ditch_entity_replacement(self): 314 # This is a UTF-8 document that contains bytestrings 315 # completely incompatible with UTF-8 (ie. encoded with some other 316 # encoding). 317 # 318 # Since there is no consistent encoding for the document, 319 # Unicode, Dammit will eventually encode the document as UTF-8 320 # and encode the incompatible characters as REPLACEMENT 321 # CHARACTER. 322 # 323 # If chardet is installed, it will detect that the document 324 # can be converted into ISO-8859-1 without errors. This happens 325 # to be the wrong encoding, but it is a consistent encoding, so the 326 # code we're testing here won't run. 327 # 328 # So we temporarily disable chardet if it's present. 329 doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> 330<html><b>\330\250\330\252\330\261</b> 331<i>\310\322\321\220\312\321\355\344</i></html>""" 332 chardet = bs4.dammit.chardet_dammit 333 logging.disable(logging.WARNING) 334 try: 335 def noop(str): 336 return None 337 bs4.dammit.chardet_dammit = noop 338 dammit = UnicodeDammit(doc) 339 self.assertEqual(True, dammit.contains_replacement_characters) 340 self.assertTrue(u"\ufffd" in dammit.unicode_markup) 341 342 soup = BeautifulSoup(doc, "html.parser") 343 self.assertTrue(soup.contains_replacement_characters) 344 finally: 345 logging.disable(logging.NOTSET) 346 bs4.dammit.chardet_dammit = chardet 347 348 def test_byte_order_mark_removed(self): 349 # A document written in UTF-16LE will have its byte order marker stripped. 350 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' 351 dammit = UnicodeDammit(data) 352 self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) 353 self.assertEqual("utf-16le", dammit.original_encoding) 354 355 def test_detwingle(self): 356 # Here's a UTF8 document. 357 utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") 358 359 # Here's a Windows-1252 document. 360 windows_1252 = ( 361 u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" 362 u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") 363 364 # Through some unholy alchemy, they've been stuck together. 365 doc = utf8 + windows_1252 + utf8 366 367 # The document can't be turned into UTF-8: 368 self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") 369 370 # Unicode, Dammit thinks the whole document is Windows-1252, 371 # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" 372 373 # But if we run it through fix_embedded_windows_1252, it's fixed: 374 375 fixed = UnicodeDammit.detwingle(doc) 376 self.assertEqual( 377 u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) 378 379 def test_detwingle_ignores_multibyte_characters(self): 380 # Each of these characters has a UTF-8 representation ending 381 # in \x93. \x93 is a smart quote if interpreted as 382 # Windows-1252. But our code knows to skip over multibyte 383 # UTF-8 characters, so they'll survive the process unscathed. 384 for tricky_unicode_char in ( 385 u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' 386 u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' 387 u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. 388 ): 389 input = tricky_unicode_char.encode("utf8") 390 self.assertTrue(input.endswith(b'\x93')) 391 output = UnicodeDammit.detwingle(input) 392 self.assertEqual(output, input) 393 394class TestNamedspacedAttribute(SoupTest): 395 396 def test_name_may_be_none(self): 397 a = NamespacedAttribute("xmlns", None) 398 self.assertEqual(a, "xmlns") 399 400 def test_attribute_is_equivalent_to_colon_separated_string(self): 401 a = NamespacedAttribute("a", "b") 402 self.assertEqual("a:b", a) 403 404 def test_attributes_are_equivalent_if_prefix_and_name_identical(self): 405 a = NamespacedAttribute("a", "b", "c") 406 b = NamespacedAttribute("a", "b", "c") 407 self.assertEqual(a, b) 408 409 # The actual namespace is not considered. 410 c = NamespacedAttribute("a", "b", None) 411 self.assertEqual(a, c) 412 413 # But name and prefix are important. 414 d = NamespacedAttribute("a", "z", "c") 415 self.assertNotEqual(a, d) 416 417 e = NamespacedAttribute("z", "b", "c") 418 self.assertNotEqual(a, e) 419 420 421class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): 422 423 def test_content_meta_attribute_value(self): 424 value = CharsetMetaAttributeValue("euc-jp") 425 self.assertEqual("euc-jp", value) 426 self.assertEqual("euc-jp", value.original_value) 427 self.assertEqual("utf8", value.encode("utf8")) 428 429 430 def test_content_meta_attribute_value(self): 431 value = ContentMetaAttributeValue("text/html; charset=euc-jp") 432 self.assertEqual("text/html; charset=euc-jp", value) 433 self.assertEqual("text/html; charset=euc-jp", value.original_value) 434 self.assertEqual("text/html; charset=utf8", value.encode("utf8")) 435