1"""Tests to ensure that the html5lib tree builder generates good trees.""" 2 3import warnings 4 5try: 6 from bs4.builder import HTML5TreeBuilder 7 HTML5LIB_PRESENT = True 8except ImportError, e: 9 HTML5LIB_PRESENT = False 10from bs4.element import SoupStrainer 11from bs4.testing import ( 12 HTML5TreeBuilderSmokeTest, 13 SoupTest, 14 skipIf, 15) 16 17@skipIf( 18 not HTML5LIB_PRESENT, 19 "html5lib seems not to be present, not testing its tree builder.") 20class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): 21 """See ``HTML5TreeBuilderSmokeTest``.""" 22 23 @property 24 def default_builder(self): 25 return HTML5TreeBuilder() 26 27 def test_soupstrainer(self): 28 # The html5lib tree builder does not support SoupStrainers. 29 strainer = SoupStrainer("b") 30 markup = "<p>A <b>bold</b> statement.</p>" 31 with warnings.catch_warnings(record=True) as w: 32 soup = self.soup(markup, parse_only=strainer) 33 self.assertEqual( 34 soup.decode(), self.document_for(markup)) 35 36 self.assertTrue( 37 "the html5lib tree builder doesn't support parse_only" in 38 str(w[0].message)) 39 40 def test_correctly_nested_tables(self): 41 """html5lib inserts <tbody> tags where other parsers don't.""" 42 markup = ('<table id="1">' 43 '<tr>' 44 "<td>Here's another table:" 45 '<table id="2">' 46 '<tr><td>foo</td></tr>' 47 '</table></td>') 48 49 self.assertSoupEquals( 50 markup, 51 '<table id="1"><tbody><tr><td>Here\'s another table:' 52 '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>' 53 '</td></tr></tbody></table>') 54 55 self.assertSoupEquals( 56 "<table><thead><tr><td>Foo</td></tr></thead>" 57 "<tbody><tr><td>Bar</td></tr></tbody>" 58 "<tfoot><tr><td>Baz</td></tr></tfoot></table>") 59 60 def test_xml_declaration_followed_by_doctype(self): 61 markup = '''<?xml version="1.0" encoding="utf-8"?> 62<!DOCTYPE html> 63<html> 64 <head> 65 </head> 66 <body> 67 <p>foo</p> 68 </body> 69</html>''' 70 soup = self.soup(markup) 71 # Verify that we can reach the <p> tag; this means the tree is connected. 72 self.assertEqual(b"<p>foo</p>", soup.p.encode()) 73 74 def test_reparented_markup(self): 75 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>' 76 soup = self.soup(markup) 77 self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) 78 self.assertEqual(2, len(soup.find_all('p'))) 79 80 81 def test_reparented_markup_ends_with_whitespace(self): 82 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n' 83 soup = self.soup(markup) 84 self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) 85 self.assertEqual(2, len(soup.find_all('p'))) 86