1# -*- coding: utf-8 -*- 2"""Tests for Beautiful Soup's tree traversal methods. 3 4The tree traversal methods are the main advantage of using Beautiful 5Soup over just using a parser. 6 7Different parsers will build different Beautiful Soup trees given the 8same markup, but all Beautiful Soup trees can be traversed with the 9methods tested here. 10""" 11 12import copy 13import pickle 14import re 15import warnings 16from bs4 import BeautifulSoup 17from bs4.builder import ( 18 builder_registry, 19 HTMLParserTreeBuilder, 20) 21from bs4.element import ( 22 CData, 23 Comment, 24 Doctype, 25 NavigableString, 26 SoupStrainer, 27 Tag, 28) 29from bs4.testing import ( 30 SoupTest, 31 skipIf, 32) 33 34XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) 35LXML_PRESENT = (builder_registry.lookup("lxml") is not None) 36 37class TreeTest(SoupTest): 38 39 def assertSelects(self, tags, should_match): 40 """Make sure that the given tags have the correct text. 41 42 This is used in tests that define a bunch of tags, each 43 containing a single string, and then select certain strings by 44 some mechanism. 45 """ 46 self.assertEqual([tag.string for tag in tags], should_match) 47 48 def assertSelectsIDs(self, tags, should_match): 49 """Make sure that the given tags have the correct IDs. 50 51 This is used in tests that define a bunch of tags, each 52 containing a single string, and then select certain strings by 53 some mechanism. 54 """ 55 self.assertEqual([tag['id'] for tag in tags], should_match) 56 57 58class TestFind(TreeTest): 59 """Basic tests of the find() method. 60 61 find() just calls find_all() with limit=1, so it's not tested all 62 that thouroughly here. 63 """ 64 65 def test_find_tag(self): 66 soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>") 67 self.assertEqual(soup.find("b").string, "2") 68 69 def test_unicode_text_find(self): 70 soup = self.soup(u'<h1>Räksmörgås</h1>') 71 self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') 72 73 def test_find_everything(self): 74 """Test an optimization that finds all tags.""" 75 soup = self.soup("<a>foo</a><b>bar</b>") 76 self.assertEqual(2, len(soup.find_all())) 77 78 def test_find_everything_with_name(self): 79 """Test an optimization that finds all tags with a given name.""" 80 soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>") 81 self.assertEqual(2, len(soup.find_all('a'))) 82 83class TestFindAll(TreeTest): 84 """Basic tests of the find_all() method.""" 85 86 def test_find_all_text_nodes(self): 87 """You can search the tree for text nodes.""" 88 soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") 89 # Exact match. 90 self.assertEqual(soup.find_all(text="bar"), [u"bar"]) 91 # Match any of a number of strings. 92 self.assertEqual( 93 soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) 94 # Match a regular expression. 95 self.assertEqual(soup.find_all(text=re.compile('.*')), 96 [u"Foo", u"bar", u'\xbb']) 97 # Match anything. 98 self.assertEqual(soup.find_all(text=True), 99 [u"Foo", u"bar", u'\xbb']) 100 101 def test_find_all_limit(self): 102 """You can limit the number of items returned by find_all.""" 103 soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>") 104 self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) 105 self.assertSelects(soup.find_all('a', limit=1), ["1"]) 106 self.assertSelects( 107 soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) 108 109 # A limit of 0 means no limit. 110 self.assertSelects( 111 soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) 112 113 def test_calling_a_tag_is_calling_findall(self): 114 soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>") 115 self.assertSelects(soup('a', limit=1), ["1"]) 116 self.assertSelects(soup.b(id="foo"), ["3"]) 117 118 def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): 119 soup = self.soup("<a></a>") 120 # Create a self-referential list. 121 l = [] 122 l.append(l) 123 124 # Without special code in _normalize_search_value, this would cause infinite 125 # recursion. 126 self.assertEqual([], soup.find_all(l)) 127 128 def test_find_all_resultset(self): 129 """All find_all calls return a ResultSet""" 130 soup = self.soup("<a></a>") 131 result = soup.find_all("a") 132 self.assertTrue(hasattr(result, "source")) 133 134 result = soup.find_all(True) 135 self.assertTrue(hasattr(result, "source")) 136 137 result = soup.find_all(text="foo") 138 self.assertTrue(hasattr(result, "source")) 139 140 141class TestFindAllBasicNamespaces(TreeTest): 142 143 def test_find_by_namespaced_name(self): 144 soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">') 145 self.assertEqual("4", soup.find("mathml:msqrt").string) 146 self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) 147 148 149class TestFindAllByName(TreeTest): 150 """Test ways of finding tags by tag name.""" 151 152 def setUp(self): 153 super(TreeTest, self).setUp() 154 self.tree = self.soup("""<a>First tag.</a> 155 <b>Second tag.</b> 156 <c>Third <a>Nested tag.</a> tag.</c>""") 157 158 def test_find_all_by_tag_name(self): 159 # Find all the <a> tags. 160 self.assertSelects( 161 self.tree.find_all('a'), ['First tag.', 'Nested tag.']) 162 163 def test_find_all_by_name_and_text(self): 164 self.assertSelects( 165 self.tree.find_all('a', text='First tag.'), ['First tag.']) 166 167 self.assertSelects( 168 self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) 169 170 self.assertSelects( 171 self.tree.find_all('a', text=re.compile("tag")), 172 ['First tag.', 'Nested tag.']) 173 174 175 def test_find_all_on_non_root_element(self): 176 # You can call find_all on any node, not just the root. 177 self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) 178 179 def test_calling_element_invokes_find_all(self): 180 self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) 181 182 def test_find_all_by_tag_strainer(self): 183 self.assertSelects( 184 self.tree.find_all(SoupStrainer('a')), 185 ['First tag.', 'Nested tag.']) 186 187 def test_find_all_by_tag_names(self): 188 self.assertSelects( 189 self.tree.find_all(['a', 'b']), 190 ['First tag.', 'Second tag.', 'Nested tag.']) 191 192 def test_find_all_by_tag_dict(self): 193 self.assertSelects( 194 self.tree.find_all({'a' : True, 'b' : True}), 195 ['First tag.', 'Second tag.', 'Nested tag.']) 196 197 def test_find_all_by_tag_re(self): 198 self.assertSelects( 199 self.tree.find_all(re.compile('^[ab]$')), 200 ['First tag.', 'Second tag.', 'Nested tag.']) 201 202 def test_find_all_with_tags_matching_method(self): 203 # You can define an oracle method that determines whether 204 # a tag matches the search. 205 def id_matches_name(tag): 206 return tag.name == tag.get('id') 207 208 tree = self.soup("""<a id="a">Match 1.</a> 209 <a id="1">Does not match.</a> 210 <b id="b">Match 2.</a>""") 211 212 self.assertSelects( 213 tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) 214 215 216class TestFindAllByAttribute(TreeTest): 217 218 def test_find_all_by_attribute_name(self): 219 # You can pass in keyword arguments to find_all to search by 220 # attribute. 221 tree = self.soup(""" 222 <a id="first">Matching a.</a> 223 <a id="second"> 224 Non-matching <b id="first">Matching b.</b>a. 225 </a>""") 226 self.assertSelects(tree.find_all(id='first'), 227 ["Matching a.", "Matching b."]) 228 229 def test_find_all_by_utf8_attribute_value(self): 230 peace = u"םולש".encode("utf8") 231 data = u'<a title="םולש"></a>'.encode("utf8") 232 soup = self.soup(data) 233 self.assertEqual([soup.a], soup.find_all(title=peace)) 234 self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) 235 self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) 236 237 def test_find_all_by_attribute_dict(self): 238 # You can pass in a dictionary as the argument 'attrs'. This 239 # lets you search for attributes like 'name' (a fixed argument 240 # to find_all) and 'class' (a reserved word in Python.) 241 tree = self.soup(""" 242 <a name="name1" class="class1">Name match.</a> 243 <a name="name2" class="class2">Class match.</a> 244 <a name="name3" class="class3">Non-match.</a> 245 <name1>A tag called 'name1'.</name1> 246 """) 247 248 # This doesn't do what you want. 249 self.assertSelects(tree.find_all(name='name1'), 250 ["A tag called 'name1'."]) 251 # This does what you want. 252 self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), 253 ["Name match."]) 254 255 self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), 256 ["Class match."]) 257 258 def test_find_all_by_class(self): 259 tree = self.soup(""" 260 <a class="1">Class 1.</a> 261 <a class="2">Class 2.</a> 262 <b class="1">Class 1.</b> 263 <c class="3 4">Class 3 and 4.</c> 264 """) 265 266 # Passing in the class_ keyword argument will search against 267 # the 'class' attribute. 268 self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) 269 self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) 270 self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) 271 272 # Passing in a string to 'attrs' will also search the CSS class. 273 self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) 274 self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) 275 self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) 276 self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) 277 278 def test_find_by_class_when_multiple_classes_present(self): 279 tree = self.soup("<gar class='foo bar'>Found it</gar>") 280 281 f = tree.find_all("gar", class_=re.compile("o")) 282 self.assertSelects(f, ["Found it"]) 283 284 f = tree.find_all("gar", class_=re.compile("a")) 285 self.assertSelects(f, ["Found it"]) 286 287 # Since the class is not the string "foo bar", but the two 288 # strings "foo" and "bar", this will not find anything. 289 f = tree.find_all("gar", class_=re.compile("o b")) 290 self.assertSelects(f, []) 291 292 def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): 293 soup = self.soup("<a class='bar'>Found it</a>") 294 295 self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) 296 297 def big_attribute_value(value): 298 return len(value) > 3 299 300 self.assertSelects(soup.find_all("a", big_attribute_value), []) 301 302 def small_attribute_value(value): 303 return len(value) <= 3 304 305 self.assertSelects( 306 soup.find_all("a", small_attribute_value), ["Found it"]) 307 308 def test_find_all_with_string_for_attrs_finds_multiple_classes(self): 309 soup = self.soup('<a class="foo bar"></a><a class="foo"></a>') 310 a, a2 = soup.find_all("a") 311 self.assertEqual([a, a2], soup.find_all("a", "foo")) 312 self.assertEqual([a], soup.find_all("a", "bar")) 313 314 # If you specify the class as a string that contains a 315 # space, only that specific value will be found. 316 self.assertEqual([a], soup.find_all("a", class_="foo bar")) 317 self.assertEqual([a], soup.find_all("a", "foo bar")) 318 self.assertEqual([], soup.find_all("a", "bar foo")) 319 320 def test_find_all_by_attribute_soupstrainer(self): 321 tree = self.soup(""" 322 <a id="first">Match.</a> 323 <a id="second">Non-match.</a>""") 324 325 strainer = SoupStrainer(attrs={'id' : 'first'}) 326 self.assertSelects(tree.find_all(strainer), ['Match.']) 327 328 def test_find_all_with_missing_atribute(self): 329 # You can pass in None as the value of an attribute to find_all. 330 # This will match tags that do not have that attribute set. 331 tree = self.soup("""<a id="1">ID present.</a> 332 <a>No ID present.</a> 333 <a id="">ID is empty.</a>""") 334 self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) 335 336 def test_find_all_with_defined_attribute(self): 337 # You can pass in None as the value of an attribute to find_all. 338 # This will match tags that have that attribute set to any value. 339 tree = self.soup("""<a id="1">ID present.</a> 340 <a>No ID present.</a> 341 <a id="">ID is empty.</a>""") 342 self.assertSelects( 343 tree.find_all(id=True), ["ID present.", "ID is empty."]) 344 345 def test_find_all_with_numeric_attribute(self): 346 # If you search for a number, it's treated as a string. 347 tree = self.soup("""<a id=1>Unquoted attribute.</a> 348 <a id="1">Quoted attribute.</a>""") 349 350 expected = ["Unquoted attribute.", "Quoted attribute."] 351 self.assertSelects(tree.find_all(id=1), expected) 352 self.assertSelects(tree.find_all(id="1"), expected) 353 354 def test_find_all_with_list_attribute_values(self): 355 # You can pass a list of attribute values instead of just one, 356 # and you'll get tags that match any of the values. 357 tree = self.soup("""<a id="1">1</a> 358 <a id="2">2</a> 359 <a id="3">3</a> 360 <a>No ID.</a>""") 361 self.assertSelects(tree.find_all(id=["1", "3", "4"]), 362 ["1", "3"]) 363 364 def test_find_all_with_regular_expression_attribute_value(self): 365 # You can pass a regular expression as an attribute value, and 366 # you'll get tags whose values for that attribute match the 367 # regular expression. 368 tree = self.soup("""<a id="a">One a.</a> 369 <a id="aa">Two as.</a> 370 <a id="ab">Mixed as and bs.</a> 371 <a id="b">One b.</a> 372 <a>No ID.</a>""") 373 374 self.assertSelects(tree.find_all(id=re.compile("^a+$")), 375 ["One a.", "Two as."]) 376 377 def test_find_by_name_and_containing_string(self): 378 soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>") 379 a = soup.a 380 381 self.assertEqual([a], soup.find_all("a", text="foo")) 382 self.assertEqual([], soup.find_all("a", text="bar")) 383 self.assertEqual([], soup.find_all("a", text="bar")) 384 385 def test_find_by_name_and_containing_string_when_string_is_buried(self): 386 soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>") 387 self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) 388 389 def test_find_by_attribute_and_containing_string(self): 390 soup = self.soup('<b id="1">foo</b><a id="2">foo</a>') 391 a = soup.a 392 393 self.assertEqual([a], soup.find_all(id=2, text="foo")) 394 self.assertEqual([], soup.find_all(id=1, text="bar")) 395 396 397 398 399class TestIndex(TreeTest): 400 """Test Tag.index""" 401 def test_index(self): 402 tree = self.soup("""<div> 403 <a>Identical</a> 404 <b>Not identical</b> 405 <a>Identical</a> 406 407 <c><d>Identical with child</d></c> 408 <b>Also not identical</b> 409 <c><d>Identical with child</d></c> 410 </div>""") 411 div = tree.div 412 for i, element in enumerate(div.contents): 413 self.assertEqual(i, div.index(element)) 414 self.assertRaises(ValueError, tree.index, 1) 415 416 417class TestParentOperations(TreeTest): 418 """Test navigation and searching through an element's parents.""" 419 420 def setUp(self): 421 super(TestParentOperations, self).setUp() 422 self.tree = self.soup('''<ul id="empty"></ul> 423 <ul id="top"> 424 <ul id="middle"> 425 <ul id="bottom"> 426 <b>Start here</b> 427 </ul> 428 </ul>''') 429 self.start = self.tree.b 430 431 432 def test_parent(self): 433 self.assertEqual(self.start.parent['id'], 'bottom') 434 self.assertEqual(self.start.parent.parent['id'], 'middle') 435 self.assertEqual(self.start.parent.parent.parent['id'], 'top') 436 437 def test_parent_of_top_tag_is_soup_object(self): 438 top_tag = self.tree.contents[0] 439 self.assertEqual(top_tag.parent, self.tree) 440 441 def test_soup_object_has_no_parent(self): 442 self.assertEqual(None, self.tree.parent) 443 444 def test_find_parents(self): 445 self.assertSelectsIDs( 446 self.start.find_parents('ul'), ['bottom', 'middle', 'top']) 447 self.assertSelectsIDs( 448 self.start.find_parents('ul', id="middle"), ['middle']) 449 450 def test_find_parent(self): 451 self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') 452 self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top') 453 454 def test_parent_of_text_element(self): 455 text = self.tree.find(text="Start here") 456 self.assertEqual(text.parent.name, 'b') 457 458 def test_text_element_find_parent(self): 459 text = self.tree.find(text="Start here") 460 self.assertEqual(text.find_parent('ul')['id'], 'bottom') 461 462 def test_parent_generator(self): 463 parents = [parent['id'] for parent in self.start.parents 464 if parent is not None and 'id' in parent.attrs] 465 self.assertEqual(parents, ['bottom', 'middle', 'top']) 466 467 468class ProximityTest(TreeTest): 469 470 def setUp(self): 471 super(TreeTest, self).setUp() 472 self.tree = self.soup( 473 '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>') 474 475 476class TestNextOperations(ProximityTest): 477 478 def setUp(self): 479 super(TestNextOperations, self).setUp() 480 self.start = self.tree.b 481 482 def test_next(self): 483 self.assertEqual(self.start.next_element, "One") 484 self.assertEqual(self.start.next_element.next_element['id'], "2") 485 486 def test_next_of_last_item_is_none(self): 487 last = self.tree.find(text="Three") 488 self.assertEqual(last.next_element, None) 489 490 def test_next_of_root_is_none(self): 491 # The document root is outside the next/previous chain. 492 self.assertEqual(self.tree.next_element, None) 493 494 def test_find_all_next(self): 495 self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) 496 self.start.find_all_next(id=3) 497 self.assertSelects(self.start.find_all_next(id=3), ["Three"]) 498 499 def test_find_next(self): 500 self.assertEqual(self.start.find_next('b')['id'], '2') 501 self.assertEqual(self.start.find_next(text="Three"), "Three") 502 503 def test_find_next_for_text_element(self): 504 text = self.tree.find(text="One") 505 self.assertEqual(text.find_next("b").string, "Two") 506 self.assertSelects(text.find_all_next("b"), ["Two", "Three"]) 507 508 def test_next_generator(self): 509 start = self.tree.find(text="Two") 510 successors = [node for node in start.next_elements] 511 # There are two successors: the final <b> tag and its text contents. 512 tag, contents = successors 513 self.assertEqual(tag['id'], '3') 514 self.assertEqual(contents, "Three") 515 516class TestPreviousOperations(ProximityTest): 517 518 def setUp(self): 519 super(TestPreviousOperations, self).setUp() 520 self.end = self.tree.find(text="Three") 521 522 def test_previous(self): 523 self.assertEqual(self.end.previous_element['id'], "3") 524 self.assertEqual(self.end.previous_element.previous_element, "Two") 525 526 def test_previous_of_first_item_is_none(self): 527 first = self.tree.find('html') 528 self.assertEqual(first.previous_element, None) 529 530 def test_previous_of_root_is_none(self): 531 # The document root is outside the next/previous chain. 532 # XXX This is broken! 533 #self.assertEqual(self.tree.previous_element, None) 534 pass 535 536 def test_find_all_previous(self): 537 # The <b> tag containing the "Three" node is the predecessor 538 # of the "Three" node itself, which is why "Three" shows up 539 # here. 540 self.assertSelects( 541 self.end.find_all_previous('b'), ["Three", "Two", "One"]) 542 self.assertSelects(self.end.find_all_previous(id=1), ["One"]) 543 544 def test_find_previous(self): 545 self.assertEqual(self.end.find_previous('b')['id'], '3') 546 self.assertEqual(self.end.find_previous(text="One"), "One") 547 548 def test_find_previous_for_text_element(self): 549 text = self.tree.find(text="Three") 550 self.assertEqual(text.find_previous("b").string, "Three") 551 self.assertSelects( 552 text.find_all_previous("b"), ["Three", "Two", "One"]) 553 554 def test_previous_generator(self): 555 start = self.tree.find(text="One") 556 predecessors = [node for node in start.previous_elements] 557 558 # There are four predecessors: the <b> tag containing "One" 559 # the <body> tag, the <head> tag, and the <html> tag. 560 b, body, head, html = predecessors 561 self.assertEqual(b['id'], '1') 562 self.assertEqual(body.name, "body") 563 self.assertEqual(head.name, "head") 564 self.assertEqual(html.name, "html") 565 566 567class SiblingTest(TreeTest): 568 569 def setUp(self): 570 super(SiblingTest, self).setUp() 571 markup = '''<html> 572 <span id="1"> 573 <span id="1.1"></span> 574 </span> 575 <span id="2"> 576 <span id="2.1"></span> 577 </span> 578 <span id="3"> 579 <span id="3.1"></span> 580 </span> 581 <span id="4"></span> 582 </html>''' 583 # All that whitespace looks good but makes the tests more 584 # difficult. Get rid of it. 585 markup = re.compile("\n\s*").sub("", markup) 586 self.tree = self.soup(markup) 587 588 589class TestNextSibling(SiblingTest): 590 591 def setUp(self): 592 super(TestNextSibling, self).setUp() 593 self.start = self.tree.find(id="1") 594 595 def test_next_sibling_of_root_is_none(self): 596 self.assertEqual(self.tree.next_sibling, None) 597 598 def test_next_sibling(self): 599 self.assertEqual(self.start.next_sibling['id'], '2') 600 self.assertEqual(self.start.next_sibling.next_sibling['id'], '3') 601 602 # Note the difference between next_sibling and next_element. 603 self.assertEqual(self.start.next_element['id'], '1.1') 604 605 def test_next_sibling_may_not_exist(self): 606 self.assertEqual(self.tree.html.next_sibling, None) 607 608 nested_span = self.tree.find(id="1.1") 609 self.assertEqual(nested_span.next_sibling, None) 610 611 last_span = self.tree.find(id="4") 612 self.assertEqual(last_span.next_sibling, None) 613 614 def test_find_next_sibling(self): 615 self.assertEqual(self.start.find_next_sibling('span')['id'], '2') 616 617 def test_next_siblings(self): 618 self.assertSelectsIDs(self.start.find_next_siblings("span"), 619 ['2', '3', '4']) 620 621 self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3']) 622 623 def test_next_sibling_for_text_element(self): 624 soup = self.soup("Foo<b>bar</b>baz") 625 start = soup.find(text="Foo") 626 self.assertEqual(start.next_sibling.name, 'b') 627 self.assertEqual(start.next_sibling.next_sibling, 'baz') 628 629 self.assertSelects(start.find_next_siblings('b'), ['bar']) 630 self.assertEqual(start.find_next_sibling(text="baz"), "baz") 631 self.assertEqual(start.find_next_sibling(text="nonesuch"), None) 632 633 634class TestPreviousSibling(SiblingTest): 635 636 def setUp(self): 637 super(TestPreviousSibling, self).setUp() 638 self.end = self.tree.find(id="4") 639 640 def test_previous_sibling_of_root_is_none(self): 641 self.assertEqual(self.tree.previous_sibling, None) 642 643 def test_previous_sibling(self): 644 self.assertEqual(self.end.previous_sibling['id'], '3') 645 self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2') 646 647 # Note the difference between previous_sibling and previous_element. 648 self.assertEqual(self.end.previous_element['id'], '3.1') 649 650 def test_previous_sibling_may_not_exist(self): 651 self.assertEqual(self.tree.html.previous_sibling, None) 652 653 nested_span = self.tree.find(id="1.1") 654 self.assertEqual(nested_span.previous_sibling, None) 655 656 first_span = self.tree.find(id="1") 657 self.assertEqual(first_span.previous_sibling, None) 658 659 def test_find_previous_sibling(self): 660 self.assertEqual(self.end.find_previous_sibling('span')['id'], '3') 661 662 def test_previous_siblings(self): 663 self.assertSelectsIDs(self.end.find_previous_siblings("span"), 664 ['3', '2', '1']) 665 666 self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1']) 667 668 def test_previous_sibling_for_text_element(self): 669 soup = self.soup("Foo<b>bar</b>baz") 670 start = soup.find(text="baz") 671 self.assertEqual(start.previous_sibling.name, 'b') 672 self.assertEqual(start.previous_sibling.previous_sibling, 'Foo') 673 674 self.assertSelects(start.find_previous_siblings('b'), ['bar']) 675 self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo") 676 self.assertEqual(start.find_previous_sibling(text="nonesuch"), None) 677 678 679class TestTagCreation(SoupTest): 680 """Test the ability to create new tags.""" 681 def test_new_tag(self): 682 soup = self.soup("") 683 new_tag = soup.new_tag("foo", bar="baz") 684 self.assertTrue(isinstance(new_tag, Tag)) 685 self.assertEqual("foo", new_tag.name) 686 self.assertEqual(dict(bar="baz"), new_tag.attrs) 687 self.assertEqual(None, new_tag.parent) 688 689 def test_tag_inherits_self_closing_rules_from_builder(self): 690 if XML_BUILDER_PRESENT: 691 xml_soup = BeautifulSoup("", "xml") 692 xml_br = xml_soup.new_tag("br") 693 xml_p = xml_soup.new_tag("p") 694 695 # Both the <br> and <p> tag are empty-element, just because 696 # they have no contents. 697 self.assertEqual(b"<br/>", xml_br.encode()) 698 self.assertEqual(b"<p/>", xml_p.encode()) 699 700 html_soup = BeautifulSoup("", "html") 701 html_br = html_soup.new_tag("br") 702 html_p = html_soup.new_tag("p") 703 704 # The HTML builder users HTML's rules about which tags are 705 # empty-element tags, and the new tags reflect these rules. 706 self.assertEqual(b"<br/>", html_br.encode()) 707 self.assertEqual(b"<p></p>", html_p.encode()) 708 709 def test_new_string_creates_navigablestring(self): 710 soup = self.soup("") 711 s = soup.new_string("foo") 712 self.assertEqual("foo", s) 713 self.assertTrue(isinstance(s, NavigableString)) 714 715 def test_new_string_can_create_navigablestring_subclass(self): 716 soup = self.soup("") 717 s = soup.new_string("foo", Comment) 718 self.assertEqual("foo", s) 719 self.assertTrue(isinstance(s, Comment)) 720 721class TestTreeModification(SoupTest): 722 723 def test_attribute_modification(self): 724 soup = self.soup('<a id="1"></a>') 725 soup.a['id'] = 2 726 self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>')) 727 del(soup.a['id']) 728 self.assertEqual(soup.decode(), self.document_for('<a></a>')) 729 soup.a['id2'] = 'foo' 730 self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>')) 731 732 def test_new_tag_creation(self): 733 builder = builder_registry.lookup('html')() 734 soup = self.soup("<body></body>", builder=builder) 735 a = Tag(soup, builder, 'a') 736 ol = Tag(soup, builder, 'ol') 737 a['href'] = 'http://foo.com/' 738 soup.body.insert(0, a) 739 soup.body.insert(1, ol) 740 self.assertEqual( 741 soup.body.encode(), 742 b'<body><a href="http://foo.com/"></a><ol></ol></body>') 743 744 def test_append_to_contents_moves_tag(self): 745 doc = """<p id="1">Don't leave me <b>here</b>.</p> 746 <p id="2">Don\'t leave!</p>""" 747 soup = self.soup(doc) 748 second_para = soup.find(id='2') 749 bold = soup.b 750 751 # Move the <b> tag to the end of the second paragraph. 752 soup.find(id='2').append(soup.b) 753 754 # The <b> tag is now a child of the second paragraph. 755 self.assertEqual(bold.parent, second_para) 756 757 self.assertEqual( 758 soup.decode(), self.document_for( 759 '<p id="1">Don\'t leave me .</p>\n' 760 '<p id="2">Don\'t leave!<b>here</b></p>')) 761 762 def test_replace_with_returns_thing_that_was_replaced(self): 763 text = "<a></a><b><c></c></b>" 764 soup = self.soup(text) 765 a = soup.a 766 new_a = a.replace_with(soup.c) 767 self.assertEqual(a, new_a) 768 769 def test_unwrap_returns_thing_that_was_replaced(self): 770 text = "<a><b></b><c></c></a>" 771 soup = self.soup(text) 772 a = soup.a 773 new_a = a.unwrap() 774 self.assertEqual(a, new_a) 775 776 def test_replace_tag_with_itself(self): 777 text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" 778 soup = self.soup(text) 779 c = soup.c 780 soup.c.replace_with(c) 781 self.assertEqual(soup.decode(), self.document_for(text)) 782 783 def test_replace_tag_with_its_parent_raises_exception(self): 784 text = "<a><b></b></a>" 785 soup = self.soup(text) 786 self.assertRaises(ValueError, soup.b.replace_with, soup.a) 787 788 def test_insert_tag_into_itself_raises_exception(self): 789 text = "<a><b></b></a>" 790 soup = self.soup(text) 791 self.assertRaises(ValueError, soup.a.insert, 0, soup.a) 792 793 def test_replace_with_maintains_next_element_throughout(self): 794 soup = self.soup('<p><a>one</a><b>three</b></p>') 795 a = soup.a 796 b = a.contents[0] 797 # Make it so the <a> tag has two text children. 798 a.insert(1, "two") 799 800 # Now replace each one with the empty string. 801 left, right = a.contents 802 left.replaceWith('') 803 right.replaceWith('') 804 805 # The <b> tag is still connected to the tree. 806 self.assertEqual("three", soup.b.string) 807 808 def test_replace_final_node(self): 809 soup = self.soup("<b>Argh!</b>") 810 soup.find(text="Argh!").replace_with("Hooray!") 811 new_text = soup.find(text="Hooray!") 812 b = soup.b 813 self.assertEqual(new_text.previous_element, b) 814 self.assertEqual(new_text.parent, b) 815 self.assertEqual(new_text.previous_element.next_element, new_text) 816 self.assertEqual(new_text.next_element, None) 817 818 def test_consecutive_text_nodes(self): 819 # A builder should never create two consecutive text nodes, 820 # but if you insert one next to another, Beautiful Soup will 821 # handle it correctly. 822 soup = self.soup("<a><b>Argh!</b><c></c></a>") 823 soup.b.insert(1, "Hooray!") 824 825 self.assertEqual( 826 soup.decode(), self.document_for( 827 "<a><b>Argh!Hooray!</b><c></c></a>")) 828 829 new_text = soup.find(text="Hooray!") 830 self.assertEqual(new_text.previous_element, "Argh!") 831 self.assertEqual(new_text.previous_element.next_element, new_text) 832 833 self.assertEqual(new_text.previous_sibling, "Argh!") 834 self.assertEqual(new_text.previous_sibling.next_sibling, new_text) 835 836 self.assertEqual(new_text.next_sibling, None) 837 self.assertEqual(new_text.next_element, soup.c) 838 839 def test_insert_string(self): 840 soup = self.soup("<a></a>") 841 soup.a.insert(0, "bar") 842 soup.a.insert(0, "foo") 843 # The string were added to the tag. 844 self.assertEqual(["foo", "bar"], soup.a.contents) 845 # And they were converted to NavigableStrings. 846 self.assertEqual(soup.a.contents[0].next_element, "bar") 847 848 def test_insert_tag(self): 849 builder = self.default_builder 850 soup = self.soup( 851 "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) 852 magic_tag = Tag(soup, builder, 'magictag') 853 magic_tag.insert(0, "the") 854 soup.a.insert(1, magic_tag) 855 856 self.assertEqual( 857 soup.decode(), self.document_for( 858 "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>")) 859 860 # Make sure all the relationships are hooked up correctly. 861 b_tag = soup.b 862 self.assertEqual(b_tag.next_sibling, magic_tag) 863 self.assertEqual(magic_tag.previous_sibling, b_tag) 864 865 find = b_tag.find(text="Find") 866 self.assertEqual(find.next_element, magic_tag) 867 self.assertEqual(magic_tag.previous_element, find) 868 869 c_tag = soup.c 870 self.assertEqual(magic_tag.next_sibling, c_tag) 871 self.assertEqual(c_tag.previous_sibling, magic_tag) 872 873 the = magic_tag.find(text="the") 874 self.assertEqual(the.parent, magic_tag) 875 self.assertEqual(the.next_element, c_tag) 876 self.assertEqual(c_tag.previous_element, the) 877 878 def test_append_child_thats_already_at_the_end(self): 879 data = "<a><b></b></a>" 880 soup = self.soup(data) 881 soup.a.append(soup.b) 882 self.assertEqual(data, soup.decode()) 883 884 def test_move_tag_to_beginning_of_parent(self): 885 data = "<a><b></b><c></c><d></d></a>" 886 soup = self.soup(data) 887 soup.a.insert(0, soup.d) 888 self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode()) 889 890 def test_insert_works_on_empty_element_tag(self): 891 # This is a little strange, since most HTML parsers don't allow 892 # markup like this to come through. But in general, we don't 893 # know what the parser would or wouldn't have allowed, so 894 # I'm letting this succeed for now. 895 soup = self.soup("<br/>") 896 soup.br.insert(1, "Contents") 897 self.assertEqual(str(soup.br), "<br>Contents</br>") 898 899 def test_insert_before(self): 900 soup = self.soup("<a>foo</a><b>bar</b>") 901 soup.b.insert_before("BAZ") 902 soup.a.insert_before("QUUX") 903 self.assertEqual( 904 soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>")) 905 906 soup.a.insert_before(soup.b) 907 self.assertEqual( 908 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) 909 910 def test_insert_after(self): 911 soup = self.soup("<a>foo</a><b>bar</b>") 912 soup.b.insert_after("BAZ") 913 soup.a.insert_after("QUUX") 914 self.assertEqual( 915 soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ")) 916 soup.b.insert_after(soup.a) 917 self.assertEqual( 918 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) 919 920 def test_insert_after_raises_exception_if_after_has_no_meaning(self): 921 soup = self.soup("") 922 tag = soup.new_tag("a") 923 string = soup.new_string("") 924 self.assertRaises(ValueError, string.insert_after, tag) 925 self.assertRaises(NotImplementedError, soup.insert_after, tag) 926 self.assertRaises(ValueError, tag.insert_after, tag) 927 928 def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): 929 soup = self.soup("") 930 tag = soup.new_tag("a") 931 string = soup.new_string("") 932 self.assertRaises(ValueError, string.insert_before, tag) 933 self.assertRaises(NotImplementedError, soup.insert_before, tag) 934 self.assertRaises(ValueError, tag.insert_before, tag) 935 936 def test_replace_with(self): 937 soup = self.soup( 938 "<p>There's <b>no</b> business like <b>show</b> business</p>") 939 no, show = soup.find_all('b') 940 show.replace_with(no) 941 self.assertEqual( 942 soup.decode(), 943 self.document_for( 944 "<p>There's business like <b>no</b> business</p>")) 945 946 self.assertEqual(show.parent, None) 947 self.assertEqual(no.parent, soup.p) 948 self.assertEqual(no.next_element, "no") 949 self.assertEqual(no.next_sibling, " business") 950 951 def test_replace_first_child(self): 952 data = "<a><b></b><c></c></a>" 953 soup = self.soup(data) 954 soup.b.replace_with(soup.c) 955 self.assertEqual("<a><c></c></a>", soup.decode()) 956 957 def test_replace_last_child(self): 958 data = "<a><b></b><c></c></a>" 959 soup = self.soup(data) 960 soup.c.replace_with(soup.b) 961 self.assertEqual("<a><b></b></a>", soup.decode()) 962 963 def test_nested_tag_replace_with(self): 964 soup = self.soup( 965 """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""") 966 967 # Replace the entire <b> tag and its contents ("reserve the 968 # right") with the <f> tag ("refuse"). 969 remove_tag = soup.b 970 move_tag = soup.f 971 remove_tag.replace_with(move_tag) 972 973 self.assertEqual( 974 soup.decode(), self.document_for( 975 "<a>We<f>refuse</f></a><e>to<g>service</g></e>")) 976 977 # The <b> tag is now an orphan. 978 self.assertEqual(remove_tag.parent, None) 979 self.assertEqual(remove_tag.find(text="right").next_element, None) 980 self.assertEqual(remove_tag.previous_element, None) 981 self.assertEqual(remove_tag.next_sibling, None) 982 self.assertEqual(remove_tag.previous_sibling, None) 983 984 # The <f> tag is now connected to the <a> tag. 985 self.assertEqual(move_tag.parent, soup.a) 986 self.assertEqual(move_tag.previous_element, "We") 987 self.assertEqual(move_tag.next_element.next_element, soup.e) 988 self.assertEqual(move_tag.next_sibling, None) 989 990 # The gap where the <f> tag used to be has been mended, and 991 # the word "to" is now connected to the <g> tag. 992 to_text = soup.find(text="to") 993 g_tag = soup.g 994 self.assertEqual(to_text.next_element, g_tag) 995 self.assertEqual(to_text.next_sibling, g_tag) 996 self.assertEqual(g_tag.previous_element, to_text) 997 self.assertEqual(g_tag.previous_sibling, to_text) 998 999 def test_unwrap(self): 1000 tree = self.soup(""" 1001 <p>Unneeded <em>formatting</em> is unneeded</p> 1002 """) 1003 tree.em.unwrap() 1004 self.assertEqual(tree.em, None) 1005 self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") 1006 1007 def test_wrap(self): 1008 soup = self.soup("I wish I was bold.") 1009 value = soup.string.wrap(soup.new_tag("b")) 1010 self.assertEqual(value.decode(), "<b>I wish I was bold.</b>") 1011 self.assertEqual( 1012 soup.decode(), self.document_for("<b>I wish I was bold.</b>")) 1013 1014 def test_wrap_extracts_tag_from_elsewhere(self): 1015 soup = self.soup("<b></b>I wish I was bold.") 1016 soup.b.next_sibling.wrap(soup.b) 1017 self.assertEqual( 1018 soup.decode(), self.document_for("<b>I wish I was bold.</b>")) 1019 1020 def test_wrap_puts_new_contents_at_the_end(self): 1021 soup = self.soup("<b>I like being bold.</b>I wish I was bold.") 1022 soup.b.next_sibling.wrap(soup.b) 1023 self.assertEqual(2, len(soup.b.contents)) 1024 self.assertEqual( 1025 soup.decode(), self.document_for( 1026 "<b>I like being bold.I wish I was bold.</b>")) 1027 1028 def test_extract(self): 1029 soup = self.soup( 1030 '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>') 1031 1032 self.assertEqual(len(soup.body.contents), 3) 1033 extracted = soup.find(id="nav").extract() 1034 1035 self.assertEqual( 1036 soup.decode(), "<html><body>Some content. More content.</body></html>") 1037 self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>') 1038 1039 # The extracted tag is now an orphan. 1040 self.assertEqual(len(soup.body.contents), 2) 1041 self.assertEqual(extracted.parent, None) 1042 self.assertEqual(extracted.previous_element, None) 1043 self.assertEqual(extracted.next_element.next_element, None) 1044 1045 # The gap where the extracted tag used to be has been mended. 1046 content_1 = soup.find(text="Some content. ") 1047 content_2 = soup.find(text=" More content.") 1048 self.assertEqual(content_1.next_element, content_2) 1049 self.assertEqual(content_1.next_sibling, content_2) 1050 self.assertEqual(content_2.previous_element, content_1) 1051 self.assertEqual(content_2.previous_sibling, content_1) 1052 1053 def test_extract_distinguishes_between_identical_strings(self): 1054 soup = self.soup("<a>foo</a><b>bar</b>") 1055 foo_1 = soup.a.string 1056 bar_1 = soup.b.string 1057 foo_2 = soup.new_string("foo") 1058 bar_2 = soup.new_string("bar") 1059 soup.a.append(foo_2) 1060 soup.b.append(bar_2) 1061 1062 # Now there are two identical strings in the <a> tag, and two 1063 # in the <b> tag. Let's remove the first "foo" and the second 1064 # "bar". 1065 foo_1.extract() 1066 bar_2.extract() 1067 self.assertEqual(foo_2, soup.a.string) 1068 self.assertEqual(bar_2, soup.b.string) 1069 1070 def test_clear(self): 1071 """Tag.clear()""" 1072 soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") 1073 # clear using extract() 1074 a = soup.a 1075 soup.p.clear() 1076 self.assertEqual(len(soup.p.contents), 0) 1077 self.assertTrue(hasattr(a, "contents")) 1078 1079 # clear using decompose() 1080 em = a.em 1081 a.clear(decompose=True) 1082 self.assertEqual(0, len(em.contents)) 1083 1084 def test_string_set(self): 1085 """Tag.string = 'string'""" 1086 soup = self.soup("<a></a> <b><c></c></b>") 1087 soup.a.string = "foo" 1088 self.assertEqual(soup.a.contents, ["foo"]) 1089 soup.b.string = "bar" 1090 self.assertEqual(soup.b.contents, ["bar"]) 1091 1092 def test_string_set_does_not_affect_original_string(self): 1093 soup = self.soup("<a><b>foo</b><c>bar</c>") 1094 soup.b.string = soup.c.string 1095 self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>") 1096 1097 def test_set_string_preserves_class_of_string(self): 1098 soup = self.soup("<a></a>") 1099 cdata = CData("foo") 1100 soup.a.string = cdata 1101 self.assertTrue(isinstance(soup.a.string, CData)) 1102 1103class TestElementObjects(SoupTest): 1104 """Test various features of element objects.""" 1105 1106 def test_len(self): 1107 """The length of an element is its number of children.""" 1108 soup = self.soup("<top>1<b>2</b>3</top>") 1109 1110 # The BeautifulSoup object itself contains one element: the 1111 # <top> tag. 1112 self.assertEqual(len(soup.contents), 1) 1113 self.assertEqual(len(soup), 1) 1114 1115 # The <top> tag contains three elements: the text node "1", the 1116 # <b> tag, and the text node "3". 1117 self.assertEqual(len(soup.top), 3) 1118 self.assertEqual(len(soup.top.contents), 3) 1119 1120 def test_member_access_invokes_find(self): 1121 """Accessing a Python member .foo invokes find('foo')""" 1122 soup = self.soup('<b><i></i></b>') 1123 self.assertEqual(soup.b, soup.find('b')) 1124 self.assertEqual(soup.b.i, soup.find('b').find('i')) 1125 self.assertEqual(soup.a, None) 1126 1127 def test_deprecated_member_access(self): 1128 soup = self.soup('<b><i></i></b>') 1129 with warnings.catch_warnings(record=True) as w: 1130 tag = soup.bTag 1131 self.assertEqual(soup.b, tag) 1132 self.assertEqual( 1133 '.bTag is deprecated, use .find("b") instead.', 1134 str(w[0].message)) 1135 1136 def test_has_attr(self): 1137 """has_attr() checks for the presence of an attribute. 1138 1139 Please note note: has_attr() is different from 1140 __in__. has_attr() checks the tag's attributes and __in__ 1141 checks the tag's chidlren. 1142 """ 1143 soup = self.soup("<foo attr='bar'>") 1144 self.assertTrue(soup.foo.has_attr('attr')) 1145 self.assertFalse(soup.foo.has_attr('attr2')) 1146 1147 1148 def test_attributes_come_out_in_alphabetical_order(self): 1149 markup = '<b a="1" z="5" m="3" f="2" y="4"></b>' 1150 self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>') 1151 1152 def test_string(self): 1153 # A tag that contains only a text node makes that node 1154 # available as .string. 1155 soup = self.soup("<b>foo</b>") 1156 self.assertEqual(soup.b.string, 'foo') 1157 1158 def test_empty_tag_has_no_string(self): 1159 # A tag with no children has no .stirng. 1160 soup = self.soup("<b></b>") 1161 self.assertEqual(soup.b.string, None) 1162 1163 def test_tag_with_multiple_children_has_no_string(self): 1164 # A tag with no children has no .string. 1165 soup = self.soup("<a>foo<b></b><b></b></b>") 1166 self.assertEqual(soup.b.string, None) 1167 1168 soup = self.soup("<a>foo<b></b>bar</b>") 1169 self.assertEqual(soup.b.string, None) 1170 1171 # Even if all the children are strings, due to trickery, 1172 # it won't work--but this would be a good optimization. 1173 soup = self.soup("<a>foo</b>") 1174 soup.a.insert(1, "bar") 1175 self.assertEqual(soup.a.string, None) 1176 1177 def test_tag_with_recursive_string_has_string(self): 1178 # A tag with a single child which has a .string inherits that 1179 # .string. 1180 soup = self.soup("<a><b>foo</b></a>") 1181 self.assertEqual(soup.a.string, "foo") 1182 self.assertEqual(soup.string, "foo") 1183 1184 def test_lack_of_string(self): 1185 """Only a tag containing a single text node has a .string.""" 1186 soup = self.soup("<b>f<i>e</i>o</b>") 1187 self.assertFalse(soup.b.string) 1188 1189 soup = self.soup("<b></b>") 1190 self.assertFalse(soup.b.string) 1191 1192 def test_all_text(self): 1193 """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" 1194 soup = self.soup("<a>a<b>r</b> <r> t </r></a>") 1195 self.assertEqual(soup.a.text, "ar t ") 1196 self.assertEqual(soup.a.get_text(strip=True), "art") 1197 self.assertEqual(soup.a.get_text(","), "a,r, , t ") 1198 self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") 1199 1200 def test_get_text_ignores_comments(self): 1201 soup = self.soup("foo<!--IGNORE-->bar") 1202 self.assertEqual(soup.get_text(), "foobar") 1203 1204 self.assertEqual( 1205 soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar") 1206 self.assertEqual( 1207 soup.get_text(types=None), "fooIGNOREbar") 1208 1209 def test_all_strings_ignores_comments(self): 1210 soup = self.soup("foo<!--IGNORE-->bar") 1211 self.assertEqual(['foo', 'bar'], list(soup.strings)) 1212 1213class TestCDAtaListAttributes(SoupTest): 1214 1215 """Testing cdata-list attributes like 'class'. 1216 """ 1217 def test_single_value_becomes_list(self): 1218 soup = self.soup("<a class='foo'>") 1219 self.assertEqual(["foo"],soup.a['class']) 1220 1221 def test_multiple_values_becomes_list(self): 1222 soup = self.soup("<a class='foo bar'>") 1223 self.assertEqual(["foo", "bar"], soup.a['class']) 1224 1225 def test_multiple_values_separated_by_weird_whitespace(self): 1226 soup = self.soup("<a class='foo\tbar\nbaz'>") 1227 self.assertEqual(["foo", "bar", "baz"],soup.a['class']) 1228 1229 def test_attributes_joined_into_string_on_output(self): 1230 soup = self.soup("<a class='foo\tbar'>") 1231 self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode()) 1232 1233 def test_accept_charset(self): 1234 soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">') 1235 self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) 1236 1237 def test_cdata_attribute_applying_only_to_one_tag(self): 1238 data = '<a accept-charset="ISO-8859-1 UTF-8"></a>' 1239 soup = self.soup(data) 1240 # We saw in another test that accept-charset is a cdata-list 1241 # attribute for the <form> tag. But it's not a cdata-list 1242 # attribute for any other tag. 1243 self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) 1244 1245 def test_string_has_immutable_name_property(self): 1246 string = self.soup("s").string 1247 self.assertEqual(None, string.name) 1248 def t(): 1249 string.name = 'foo' 1250 self.assertRaises(AttributeError, t) 1251 1252class TestPersistence(SoupTest): 1253 "Testing features like pickle and deepcopy." 1254 1255 def setUp(self): 1256 super(TestPersistence, self).setUp() 1257 self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" 1258"http://www.w3.org/TR/REC-html40/transitional.dtd"> 1259<html> 1260<head> 1261<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> 1262<title>Beautiful Soup: We called him Tortoise because he taught us.</title> 1263<link rev="made" href="mailto:leonardr@segfault.org"> 1264<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping."> 1265<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)"> 1266<meta name="author" content="Leonard Richardson"> 1267</head> 1268<body> 1269<a href="foo">foo</a> 1270<a href="foo"><b>bar</b></a> 1271</body> 1272</html>""" 1273 self.tree = self.soup(self.page) 1274 1275 def test_pickle_and_unpickle_identity(self): 1276 # Pickling a tree, then unpickling it, yields a tree identical 1277 # to the original. 1278 dumped = pickle.dumps(self.tree, 2) 1279 loaded = pickle.loads(dumped) 1280 self.assertEqual(loaded.__class__, BeautifulSoup) 1281 self.assertEqual(loaded.decode(), self.tree.decode()) 1282 1283 def test_deepcopy_identity(self): 1284 # Making a deepcopy of a tree yields an identical tree. 1285 copied = copy.deepcopy(self.tree) 1286 self.assertEqual(copied.decode(), self.tree.decode()) 1287 1288 def test_unicode_pickle(self): 1289 # A tree containing Unicode characters can be pickled. 1290 html = u"<b>\N{SNOWMAN}</b>" 1291 soup = self.soup(html) 1292 dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) 1293 loaded = pickle.loads(dumped) 1294 self.assertEqual(loaded.decode(), soup.decode()) 1295 1296 1297class TestSubstitutions(SoupTest): 1298 1299 def test_default_formatter_is_minimal(self): 1300 markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" 1301 soup = self.soup(markup) 1302 decoded = soup.decode(formatter="minimal") 1303 # The < is converted back into < but the e-with-acute is left alone. 1304 self.assertEqual( 1305 decoded, 1306 self.document_for( 1307 u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) 1308 1309 def test_formatter_html(self): 1310 markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" 1311 soup = self.soup(markup) 1312 decoded = soup.decode(formatter="html") 1313 self.assertEqual( 1314 decoded, 1315 self.document_for("<b><<Sacré bleu!>></b>")) 1316 1317 def test_formatter_minimal(self): 1318 markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" 1319 soup = self.soup(markup) 1320 decoded = soup.decode(formatter="minimal") 1321 # The < is converted back into < but the e-with-acute is left alone. 1322 self.assertEqual( 1323 decoded, 1324 self.document_for( 1325 u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) 1326 1327 def test_formatter_null(self): 1328 markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" 1329 soup = self.soup(markup) 1330 decoded = soup.decode(formatter=None) 1331 # Neither the angle brackets nor the e-with-acute are converted. 1332 # This is not valid HTML, but it's what the user wanted. 1333 self.assertEqual(decoded, 1334 self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) 1335 1336 def test_formatter_custom(self): 1337 markup = u"<b><foo></b><b>bar</b>" 1338 soup = self.soup(markup) 1339 decoded = soup.decode(formatter = lambda x: x.upper()) 1340 # Instead of normal entity conversion code, the custom 1341 # callable is called on every string. 1342 self.assertEqual( 1343 decoded, 1344 self.document_for(u"<b><FOO></b><b>BAR</b>")) 1345 1346 def test_formatter_is_run_on_attribute_values(self): 1347 markup = u'<a href="http://a.com?a=b&c=é">e</a>' 1348 soup = self.soup(markup) 1349 a = soup.a 1350 1351 expect_minimal = u'<a href="http://a.com?a=b&c=é">e</a>' 1352 1353 self.assertEqual(expect_minimal, a.decode()) 1354 self.assertEqual(expect_minimal, a.decode(formatter="minimal")) 1355 1356 expect_html = u'<a href="http://a.com?a=b&c=é">e</a>' 1357 self.assertEqual(expect_html, a.decode(formatter="html")) 1358 1359 self.assertEqual(markup, a.decode(formatter=None)) 1360 expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>' 1361 self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) 1362 1363 def test_formatter_skips_script_tag_for_html_documents(self): 1364 doc = """ 1365 <script type="text/javascript"> 1366 console.log("< < hey > > "); 1367 </script> 1368""" 1369 encoded = BeautifulSoup(doc).encode() 1370 self.assertTrue(b"< < hey > >" in encoded) 1371 1372 def test_formatter_skips_style_tag_for_html_documents(self): 1373 doc = """ 1374 <style type="text/css"> 1375 console.log("< < hey > > "); 1376 </style> 1377""" 1378 encoded = BeautifulSoup(doc).encode() 1379 self.assertTrue(b"< < hey > >" in encoded) 1380 1381 def test_prettify_leaves_preformatted_text_alone(self): 1382 soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ") 1383 # Everything outside the <pre> tag is reformatted, but everything 1384 # inside is left alone. 1385 self.assertEqual( 1386 u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', 1387 soup.div.prettify()) 1388 1389 def test_prettify_accepts_formatter(self): 1390 soup = BeautifulSoup("<html><body>foo</body></html>") 1391 pretty = soup.prettify(formatter = lambda x: x.upper()) 1392 self.assertTrue("FOO" in pretty) 1393 1394 def test_prettify_outputs_unicode_by_default(self): 1395 soup = self.soup("<a></a>") 1396 self.assertEqual(unicode, type(soup.prettify())) 1397 1398 def test_prettify_can_encode_data(self): 1399 soup = self.soup("<a></a>") 1400 self.assertEqual(bytes, type(soup.prettify("utf-8"))) 1401 1402 def test_html_entity_substitution_off_by_default(self): 1403 markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" 1404 soup = self.soup(markup) 1405 encoded = soup.b.encode("utf-8") 1406 self.assertEqual(encoded, markup.encode('utf-8')) 1407 1408 def test_encoding_substitution(self): 1409 # Here's the <meta> tag saying that a document is 1410 # encoded in Shift-JIS. 1411 meta_tag = ('<meta content="text/html; charset=x-sjis" ' 1412 'http-equiv="Content-type"/>') 1413 soup = self.soup(meta_tag) 1414 1415 # Parse the document, and the charset apprears unchanged. 1416 self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') 1417 1418 # Encode the document into some encoding, and the encoding is 1419 # substituted into the meta tag. 1420 utf_8 = soup.encode("utf-8") 1421 self.assertTrue(b"charset=utf-8" in utf_8) 1422 1423 euc_jp = soup.encode("euc_jp") 1424 self.assertTrue(b"charset=euc_jp" in euc_jp) 1425 1426 shift_jis = soup.encode("shift-jis") 1427 self.assertTrue(b"charset=shift-jis" in shift_jis) 1428 1429 utf_16_u = soup.encode("utf-16").decode("utf-16") 1430 self.assertTrue("charset=utf-16" in utf_16_u) 1431 1432 def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): 1433 markup = ('<head><meta content="text/html; charset=x-sjis" ' 1434 'http-equiv="Content-type"/></head><pre>foo</pre>') 1435 1436 # Beautiful Soup used to try to rewrite the meta tag even if the 1437 # meta tag got filtered out by the strainer. This test makes 1438 # sure that doesn't happen. 1439 strainer = SoupStrainer('pre') 1440 soup = self.soup(markup, parse_only=strainer) 1441 self.assertEqual(soup.contents[0].name, 'pre') 1442 1443class TestEncoding(SoupTest): 1444 """Test the ability to encode objects into strings.""" 1445 1446 def test_unicode_string_can_be_encoded(self): 1447 html = u"<b>\N{SNOWMAN}</b>" 1448 soup = self.soup(html) 1449 self.assertEqual(soup.b.string.encode("utf-8"), 1450 u"\N{SNOWMAN}".encode("utf-8")) 1451 1452 def test_tag_containing_unicode_string_can_be_encoded(self): 1453 html = u"<b>\N{SNOWMAN}</b>" 1454 soup = self.soup(html) 1455 self.assertEqual( 1456 soup.b.encode("utf-8"), html.encode("utf-8")) 1457 1458 def test_encoding_substitutes_unrecognized_characters_by_default(self): 1459 html = u"<b>\N{SNOWMAN}</b>" 1460 soup = self.soup(html) 1461 self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>") 1462 1463 def test_encoding_can_be_made_strict(self): 1464 html = u"<b>\N{SNOWMAN}</b>" 1465 soup = self.soup(html) 1466 self.assertRaises( 1467 UnicodeEncodeError, soup.encode, "ascii", errors="strict") 1468 1469 def test_decode_contents(self): 1470 html = u"<b>\N{SNOWMAN}</b>" 1471 soup = self.soup(html) 1472 self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) 1473 1474 def test_encode_contents(self): 1475 html = u"<b>\N{SNOWMAN}</b>" 1476 soup = self.soup(html) 1477 self.assertEqual( 1478 u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( 1479 encoding="utf8")) 1480 1481 def test_deprecated_renderContents(self): 1482 html = u"<b>\N{SNOWMAN}</b>" 1483 soup = self.soup(html) 1484 self.assertEqual( 1485 u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) 1486 1487class TestNavigableStringSubclasses(SoupTest): 1488 1489 def test_cdata(self): 1490 # None of the current builders turn CDATA sections into CData 1491 # objects, but you can create them manually. 1492 soup = self.soup("") 1493 cdata = CData("foo") 1494 soup.insert(1, cdata) 1495 self.assertEqual(str(soup), "<![CDATA[foo]]>") 1496 self.assertEqual(soup.find(text="foo"), "foo") 1497 self.assertEqual(soup.contents[0], "foo") 1498 1499 def test_cdata_is_never_formatted(self): 1500 """Text inside a CData object is passed into the formatter. 1501 1502 But the return value is ignored. 1503 """ 1504 1505 self.count = 0 1506 def increment(*args): 1507 self.count += 1 1508 return "BITTER FAILURE" 1509 1510 soup = self.soup("") 1511 cdata = CData("<><><>") 1512 soup.insert(1, cdata) 1513 self.assertEqual( 1514 b"<![CDATA[<><><>]]>", soup.encode(formatter=increment)) 1515 self.assertEqual(1, self.count) 1516 1517 def test_doctype_ends_in_newline(self): 1518 # Unlike other NavigableString subclasses, a DOCTYPE always ends 1519 # in a newline. 1520 doctype = Doctype("foo") 1521 soup = self.soup("") 1522 soup.insert(1, doctype) 1523 self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") 1524 1525 1526class TestSoupSelector(TreeTest): 1527 1528 HTML = """ 1529<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" 1530"http://www.w3.org/TR/html4/strict.dtd"> 1531<html> 1532<head> 1533<title>The title</title> 1534<link rel="stylesheet" href="blah.css" type="text/css" id="l1"> 1535</head> 1536<body> 1537 1538<div id="main" class="fancy"> 1539<div id="inner"> 1540<h1 id="header1">An H1</h1> 1541<p>Some text</p> 1542<p class="onep" id="p1">Some more text</p> 1543<h2 id="header2">An H2</h2> 1544<p class="class1 class2 class3" id="pmulti">Another</p> 1545<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> 1546<h2 id="header3">Another H2</h2> 1547<a id="me" href="http://simonwillison.net/" rel="me">me</a> 1548<span class="s1"> 1549<a href="#" id="s1a1">span1a1</a> 1550<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a> 1551<span class="span2"> 1552<a href="#" id="s2a1">span2a1</a> 1553</span> 1554<span class="span3"></span> 1555</span> 1556</div> 1557<p lang="en" id="lang-en">English</p> 1558<p lang="en-gb" id="lang-en-gb">English UK</p> 1559<p lang="en-us" id="lang-en-us">English US</p> 1560<p lang="fr" id="lang-fr">French</p> 1561</div> 1562 1563<div id="footer"> 1564</div> 1565""" 1566 1567 def setUp(self): 1568 self.soup = BeautifulSoup(self.HTML) 1569 1570 def assertSelects(self, selector, expected_ids): 1571 el_ids = [el['id'] for el in self.soup.select(selector)] 1572 el_ids.sort() 1573 expected_ids.sort() 1574 self.assertEqual(expected_ids, el_ids, 1575 "Selector %s, expected [%s], got [%s]" % ( 1576 selector, ', '.join(expected_ids), ', '.join(el_ids) 1577 ) 1578 ) 1579 1580 assertSelect = assertSelects 1581 1582 def assertSelectMultiple(self, *tests): 1583 for selector, expected_ids in tests: 1584 self.assertSelect(selector, expected_ids) 1585 1586 def test_one_tag_one(self): 1587 els = self.soup.select('title') 1588 self.assertEqual(len(els), 1) 1589 self.assertEqual(els[0].name, 'title') 1590 self.assertEqual(els[0].contents, [u'The title']) 1591 1592 def test_one_tag_many(self): 1593 els = self.soup.select('div') 1594 self.assertEqual(len(els), 3) 1595 for div in els: 1596 self.assertEqual(div.name, 'div') 1597 1598 def test_tag_in_tag_one(self): 1599 els = self.soup.select('div div') 1600 self.assertSelects('div div', ['inner']) 1601 1602 def test_tag_in_tag_many(self): 1603 for selector in ('html div', 'html body div', 'body div'): 1604 self.assertSelects(selector, ['main', 'inner', 'footer']) 1605 1606 def test_tag_no_match(self): 1607 self.assertEqual(len(self.soup.select('del')), 0) 1608 1609 def test_invalid_tag(self): 1610 self.assertRaises(ValueError, self.soup.select, 'tag%t') 1611 1612 def test_header_tags(self): 1613 self.assertSelectMultiple( 1614 ('h1', ['header1']), 1615 ('h2', ['header2', 'header3']), 1616 ) 1617 1618 def test_class_one(self): 1619 for selector in ('.onep', 'p.onep', 'html p.onep'): 1620 els = self.soup.select(selector) 1621 self.assertEqual(len(els), 1) 1622 self.assertEqual(els[0].name, 'p') 1623 self.assertEqual(els[0]['class'], ['onep']) 1624 1625 def test_class_mismatched_tag(self): 1626 els = self.soup.select('div.onep') 1627 self.assertEqual(len(els), 0) 1628 1629 def test_one_id(self): 1630 for selector in ('div#inner', '#inner', 'div div#inner'): 1631 self.assertSelects(selector, ['inner']) 1632 1633 def test_bad_id(self): 1634 els = self.soup.select('#doesnotexist') 1635 self.assertEqual(len(els), 0) 1636 1637 def test_items_in_id(self): 1638 els = self.soup.select('div#inner p') 1639 self.assertEqual(len(els), 3) 1640 for el in els: 1641 self.assertEqual(el.name, 'p') 1642 self.assertEqual(els[1]['class'], ['onep']) 1643 self.assertFalse(els[0].has_attr('class')) 1644 1645 def test_a_bunch_of_emptys(self): 1646 for selector in ('div#main del', 'div#main div.oops', 'div div#main'): 1647 self.assertEqual(len(self.soup.select(selector)), 0) 1648 1649 def test_multi_class_support(self): 1650 for selector in ('.class1', 'p.class1', '.class2', 'p.class2', 1651 '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): 1652 self.assertSelects(selector, ['pmulti']) 1653 1654 def test_multi_class_selection(self): 1655 for selector in ('.class1.class3', '.class3.class2', 1656 '.class1.class2.class3'): 1657 self.assertSelects(selector, ['pmulti']) 1658 1659 def test_child_selector(self): 1660 self.assertSelects('.s1 > a', ['s1a1', 's1a2']) 1661 self.assertSelects('.s1 > a span', ['s1a2s1']) 1662 1663 def test_child_selector_id(self): 1664 self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) 1665 1666 def test_attribute_equals(self): 1667 self.assertSelectMultiple( 1668 ('p[class="onep"]', ['p1']), 1669 ('p[id="p1"]', ['p1']), 1670 ('[class="onep"]', ['p1']), 1671 ('[id="p1"]', ['p1']), 1672 ('link[rel="stylesheet"]', ['l1']), 1673 ('link[type="text/css"]', ['l1']), 1674 ('link[href="blah.css"]', ['l1']), 1675 ('link[href="no-blah.css"]', []), 1676 ('[rel="stylesheet"]', ['l1']), 1677 ('[type="text/css"]', ['l1']), 1678 ('[href="blah.css"]', ['l1']), 1679 ('[href="no-blah.css"]', []), 1680 ('p[href="no-blah.css"]', []), 1681 ('[href="no-blah.css"]', []), 1682 ) 1683 1684 def test_attribute_tilde(self): 1685 self.assertSelectMultiple( 1686 ('p[class~="class1"]', ['pmulti']), 1687 ('p[class~="class2"]', ['pmulti']), 1688 ('p[class~="class3"]', ['pmulti']), 1689 ('[class~="class1"]', ['pmulti']), 1690 ('[class~="class2"]', ['pmulti']), 1691 ('[class~="class3"]', ['pmulti']), 1692 ('a[rel~="friend"]', ['bob']), 1693 ('a[rel~="met"]', ['bob']), 1694 ('[rel~="friend"]', ['bob']), 1695 ('[rel~="met"]', ['bob']), 1696 ) 1697 1698 def test_attribute_startswith(self): 1699 self.assertSelectMultiple( 1700 ('[rel^="style"]', ['l1']), 1701 ('link[rel^="style"]', ['l1']), 1702 ('notlink[rel^="notstyle"]', []), 1703 ('[rel^="notstyle"]', []), 1704 ('link[rel^="notstyle"]', []), 1705 ('link[href^="bla"]', ['l1']), 1706 ('a[href^="http://"]', ['bob', 'me']), 1707 ('[href^="http://"]', ['bob', 'me']), 1708 ('[id^="p"]', ['pmulti', 'p1']), 1709 ('[id^="m"]', ['me', 'main']), 1710 ('div[id^="m"]', ['main']), 1711 ('a[id^="m"]', ['me']), 1712 ) 1713 1714 def test_attribute_endswith(self): 1715 self.assertSelectMultiple( 1716 ('[href$=".css"]', ['l1']), 1717 ('link[href$=".css"]', ['l1']), 1718 ('link[id$="1"]', ['l1']), 1719 ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), 1720 ('div[id$="1"]', []), 1721 ('[id$="noending"]', []), 1722 ) 1723 1724 def test_attribute_contains(self): 1725 self.assertSelectMultiple( 1726 # From test_attribute_startswith 1727 ('[rel*="style"]', ['l1']), 1728 ('link[rel*="style"]', ['l1']), 1729 ('notlink[rel*="notstyle"]', []), 1730 ('[rel*="notstyle"]', []), 1731 ('link[rel*="notstyle"]', []), 1732 ('link[href*="bla"]', ['l1']), 1733 ('a[href*="http://"]', ['bob', 'me']), 1734 ('[href*="http://"]', ['bob', 'me']), 1735 ('[id*="p"]', ['pmulti', 'p1']), 1736 ('div[id*="m"]', ['main']), 1737 ('a[id*="m"]', ['me']), 1738 # From test_attribute_endswith 1739 ('[href*=".css"]', ['l1']), 1740 ('link[href*=".css"]', ['l1']), 1741 ('link[id*="1"]', ['l1']), 1742 ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), 1743 ('div[id*="1"]', []), 1744 ('[id*="noending"]', []), 1745 # New for this test 1746 ('[href*="."]', ['bob', 'me', 'l1']), 1747 ('a[href*="."]', ['bob', 'me']), 1748 ('link[href*="."]', ['l1']), 1749 ('div[id*="n"]', ['main', 'inner']), 1750 ('div[id*="nn"]', ['inner']), 1751 ) 1752 1753 def test_attribute_exact_or_hypen(self): 1754 self.assertSelectMultiple( 1755 ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), 1756 ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), 1757 ('p[lang|="fr"]', ['lang-fr']), 1758 ('p[lang|="gb"]', []), 1759 ) 1760 1761 def test_attribute_exists(self): 1762 self.assertSelectMultiple( 1763 ('[rel]', ['l1', 'bob', 'me']), 1764 ('link[rel]', ['l1']), 1765 ('a[rel]', ['bob', 'me']), 1766 ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), 1767 ('p[class]', ['p1', 'pmulti']), 1768 ('[blah]', []), 1769 ('p[blah]', []), 1770 ) 1771 1772 def test_nth_of_type(self): 1773 # Try to select first paragraph 1774 els = self.soup.select('div#inner p:nth-of-type(1)') 1775 self.assertEqual(len(els), 1) 1776 self.assertEqual(els[0].string, u'Some text') 1777 1778 # Try to select third paragraph 1779 els = self.soup.select('div#inner p:nth-of-type(3)') 1780 self.assertEqual(len(els), 1) 1781 self.assertEqual(els[0].string, u'Another') 1782 1783 # Try to select (non-existent!) fourth paragraph 1784 els = self.soup.select('div#inner p:nth-of-type(4)') 1785 self.assertEqual(len(els), 0) 1786 1787 # Pass in an invalid value. 1788 self.assertRaises( 1789 ValueError, self.soup.select, 'div p:nth-of-type(0)') 1790 1791 def test_nth_of_type_direct_descendant(self): 1792 els = self.soup.select('div#inner > p:nth-of-type(1)') 1793 self.assertEqual(len(els), 1) 1794 self.assertEqual(els[0].string, u'Some text') 1795 1796 def test_id_child_selector_nth_of_type(self): 1797 self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) 1798 1799 def test_select_on_element(self): 1800 # Other tests operate on the tree; this operates on an element 1801 # within the tree. 1802 inner = self.soup.find("div", id="main") 1803 selected = inner.select("div") 1804 # The <div id="inner"> tag was selected. The <div id="footer"> 1805 # tag was not. 1806 self.assertSelectsIDs(selected, ['inner']) 1807 1808 def test_overspecified_child_id(self): 1809 self.assertSelects(".fancy #inner", ['inner']) 1810 self.assertSelects(".normal #inner", []) 1811 1812 def test_adjacent_sibling_selector(self): 1813 self.assertSelects('#p1 + h2', ['header2']) 1814 self.assertSelects('#p1 + h2 + p', ['pmulti']) 1815 self.assertSelects('#p1 + #header2 + .class1', ['pmulti']) 1816 self.assertEqual([], self.soup.select('#p1 + p')) 1817 1818 def test_general_sibling_selector(self): 1819 self.assertSelects('#p1 ~ h2', ['header2', 'header3']) 1820 self.assertSelects('#p1 ~ #header2', ['header2']) 1821 self.assertSelects('#p1 ~ h2 + a', ['me']) 1822 self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me']) 1823 self.assertEqual([], self.soup.select('#inner ~ h2')) 1824 1825 def test_dangling_combinator(self): 1826 self.assertRaises(ValueError, self.soup.select, 'h1 >') 1827 1828 def test_sibling_combinator_wont_select_same_tag_twice(self): 1829 self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) 1830