1# -*- coding: utf-8 -*-
2"""Tests for Beautiful Soup's tree traversal methods.
3
4The tree traversal methods are the main advantage of using Beautiful
5Soup over just using a parser.
6
7Different parsers will build different Beautiful Soup trees given the
8same markup, but all Beautiful Soup trees can be traversed with the
9methods tested here.
10"""
11
12import copy
13import pickle
14import re
15import warnings
16from bs4 import BeautifulSoup
17from bs4.builder import (
18    builder_registry,
19    HTMLParserTreeBuilder,
20)
21from bs4.element import (
22    CData,
23    Comment,
24    Doctype,
25    NavigableString,
26    SoupStrainer,
27    Tag,
28)
29from bs4.testing import (
30    SoupTest,
31    skipIf,
32)
33
34XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
35LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
36
37class TreeTest(SoupTest):
38
39    def assertSelects(self, tags, should_match):
40        """Make sure that the given tags have the correct text.
41
42        This is used in tests that define a bunch of tags, each
43        containing a single string, and then select certain strings by
44        some mechanism.
45        """
46        self.assertEqual([tag.string for tag in tags], should_match)
47
48    def assertSelectsIDs(self, tags, should_match):
49        """Make sure that the given tags have the correct IDs.
50
51        This is used in tests that define a bunch of tags, each
52        containing a single string, and then select certain strings by
53        some mechanism.
54        """
55        self.assertEqual([tag['id'] for tag in tags], should_match)
56
57
58class TestFind(TreeTest):
59    """Basic tests of the find() method.
60
61    find() just calls find_all() with limit=1, so it's not tested all
62    that thouroughly here.
63    """
64
65    def test_find_tag(self):
66        soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>")
67        self.assertEqual(soup.find("b").string, "2")
68
69    def test_unicode_text_find(self):
70        soup = self.soup(u'<h1>Räksmörgås</h1>')
71        self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås')
72
73    def test_find_everything(self):
74        """Test an optimization that finds all tags."""
75        soup = self.soup("<a>foo</a><b>bar</b>")
76        self.assertEqual(2, len(soup.find_all()))
77
78    def test_find_everything_with_name(self):
79        """Test an optimization that finds all tags with a given name."""
80        soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>")
81        self.assertEqual(2, len(soup.find_all('a')))
82
83class TestFindAll(TreeTest):
84    """Basic tests of the find_all() method."""
85
86    def test_find_all_text_nodes(self):
87        """You can search the tree for text nodes."""
88        soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
89        # Exact match.
90        self.assertEqual(soup.find_all(text="bar"), [u"bar"])
91        # Match any of a number of strings.
92        self.assertEqual(
93            soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"])
94        # Match a regular expression.
95        self.assertEqual(soup.find_all(text=re.compile('.*')),
96                         [u"Foo", u"bar", u'\xbb'])
97        # Match anything.
98        self.assertEqual(soup.find_all(text=True),
99                         [u"Foo", u"bar", u'\xbb'])
100
101    def test_find_all_limit(self):
102        """You can limit the number of items returned by find_all."""
103        soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>")
104        self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"])
105        self.assertSelects(soup.find_all('a', limit=1), ["1"])
106        self.assertSelects(
107            soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"])
108
109        # A limit of 0 means no limit.
110        self.assertSelects(
111            soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"])
112
113    def test_calling_a_tag_is_calling_findall(self):
114        soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>")
115        self.assertSelects(soup('a', limit=1), ["1"])
116        self.assertSelects(soup.b(id="foo"), ["3"])
117
118    def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self):
119        soup = self.soup("<a></a>")
120        # Create a self-referential list.
121        l = []
122        l.append(l)
123
124        # Without special code in _normalize_search_value, this would cause infinite
125        # recursion.
126        self.assertEqual([], soup.find_all(l))
127
128    def test_find_all_resultset(self):
129        """All find_all calls return a ResultSet"""
130        soup = self.soup("<a></a>")
131        result = soup.find_all("a")
132        self.assertTrue(hasattr(result, "source"))
133
134        result = soup.find_all(True)
135        self.assertTrue(hasattr(result, "source"))
136
137        result = soup.find_all(text="foo")
138        self.assertTrue(hasattr(result, "source"))
139
140
141class TestFindAllBasicNamespaces(TreeTest):
142
143    def test_find_by_namespaced_name(self):
144        soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">')
145        self.assertEqual("4", soup.find("mathml:msqrt").string)
146        self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name)
147
148
149class TestFindAllByName(TreeTest):
150    """Test ways of finding tags by tag name."""
151
152    def setUp(self):
153        super(TreeTest, self).setUp()
154        self.tree =  self.soup("""<a>First tag.</a>
155                                  <b>Second tag.</b>
156                                  <c>Third <a>Nested tag.</a> tag.</c>""")
157
158    def test_find_all_by_tag_name(self):
159        # Find all the <a> tags.
160        self.assertSelects(
161            self.tree.find_all('a'), ['First tag.', 'Nested tag.'])
162
163    def test_find_all_by_name_and_text(self):
164        self.assertSelects(
165            self.tree.find_all('a', text='First tag.'), ['First tag.'])
166
167        self.assertSelects(
168            self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.'])
169
170        self.assertSelects(
171            self.tree.find_all('a', text=re.compile("tag")),
172            ['First tag.', 'Nested tag.'])
173
174
175    def test_find_all_on_non_root_element(self):
176        # You can call find_all on any node, not just the root.
177        self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.'])
178
179    def test_calling_element_invokes_find_all(self):
180        self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.'])
181
182    def test_find_all_by_tag_strainer(self):
183        self.assertSelects(
184            self.tree.find_all(SoupStrainer('a')),
185            ['First tag.', 'Nested tag.'])
186
187    def test_find_all_by_tag_names(self):
188        self.assertSelects(
189            self.tree.find_all(['a', 'b']),
190            ['First tag.', 'Second tag.', 'Nested tag.'])
191
192    def test_find_all_by_tag_dict(self):
193        self.assertSelects(
194            self.tree.find_all({'a' : True, 'b' : True}),
195            ['First tag.', 'Second tag.', 'Nested tag.'])
196
197    def test_find_all_by_tag_re(self):
198        self.assertSelects(
199            self.tree.find_all(re.compile('^[ab]$')),
200            ['First tag.', 'Second tag.', 'Nested tag.'])
201
202    def test_find_all_with_tags_matching_method(self):
203        # You can define an oracle method that determines whether
204        # a tag matches the search.
205        def id_matches_name(tag):
206            return tag.name == tag.get('id')
207
208        tree = self.soup("""<a id="a">Match 1.</a>
209                            <a id="1">Does not match.</a>
210                            <b id="b">Match 2.</a>""")
211
212        self.assertSelects(
213            tree.find_all(id_matches_name), ["Match 1.", "Match 2."])
214
215
216class TestFindAllByAttribute(TreeTest):
217
218    def test_find_all_by_attribute_name(self):
219        # You can pass in keyword arguments to find_all to search by
220        # attribute.
221        tree = self.soup("""
222                         <a id="first">Matching a.</a>
223                         <a id="second">
224                          Non-matching <b id="first">Matching b.</b>a.
225                         </a>""")
226        self.assertSelects(tree.find_all(id='first'),
227                           ["Matching a.", "Matching b."])
228
229    def test_find_all_by_utf8_attribute_value(self):
230        peace = u"םולש".encode("utf8")
231        data = u'<a title="םולש"></a>'.encode("utf8")
232        soup = self.soup(data)
233        self.assertEqual([soup.a], soup.find_all(title=peace))
234        self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
235        self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"]))
236
237    def test_find_all_by_attribute_dict(self):
238        # You can pass in a dictionary as the argument 'attrs'. This
239        # lets you search for attributes like 'name' (a fixed argument
240        # to find_all) and 'class' (a reserved word in Python.)
241        tree = self.soup("""
242                         <a name="name1" class="class1">Name match.</a>
243                         <a name="name2" class="class2">Class match.</a>
244                         <a name="name3" class="class3">Non-match.</a>
245                         <name1>A tag called 'name1'.</name1>
246                         """)
247
248        # This doesn't do what you want.
249        self.assertSelects(tree.find_all(name='name1'),
250                           ["A tag called 'name1'."])
251        # This does what you want.
252        self.assertSelects(tree.find_all(attrs={'name' : 'name1'}),
253                           ["Name match."])
254
255        self.assertSelects(tree.find_all(attrs={'class' : 'class2'}),
256                           ["Class match."])
257
258    def test_find_all_by_class(self):
259        tree = self.soup("""
260                         <a class="1">Class 1.</a>
261                         <a class="2">Class 2.</a>
262                         <b class="1">Class 1.</b>
263                         <c class="3 4">Class 3 and 4.</c>
264                         """)
265
266        # Passing in the class_ keyword argument will search against
267        # the 'class' attribute.
268        self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.'])
269        self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.'])
270        self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.'])
271
272        # Passing in a string to 'attrs' will also search the CSS class.
273        self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
274        self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
275        self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
276        self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
277
278    def test_find_by_class_when_multiple_classes_present(self):
279        tree = self.soup("<gar class='foo bar'>Found it</gar>")
280
281        f = tree.find_all("gar", class_=re.compile("o"))
282        self.assertSelects(f, ["Found it"])
283
284        f = tree.find_all("gar", class_=re.compile("a"))
285        self.assertSelects(f, ["Found it"])
286
287        # Since the class is not the string "foo bar", but the two
288        # strings "foo" and "bar", this will not find anything.
289        f = tree.find_all("gar", class_=re.compile("o b"))
290        self.assertSelects(f, [])
291
292    def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
293        soup = self.soup("<a class='bar'>Found it</a>")
294
295        self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"])
296
297        def big_attribute_value(value):
298            return len(value) > 3
299
300        self.assertSelects(soup.find_all("a", big_attribute_value), [])
301
302        def small_attribute_value(value):
303            return len(value) <= 3
304
305        self.assertSelects(
306            soup.find_all("a", small_attribute_value), ["Found it"])
307
308    def test_find_all_with_string_for_attrs_finds_multiple_classes(self):
309        soup = self.soup('<a class="foo bar"></a><a class="foo"></a>')
310        a, a2 = soup.find_all("a")
311        self.assertEqual([a, a2], soup.find_all("a", "foo"))
312        self.assertEqual([a], soup.find_all("a", "bar"))
313
314        # If you specify the class as a string that contains a
315        # space, only that specific value will be found.
316        self.assertEqual([a], soup.find_all("a", class_="foo bar"))
317        self.assertEqual([a], soup.find_all("a", "foo bar"))
318        self.assertEqual([], soup.find_all("a", "bar foo"))
319
320    def test_find_all_by_attribute_soupstrainer(self):
321        tree = self.soup("""
322                         <a id="first">Match.</a>
323                         <a id="second">Non-match.</a>""")
324
325        strainer = SoupStrainer(attrs={'id' : 'first'})
326        self.assertSelects(tree.find_all(strainer), ['Match.'])
327
328    def test_find_all_with_missing_atribute(self):
329        # You can pass in None as the value of an attribute to find_all.
330        # This will match tags that do not have that attribute set.
331        tree = self.soup("""<a id="1">ID present.</a>
332                            <a>No ID present.</a>
333                            <a id="">ID is empty.</a>""")
334        self.assertSelects(tree.find_all('a', id=None), ["No ID present."])
335
336    def test_find_all_with_defined_attribute(self):
337        # You can pass in None as the value of an attribute to find_all.
338        # This will match tags that have that attribute set to any value.
339        tree = self.soup("""<a id="1">ID present.</a>
340                            <a>No ID present.</a>
341                            <a id="">ID is empty.</a>""")
342        self.assertSelects(
343            tree.find_all(id=True), ["ID present.", "ID is empty."])
344
345    def test_find_all_with_numeric_attribute(self):
346        # If you search for a number, it's treated as a string.
347        tree = self.soup("""<a id=1>Unquoted attribute.</a>
348                            <a id="1">Quoted attribute.</a>""")
349
350        expected = ["Unquoted attribute.", "Quoted attribute."]
351        self.assertSelects(tree.find_all(id=1), expected)
352        self.assertSelects(tree.find_all(id="1"), expected)
353
354    def test_find_all_with_list_attribute_values(self):
355        # You can pass a list of attribute values instead of just one,
356        # and you'll get tags that match any of the values.
357        tree = self.soup("""<a id="1">1</a>
358                            <a id="2">2</a>
359                            <a id="3">3</a>
360                            <a>No ID.</a>""")
361        self.assertSelects(tree.find_all(id=["1", "3", "4"]),
362                           ["1", "3"])
363
364    def test_find_all_with_regular_expression_attribute_value(self):
365        # You can pass a regular expression as an attribute value, and
366        # you'll get tags whose values for that attribute match the
367        # regular expression.
368        tree = self.soup("""<a id="a">One a.</a>
369                            <a id="aa">Two as.</a>
370                            <a id="ab">Mixed as and bs.</a>
371                            <a id="b">One b.</a>
372                            <a>No ID.</a>""")
373
374        self.assertSelects(tree.find_all(id=re.compile("^a+$")),
375                           ["One a.", "Two as."])
376
377    def test_find_by_name_and_containing_string(self):
378        soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>")
379        a = soup.a
380
381        self.assertEqual([a], soup.find_all("a", text="foo"))
382        self.assertEqual([], soup.find_all("a", text="bar"))
383        self.assertEqual([], soup.find_all("a", text="bar"))
384
385    def test_find_by_name_and_containing_string_when_string_is_buried(self):
386        soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>")
387        self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo"))
388
389    def test_find_by_attribute_and_containing_string(self):
390        soup = self.soup('<b id="1">foo</b><a id="2">foo</a>')
391        a = soup.a
392
393        self.assertEqual([a], soup.find_all(id=2, text="foo"))
394        self.assertEqual([], soup.find_all(id=1, text="bar"))
395
396
397
398
399class TestIndex(TreeTest):
400    """Test Tag.index"""
401    def test_index(self):
402        tree = self.soup("""<div>
403                            <a>Identical</a>
404                            <b>Not identical</b>
405                            <a>Identical</a>
406
407                            <c><d>Identical with child</d></c>
408                            <b>Also not identical</b>
409                            <c><d>Identical with child</d></c>
410                            </div>""")
411        div = tree.div
412        for i, element in enumerate(div.contents):
413            self.assertEqual(i, div.index(element))
414        self.assertRaises(ValueError, tree.index, 1)
415
416
417class TestParentOperations(TreeTest):
418    """Test navigation and searching through an element's parents."""
419
420    def setUp(self):
421        super(TestParentOperations, self).setUp()
422        self.tree = self.soup('''<ul id="empty"></ul>
423                                 <ul id="top">
424                                  <ul id="middle">
425                                   <ul id="bottom">
426                                    <b>Start here</b>
427                                   </ul>
428                                  </ul>''')
429        self.start = self.tree.b
430
431
432    def test_parent(self):
433        self.assertEqual(self.start.parent['id'], 'bottom')
434        self.assertEqual(self.start.parent.parent['id'], 'middle')
435        self.assertEqual(self.start.parent.parent.parent['id'], 'top')
436
437    def test_parent_of_top_tag_is_soup_object(self):
438        top_tag = self.tree.contents[0]
439        self.assertEqual(top_tag.parent, self.tree)
440
441    def test_soup_object_has_no_parent(self):
442        self.assertEqual(None, self.tree.parent)
443
444    def test_find_parents(self):
445        self.assertSelectsIDs(
446            self.start.find_parents('ul'), ['bottom', 'middle', 'top'])
447        self.assertSelectsIDs(
448            self.start.find_parents('ul', id="middle"), ['middle'])
449
450    def test_find_parent(self):
451        self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
452        self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
453
454    def test_parent_of_text_element(self):
455        text = self.tree.find(text="Start here")
456        self.assertEqual(text.parent.name, 'b')
457
458    def test_text_element_find_parent(self):
459        text = self.tree.find(text="Start here")
460        self.assertEqual(text.find_parent('ul')['id'], 'bottom')
461
462    def test_parent_generator(self):
463        parents = [parent['id'] for parent in self.start.parents
464                   if parent is not None and 'id' in parent.attrs]
465        self.assertEqual(parents, ['bottom', 'middle', 'top'])
466
467
468class ProximityTest(TreeTest):
469
470    def setUp(self):
471        super(TreeTest, self).setUp()
472        self.tree = self.soup(
473            '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>')
474
475
476class TestNextOperations(ProximityTest):
477
478    def setUp(self):
479        super(TestNextOperations, self).setUp()
480        self.start = self.tree.b
481
482    def test_next(self):
483        self.assertEqual(self.start.next_element, "One")
484        self.assertEqual(self.start.next_element.next_element['id'], "2")
485
486    def test_next_of_last_item_is_none(self):
487        last = self.tree.find(text="Three")
488        self.assertEqual(last.next_element, None)
489
490    def test_next_of_root_is_none(self):
491        # The document root is outside the next/previous chain.
492        self.assertEqual(self.tree.next_element, None)
493
494    def test_find_all_next(self):
495        self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"])
496        self.start.find_all_next(id=3)
497        self.assertSelects(self.start.find_all_next(id=3), ["Three"])
498
499    def test_find_next(self):
500        self.assertEqual(self.start.find_next('b')['id'], '2')
501        self.assertEqual(self.start.find_next(text="Three"), "Three")
502
503    def test_find_next_for_text_element(self):
504        text = self.tree.find(text="One")
505        self.assertEqual(text.find_next("b").string, "Two")
506        self.assertSelects(text.find_all_next("b"), ["Two", "Three"])
507
508    def test_next_generator(self):
509        start = self.tree.find(text="Two")
510        successors = [node for node in start.next_elements]
511        # There are two successors: the final <b> tag and its text contents.
512        tag, contents = successors
513        self.assertEqual(tag['id'], '3')
514        self.assertEqual(contents, "Three")
515
516class TestPreviousOperations(ProximityTest):
517
518    def setUp(self):
519        super(TestPreviousOperations, self).setUp()
520        self.end = self.tree.find(text="Three")
521
522    def test_previous(self):
523        self.assertEqual(self.end.previous_element['id'], "3")
524        self.assertEqual(self.end.previous_element.previous_element, "Two")
525
526    def test_previous_of_first_item_is_none(self):
527        first = self.tree.find('html')
528        self.assertEqual(first.previous_element, None)
529
530    def test_previous_of_root_is_none(self):
531        # The document root is outside the next/previous chain.
532        # XXX This is broken!
533        #self.assertEqual(self.tree.previous_element, None)
534        pass
535
536    def test_find_all_previous(self):
537        # The <b> tag containing the "Three" node is the predecessor
538        # of the "Three" node itself, which is why "Three" shows up
539        # here.
540        self.assertSelects(
541            self.end.find_all_previous('b'), ["Three", "Two", "One"])
542        self.assertSelects(self.end.find_all_previous(id=1), ["One"])
543
544    def test_find_previous(self):
545        self.assertEqual(self.end.find_previous('b')['id'], '3')
546        self.assertEqual(self.end.find_previous(text="One"), "One")
547
548    def test_find_previous_for_text_element(self):
549        text = self.tree.find(text="Three")
550        self.assertEqual(text.find_previous("b").string, "Three")
551        self.assertSelects(
552            text.find_all_previous("b"), ["Three", "Two", "One"])
553
554    def test_previous_generator(self):
555        start = self.tree.find(text="One")
556        predecessors = [node for node in start.previous_elements]
557
558        # There are four predecessors: the <b> tag containing "One"
559        # the <body> tag, the <head> tag, and the <html> tag.
560        b, body, head, html = predecessors
561        self.assertEqual(b['id'], '1')
562        self.assertEqual(body.name, "body")
563        self.assertEqual(head.name, "head")
564        self.assertEqual(html.name, "html")
565
566
567class SiblingTest(TreeTest):
568
569    def setUp(self):
570        super(SiblingTest, self).setUp()
571        markup = '''<html>
572                    <span id="1">
573                     <span id="1.1"></span>
574                    </span>
575                    <span id="2">
576                     <span id="2.1"></span>
577                    </span>
578                    <span id="3">
579                     <span id="3.1"></span>
580                    </span>
581                    <span id="4"></span>
582                    </html>'''
583        # All that whitespace looks good but makes the tests more
584        # difficult. Get rid of it.
585        markup = re.compile("\n\s*").sub("", markup)
586        self.tree = self.soup(markup)
587
588
589class TestNextSibling(SiblingTest):
590
591    def setUp(self):
592        super(TestNextSibling, self).setUp()
593        self.start = self.tree.find(id="1")
594
595    def test_next_sibling_of_root_is_none(self):
596        self.assertEqual(self.tree.next_sibling, None)
597
598    def test_next_sibling(self):
599        self.assertEqual(self.start.next_sibling['id'], '2')
600        self.assertEqual(self.start.next_sibling.next_sibling['id'], '3')
601
602        # Note the difference between next_sibling and next_element.
603        self.assertEqual(self.start.next_element['id'], '1.1')
604
605    def test_next_sibling_may_not_exist(self):
606        self.assertEqual(self.tree.html.next_sibling, None)
607
608        nested_span = self.tree.find(id="1.1")
609        self.assertEqual(nested_span.next_sibling, None)
610
611        last_span = self.tree.find(id="4")
612        self.assertEqual(last_span.next_sibling, None)
613
614    def test_find_next_sibling(self):
615        self.assertEqual(self.start.find_next_sibling('span')['id'], '2')
616
617    def test_next_siblings(self):
618        self.assertSelectsIDs(self.start.find_next_siblings("span"),
619                              ['2', '3', '4'])
620
621        self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3'])
622
623    def test_next_sibling_for_text_element(self):
624        soup = self.soup("Foo<b>bar</b>baz")
625        start = soup.find(text="Foo")
626        self.assertEqual(start.next_sibling.name, 'b')
627        self.assertEqual(start.next_sibling.next_sibling, 'baz')
628
629        self.assertSelects(start.find_next_siblings('b'), ['bar'])
630        self.assertEqual(start.find_next_sibling(text="baz"), "baz")
631        self.assertEqual(start.find_next_sibling(text="nonesuch"), None)
632
633
634class TestPreviousSibling(SiblingTest):
635
636    def setUp(self):
637        super(TestPreviousSibling, self).setUp()
638        self.end = self.tree.find(id="4")
639
640    def test_previous_sibling_of_root_is_none(self):
641        self.assertEqual(self.tree.previous_sibling, None)
642
643    def test_previous_sibling(self):
644        self.assertEqual(self.end.previous_sibling['id'], '3')
645        self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2')
646
647        # Note the difference between previous_sibling and previous_element.
648        self.assertEqual(self.end.previous_element['id'], '3.1')
649
650    def test_previous_sibling_may_not_exist(self):
651        self.assertEqual(self.tree.html.previous_sibling, None)
652
653        nested_span = self.tree.find(id="1.1")
654        self.assertEqual(nested_span.previous_sibling, None)
655
656        first_span = self.tree.find(id="1")
657        self.assertEqual(first_span.previous_sibling, None)
658
659    def test_find_previous_sibling(self):
660        self.assertEqual(self.end.find_previous_sibling('span')['id'], '3')
661
662    def test_previous_siblings(self):
663        self.assertSelectsIDs(self.end.find_previous_siblings("span"),
664                              ['3', '2', '1'])
665
666        self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1'])
667
668    def test_previous_sibling_for_text_element(self):
669        soup = self.soup("Foo<b>bar</b>baz")
670        start = soup.find(text="baz")
671        self.assertEqual(start.previous_sibling.name, 'b')
672        self.assertEqual(start.previous_sibling.previous_sibling, 'Foo')
673
674        self.assertSelects(start.find_previous_siblings('b'), ['bar'])
675        self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo")
676        self.assertEqual(start.find_previous_sibling(text="nonesuch"), None)
677
678
679class TestTagCreation(SoupTest):
680    """Test the ability to create new tags."""
681    def test_new_tag(self):
682        soup = self.soup("")
683        new_tag = soup.new_tag("foo", bar="baz")
684        self.assertTrue(isinstance(new_tag, Tag))
685        self.assertEqual("foo", new_tag.name)
686        self.assertEqual(dict(bar="baz"), new_tag.attrs)
687        self.assertEqual(None, new_tag.parent)
688
689    def test_tag_inherits_self_closing_rules_from_builder(self):
690        if XML_BUILDER_PRESENT:
691            xml_soup = BeautifulSoup("", "xml")
692            xml_br = xml_soup.new_tag("br")
693            xml_p = xml_soup.new_tag("p")
694
695            # Both the <br> and <p> tag are empty-element, just because
696            # they have no contents.
697            self.assertEqual(b"<br/>", xml_br.encode())
698            self.assertEqual(b"<p/>", xml_p.encode())
699
700        html_soup = BeautifulSoup("", "html")
701        html_br = html_soup.new_tag("br")
702        html_p = html_soup.new_tag("p")
703
704        # The HTML builder users HTML's rules about which tags are
705        # empty-element tags, and the new tags reflect these rules.
706        self.assertEqual(b"<br/>", html_br.encode())
707        self.assertEqual(b"<p></p>", html_p.encode())
708
709    def test_new_string_creates_navigablestring(self):
710        soup = self.soup("")
711        s = soup.new_string("foo")
712        self.assertEqual("foo", s)
713        self.assertTrue(isinstance(s, NavigableString))
714
715    def test_new_string_can_create_navigablestring_subclass(self):
716        soup = self.soup("")
717        s = soup.new_string("foo", Comment)
718        self.assertEqual("foo", s)
719        self.assertTrue(isinstance(s, Comment))
720
721class TestTreeModification(SoupTest):
722
723    def test_attribute_modification(self):
724        soup = self.soup('<a id="1"></a>')
725        soup.a['id'] = 2
726        self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>'))
727        del(soup.a['id'])
728        self.assertEqual(soup.decode(), self.document_for('<a></a>'))
729        soup.a['id2'] = 'foo'
730        self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>'))
731
732    def test_new_tag_creation(self):
733        builder = builder_registry.lookup('html')()
734        soup = self.soup("<body></body>", builder=builder)
735        a = Tag(soup, builder, 'a')
736        ol = Tag(soup, builder, 'ol')
737        a['href'] = 'http://foo.com/'
738        soup.body.insert(0, a)
739        soup.body.insert(1, ol)
740        self.assertEqual(
741            soup.body.encode(),
742            b'<body><a href="http://foo.com/"></a><ol></ol></body>')
743
744    def test_append_to_contents_moves_tag(self):
745        doc = """<p id="1">Don't leave me <b>here</b>.</p>
746                <p id="2">Don\'t leave!</p>"""
747        soup = self.soup(doc)
748        second_para = soup.find(id='2')
749        bold = soup.b
750
751        # Move the <b> tag to the end of the second paragraph.
752        soup.find(id='2').append(soup.b)
753
754        # The <b> tag is now a child of the second paragraph.
755        self.assertEqual(bold.parent, second_para)
756
757        self.assertEqual(
758            soup.decode(), self.document_for(
759                '<p id="1">Don\'t leave me .</p>\n'
760                '<p id="2">Don\'t leave!<b>here</b></p>'))
761
762    def test_replace_with_returns_thing_that_was_replaced(self):
763        text = "<a></a><b><c></c></b>"
764        soup = self.soup(text)
765        a = soup.a
766        new_a = a.replace_with(soup.c)
767        self.assertEqual(a, new_a)
768
769    def test_unwrap_returns_thing_that_was_replaced(self):
770        text = "<a><b></b><c></c></a>"
771        soup = self.soup(text)
772        a = soup.a
773        new_a = a.unwrap()
774        self.assertEqual(a, new_a)
775
776    def test_replace_tag_with_itself(self):
777        text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
778        soup = self.soup(text)
779        c = soup.c
780        soup.c.replace_with(c)
781        self.assertEqual(soup.decode(), self.document_for(text))
782
783    def test_replace_tag_with_its_parent_raises_exception(self):
784        text = "<a><b></b></a>"
785        soup = self.soup(text)
786        self.assertRaises(ValueError, soup.b.replace_with, soup.a)
787
788    def test_insert_tag_into_itself_raises_exception(self):
789        text = "<a><b></b></a>"
790        soup = self.soup(text)
791        self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
792
793    def test_replace_with_maintains_next_element_throughout(self):
794        soup = self.soup('<p><a>one</a><b>three</b></p>')
795        a = soup.a
796        b = a.contents[0]
797        # Make it so the <a> tag has two text children.
798        a.insert(1, "two")
799
800        # Now replace each one with the empty string.
801        left, right = a.contents
802        left.replaceWith('')
803        right.replaceWith('')
804
805        # The <b> tag is still connected to the tree.
806        self.assertEqual("three", soup.b.string)
807
808    def test_replace_final_node(self):
809        soup = self.soup("<b>Argh!</b>")
810        soup.find(text="Argh!").replace_with("Hooray!")
811        new_text = soup.find(text="Hooray!")
812        b = soup.b
813        self.assertEqual(new_text.previous_element, b)
814        self.assertEqual(new_text.parent, b)
815        self.assertEqual(new_text.previous_element.next_element, new_text)
816        self.assertEqual(new_text.next_element, None)
817
818    def test_consecutive_text_nodes(self):
819        # A builder should never create two consecutive text nodes,
820        # but if you insert one next to another, Beautiful Soup will
821        # handle it correctly.
822        soup = self.soup("<a><b>Argh!</b><c></c></a>")
823        soup.b.insert(1, "Hooray!")
824
825        self.assertEqual(
826            soup.decode(), self.document_for(
827                "<a><b>Argh!Hooray!</b><c></c></a>"))
828
829        new_text = soup.find(text="Hooray!")
830        self.assertEqual(new_text.previous_element, "Argh!")
831        self.assertEqual(new_text.previous_element.next_element, new_text)
832
833        self.assertEqual(new_text.previous_sibling, "Argh!")
834        self.assertEqual(new_text.previous_sibling.next_sibling, new_text)
835
836        self.assertEqual(new_text.next_sibling, None)
837        self.assertEqual(new_text.next_element, soup.c)
838
839    def test_insert_string(self):
840        soup = self.soup("<a></a>")
841        soup.a.insert(0, "bar")
842        soup.a.insert(0, "foo")
843        # The string were added to the tag.
844        self.assertEqual(["foo", "bar"], soup.a.contents)
845        # And they were converted to NavigableStrings.
846        self.assertEqual(soup.a.contents[0].next_element, "bar")
847
848    def test_insert_tag(self):
849        builder = self.default_builder
850        soup = self.soup(
851            "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
852        magic_tag = Tag(soup, builder, 'magictag')
853        magic_tag.insert(0, "the")
854        soup.a.insert(1, magic_tag)
855
856        self.assertEqual(
857            soup.decode(), self.document_for(
858                "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>"))
859
860        # Make sure all the relationships are hooked up correctly.
861        b_tag = soup.b
862        self.assertEqual(b_tag.next_sibling, magic_tag)
863        self.assertEqual(magic_tag.previous_sibling, b_tag)
864
865        find = b_tag.find(text="Find")
866        self.assertEqual(find.next_element, magic_tag)
867        self.assertEqual(magic_tag.previous_element, find)
868
869        c_tag = soup.c
870        self.assertEqual(magic_tag.next_sibling, c_tag)
871        self.assertEqual(c_tag.previous_sibling, magic_tag)
872
873        the = magic_tag.find(text="the")
874        self.assertEqual(the.parent, magic_tag)
875        self.assertEqual(the.next_element, c_tag)
876        self.assertEqual(c_tag.previous_element, the)
877
878    def test_append_child_thats_already_at_the_end(self):
879        data = "<a><b></b></a>"
880        soup = self.soup(data)
881        soup.a.append(soup.b)
882        self.assertEqual(data, soup.decode())
883
884    def test_move_tag_to_beginning_of_parent(self):
885        data = "<a><b></b><c></c><d></d></a>"
886        soup = self.soup(data)
887        soup.a.insert(0, soup.d)
888        self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode())
889
890    def test_insert_works_on_empty_element_tag(self):
891        # This is a little strange, since most HTML parsers don't allow
892        # markup like this to come through. But in general, we don't
893        # know what the parser would or wouldn't have allowed, so
894        # I'm letting this succeed for now.
895        soup = self.soup("<br/>")
896        soup.br.insert(1, "Contents")
897        self.assertEqual(str(soup.br), "<br>Contents</br>")
898
899    def test_insert_before(self):
900        soup = self.soup("<a>foo</a><b>bar</b>")
901        soup.b.insert_before("BAZ")
902        soup.a.insert_before("QUUX")
903        self.assertEqual(
904            soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>"))
905
906        soup.a.insert_before(soup.b)
907        self.assertEqual(
908            soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
909
910    def test_insert_after(self):
911        soup = self.soup("<a>foo</a><b>bar</b>")
912        soup.b.insert_after("BAZ")
913        soup.a.insert_after("QUUX")
914        self.assertEqual(
915            soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ"))
916        soup.b.insert_after(soup.a)
917        self.assertEqual(
918            soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
919
920    def test_insert_after_raises_exception_if_after_has_no_meaning(self):
921        soup = self.soup("")
922        tag = soup.new_tag("a")
923        string = soup.new_string("")
924        self.assertRaises(ValueError, string.insert_after, tag)
925        self.assertRaises(NotImplementedError, soup.insert_after, tag)
926        self.assertRaises(ValueError, tag.insert_after, tag)
927
928    def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self):
929        soup = self.soup("")
930        tag = soup.new_tag("a")
931        string = soup.new_string("")
932        self.assertRaises(ValueError, string.insert_before, tag)
933        self.assertRaises(NotImplementedError, soup.insert_before, tag)
934        self.assertRaises(ValueError, tag.insert_before, tag)
935
936    def test_replace_with(self):
937        soup = self.soup(
938                "<p>There's <b>no</b> business like <b>show</b> business</p>")
939        no, show = soup.find_all('b')
940        show.replace_with(no)
941        self.assertEqual(
942            soup.decode(),
943            self.document_for(
944                "<p>There's  business like <b>no</b> business</p>"))
945
946        self.assertEqual(show.parent, None)
947        self.assertEqual(no.parent, soup.p)
948        self.assertEqual(no.next_element, "no")
949        self.assertEqual(no.next_sibling, " business")
950
951    def test_replace_first_child(self):
952        data = "<a><b></b><c></c></a>"
953        soup = self.soup(data)
954        soup.b.replace_with(soup.c)
955        self.assertEqual("<a><c></c></a>", soup.decode())
956
957    def test_replace_last_child(self):
958        data = "<a><b></b><c></c></a>"
959        soup = self.soup(data)
960        soup.c.replace_with(soup.b)
961        self.assertEqual("<a><b></b></a>", soup.decode())
962
963    def test_nested_tag_replace_with(self):
964        soup = self.soup(
965            """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
966
967        # Replace the entire <b> tag and its contents ("reserve the
968        # right") with the <f> tag ("refuse").
969        remove_tag = soup.b
970        move_tag = soup.f
971        remove_tag.replace_with(move_tag)
972
973        self.assertEqual(
974            soup.decode(), self.document_for(
975                "<a>We<f>refuse</f></a><e>to<g>service</g></e>"))
976
977        # The <b> tag is now an orphan.
978        self.assertEqual(remove_tag.parent, None)
979        self.assertEqual(remove_tag.find(text="right").next_element, None)
980        self.assertEqual(remove_tag.previous_element, None)
981        self.assertEqual(remove_tag.next_sibling, None)
982        self.assertEqual(remove_tag.previous_sibling, None)
983
984        # The <f> tag is now connected to the <a> tag.
985        self.assertEqual(move_tag.parent, soup.a)
986        self.assertEqual(move_tag.previous_element, "We")
987        self.assertEqual(move_tag.next_element.next_element, soup.e)
988        self.assertEqual(move_tag.next_sibling, None)
989
990        # The gap where the <f> tag used to be has been mended, and
991        # the word "to" is now connected to the <g> tag.
992        to_text = soup.find(text="to")
993        g_tag = soup.g
994        self.assertEqual(to_text.next_element, g_tag)
995        self.assertEqual(to_text.next_sibling, g_tag)
996        self.assertEqual(g_tag.previous_element, to_text)
997        self.assertEqual(g_tag.previous_sibling, to_text)
998
999    def test_unwrap(self):
1000        tree = self.soup("""
1001            <p>Unneeded <em>formatting</em> is unneeded</p>
1002            """)
1003        tree.em.unwrap()
1004        self.assertEqual(tree.em, None)
1005        self.assertEqual(tree.p.text, "Unneeded formatting is unneeded")
1006
1007    def test_wrap(self):
1008        soup = self.soup("I wish I was bold.")
1009        value = soup.string.wrap(soup.new_tag("b"))
1010        self.assertEqual(value.decode(), "<b>I wish I was bold.</b>")
1011        self.assertEqual(
1012            soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1013
1014    def test_wrap_extracts_tag_from_elsewhere(self):
1015        soup = self.soup("<b></b>I wish I was bold.")
1016        soup.b.next_sibling.wrap(soup.b)
1017        self.assertEqual(
1018            soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1019
1020    def test_wrap_puts_new_contents_at_the_end(self):
1021        soup = self.soup("<b>I like being bold.</b>I wish I was bold.")
1022        soup.b.next_sibling.wrap(soup.b)
1023        self.assertEqual(2, len(soup.b.contents))
1024        self.assertEqual(
1025            soup.decode(), self.document_for(
1026                "<b>I like being bold.I wish I was bold.</b>"))
1027
1028    def test_extract(self):
1029        soup = self.soup(
1030            '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>')
1031
1032        self.assertEqual(len(soup.body.contents), 3)
1033        extracted = soup.find(id="nav").extract()
1034
1035        self.assertEqual(
1036            soup.decode(), "<html><body>Some content.  More content.</body></html>")
1037        self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
1038
1039        # The extracted tag is now an orphan.
1040        self.assertEqual(len(soup.body.contents), 2)
1041        self.assertEqual(extracted.parent, None)
1042        self.assertEqual(extracted.previous_element, None)
1043        self.assertEqual(extracted.next_element.next_element, None)
1044
1045        # The gap where the extracted tag used to be has been mended.
1046        content_1 = soup.find(text="Some content. ")
1047        content_2 = soup.find(text=" More content.")
1048        self.assertEqual(content_1.next_element, content_2)
1049        self.assertEqual(content_1.next_sibling, content_2)
1050        self.assertEqual(content_2.previous_element, content_1)
1051        self.assertEqual(content_2.previous_sibling, content_1)
1052
1053    def test_extract_distinguishes_between_identical_strings(self):
1054        soup = self.soup("<a>foo</a><b>bar</b>")
1055        foo_1 = soup.a.string
1056        bar_1 = soup.b.string
1057        foo_2 = soup.new_string("foo")
1058        bar_2 = soup.new_string("bar")
1059        soup.a.append(foo_2)
1060        soup.b.append(bar_2)
1061
1062        # Now there are two identical strings in the <a> tag, and two
1063        # in the <b> tag. Let's remove the first "foo" and the second
1064        # "bar".
1065        foo_1.extract()
1066        bar_2.extract()
1067        self.assertEqual(foo_2, soup.a.string)
1068        self.assertEqual(bar_2, soup.b.string)
1069
1070    def test_clear(self):
1071        """Tag.clear()"""
1072        soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
1073        # clear using extract()
1074        a = soup.a
1075        soup.p.clear()
1076        self.assertEqual(len(soup.p.contents), 0)
1077        self.assertTrue(hasattr(a, "contents"))
1078
1079        # clear using decompose()
1080        em = a.em
1081        a.clear(decompose=True)
1082        self.assertEqual(0, len(em.contents))
1083
1084    def test_string_set(self):
1085        """Tag.string = 'string'"""
1086        soup = self.soup("<a></a> <b><c></c></b>")
1087        soup.a.string = "foo"
1088        self.assertEqual(soup.a.contents, ["foo"])
1089        soup.b.string = "bar"
1090        self.assertEqual(soup.b.contents, ["bar"])
1091
1092    def test_string_set_does_not_affect_original_string(self):
1093        soup = self.soup("<a><b>foo</b><c>bar</c>")
1094        soup.b.string = soup.c.string
1095        self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>")
1096
1097    def test_set_string_preserves_class_of_string(self):
1098        soup = self.soup("<a></a>")
1099        cdata = CData("foo")
1100        soup.a.string = cdata
1101        self.assertTrue(isinstance(soup.a.string, CData))
1102
1103class TestElementObjects(SoupTest):
1104    """Test various features of element objects."""
1105
1106    def test_len(self):
1107        """The length of an element is its number of children."""
1108        soup = self.soup("<top>1<b>2</b>3</top>")
1109
1110        # The BeautifulSoup object itself contains one element: the
1111        # <top> tag.
1112        self.assertEqual(len(soup.contents), 1)
1113        self.assertEqual(len(soup), 1)
1114
1115        # The <top> tag contains three elements: the text node "1", the
1116        # <b> tag, and the text node "3".
1117        self.assertEqual(len(soup.top), 3)
1118        self.assertEqual(len(soup.top.contents), 3)
1119
1120    def test_member_access_invokes_find(self):
1121        """Accessing a Python member .foo invokes find('foo')"""
1122        soup = self.soup('<b><i></i></b>')
1123        self.assertEqual(soup.b, soup.find('b'))
1124        self.assertEqual(soup.b.i, soup.find('b').find('i'))
1125        self.assertEqual(soup.a, None)
1126
1127    def test_deprecated_member_access(self):
1128        soup = self.soup('<b><i></i></b>')
1129        with warnings.catch_warnings(record=True) as w:
1130            tag = soup.bTag
1131        self.assertEqual(soup.b, tag)
1132        self.assertEqual(
1133            '.bTag is deprecated, use .find("b") instead.',
1134            str(w[0].message))
1135
1136    def test_has_attr(self):
1137        """has_attr() checks for the presence of an attribute.
1138
1139        Please note note: has_attr() is different from
1140        __in__. has_attr() checks the tag's attributes and __in__
1141        checks the tag's chidlren.
1142        """
1143        soup = self.soup("<foo attr='bar'>")
1144        self.assertTrue(soup.foo.has_attr('attr'))
1145        self.assertFalse(soup.foo.has_attr('attr2'))
1146
1147
1148    def test_attributes_come_out_in_alphabetical_order(self):
1149        markup = '<b a="1" z="5" m="3" f="2" y="4"></b>'
1150        self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>')
1151
1152    def test_string(self):
1153        # A tag that contains only a text node makes that node
1154        # available as .string.
1155        soup = self.soup("<b>foo</b>")
1156        self.assertEqual(soup.b.string, 'foo')
1157
1158    def test_empty_tag_has_no_string(self):
1159        # A tag with no children has no .stirng.
1160        soup = self.soup("<b></b>")
1161        self.assertEqual(soup.b.string, None)
1162
1163    def test_tag_with_multiple_children_has_no_string(self):
1164        # A tag with no children has no .string.
1165        soup = self.soup("<a>foo<b></b><b></b></b>")
1166        self.assertEqual(soup.b.string, None)
1167
1168        soup = self.soup("<a>foo<b></b>bar</b>")
1169        self.assertEqual(soup.b.string, None)
1170
1171        # Even if all the children are strings, due to trickery,
1172        # it won't work--but this would be a good optimization.
1173        soup = self.soup("<a>foo</b>")
1174        soup.a.insert(1, "bar")
1175        self.assertEqual(soup.a.string, None)
1176
1177    def test_tag_with_recursive_string_has_string(self):
1178        # A tag with a single child which has a .string inherits that
1179        # .string.
1180        soup = self.soup("<a><b>foo</b></a>")
1181        self.assertEqual(soup.a.string, "foo")
1182        self.assertEqual(soup.string, "foo")
1183
1184    def test_lack_of_string(self):
1185        """Only a tag containing a single text node has a .string."""
1186        soup = self.soup("<b>f<i>e</i>o</b>")
1187        self.assertFalse(soup.b.string)
1188
1189        soup = self.soup("<b></b>")
1190        self.assertFalse(soup.b.string)
1191
1192    def test_all_text(self):
1193        """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
1194        soup = self.soup("<a>a<b>r</b>   <r> t </r></a>")
1195        self.assertEqual(soup.a.text, "ar  t ")
1196        self.assertEqual(soup.a.get_text(strip=True), "art")
1197        self.assertEqual(soup.a.get_text(","), "a,r, , t ")
1198        self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
1199
1200    def test_get_text_ignores_comments(self):
1201        soup = self.soup("foo<!--IGNORE-->bar")
1202        self.assertEqual(soup.get_text(), "foobar")
1203
1204        self.assertEqual(
1205            soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
1206        self.assertEqual(
1207            soup.get_text(types=None), "fooIGNOREbar")
1208
1209    def test_all_strings_ignores_comments(self):
1210        soup = self.soup("foo<!--IGNORE-->bar")
1211        self.assertEqual(['foo', 'bar'], list(soup.strings))
1212
1213class TestCDAtaListAttributes(SoupTest):
1214
1215    """Testing cdata-list attributes like 'class'.
1216    """
1217    def test_single_value_becomes_list(self):
1218        soup = self.soup("<a class='foo'>")
1219        self.assertEqual(["foo"],soup.a['class'])
1220
1221    def test_multiple_values_becomes_list(self):
1222        soup = self.soup("<a class='foo bar'>")
1223        self.assertEqual(["foo", "bar"], soup.a['class'])
1224
1225    def test_multiple_values_separated_by_weird_whitespace(self):
1226        soup = self.soup("<a class='foo\tbar\nbaz'>")
1227        self.assertEqual(["foo", "bar", "baz"],soup.a['class'])
1228
1229    def test_attributes_joined_into_string_on_output(self):
1230        soup = self.soup("<a class='foo\tbar'>")
1231        self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode())
1232
1233    def test_accept_charset(self):
1234        soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
1235        self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
1236
1237    def test_cdata_attribute_applying_only_to_one_tag(self):
1238        data = '<a accept-charset="ISO-8859-1 UTF-8"></a>'
1239        soup = self.soup(data)
1240        # We saw in another test that accept-charset is a cdata-list
1241        # attribute for the <form> tag. But it's not a cdata-list
1242        # attribute for any other tag.
1243        self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset'])
1244
1245    def test_string_has_immutable_name_property(self):
1246        string = self.soup("s").string
1247        self.assertEqual(None, string.name)
1248        def t():
1249            string.name = 'foo'
1250        self.assertRaises(AttributeError, t)
1251
1252class TestPersistence(SoupTest):
1253    "Testing features like pickle and deepcopy."
1254
1255    def setUp(self):
1256        super(TestPersistence, self).setUp()
1257        self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
1258"http://www.w3.org/TR/REC-html40/transitional.dtd">
1259<html>
1260<head>
1261<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
1262<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
1263<link rev="made" href="mailto:leonardr@segfault.org">
1264<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
1265<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
1266<meta name="author" content="Leonard Richardson">
1267</head>
1268<body>
1269<a href="foo">foo</a>
1270<a href="foo"><b>bar</b></a>
1271</body>
1272</html>"""
1273        self.tree = self.soup(self.page)
1274
1275    def test_pickle_and_unpickle_identity(self):
1276        # Pickling a tree, then unpickling it, yields a tree identical
1277        # to the original.
1278        dumped = pickle.dumps(self.tree, 2)
1279        loaded = pickle.loads(dumped)
1280        self.assertEqual(loaded.__class__, BeautifulSoup)
1281        self.assertEqual(loaded.decode(), self.tree.decode())
1282
1283    def test_deepcopy_identity(self):
1284        # Making a deepcopy of a tree yields an identical tree.
1285        copied = copy.deepcopy(self.tree)
1286        self.assertEqual(copied.decode(), self.tree.decode())
1287
1288    def test_unicode_pickle(self):
1289        # A tree containing Unicode characters can be pickled.
1290        html = u"<b>\N{SNOWMAN}</b>"
1291        soup = self.soup(html)
1292        dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
1293        loaded = pickle.loads(dumped)
1294        self.assertEqual(loaded.decode(), soup.decode())
1295
1296
1297class TestSubstitutions(SoupTest):
1298
1299    def test_default_formatter_is_minimal(self):
1300        markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1301        soup = self.soup(markup)
1302        decoded = soup.decode(formatter="minimal")
1303        # The < is converted back into &lt; but the e-with-acute is left alone.
1304        self.assertEqual(
1305            decoded,
1306            self.document_for(
1307                u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1308
1309    def test_formatter_html(self):
1310        markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1311        soup = self.soup(markup)
1312        decoded = soup.decode(formatter="html")
1313        self.assertEqual(
1314            decoded,
1315            self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
1316
1317    def test_formatter_minimal(self):
1318        markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1319        soup = self.soup(markup)
1320        decoded = soup.decode(formatter="minimal")
1321        # The < is converted back into &lt; but the e-with-acute is left alone.
1322        self.assertEqual(
1323            decoded,
1324            self.document_for(
1325                u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1326
1327    def test_formatter_null(self):
1328        markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1329        soup = self.soup(markup)
1330        decoded = soup.decode(formatter=None)
1331        # Neither the angle brackets nor the e-with-acute are converted.
1332        # This is not valid HTML, but it's what the user wanted.
1333        self.assertEqual(decoded,
1334                          self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
1335
1336    def test_formatter_custom(self):
1337        markup = u"<b>&lt;foo&gt;</b><b>bar</b>"
1338        soup = self.soup(markup)
1339        decoded = soup.decode(formatter = lambda x: x.upper())
1340        # Instead of normal entity conversion code, the custom
1341        # callable is called on every string.
1342        self.assertEqual(
1343            decoded,
1344            self.document_for(u"<b><FOO></b><b>BAR</b>"))
1345
1346    def test_formatter_is_run_on_attribute_values(self):
1347        markup = u'<a href="http://a.com?a=b&c=é">e</a>'
1348        soup = self.soup(markup)
1349        a = soup.a
1350
1351        expect_minimal = u'<a href="http://a.com?a=b&amp;c=é">e</a>'
1352
1353        self.assertEqual(expect_minimal, a.decode())
1354        self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
1355
1356        expect_html = u'<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
1357        self.assertEqual(expect_html, a.decode(formatter="html"))
1358
1359        self.assertEqual(markup, a.decode(formatter=None))
1360        expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
1361        self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
1362
1363    def test_formatter_skips_script_tag_for_html_documents(self):
1364        doc = """
1365  <script type="text/javascript">
1366   console.log("< < hey > > ");
1367  </script>
1368"""
1369        encoded = BeautifulSoup(doc).encode()
1370        self.assertTrue(b"< < hey > >" in encoded)
1371
1372    def test_formatter_skips_style_tag_for_html_documents(self):
1373        doc = """
1374  <style type="text/css">
1375   console.log("< < hey > > ");
1376  </style>
1377"""
1378        encoded = BeautifulSoup(doc).encode()
1379        self.assertTrue(b"< < hey > >" in encoded)
1380
1381    def test_prettify_leaves_preformatted_text_alone(self):
1382        soup = self.soup("<div>  foo  <pre>  \tbar\n  \n  </pre>  baz  ")
1383        # Everything outside the <pre> tag is reformatted, but everything
1384        # inside is left alone.
1385        self.assertEqual(
1386            u'<div>\n foo\n <pre>  \tbar\n  \n  </pre>\n baz\n</div>',
1387            soup.div.prettify())
1388
1389    def test_prettify_accepts_formatter(self):
1390        soup = BeautifulSoup("<html><body>foo</body></html>")
1391        pretty = soup.prettify(formatter = lambda x: x.upper())
1392        self.assertTrue("FOO" in pretty)
1393
1394    def test_prettify_outputs_unicode_by_default(self):
1395        soup = self.soup("<a></a>")
1396        self.assertEqual(unicode, type(soup.prettify()))
1397
1398    def test_prettify_can_encode_data(self):
1399        soup = self.soup("<a></a>")
1400        self.assertEqual(bytes, type(soup.prettify("utf-8")))
1401
1402    def test_html_entity_substitution_off_by_default(self):
1403        markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
1404        soup = self.soup(markup)
1405        encoded = soup.b.encode("utf-8")
1406        self.assertEqual(encoded, markup.encode('utf-8'))
1407
1408    def test_encoding_substitution(self):
1409        # Here's the <meta> tag saying that a document is
1410        # encoded in Shift-JIS.
1411        meta_tag = ('<meta content="text/html; charset=x-sjis" '
1412                    'http-equiv="Content-type"/>')
1413        soup = self.soup(meta_tag)
1414
1415        # Parse the document, and the charset apprears unchanged.
1416        self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis')
1417
1418        # Encode the document into some encoding, and the encoding is
1419        # substituted into the meta tag.
1420        utf_8 = soup.encode("utf-8")
1421        self.assertTrue(b"charset=utf-8" in utf_8)
1422
1423        euc_jp = soup.encode("euc_jp")
1424        self.assertTrue(b"charset=euc_jp" in euc_jp)
1425
1426        shift_jis = soup.encode("shift-jis")
1427        self.assertTrue(b"charset=shift-jis" in shift_jis)
1428
1429        utf_16_u = soup.encode("utf-16").decode("utf-16")
1430        self.assertTrue("charset=utf-16" in utf_16_u)
1431
1432    def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
1433        markup = ('<head><meta content="text/html; charset=x-sjis" '
1434                    'http-equiv="Content-type"/></head><pre>foo</pre>')
1435
1436        # Beautiful Soup used to try to rewrite the meta tag even if the
1437        # meta tag got filtered out by the strainer. This test makes
1438        # sure that doesn't happen.
1439        strainer = SoupStrainer('pre')
1440        soup = self.soup(markup, parse_only=strainer)
1441        self.assertEqual(soup.contents[0].name, 'pre')
1442
1443class TestEncoding(SoupTest):
1444    """Test the ability to encode objects into strings."""
1445
1446    def test_unicode_string_can_be_encoded(self):
1447        html = u"<b>\N{SNOWMAN}</b>"
1448        soup = self.soup(html)
1449        self.assertEqual(soup.b.string.encode("utf-8"),
1450                          u"\N{SNOWMAN}".encode("utf-8"))
1451
1452    def test_tag_containing_unicode_string_can_be_encoded(self):
1453        html = u"<b>\N{SNOWMAN}</b>"
1454        soup = self.soup(html)
1455        self.assertEqual(
1456            soup.b.encode("utf-8"), html.encode("utf-8"))
1457
1458    def test_encoding_substitutes_unrecognized_characters_by_default(self):
1459        html = u"<b>\N{SNOWMAN}</b>"
1460        soup = self.soup(html)
1461        self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
1462
1463    def test_encoding_can_be_made_strict(self):
1464        html = u"<b>\N{SNOWMAN}</b>"
1465        soup = self.soup(html)
1466        self.assertRaises(
1467            UnicodeEncodeError, soup.encode, "ascii", errors="strict")
1468
1469    def test_decode_contents(self):
1470        html = u"<b>\N{SNOWMAN}</b>"
1471        soup = self.soup(html)
1472        self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents())
1473
1474    def test_encode_contents(self):
1475        html = u"<b>\N{SNOWMAN}</b>"
1476        soup = self.soup(html)
1477        self.assertEqual(
1478            u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
1479                encoding="utf8"))
1480
1481    def test_deprecated_renderContents(self):
1482        html = u"<b>\N{SNOWMAN}</b>"
1483        soup = self.soup(html)
1484        self.assertEqual(
1485            u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
1486
1487class TestNavigableStringSubclasses(SoupTest):
1488
1489    def test_cdata(self):
1490        # None of the current builders turn CDATA sections into CData
1491        # objects, but you can create them manually.
1492        soup = self.soup("")
1493        cdata = CData("foo")
1494        soup.insert(1, cdata)
1495        self.assertEqual(str(soup), "<![CDATA[foo]]>")
1496        self.assertEqual(soup.find(text="foo"), "foo")
1497        self.assertEqual(soup.contents[0], "foo")
1498
1499    def test_cdata_is_never_formatted(self):
1500        """Text inside a CData object is passed into the formatter.
1501
1502        But the return value is ignored.
1503        """
1504
1505        self.count = 0
1506        def increment(*args):
1507            self.count += 1
1508            return "BITTER FAILURE"
1509
1510        soup = self.soup("")
1511        cdata = CData("<><><>")
1512        soup.insert(1, cdata)
1513        self.assertEqual(
1514            b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
1515        self.assertEqual(1, self.count)
1516
1517    def test_doctype_ends_in_newline(self):
1518        # Unlike other NavigableString subclasses, a DOCTYPE always ends
1519        # in a newline.
1520        doctype = Doctype("foo")
1521        soup = self.soup("")
1522        soup.insert(1, doctype)
1523        self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
1524
1525
1526class TestSoupSelector(TreeTest):
1527
1528    HTML = """
1529<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
1530"http://www.w3.org/TR/html4/strict.dtd">
1531<html>
1532<head>
1533<title>The title</title>
1534<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
1535</head>
1536<body>
1537
1538<div id="main" class="fancy">
1539<div id="inner">
1540<h1 id="header1">An H1</h1>
1541<p>Some text</p>
1542<p class="onep" id="p1">Some more text</p>
1543<h2 id="header2">An H2</h2>
1544<p class="class1 class2 class3" id="pmulti">Another</p>
1545<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
1546<h2 id="header3">Another H2</h2>
1547<a id="me" href="http://simonwillison.net/" rel="me">me</a>
1548<span class="s1">
1549<a href="#" id="s1a1">span1a1</a>
1550<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
1551<span class="span2">
1552<a href="#" id="s2a1">span2a1</a>
1553</span>
1554<span class="span3"></span>
1555</span>
1556</div>
1557<p lang="en" id="lang-en">English</p>
1558<p lang="en-gb" id="lang-en-gb">English UK</p>
1559<p lang="en-us" id="lang-en-us">English US</p>
1560<p lang="fr" id="lang-fr">French</p>
1561</div>
1562
1563<div id="footer">
1564</div>
1565"""
1566
1567    def setUp(self):
1568        self.soup = BeautifulSoup(self.HTML)
1569
1570    def assertSelects(self, selector, expected_ids):
1571        el_ids = [el['id'] for el in self.soup.select(selector)]
1572        el_ids.sort()
1573        expected_ids.sort()
1574        self.assertEqual(expected_ids, el_ids,
1575            "Selector %s, expected [%s], got [%s]" % (
1576                selector, ', '.join(expected_ids), ', '.join(el_ids)
1577            )
1578        )
1579
1580    assertSelect = assertSelects
1581
1582    def assertSelectMultiple(self, *tests):
1583        for selector, expected_ids in tests:
1584            self.assertSelect(selector, expected_ids)
1585
1586    def test_one_tag_one(self):
1587        els = self.soup.select('title')
1588        self.assertEqual(len(els), 1)
1589        self.assertEqual(els[0].name, 'title')
1590        self.assertEqual(els[0].contents, [u'The title'])
1591
1592    def test_one_tag_many(self):
1593        els = self.soup.select('div')
1594        self.assertEqual(len(els), 3)
1595        for div in els:
1596            self.assertEqual(div.name, 'div')
1597
1598    def test_tag_in_tag_one(self):
1599        els = self.soup.select('div div')
1600        self.assertSelects('div div', ['inner'])
1601
1602    def test_tag_in_tag_many(self):
1603        for selector in ('html div', 'html body div', 'body div'):
1604            self.assertSelects(selector, ['main', 'inner', 'footer'])
1605
1606    def test_tag_no_match(self):
1607        self.assertEqual(len(self.soup.select('del')), 0)
1608
1609    def test_invalid_tag(self):
1610        self.assertRaises(ValueError, self.soup.select, 'tag%t')
1611
1612    def test_header_tags(self):
1613        self.assertSelectMultiple(
1614            ('h1', ['header1']),
1615            ('h2', ['header2', 'header3']),
1616        )
1617
1618    def test_class_one(self):
1619        for selector in ('.onep', 'p.onep', 'html p.onep'):
1620            els = self.soup.select(selector)
1621            self.assertEqual(len(els), 1)
1622            self.assertEqual(els[0].name, 'p')
1623            self.assertEqual(els[0]['class'], ['onep'])
1624
1625    def test_class_mismatched_tag(self):
1626        els = self.soup.select('div.onep')
1627        self.assertEqual(len(els), 0)
1628
1629    def test_one_id(self):
1630        for selector in ('div#inner', '#inner', 'div div#inner'):
1631            self.assertSelects(selector, ['inner'])
1632
1633    def test_bad_id(self):
1634        els = self.soup.select('#doesnotexist')
1635        self.assertEqual(len(els), 0)
1636
1637    def test_items_in_id(self):
1638        els = self.soup.select('div#inner p')
1639        self.assertEqual(len(els), 3)
1640        for el in els:
1641            self.assertEqual(el.name, 'p')
1642        self.assertEqual(els[1]['class'], ['onep'])
1643        self.assertFalse(els[0].has_attr('class'))
1644
1645    def test_a_bunch_of_emptys(self):
1646        for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
1647            self.assertEqual(len(self.soup.select(selector)), 0)
1648
1649    def test_multi_class_support(self):
1650        for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
1651            '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
1652            self.assertSelects(selector, ['pmulti'])
1653
1654    def test_multi_class_selection(self):
1655        for selector in ('.class1.class3', '.class3.class2',
1656                         '.class1.class2.class3'):
1657            self.assertSelects(selector, ['pmulti'])
1658
1659    def test_child_selector(self):
1660        self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
1661        self.assertSelects('.s1 > a span', ['s1a2s1'])
1662
1663    def test_child_selector_id(self):
1664        self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
1665
1666    def test_attribute_equals(self):
1667        self.assertSelectMultiple(
1668            ('p[class="onep"]', ['p1']),
1669            ('p[id="p1"]', ['p1']),
1670            ('[class="onep"]', ['p1']),
1671            ('[id="p1"]', ['p1']),
1672            ('link[rel="stylesheet"]', ['l1']),
1673            ('link[type="text/css"]', ['l1']),
1674            ('link[href="blah.css"]', ['l1']),
1675            ('link[href="no-blah.css"]', []),
1676            ('[rel="stylesheet"]', ['l1']),
1677            ('[type="text/css"]', ['l1']),
1678            ('[href="blah.css"]', ['l1']),
1679            ('[href="no-blah.css"]', []),
1680            ('p[href="no-blah.css"]', []),
1681            ('[href="no-blah.css"]', []),
1682        )
1683
1684    def test_attribute_tilde(self):
1685        self.assertSelectMultiple(
1686            ('p[class~="class1"]', ['pmulti']),
1687            ('p[class~="class2"]', ['pmulti']),
1688            ('p[class~="class3"]', ['pmulti']),
1689            ('[class~="class1"]', ['pmulti']),
1690            ('[class~="class2"]', ['pmulti']),
1691            ('[class~="class3"]', ['pmulti']),
1692            ('a[rel~="friend"]', ['bob']),
1693            ('a[rel~="met"]', ['bob']),
1694            ('[rel~="friend"]', ['bob']),
1695            ('[rel~="met"]', ['bob']),
1696        )
1697
1698    def test_attribute_startswith(self):
1699        self.assertSelectMultiple(
1700            ('[rel^="style"]', ['l1']),
1701            ('link[rel^="style"]', ['l1']),
1702            ('notlink[rel^="notstyle"]', []),
1703            ('[rel^="notstyle"]', []),
1704            ('link[rel^="notstyle"]', []),
1705            ('link[href^="bla"]', ['l1']),
1706            ('a[href^="http://"]', ['bob', 'me']),
1707            ('[href^="http://"]', ['bob', 'me']),
1708            ('[id^="p"]', ['pmulti', 'p1']),
1709            ('[id^="m"]', ['me', 'main']),
1710            ('div[id^="m"]', ['main']),
1711            ('a[id^="m"]', ['me']),
1712        )
1713
1714    def test_attribute_endswith(self):
1715        self.assertSelectMultiple(
1716            ('[href$=".css"]', ['l1']),
1717            ('link[href$=".css"]', ['l1']),
1718            ('link[id$="1"]', ['l1']),
1719            ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']),
1720            ('div[id$="1"]', []),
1721            ('[id$="noending"]', []),
1722        )
1723
1724    def test_attribute_contains(self):
1725        self.assertSelectMultiple(
1726            # From test_attribute_startswith
1727            ('[rel*="style"]', ['l1']),
1728            ('link[rel*="style"]', ['l1']),
1729            ('notlink[rel*="notstyle"]', []),
1730            ('[rel*="notstyle"]', []),
1731            ('link[rel*="notstyle"]', []),
1732            ('link[href*="bla"]', ['l1']),
1733            ('a[href*="http://"]', ['bob', 'me']),
1734            ('[href*="http://"]', ['bob', 'me']),
1735            ('[id*="p"]', ['pmulti', 'p1']),
1736            ('div[id*="m"]', ['main']),
1737            ('a[id*="m"]', ['me']),
1738            # From test_attribute_endswith
1739            ('[href*=".css"]', ['l1']),
1740            ('link[href*=".css"]', ['l1']),
1741            ('link[id*="1"]', ['l1']),
1742            ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']),
1743            ('div[id*="1"]', []),
1744            ('[id*="noending"]', []),
1745            # New for this test
1746            ('[href*="."]', ['bob', 'me', 'l1']),
1747            ('a[href*="."]', ['bob', 'me']),
1748            ('link[href*="."]', ['l1']),
1749            ('div[id*="n"]', ['main', 'inner']),
1750            ('div[id*="nn"]', ['inner']),
1751        )
1752
1753    def test_attribute_exact_or_hypen(self):
1754        self.assertSelectMultiple(
1755            ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
1756            ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
1757            ('p[lang|="fr"]', ['lang-fr']),
1758            ('p[lang|="gb"]', []),
1759        )
1760
1761    def test_attribute_exists(self):
1762        self.assertSelectMultiple(
1763            ('[rel]', ['l1', 'bob', 'me']),
1764            ('link[rel]', ['l1']),
1765            ('a[rel]', ['bob', 'me']),
1766            ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
1767            ('p[class]', ['p1', 'pmulti']),
1768            ('[blah]', []),
1769            ('p[blah]', []),
1770        )
1771
1772    def test_nth_of_type(self):
1773        # Try to select first paragraph
1774        els = self.soup.select('div#inner p:nth-of-type(1)')
1775        self.assertEqual(len(els), 1)
1776        self.assertEqual(els[0].string, u'Some text')
1777
1778        # Try to select third paragraph
1779        els = self.soup.select('div#inner p:nth-of-type(3)')
1780        self.assertEqual(len(els), 1)
1781        self.assertEqual(els[0].string, u'Another')
1782
1783        # Try to select (non-existent!) fourth paragraph
1784        els = self.soup.select('div#inner p:nth-of-type(4)')
1785        self.assertEqual(len(els), 0)
1786
1787        # Pass in an invalid value.
1788        self.assertRaises(
1789            ValueError, self.soup.select, 'div p:nth-of-type(0)')
1790
1791    def test_nth_of_type_direct_descendant(self):
1792        els = self.soup.select('div#inner > p:nth-of-type(1)')
1793        self.assertEqual(len(els), 1)
1794        self.assertEqual(els[0].string, u'Some text')
1795
1796    def test_id_child_selector_nth_of_type(self):
1797        self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
1798
1799    def test_select_on_element(self):
1800        # Other tests operate on the tree; this operates on an element
1801        # within the tree.
1802        inner = self.soup.find("div", id="main")
1803        selected = inner.select("div")
1804        # The <div id="inner"> tag was selected. The <div id="footer">
1805        # tag was not.
1806        self.assertSelectsIDs(selected, ['inner'])
1807
1808    def test_overspecified_child_id(self):
1809        self.assertSelects(".fancy #inner", ['inner'])
1810        self.assertSelects(".normal #inner", [])
1811
1812    def test_adjacent_sibling_selector(self):
1813        self.assertSelects('#p1 + h2', ['header2'])
1814        self.assertSelects('#p1 + h2 + p', ['pmulti'])
1815        self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
1816        self.assertEqual([], self.soup.select('#p1 + p'))
1817
1818    def test_general_sibling_selector(self):
1819        self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
1820        self.assertSelects('#p1 ~ #header2', ['header2'])
1821        self.assertSelects('#p1 ~ h2 + a', ['me'])
1822        self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
1823        self.assertEqual([], self.soup.select('#inner ~ h2'))
1824
1825    def test_dangling_combinator(self):
1826        self.assertRaises(ValueError, self.soup.select, 'h1 >')
1827
1828    def test_sibling_combinator_wont_select_same_tag_twice(self):
1829        self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
1830