1from test.test_support import verbose, run_unittest, import_module
2import re
3from re import Scanner
4import sys
5import string
6import traceback
7from weakref import proxy
8
9# Misc tests from Tim Peters' re.doc
10
11# WARNING: Don't change details in these tests if you don't know
12# what you're doing. Some of these tests were carefully modeled to
13# cover most of the code.
14
15import unittest
16
17class ReTests(unittest.TestCase):
18
19    def test_weakref(self):
20        s = 'QabbbcR'
21        x = re.compile('ab+c')
22        y = proxy(x)
23        self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
24
25    def test_search_star_plus(self):
26        self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
27        self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
28        self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
29        self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
30        self.assertEqual(re.search('x', 'aaa'), None)
31        self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
32        self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
33        self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
34        self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
35        self.assertEqual(re.match('a+', 'xxx'), None)
36
37    def bump_num(self, matchobj):
38        int_value = int(matchobj.group(0))
39        return str(int_value + 1)
40
41    def test_basic_re_sub(self):
42        self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
43        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
44                         '9.3 -3 24x100y')
45        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
46                         '9.3 -3 23x99y')
47
48        self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
49        self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
50
51        s = r"\1\1"
52        self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
53        self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
54        self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
55
56        self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
57        self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
58        self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
59        self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
60
61        self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
62                         '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
63        self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
64        self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
65                         (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
66
67        self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
68
69    def test_bug_449964(self):
70        # fails for group followed by other escape
71        self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
72                         'xx\bxx\b')
73
74    def test_bug_449000(self):
75        # Test for sub() on escaped characters
76        self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
77                         'abc\ndef\n')
78        self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
79                         'abc\ndef\n')
80        self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
81                         'abc\ndef\n')
82        self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
83                         'abc\ndef\n')
84
85    def test_bug_1140(self):
86        # re.sub(x, y, u'') should return u'', not '', and
87        # re.sub(x, y, '') should return '', not u''.
88        # Also:
89        # re.sub(x, y, unicode(x)) should return unicode(y), and
90        # re.sub(x, y, str(x)) should return
91        #     str(y) if isinstance(y, str) else unicode(y).
92        for x in 'x', u'x':
93            for y in 'y', u'y':
94                z = re.sub(x, y, u'')
95                self.assertEqual(z, u'')
96                self.assertEqual(type(z), unicode)
97                #
98                z = re.sub(x, y, '')
99                self.assertEqual(z, '')
100                self.assertEqual(type(z), str)
101                #
102                z = re.sub(x, y, unicode(x))
103                self.assertEqual(z, y)
104                self.assertEqual(type(z), unicode)
105                #
106                z = re.sub(x, y, str(x))
107                self.assertEqual(z, y)
108                self.assertEqual(type(z), type(y))
109
110    def test_bug_1661(self):
111        # Verify that flags do not get silently ignored with compiled patterns
112        pattern = re.compile('.')
113        self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
114        self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
115        self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
116        self.assertRaises(ValueError, re.compile, pattern, re.I)
117
118    def test_bug_3629(self):
119        # A regex that triggered a bug in the sre-code validator
120        re.compile("(?P<quote>)(?(quote))")
121
122    def test_sub_template_numeric_escape(self):
123        # bug 776311 and friends
124        self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
125        self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
126        self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
127        self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
128        self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
129        self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
130        self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
131
132        self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
133        self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
134
135        self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
136        self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
137        self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
138        self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
139        self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
140
141        self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
142        self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
143
144        self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
145        self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
146        self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
147        self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
148        self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
149        self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
150        self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
151        self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
152        self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
153        self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
154        self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
155        self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
156
157        # in python2.3 (etc), these loop endlessly in sre_parser.py
158        self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
159        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
160                         'xz8')
161        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
162                         'xza')
163
164    def test_qualified_re_sub(self):
165        self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
166        self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
167
168    def test_bug_114660(self):
169        self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello  there'),
170                         'hello there')
171
172    def test_bug_462270(self):
173        # Test for empty sub() behaviour, see SF bug #462270
174        self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
175        self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
176
177    def test_symbolic_refs(self):
178        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
179        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
180        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
181        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
182        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
183        self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
184        self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
185        self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
186        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
187
188    def test_re_subn(self):
189        self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
190        self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
191        self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
192        self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
193        self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
194
195    def test_re_split(self):
196        self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
197        self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
198        self.assertEqual(re.split("(:*)", ":a:b::c"),
199                         ['', ':', 'a', ':', 'b', '::', 'c'])
200        self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
201        self.assertEqual(re.split("(:)*", ":a:b::c"),
202                         ['', ':', 'a', ':', 'b', ':', 'c'])
203        self.assertEqual(re.split("([b:]+)", ":a:b::c"),
204                         ['', ':', 'a', ':b::', 'c'])
205        self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
206                         ['', None, ':', 'a', None, ':', '', 'b', None, '',
207                          None, '::', 'c'])
208        self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
209                         ['', 'a', '', '', 'c'])
210
211    def test_qualified_re_split(self):
212        self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
213        self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
214        self.assertEqual(re.split("(:)", ":a:b::c", 2),
215                         ['', ':', 'a', ':', 'b::c'])
216        self.assertEqual(re.split("(:*)", ":a:b::c", 2),
217                         ['', ':', 'a', ':', 'b::c'])
218
219    def test_re_findall(self):
220        self.assertEqual(re.findall(":+", "abc"), [])
221        self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
222        self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
223        self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
224                                                               (":", ":"),
225                                                               (":", "::")])
226
227    def test_bug_117612(self):
228        self.assertEqual(re.findall(r"(a|(b))", "aba"),
229                         [("a", ""),("b", "b"),("a", "")])
230
231    def test_re_match(self):
232        self.assertEqual(re.match('a', 'a').groups(), ())
233        self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
234        self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
235        self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
236        self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
237
238        pat = re.compile('((a)|(b))(c)?')
239        self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
240        self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
241        self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
242        self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
243        self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
244
245        # A single group
246        m = re.match('(a)', 'a')
247        self.assertEqual(m.group(0), 'a')
248        self.assertEqual(m.group(0), 'a')
249        self.assertEqual(m.group(1), 'a')
250        self.assertEqual(m.group(1, 1), ('a', 'a'))
251
252        pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
253        self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
254        self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
255                         (None, 'b', None))
256        self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
257
258    def test_re_groupref_exists(self):
259        self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
260                         ('(', 'a'))
261        self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
262                         (None, 'a'))
263        self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'), None)
264        self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a'), None)
265        self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
266                         ('a', 'b'))
267        self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
268                         (None, 'd'))
269        self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
270                         (None, 'd'))
271        self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
272                         ('a', ''))
273
274        # Tests for bug #1177831: exercise groups other than the first group
275        p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
276        self.assertEqual(p.match('abc').groups(),
277                         ('a', 'b', 'c'))
278        self.assertEqual(p.match('ad').groups(),
279                         ('a', None, 'd'))
280        self.assertEqual(p.match('abd'), None)
281        self.assertEqual(p.match('ac'), None)
282
283
284    def test_re_groupref(self):
285        self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
286                         ('|', 'a'))
287        self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
288                         (None, 'a'))
289        self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', 'a|'), None)
290        self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a'), None)
291        self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
292                         ('a', 'a'))
293        self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
294                         (None, None))
295
296    def test_groupdict(self):
297        self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
298                                  'first second').groupdict(),
299                         {'first':'first', 'second':'second'})
300
301    def test_expand(self):
302        self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
303                                  "first second")
304                                  .expand(r"\2 \1 \g<second> \g<first>"),
305                         "second first second first")
306
307    def test_repeat_minmax(self):
308        self.assertEqual(re.match("^(\w){1}$", "abc"), None)
309        self.assertEqual(re.match("^(\w){1}?$", "abc"), None)
310        self.assertEqual(re.match("^(\w){1,2}$", "abc"), None)
311        self.assertEqual(re.match("^(\w){1,2}?$", "abc"), None)
312
313        self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
314        self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
315        self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
316        self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
317        self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
318        self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
319        self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
320        self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
321
322        self.assertEqual(re.match("^x{1}$", "xxx"), None)
323        self.assertEqual(re.match("^x{1}?$", "xxx"), None)
324        self.assertEqual(re.match("^x{1,2}$", "xxx"), None)
325        self.assertEqual(re.match("^x{1,2}?$", "xxx"), None)
326
327        self.assertNotEqual(re.match("^x{3}$", "xxx"), None)
328        self.assertNotEqual(re.match("^x{1,3}$", "xxx"), None)
329        self.assertNotEqual(re.match("^x{1,4}$", "xxx"), None)
330        self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
331        self.assertNotEqual(re.match("^x{3}?$", "xxx"), None)
332        self.assertNotEqual(re.match("^x{1,3}?$", "xxx"), None)
333        self.assertNotEqual(re.match("^x{1,4}?$", "xxx"), None)
334        self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
335
336        self.assertEqual(re.match("^x{}$", "xxx"), None)
337        self.assertNotEqual(re.match("^x{}$", "x{}"), None)
338
339    def test_getattr(self):
340        self.assertEqual(re.match("(a)", "a").pos, 0)
341        self.assertEqual(re.match("(a)", "a").endpos, 1)
342        self.assertEqual(re.match("(a)", "a").string, "a")
343        self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
344        self.assertNotEqual(re.match("(a)", "a").re, None)
345
346    def test_special_escapes(self):
347        self.assertEqual(re.search(r"\b(b.)\b",
348                                   "abcd abc bcd bx").group(1), "bx")
349        self.assertEqual(re.search(r"\B(b.)\B",
350                                   "abc bcd bc abxd").group(1), "bx")
351        self.assertEqual(re.search(r"\b(b.)\b",
352                                   "abcd abc bcd bx", re.LOCALE).group(1), "bx")
353        self.assertEqual(re.search(r"\B(b.)\B",
354                                   "abc bcd bc abxd", re.LOCALE).group(1), "bx")
355        self.assertEqual(re.search(r"\b(b.)\b",
356                                   "abcd abc bcd bx", re.UNICODE).group(1), "bx")
357        self.assertEqual(re.search(r"\B(b.)\B",
358                                   "abc bcd bc abxd", re.UNICODE).group(1), "bx")
359        self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
360        self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
361        self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
362        self.assertEqual(re.search(r"\b(b.)\b",
363                                   u"abcd abc bcd bx").group(1), "bx")
364        self.assertEqual(re.search(r"\B(b.)\B",
365                                   u"abc bcd bc abxd").group(1), "bx")
366        self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
367        self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
368        self.assertEqual(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M), None)
369        self.assertEqual(re.search(r"\d\D\w\W\s\S",
370                                   "1aa! a").group(0), "1aa! a")
371        self.assertEqual(re.search(r"\d\D\w\W\s\S",
372                                   "1aa! a", re.LOCALE).group(0), "1aa! a")
373        self.assertEqual(re.search(r"\d\D\w\W\s\S",
374                                   "1aa! a", re.UNICODE).group(0), "1aa! a")
375
376    def test_bigcharset(self):
377        self.assertEqual(re.match(u"([\u2222\u2223])",
378                                  u"\u2222").group(1), u"\u2222")
379        self.assertEqual(re.match(u"([\u2222\u2223])",
380                                  u"\u2222", re.UNICODE).group(1), u"\u2222")
381
382    def test_anyall(self):
383        self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
384                         "a\nb")
385        self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
386                         "a\n\nb")
387
388    def test_non_consuming(self):
389        self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
390        self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
391        self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
392        self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
393        self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
394        self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
395        self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
396
397        self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
398        self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
399        self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
400        self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
401
402    def test_ignore_case(self):
403        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
404        self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
405        self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
406        self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
407        self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
408        self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
409        self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
410        self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
411        self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
412        self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
413
414    def test_category(self):
415        self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
416
417    def test_getlower(self):
418        import _sre
419        self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
420        self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
421        self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
422
423        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
424        self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
425
426    def test_not_literal(self):
427        self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
428        self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
429
430    def test_search_coverage(self):
431        self.assertEqual(re.search("\s(b)", " b").group(1), "b")
432        self.assertEqual(re.search("a\s", "a ").group(0), "a ")
433
434    def assertMatch(self, pattern, text, match=None, span=None,
435                    matcher=re.match):
436        if match is None and span is None:
437            # the pattern matches the whole text
438            match = text
439            span = (0, len(text))
440        elif match is None or span is None:
441            raise ValueError('If match is not None, span should be specified '
442                             '(and vice versa).')
443        m = matcher(pattern, text)
444        self.assertTrue(m)
445        self.assertEqual(m.group(), match)
446        self.assertEqual(m.span(), span)
447
448    def test_re_escape(self):
449        alnum_chars = string.ascii_letters + string.digits
450        p = u''.join(unichr(i) for i in range(256))
451        for c in p:
452            if c in alnum_chars:
453                self.assertEqual(re.escape(c), c)
454            elif c == u'\x00':
455                self.assertEqual(re.escape(c), u'\\000')
456            else:
457                self.assertEqual(re.escape(c), u'\\' + c)
458            self.assertMatch(re.escape(c), c)
459        self.assertMatch(re.escape(p), p)
460
461    def test_re_escape_byte(self):
462        alnum_chars = (string.ascii_letters + string.digits).encode('ascii')
463        p = ''.join(chr(i) for i in range(256))
464        for b in p:
465            if b in alnum_chars:
466                self.assertEqual(re.escape(b), b)
467            elif b == b'\x00':
468                self.assertEqual(re.escape(b), b'\\000')
469            else:
470                self.assertEqual(re.escape(b), b'\\' + b)
471            self.assertMatch(re.escape(b), b)
472        self.assertMatch(re.escape(p), p)
473
474    def test_re_escape_non_ascii(self):
475        s = u'xxx\u2620\u2620\u2620xxx'
476        s_escaped = re.escape(s)
477        self.assertEqual(s_escaped, u'xxx\\\u2620\\\u2620\\\u2620xxx')
478        self.assertMatch(s_escaped, s)
479        self.assertMatch(u'.%s+.' % re.escape(u'\u2620'), s,
480                         u'x\u2620\u2620\u2620x', (2, 7), re.search)
481
482    def test_re_escape_non_ascii_bytes(self):
483        b = u'y\u2620y\u2620y'.encode('utf-8')
484        b_escaped = re.escape(b)
485        self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
486        self.assertMatch(b_escaped, b)
487        res = re.findall(re.escape(u'\u2620'.encode('utf-8')), b)
488        self.assertEqual(len(res), 2)
489
490    def test_pickling(self):
491        import pickle
492        self.pickle_test(pickle)
493        import cPickle
494        self.pickle_test(cPickle)
495        # old pickles expect the _compile() reconstructor in sre module
496        import_module("sre", deprecated=True)
497        from sre import _compile
498
499    def pickle_test(self, pickle):
500        oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
501        s = pickle.dumps(oldpat)
502        newpat = pickle.loads(s)
503        self.assertEqual(oldpat, newpat)
504
505    def test_constants(self):
506        self.assertEqual(re.I, re.IGNORECASE)
507        self.assertEqual(re.L, re.LOCALE)
508        self.assertEqual(re.M, re.MULTILINE)
509        self.assertEqual(re.S, re.DOTALL)
510        self.assertEqual(re.X, re.VERBOSE)
511
512    def test_flags(self):
513        for flag in [re.I, re.M, re.X, re.S, re.L]:
514            self.assertNotEqual(re.compile('^pattern$', flag), None)
515
516    def test_sre_character_literals(self):
517        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
518            self.assertNotEqual(re.match(r"\%03o" % i, chr(i)), None)
519            self.assertNotEqual(re.match(r"\%03o0" % i, chr(i)+"0"), None)
520            self.assertNotEqual(re.match(r"\%03o8" % i, chr(i)+"8"), None)
521            self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None)
522            self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None)
523            self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None)
524        self.assertRaises(re.error, re.match, "\911", "")
525
526    def test_sre_character_class_literals(self):
527        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
528            self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None)
529            self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None)
530            self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None)
531            self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None)
532            self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None)
533            self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None)
534        self.assertRaises(re.error, re.match, "[\911]", "")
535
536    def test_bug_113254(self):
537        self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
538        self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
539        self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
540
541    def test_bug_527371(self):
542        # bug described in patches 527371/672491
543        self.assertEqual(re.match(r'(a)?a','a').lastindex, None)
544        self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
545        self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
546        self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
547        self.assertEqual(re.match("((a))", "a").lastindex, 1)
548
549    def test_bug_545855(self):
550        # bug 545855 -- This pattern failed to cause a compile error as it
551        # should, instead provoking a TypeError.
552        self.assertRaises(re.error, re.compile, 'foo[a-')
553
554    def test_bug_418626(self):
555        # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
556        # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
557        # pattern '*?' on a long string.
558        self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
559        self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
560                         20003)
561        self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
562        # non-simple '*?' still used to hit the recursion limit, before the
563        # non-recursive scheme was implemented.
564        self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
565
566    def test_bug_612074(self):
567        pat=u"["+re.escape(u"\u2039")+u"]"
568        self.assertEqual(re.compile(pat) and 1, 1)
569
570    def test_stack_overflow(self):
571        # nasty cases that used to overflow the straightforward recursive
572        # implementation of repeated groups.
573        self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
574        self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
575        self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
576
577    def test_scanner(self):
578        def s_ident(scanner, token): return token
579        def s_operator(scanner, token): return "op%s" % token
580        def s_float(scanner, token): return float(token)
581        def s_int(scanner, token): return int(token)
582
583        scanner = Scanner([
584            (r"[a-zA-Z_]\w*", s_ident),
585            (r"\d+\.\d*", s_float),
586            (r"\d+", s_int),
587            (r"=|\+|-|\*|/", s_operator),
588            (r"\s+", None),
589            ])
590
591        self.assertNotEqual(scanner.scanner.scanner("").pattern, None)
592
593        self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
594                         (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
595                           'op+', 'bar'], ''))
596
597    def test_bug_448951(self):
598        # bug 448951 (similar to 429357, but with single char match)
599        # (Also test greedy matches.)
600        for op in '','?','*':
601            self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
602                             (None, None))
603            self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
604                             ('a:', 'a'))
605
606    def test_bug_725106(self):
607        # capturing groups in alternatives in repeats
608        self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
609                         ('b', 'a'))
610        self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
611                         ('c', 'b'))
612        self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
613                         ('b', None))
614        self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
615                         ('b', None))
616        self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
617                         ('b', 'a'))
618        self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
619                         ('c', 'b'))
620        self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
621                         ('b', None))
622        self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
623                         ('b', None))
624
625    def test_bug_725149(self):
626        # mark_stack_base restoring before restoring marks
627        self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
628                         ('a', None))
629        self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
630                         ('a', None, None))
631
632    def test_bug_764548(self):
633        # bug 764548, re.compile() barfs on str/unicode subclasses
634        try:
635            unicode
636        except NameError:
637            return  # no problem if we have no unicode
638        class my_unicode(unicode): pass
639        pat = re.compile(my_unicode("abc"))
640        self.assertEqual(pat.match("xyz"), None)
641
642    def test_finditer(self):
643        iter = re.finditer(r":+", "a:b::c:::d")
644        self.assertEqual([item.group(0) for item in iter],
645                         [":", "::", ":::"])
646
647    def test_bug_926075(self):
648        try:
649            unicode
650        except NameError:
651            return # no problem if we have no unicode
652        self.assertTrue(re.compile('bug_926075') is not
653                     re.compile(eval("u'bug_926075'")))
654
655    def test_bug_931848(self):
656        try:
657            unicode
658        except NameError:
659            pass
660        pattern = eval('u"[\u002E\u3002\uFF0E\uFF61]"')
661        self.assertEqual(re.compile(pattern).split("a.b.c"),
662                         ['a','b','c'])
663
664    def test_bug_581080(self):
665        iter = re.finditer(r"\s", "a b")
666        self.assertEqual(iter.next().span(), (1,2))
667        self.assertRaises(StopIteration, iter.next)
668
669        scanner = re.compile(r"\s").scanner("a b")
670        self.assertEqual(scanner.search().span(), (1, 2))
671        self.assertEqual(scanner.search(), None)
672
673    def test_bug_817234(self):
674        iter = re.finditer(r".*", "asdf")
675        self.assertEqual(iter.next().span(), (0, 4))
676        self.assertEqual(iter.next().span(), (4, 4))
677        self.assertRaises(StopIteration, iter.next)
678
679    def test_bug_6561(self):
680        # '\d' should match characters in Unicode category 'Nd'
681        # (Number, Decimal Digit), but not those in 'Nl' (Number,
682        # Letter) or 'No' (Number, Other).
683        decimal_digits = [
684            u'\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
685            u'\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
686            u'\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
687            ]
688        for x in decimal_digits:
689            self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)
690
691        not_decimal_digits = [
692            u'\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
693            u'\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
694            u'\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
695            u'\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
696            ]
697        for x in not_decimal_digits:
698            self.assertIsNone(re.match('^\d$', x, re.UNICODE))
699
700    def test_empty_array(self):
701        # SF buf 1647541
702        import array
703        for typecode in 'cbBuhHiIlLfd':
704            a = array.array(typecode)
705            self.assertEqual(re.compile("bla").match(a), None)
706            self.assertEqual(re.compile("").match(a).groups(), ())
707
708    def test_inline_flags(self):
709        # Bug #1700
710        upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
711        lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
712
713        p = re.compile(upper_char, re.I | re.U)
714        q = p.match(lower_char)
715        self.assertNotEqual(q, None)
716
717        p = re.compile(lower_char, re.I | re.U)
718        q = p.match(upper_char)
719        self.assertNotEqual(q, None)
720
721        p = re.compile('(?i)' + upper_char, re.U)
722        q = p.match(lower_char)
723        self.assertNotEqual(q, None)
724
725        p = re.compile('(?i)' + lower_char, re.U)
726        q = p.match(upper_char)
727        self.assertNotEqual(q, None)
728
729        p = re.compile('(?iu)' + upper_char)
730        q = p.match(lower_char)
731        self.assertNotEqual(q, None)
732
733        p = re.compile('(?iu)' + lower_char)
734        q = p.match(upper_char)
735        self.assertNotEqual(q, None)
736
737    def test_dollar_matches_twice(self):
738        "$ matches the end of string, and just before the terminating \n"
739        pattern = re.compile('$')
740        self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
741        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
742        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
743
744        pattern = re.compile('$', re.MULTILINE)
745        self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
746        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
747        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
748
749    def test_dealloc(self):
750        # issue 3299: check for segfault in debug build
751        import _sre
752        # the overflow limit is different on wide and narrow builds and it
753        # depends on the definition of SRE_CODE (see sre.h).
754        # 2**128 should be big enough to overflow on both. For smaller values
755        # a RuntimeError is raised instead of OverflowError.
756        long_overflow = 2**128
757        self.assertRaises(TypeError, re.finditer, "a", {})
758        self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
759
760def run_re_tests():
761    from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
762    if verbose:
763        print 'Running re_tests test suite'
764    else:
765        # To save time, only run the first and last 10 tests
766        #tests = tests[:10] + tests[-10:]
767        pass
768
769    for t in tests:
770        sys.stdout.flush()
771        pattern = s = outcome = repl = expected = None
772        if len(t) == 5:
773            pattern, s, outcome, repl, expected = t
774        elif len(t) == 3:
775            pattern, s, outcome = t
776        else:
777            raise ValueError, ('Test tuples should have 3 or 5 fields', t)
778
779        try:
780            obj = re.compile(pattern)
781        except re.error:
782            if outcome == SYNTAX_ERROR: pass  # Expected a syntax error
783            else:
784                print '=== Syntax error:', t
785        except KeyboardInterrupt: raise KeyboardInterrupt
786        except:
787            print '*** Unexpected error ***', t
788            if verbose:
789                traceback.print_exc(file=sys.stdout)
790        else:
791            try:
792                result = obj.search(s)
793            except re.error, msg:
794                print '=== Unexpected exception', t, repr(msg)
795            if outcome == SYNTAX_ERROR:
796                # This should have been a syntax error; forget it.
797                pass
798            elif outcome == FAIL:
799                if result is None: pass   # No match, as expected
800                else: print '=== Succeeded incorrectly', t
801            elif outcome == SUCCEED:
802                if result is not None:
803                    # Matched, as expected, so now we compute the
804                    # result string and compare it to our expected result.
805                    start, end = result.span(0)
806                    vardict={'found': result.group(0),
807                             'groups': result.group(),
808                             'flags': result.re.flags}
809                    for i in range(1, 100):
810                        try:
811                            gi = result.group(i)
812                            # Special hack because else the string concat fails:
813                            if gi is None:
814                                gi = "None"
815                        except IndexError:
816                            gi = "Error"
817                        vardict['g%d' % i] = gi
818                    for i in result.re.groupindex.keys():
819                        try:
820                            gi = result.group(i)
821                            if gi is None:
822                                gi = "None"
823                        except IndexError:
824                            gi = "Error"
825                        vardict[i] = gi
826                    repl = eval(repl, vardict)
827                    if repl != expected:
828                        print '=== grouping error', t,
829                        print repr(repl) + ' should be ' + repr(expected)
830                else:
831                    print '=== Failed incorrectly', t
832
833                # Try the match on a unicode string, and check that it
834                # still succeeds.
835                try:
836                    result = obj.search(unicode(s, "latin-1"))
837                    if result is None:
838                        print '=== Fails on unicode match', t
839                except NameError:
840                    continue # 1.5.2
841                except TypeError:
842                    continue # unicode test case
843
844                # Try the match on a unicode pattern, and check that it
845                # still succeeds.
846                obj=re.compile(unicode(pattern, "latin-1"))
847                result = obj.search(s)
848                if result is None:
849                    print '=== Fails on unicode pattern match', t
850
851                # Try the match with the search area limited to the extent
852                # of the match and see if it still succeeds.  \B will
853                # break (because it won't match at the end or start of a
854                # string), so we'll ignore patterns that feature it.
855
856                if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
857                               and result is not None:
858                    obj = re.compile(pattern)
859                    result = obj.search(s, result.start(0), result.end(0) + 1)
860                    if result is None:
861                        print '=== Failed on range-limited match', t
862
863                # Try the match with IGNORECASE enabled, and check that it
864                # still succeeds.
865                obj = re.compile(pattern, re.IGNORECASE)
866                result = obj.search(s)
867                if result is None:
868                    print '=== Fails on case-insensitive match', t
869
870                # Try the match with LOCALE enabled, and check that it
871                # still succeeds.
872                obj = re.compile(pattern, re.LOCALE)
873                result = obj.search(s)
874                if result is None:
875                    print '=== Fails on locale-sensitive match', t
876
877                # Try the match with UNICODE locale enabled, and check
878                # that it still succeeds.
879                obj = re.compile(pattern, re.UNICODE)
880                result = obj.search(s)
881                if result is None:
882                    print '=== Fails on unicode-sensitive match', t
883
884def test_main():
885    run_unittest(ReTests)
886    run_re_tests()
887
888if __name__ == "__main__":
889    test_main()
890